server : allow json array in prompt or content for direct token input (#2306 )

* server: allow json array in prompt or content We accept an array of strings and numbers representing tokens, in addition to the current string valued prompt or content. This allows direct token input, so that any special tokens can be processed and used at the frontend during the construction of the json data, before sending to the server. And the server does not need to know or parse special tokens from textual input. With this, we can use EOS and BOS used in llama-2-chat models. * server: use tokenizePrompt(json) and default "" if empty prompt * server: fix prompt check * server: tokenize endpoint no longer adds BOS
docs : add grammar docs (#2701 )
2026-02-26 14:23:22 +02:00 · 2023-08-23 15:12:12 +08:00 · 2023-08-22 21:01:57 -04:00 · 2023-08-22 17:39:39 -06:00 · 2023-08-23 00:10:42 +03:00 · 2023-08-22 22:47:05 +02:00
122 changed files with 33154 additions and 13580 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -197,6 +197,8 @@ jobs:
    strategy:
      matrix:
        include:
+          - build: 'noavx'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
          - build: 'avx2'
            defines: '-DLLAMA_BUILD_SERVER=ON'
          - build: 'avx'
@@ -308,13 +310,13 @@ jobs:
          path: |
            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip

-  windows-latest-cmake-cuda:
+  windows-latest-cmake-cublas:
    runs-on: windows-latest

    strategy:
      matrix:
        cuda: ['12.1.0', '11.7.1']
-        build: ['cuda']
+        build: ['cublas']

    steps:
      - name: Clone
@@ -333,7 +335,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON
+          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
          cmake --build . --config Release

      - name: Get commit hash
@@ -395,7 +397,7 @@ jobs:
      - macOS-latest-make
      - macOS-latest-cmake
      - windows-latest-cmake
-      - windows-latest-cmake-cuda
+      - windows-latest-cmake-cublas

    steps:
      - name: Download artifacts
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
 *.o
 *.a
 *.so
+*.gguf
+*.bin
 .DS_Store
 .build/
 .cache/
@@ -16,6 +18,8 @@ build/
 build-em/
 build-debug/
 build-release/
+build-ci-debug/
+build-ci-release/
 build-static/
 build-cublas/
 build-opencl/
@@ -25,9 +29,10 @@ build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
 out/
+tmp/

 models/*
-*.bin
+models-mnt

 /main
 /quantize
@@ -36,13 +41,17 @@ models/*
 /perplexity
 /embedding
 /train-text-from-scratch
+/convert-llama2c-to-ggml
 /simple
 /benchmark-matmult
 /vdot
 /server
 /Pipfile
 /embd-input-test
+/gguf
+/gguf-llama-simple
 /libllama.so
+/llama-bench
 build-info.h
 arm_neon.h
 compile_commands.json
@@ -58,3 +67,18 @@ qnt-*.txt
 perf-*.txt

 examples/jeopardy/results.txt
+
+pyproject.toml
+poetry.lock
+poetry.toml
+
+# Test binaries
+tests/test-grammar-parser
+tests/test-double-float
+tests/test-grad0
+tests/test-opt
+tests/test-quantize-fns
+tests/test-quantize-perf
+tests/test-sampling
+tests/test-tokenizer-0
+
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,11 +67,12 @@ endif()
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
+option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
+#option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
-option(LLAMA_CUDA_DMMV_F16                   "llama: use 16 bit floats for dmmv CUDA kernels"   OFF)
+option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
@@ -239,18 +240,21 @@ if (LLAMA_K_QUANTS)
    endif()
 endif()

-if (LLAMA_CUDA)
+if (LLAMA_CUBLAS)
    cmake_minimum_required(VERSION 3.17)

    find_package(CUDAToolkit)
    if (CUDAToolkit_FOUND)
-        message(STATUS "CUDA found")
+        message(STATUS "cuBLAS found")

        enable_language(CUDA)

        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)

-        add_compile_definitions(GGML_USE_CUDA)
+        add_compile_definitions(GGML_USE_CUBLAS)
+#        if (LLAMA_CUDA_CUBLAS)
+#            add_compile_definitions(GGML_CUDA_CUBLAS)
+#        endif()
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@@ -259,8 +263,8 @@ if (LLAMA_CUDA)
        if (DEFINED LLAMA_CUDA_DMMV_Y)
            add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
        endif()
-        if (LLAMA_CUDA_DMMV_F16)
-            add_compile_definitions(GGML_CUDA_DMMV_F16)
+        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
+            add_compile_definitions(GGML_CUDA_F16)
        endif()
        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

@@ -271,16 +275,20 @@ if (LLAMA_CUDA)
        endif()

    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        if (LLAMA_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
+        # 52 == lowest CUDA 12 standard
+        # 60 == f16 CUDA intrinsics
+        # 61 == integer CUDA intrinsics
+        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
+        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
+            set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
        else()
-            set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
+            set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
        endif()
    endif()
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

    else()
-        message(WARNING "CUDA not found")
+        message(WARNING "cuBLAS not found")
    endif()
 endif()

@@ -288,7 +296,6 @@ if (LLAMA_METAL)
    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
-    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)

    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)

@@ -305,7 +312,6 @@ if (LLAMA_METAL)
        ${FOUNDATION_LIBRARY}
        ${METAL_FRAMEWORK}
        ${METALKIT_FRAMEWORK}
-        ${METALPERFORMANCE_FRAMEWORK}
        )
 endif()

@@ -357,6 +363,7 @@ if (LLAMA_ALL_WARNINGS)
            -Wshadow
            -Wstrict-prototypes
            -Wpointer-arith
+            -Wmissing-prototypes
        )
        set(cxx_flags
            -Wall
@@ -490,12 +497,16 @@ else()
 endif()

 #
-# Build libraries
+# libraries
 #

+# ggml
+
 add_library(ggml OBJECT
            ggml.c
            ggml.h
+            ggml-alloc.c
+            ggml-alloc.h
            ${GGML_SOURCES_CUDA}
            ${GGML_SOURCES_OPENCL}
            ${GGML_SOURCES_METAL}
@@ -512,12 +523,14 @@ if (BUILD_SHARED_LIBS)
    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
    add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
    target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+    install(TARGETS ggml_shared LIBRARY)
 endif()

+# llama
+
 add_library(llama
            llama.cpp
            llama.h
-            llama-util.h
            )

 target_include_directories(llama PUBLIC .)
@@ -533,13 +546,53 @@ if (BUILD_SHARED_LIBS)
    if (LLAMA_METAL)
        set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
    endif()
+    install(TARGETS llama LIBRARY)
 endif()

+#
+# install
+#
+
+include(GNUInstallDirs)
+install(
+    FILES convert.py
+    PERMISSIONS
+        OWNER_READ
+        OWNER_WRITE
+        OWNER_EXECUTE
+        GROUP_READ
+        GROUP_EXECUTE
+        WORLD_READ
+        WORLD_EXECUTE
+    DESTINATION ${CMAKE_INSTALL_BINDIR})
+install(
+    FILES convert-lora-to-ggml.py
+    PERMISSIONS
+        OWNER_READ
+        OWNER_WRITE
+        OWNER_EXECUTE
+        GROUP_READ
+        GROUP_EXECUTE
+        WORLD_READ
+        WORLD_EXECUTE
+    DESTINATION ${CMAKE_INSTALL_BINDIR})
+if (LLAMA_METAL)
+    install(
+        FILES ggml-metal.metal
+        PERMISSIONS
+            OWNER_READ
+            OWNER_WRITE
+            GROUP_READ
+            WORLD_READ
+        DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()

 #
 # programs, examples and tests
 #

+add_subdirectory(common)
+
 if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
    include(CTest)
    add_subdirectory(tests)
--- a/205
+++ b/205
@@ -1,5 +1,8 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf llama-bench
+
+# Binaries only useful for tests
+TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0

 default: $(BUILD_TARGETS)

@@ -42,8 +45,8 @@ OPT = -Ofast
 else
 OPT = -O3
 endif
-CFLAGS   = -I.              $(OPT) -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
+CFLAGS   = -I.            $(OPT) -std=c11   -fPIC
+CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC
 LDFLAGS  =

 ifdef LLAMA_DEBUG
@@ -55,18 +58,13 @@ else
 	CXXFLAGS += -DNDEBUG
 endif

-ifdef LLAMA_SANITIZE
-	CFLAGS   += -g -fsanitize=$(LLAMA_SANITIZE) -fno-omit-frame-pointer
-	CXXFLAGS += -g -fsanitize=$(LLAMA_SANITIZE) -fno-omit-frame-pointer
-	LDFLAGS  += -g -fsanitize=$(LLAMA_SANITIZE)
-endif
-
 ifdef LLAMA_SERVER_VERBOSE
 	CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
 endif

 # warnings
-CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
+CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
+			-Wmissing-prototypes
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar

 # OS specific
@@ -96,6 +94,28 @@ ifeq ($(UNAME_S),Haiku)
 	CXXFLAGS += -pthread
 endif

+# detect Windows
+ifneq ($(findstring _NT,$(UNAME_S)),)
+	_WIN32 := 1
+endif
+
+# library name prefix
+ifneq ($(_WIN32),1)
+	LIB_PRE := lib
+endif
+
+# Dynamic Shared Object extension
+ifneq ($(_WIN32),1)
+	DSO_EXT := .so
+else
+	DSO_EXT := .dll
+endif
+
+# Windows Sockets 2 (Winsock) for network-capable apps
+ifeq ($(_WIN32),1)
+	LWINSOCK2 := -lws2_32
+endif
+
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
@@ -108,7 +128,7 @@ endif
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
-ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
+ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
 	CFLAGS   += -march=native -mtune=native
 	CXXFLAGS += -march=native -mtune=native
@@ -122,6 +142,28 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 	#CXXFLAGS += -mssse3
 endif

+ifneq ($(filter aarch64%,$(UNAME_M)),)
+	# Apple M1, M2, etc.
+	# Raspberry Pi 3, 4, Zero 2 (64-bit)
+	CFLAGS   += -mcpu=native
+	CXXFLAGS += -mcpu=native
+endif
+
+ifneq ($(filter armv6%,$(UNAME_M)),)
+	# Raspberry Pi 1, Zero
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+endif
+
+ifneq ($(filter armv7%,$(UNAME_M)),)
+	# Raspberry Pi 2
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+endif
+
+ifneq ($(filter armv8%,$(UNAME_M)),)
+	# Raspberry Pi 3, 4, Zero 2 (32-bit)
+	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
+endif
+
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
@@ -169,17 +211,17 @@ ifdef LLAMA_BLIS
 	LDFLAGS += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS

-ifdef LLAMA_CUDA
-	CFLAGS    += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
-	CXXFLAGS  += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+ifdef LLAMA_CUBLAS
+	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
-	NVCC      = nvcc
-	NVCCFLAGS = --forward-unknown-to-host-compiler
-	NVCCV 	  := $(shell $(NVCC) --version | tail -n 1)
-ifdef LLAMA_DEBUG
-	NVCCFLAGS += -lineinfo
-endif # LLAMA_DEBUG
+	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
+ifdef LLAMA_CUDA_NVCC
+	NVCC = $(LLAMA_CUDA_NVCC)
+else
+	NVCC = nvcc
+endif #LLAMA_CUDA_NVCC
 ifdef CUDA_DOCKER_ARCH
 	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
 else
@@ -200,26 +242,37 @@ else ifdef LLAMA_CUDA_DMMV_Y
 else
 	NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # LLAMA_CUDA_MMV_Y
+ifdef LLAMA_CUDA_F16
+	NVCCFLAGS += -DGGML_CUDA_F16
+endif # LLAMA_CUDA_F16
 ifdef LLAMA_CUDA_DMMV_F16
-	NVCCFLAGS += -DGGML_CUDA_DMMV_F16
+	NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_DMMV_F16
 ifdef LLAMA_CUDA_KQUANTS_ITER
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 else
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda-kern.h ggml-cuda-quant.h
-	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
-endif # LLAMA_CUDA
+#ifdef LLAMA_CUDA_CUBLAS
+#	NVCCFLAGS += -DGGML_CUDA_CUBLAS
+#endif # LLAMA_CUDA_CUBLAS
+ifdef LLAMA_CUDA_CCBIN
+	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
+endif
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+	$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
+endif # LLAMA_CUBLAS

 ifdef LLAMA_CLBLAST
-	CFLAGS   += -DGGML_USE_CLBLAST
-	CXXFLAGS += -DGGML_USE_CLBLAST
+
+	CFLAGS   += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
+	CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
+
 	# Mac provides OpenCL as a framework
 	ifeq ($(UNAME_S),Darwin)
 		LDFLAGS += -lclblast -framework OpenCL
 	else
-		LDFLAGS += -lclblast -lOpenCL
+		LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
 	endif
 	OBJS    += ggml-opencl.o

@@ -230,32 +283,10 @@ endif # LLAMA_CLBLAST
 ifdef LLAMA_METAL
 	CFLAGS   += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 	CXXFLAGS += -DGGML_USE_METAL
-	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
 	OBJS     += ggml-metal.o
 endif # LLAMA_METAL

-ifneq ($(filter aarch64%,$(UNAME_M)),)
-	# Apple M1, M2, etc.
-	# Raspberry Pi 3, 4, Zero 2 (64-bit)
-	CFLAGS   += -mcpu=native
-	CXXFLAGS += -mcpu=native
-endif
-
-ifneq ($(filter armv6%,$(UNAME_M)),)
-	# Raspberry Pi 1, Zero
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
-endif
-
-ifneq ($(filter armv7%,$(UNAME_M)),)
-	# Raspberry Pi 2
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
-endif
-
-ifneq ($(filter armv8%,$(UNAME_M)),)
-	# Raspberry Pi 3, 4, Zero 2 (32-bit)
-	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
-endif
-
 ifdef LLAMA_METAL
 ggml-metal.o: ggml-metal.m ggml-metal.h
 	$(CC) $(CFLAGS) -c $< -o $@
@@ -284,9 +315,6 @@ $(info I CXXFLAGS: $(CXXFLAGS))
 $(info I LDFLAGS:  $(LDFLAGS))
 $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
-ifdef LLAMA_CUDA
-$(info I NVCC:     $(NVCCV))
-endif # LLAMA_CUDA
 $(info )

 #
@@ -296,29 +324,34 @@ $(info )
 ggml.o: ggml.c ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@

-# temporary, probably will be added to ggml.c
-ggml-backend.o: ggml-backend.c ggml-backend.h ggml.h
+ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 	$(CC)  $(CFLAGS)   -c $< -o $@

-OBJS += ggml-backend.o
+OBJS += ggml-alloc.o

-llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-common.o: examples/common.cpp examples/common.h
+common.o: common/common.cpp common/common.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+console.o: common/console.cpp common/console.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 clean:
-	rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf llama-bench build-info.h $(TEST_TARGETS)

 #
 # Examples
 #

-main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o $(OBJS)
+main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
@@ -342,17 +375,26 @@ embedding: examples/embedding/embedding.cpp                   build-info.h ggml.
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)

-libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)


-embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
+embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput

-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
+gguf: examples/gguf/gguf.cpp                                  build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp    build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 build-info.h: $(wildcard .git/index) scripts/build-info.sh
@@ -367,6 +409,8 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
 # Tests
 #

+tests: $(TEST_TARGETS)
+
 benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	./$@
@@ -374,6 +418,29 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-.PHONY: tests clean
-tests:
-	bash ./tests/run-tests.sh
+tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-tokenizer-0: tests/test-tokenizer-0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
--- a/README.md
+++ b/README.md
@@ -9,13 +9,19 @@

 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

-**Hot topics:**
+### Hot topics

- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
- New roadmap: https://github.com/users/ggerganov/projects/7
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
+A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
+
+Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
+
+### Current `master` should be considered in Beta - expect some issues for a few days!
+
+### Be prepared to re-convert and / or re-quantize your GGUF models while this notice is up!
+
+### Issues with non-GGUF models will be considered with low priority!
+
+----

 <details>
  <summary>Table of Contents</summary>
@@ -33,6 +39,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
        <li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
        <li><a href="#quantization">Quantization</a></li>
        <li><a href="#interactive-mode">Interactive mode</a></li>
+        <li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
        <li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
        <li><a href="#using-openllama">Using OpenLLaMA</a></li>
        <li><a href="#using-gpt4all">Using GPT4All</a></li>
@@ -77,9 +84,10 @@ as the main playground for developing new features for the [ggml](https://github
 **Supported models:**

 - [X] LLaMA 🦙
+- [x] LLaMA 2 🦙🦙
 - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
 - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
+- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
 - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
@@ -87,6 +95,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
 - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
 - [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
+- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)

 **Bindings:**

@@ -94,8 +103,10 @@ as the main playground for developing new features for the [ggml](https://github
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
+- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
+- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)

 **UI:**

@@ -236,12 +247,34 @@ In order to build llama.cpp you have three different options.
    cmake --build . --config Release
    ```

- Using `Zig`:
+- Using `Zig` (version 0.11 or later):
+
+    Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
+    it's also possible to cross compile for other operating systems and architectures:

    ```bash
-    zig build -Doptimize=ReleaseFast
+    zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
    ```

+    The `zig targets` command will give you valid options to use.
+
+-   Using `gmake` (FreeBSD):
+
+    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
+    2. Add your user to **video** group
+    3. Install compilation dependencies.
+
+        ```bash
+        sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
+            opencl clblast openblas
+
+            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
+        ```
+
+    **Notes:** With this packages you can build llama.cpp with OPENBLAS and
+    CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
+    the instructions for use and activate this options in this document below.
+
 ### Metal Build

 Using Metal allows the computation to be executed on the GPU for Apple devices:
@@ -265,7 +298,7 @@ When built with Metal support, you can enable GPU inference with the `--gpu-laye
 Any value larger than 0 will offload the computation to the GPU. For example:

 ```bash
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
 ```

 ### MPI Build
@@ -304,7 +337,7 @@ The above will distribute the computation across 2 processes on the first host a
 Finally, you're ready to run a computation using `mpirun`:

 ```bash
-mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```

 ### BLAS Build
@@ -382,12 +415,15 @@ Building the program with BLAS support may lead to some performance improvements

  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:

+<!---
+  | LLAMA_CUDA_CUBLAS       | Boolean                |   false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
+--->
  | Option                  | Legal values           | Default | Description |
  |-------------------------|------------------------|---------|-------------|
-  | LLAMA_CUDA_FORCE_DMMV   | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 7.0/Turing/RTX 2000 or higher). Does not affect k-quants. |
+  | LLAMA_CUDA_FORCE_DMMV   | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y       | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_DMMV_F16     | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
+  | LLAMA_CUDA_F16          | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |

 - #### CLBlast
@@ -470,6 +506,9 @@ Building the program with BLAS support may lead to some performance improvements
 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
+  # [Optional] for models using BPE tokenizers
+  ls ./models
+  65B 30B 13B 7B vocab.json

 # install Python dependencies
 python3 -m pip install -r requirements.txt
@@ -477,11 +516,14 @@ python3 -m pip install -r requirements.txt
 # convert the 7B model to ggml FP16 format
 python3 convert.py models/7B/

+  # [Optional] for models using BPE tokenizers
+  python convert.py models/7B/ --vocabtype bpe
+
 # quantize the model to 4-bits (using q4_0 method)
-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
+./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0

 # run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```

 When running the larger models, make sure you have enough disk space to store all the intermediate files.
@@ -537,7 +579,7 @@ Here is an example of a few-shot interaction, invoked with the command
 ./examples/chat-13B.sh

 # custom arguments using a 13B model
-./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```

 Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
@@ -563,6 +605,16 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
    CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
 ```

+### Constrained output with grammars
+
+`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
+
+```bash
+./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
+```
+
+The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
+
 ### Instruction mode with Alpaca

 1. First, download the `ggml` Alpaca model into the `./models` folder
@@ -600,6 +652,8 @@ OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It

 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)

+*Note: these instructions are likely obsoleted by the GGUF update*
+
 - Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
 - Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
 - Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
@@ -633,6 +687,19 @@ python3 convert.py pygmalion-7b/ --outtype q4_1
 - The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
 - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.

+### Obtaining and using the Facebook LLaMA 2 model
+
+- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
+- Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
+  - [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGML)
+  - [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGML)
+  - [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGML)
+  - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML)
+  - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML)
+  - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML)
+- Specify `-eps 1e-5` for best generation quality
+- Specify `-gqa 8` for 70B models to work
+
 ### Verifying the model files

 Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
@@ -640,7 +707,7 @@ Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files t

 ```bash
 # run the verification script
-python3 .\scripts\verify-checksum-models.py
+./scripts/verify-checksum-models.py
 ```

 - On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:
@@ -662,7 +729,7 @@ If your issue is with model generation quality, then please at least scan the fo
 #### How to run

 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
-2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
 3. Output:
 ```
 perplexity : calculating perplexity over 655 chunks
@@ -761,13 +828,13 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-
 On completion, you are ready to play!

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 or with a light image:

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 ### Docker With CUDA
@@ -798,8 +865,8 @@ The resulting images, are essentially the same as the non-CUDA images:
 After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.

 ```bash
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 ```

 ### Contributing
@@ -829,3 +896,4 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
 - [BLIS](./docs/BLIS.md)
 - [Performance troubleshooting](./docs/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [GBNF grammars](./grammars/README.md)
--- a/build.zig
+++ b/build.zig
@@ -1,68 +1,121 @@
+// Compatible with Zig Version 0.11.0
 const std = @import("std");
-const commit_hash = @embedFile(".git/refs/heads/master");
+const ArrayList = std.ArrayList;
+const Compile = std.Build.Step.Compile;
+const ConfigHeader = std.Build.Step.ConfigHeader;
+const Mode = std.builtin.Mode;
+const CrossTarget = std.zig.CrossTarget;

-// Zig Version: 0.11.0-dev.3986+e05c242cd
-pub fn build(b: *std.build.Builder) void {
-    const target = b.standardTargetOptions(.{});
-    const optimize = b.standardOptimizeOption(.{});
+const Maker = struct {
+    builder: *std.build.Builder,
+    target: CrossTarget,
+    optimize: Mode,
+    config_header: *ConfigHeader,
+    enable_lto: bool,

-    const config_header = b.addConfigHeader(
-        .{ .style = .blank, .include_path = "build-info.h" },
-        .{
-            .BUILD_NUMBER = 0,
-            .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
-        },
-    );
+    include_dirs: ArrayList([]const u8),
+    cflags: ArrayList([]const u8),
+    cxxflags: ArrayList([]const u8),
+    objs: ArrayList(*Compile),

-    const lib = b.addStaticLibrary(.{
-        .name = "llama",
-        .target = target,
-        .optimize = optimize,
-    });
-    lib.linkLibC();
-    lib.linkLibCpp();
-    lib.addIncludePath(".");
-    lib.addIncludePath("./examples");
-    lib.addConfigHeader(config_header);
-    lib.addCSourceFiles(&.{"ggml.c"}, &.{"-std=c11"});
-    lib.addCSourceFiles(&.{"llama.cpp"}, &.{"-std=c++11"});
-    b.installArtifact(lib);
+    fn addInclude(m: *Maker, dir: []const u8) !void {
+        try m.include_dirs.append(dir);
+    }
+    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
+        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
+    }
+    fn addCFlag(m: *Maker, flag: []const u8) !void {
+        try m.cflags.append(flag);
+    }
+    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
+        try m.cxxflags.append(flag);
+    }
+    fn addFlag(m: *Maker, flag: []const u8) !void {
+        try m.addCFlag(flag);
+        try m.addCxxFlag(flag);
+    }

-    const examples = .{
-        "main",
-        "baby-llama",
-        "embedding",
-        "metal",
-        "perplexity",
-        "quantize",
-        "quantize-stats",
-        "save-load-state",
-        "server",
-        "simple",
-        "train-text-from-scratch",
-    };
+    fn init(builder: *std.build.Builder) !Maker {
+        const commit_hash = @embedFile(".git/refs/heads/master");
+        const config_header = builder.addConfigHeader(
+            .{ .style = .blank, .include_path = "build-info.h" },
+            .{
+                .BUILD_NUMBER = 0,
+                .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
+            },
+        );
+        var m = Maker{
+            .builder = builder,
+            .target = builder.standardTargetOptions(.{}),
+            .optimize = builder.standardOptimizeOption(.{}),
+            .config_header = config_header,
+            .enable_lto = false,
+            .include_dirs = ArrayList([]const u8).init(builder.allocator),
+            .cflags = ArrayList([]const u8).init(builder.allocator),
+            .cxxflags = ArrayList([]const u8).init(builder.allocator),
+            .objs = ArrayList(*Compile).init(builder.allocator),
+        };
+        try m.addCFlag("-std=c11");
+        try m.addCxxFlag("-std=c++11");
+        try m.addProjectInclude(&.{});
+        try m.addProjectInclude(&.{"examples"});
+        return m;
+    }

-    inline for (examples) |example_name| {
-        const exe = b.addExecutable(.{
-            .name = example_name,
-            .target = target,
-            .optimize = optimize,
-        });
-        exe.addIncludePath(".");
-        exe.addIncludePath("./examples");
-        exe.addConfigHeader(config_header);
-        exe.addCSourceFiles(&.{
-            std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{ example_name, example_name }),
-            "examples/common.cpp",
-        }, &.{"-std=c++11"});
-        exe.linkLibrary(lib);
-        b.installArtifact(exe);
+    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
+        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        if (std.mem.endsWith(u8, src, ".c")) {
+            o.addCSourceFiles(&.{src}, m.cflags.items);
+            o.linkLibC();
+        } else {
+            o.addCSourceFiles(&.{src}, m.cxxflags.items);
+            o.linkLibCpp();
+        }
+        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
+        o.want_lto = m.enable_lto;
+        return o;
+    }

-        const run_cmd = b.addRunArtifact(exe);
-        run_cmd.step.dependOn(b.getInstallStep());
-        if (b.args) |args| run_cmd.addArgs(args);
+    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
+        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
+        e.addCSourceFiles(&.{src}, m.cxxflags.items);
+        for (deps) |d| e.addObject(d);
+        for (m.objs.items) |o| e.addObject(o);
+        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
+        e.linkLibC();
+        e.linkLibCpp();
+        e.addConfigHeader(m.config_header);
+        m.builder.installArtifact(e);
+        e.want_lto = m.enable_lto;
+        return e;
+    }
+};

-        const run_step = b.step("run-" ++ example_name, "Run the app");
-        run_step.dependOn(&run_cmd.step);
+pub fn build(b: *std.build.Builder) !void {
+    var make = try Maker.init(b);
+    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
+
+    if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
+        try make.addFlag("-DGGML_USE_K_QUANTS");
+        const k_quants = make.obj("k_quants", "k_quants.c");
+        try make.objs.append(k_quants);
+    }
+
+    const ggml = make.obj("ggml", "ggml.c");
+    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
+    const llama = make.obj("llama", "llama.cpp");
+    const common = make.obj("common", "examples/common.cpp");
+    const console = make.obj("common", "examples/console.cpp");
+    const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");
+
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
+    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
+    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
+    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama });
+
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
+    if (server.target.isWindows()) {
+        server.linkSystemLibrary("ws2_32");
    }
 }
--- a/ci/README.md
+++ b/ci/README.md
@@ -0,0 +1,25 @@
+# CI
+
+In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
+
+https://github.com/ggml-org/ci
+
+It monitors the `master` branch for new commits and runs the
+[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
+to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
+to cover various hardware architectures, including GPU and Apple Silicon instances.
+
+Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
+Only the branches of this repo are monitored for this keyword.
+
+It is a good practice, before publishing changes to execute the full CI locally on your machine:
+
+```bash
+mkdir tmp
+
+# CPU-only build
+bash ./ci/run.sh ./tmp/results ./tmp/mnt
+
+# with CUDA support
+GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+```
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -0,0 +1,409 @@
+#/bin/bash
+#
+# sample usage:
+#
+# mkdir tmp
+#
+# # CPU-only build
+# bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+# # with CUDA support
+# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+
+if [ -z "$2" ]; then
+    echo "usage: $0 <output-dir> <mnt-dir>"
+    exit 1
+fi
+
+mkdir -p "$1"
+mkdir -p "$2"
+
+OUT=$(realpath "$1")
+MNT=$(realpath "$2")
+
+rm -v $OUT/*.log
+rm -v $OUT/*.exit
+rm -v $OUT/*.md
+
+sd=`dirname $0`
+cd $sd/../
+SRC=`pwd`
+
+## helpers
+
+# download a file if it does not exist or if it is outdated
+function gg_wget {
+    local out=$1
+    local url=$2
+
+    local cwd=`pwd`
+
+    mkdir -p $out
+    cd $out
+
+    # should not re-download if file is the same
+    wget -nv -N $url
+
+    cd $cwd
+}
+
+function gg_printf {
+    printf -- "$@" >> $OUT/README.md
+}
+
+function gg_run {
+    ci=$1
+
+    set -o pipefail
+    set -x
+
+    gg_run_$ci | tee $OUT/$ci.log
+    cur=$?
+    echo "$cur" > $OUT/$ci.exit
+
+    set +x
+    set +o pipefail
+
+    gg_sum_$ci
+
+    ret=$((ret | cur))
+}
+
+## ci
+
+# ctest_debug
+
+function gg_run_ctest_debug {
+    cd ${SRC}
+
+    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Debug ..     ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+
+    set +e
+}
+
+function gg_sum_ctest_debug {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs ctest in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+
+# ctest_release
+
+function gg_run_ctest_release {
+    cd ${SRC}
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    if [ -z ${GG_BUILD_LOW_PERF} ]; then
+        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    else
+        (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    fi
+
+    set +e
+}
+
+function gg_sum_ctest_release {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'Runs ctest in release mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+}
+
+# open_llama_3b_v2
+
+function gg_run_open_llama_3b_v2 {
+    cd ${SRC}
+
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
+
+    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
+
+    path_models="../models-mnt/open-llama/3B-v2"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert.py ${path_models}
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+
+    wiki_test_60="${path_wiki}/wiki.test-60.raw"
+
+    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+    set +e
+}
+
+function gg_sum_open_llama_3b_v2 {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'OpenLLaMA 3B-v2:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+}
+
+# open_llama_7b_v2
+# requires: GG_BUILD_CUDA
+
+function gg_run_open_llama_7b_v2 {
+    cd ${SRC}
+
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
+
+    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+
+    path_models="../models-mnt/open-llama/7B-v2"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+    set -e
+
+    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+    python3 ../convert.py ${path_models}
+
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+
+    wiki_test="${path_wiki}/wiki.test.raw"
+
+    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+
+    (time ./bin/main --model ${model_f16}  -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/main --model ${model_q8_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/main --model ${model_q4_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/main --model ${model_q4_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/main --model ${model_q5_0} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/main --model ${model_q5_1} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/main --model ${model_q2_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/main --model ${model_q3_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/main --model ${model_q4_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/main --model ${model_q5_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/main --model ${model_q6_k} -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+    set +e
+}
+
+function gg_sum_open_llama_7b_v2 {
+    gg_printf '### %s\n\n' "${ci}"
+
+    gg_printf 'OpenLLaMA 7B-v2:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+}
+
+## main
+
+if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    rm -rf ${SRC}/models-mnt
+
+    mnt_models=${MNT}/models
+    mkdir -p ${mnt_models}
+    ln -sfn ${mnt_models} ${SRC}/models-mnt
+
+    python3 -m pip install -r ${SRC}/requirements.txt
+fi
+
+ret=0
+
+test $ret -eq 0 && gg_run ctest_debug
+test $ret -eq 0 && gg_run ctest_release
+
+if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    if [ -z ${GG_BUILD_CUDA} ]; then
+        test $ret -eq 0 && gg_run open_llama_3b_v2
+    else
+        test $ret -eq 0 && gg_run open_llama_7b_v2
+    fi
+fi
+
+exit $ret
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -0,0 +1,20 @@
+# common
+
+set(TARGET common)
+
+add_library(${TARGET} OBJECT
+    common.h
+    common.cpp
+    console.h
+    console.cpp
+    grammar-parser.h
+    grammar-parser.cpp
+    )
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+target_include_directories(${TARGET} PUBLIC .)
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE llama)
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -25,7 +25,6 @@
 #else
 #include <sys/ioctl.h>
 #include <unistd.h>
-#include <wchar.h>
 #endif

 #if defined(_MSC_VER)
@@ -117,6 +116,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_threads = std::stoi(argv[i]);
+            if (params.n_threads <= 0) {
+                params.n_threads = std::thread::hardware_concurrency();
+            }
        } else if (arg == "-p" || arg == "--prompt") {
            if (++i >= argc) {
                invalid_param = true;
@@ -180,6 +182,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.rope_freq_scale = std::stof(argv[i]);
+        } else if (arg == "--rope-scale") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.rope_freq_scale = 1.0f/std::stof(argv[i]);
        } else if (arg == "--memory-f32") {
            params.memory_f16 = false;
        } else if (arg == "--top-p") {
@@ -254,31 +262,45 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.cfg_negative_prompt = argv[i];
+        } else if (arg == "--cfg-negative-prompt-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
+            if (params.cfg_negative_prompt.back() == '\n') {
+                params.cfg_negative_prompt.pop_back();
+            }
        } else if (arg == "--cfg-scale") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.cfg_scale = std::stof(argv[i]);
-        } else if (arg == "--cfg-smooth-factor") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.cfg_smooth_factor = std::stof(argv[i]);
        } else if (arg == "-b" || arg == "--batch-size") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_batch = std::stoi(argv[i]);
-            params.n_batch = std::min(512, params.n_batch);
        } else if (arg == "--keep") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_keep = std::stoi(argv[i]);
+        } else if (arg == "--chunks") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_chunks = std::stoi(argv[i]);
        } else if (arg == "-m" || arg == "--model") {
            if (++i >= argc) {
                invalid_param = true;
@@ -314,6 +336,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.instruct = true;
        } else if (arg == "--multiline-input") {
            params.multiline_input = true;
+        } else if (arg == "--simple-io") {
+            params.simple_io = true;
        } else if (arg == "--color") {
            params.use_color = true;
        } else if (arg == "--mlock") {
@@ -327,24 +351,24 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.n_gpu_layers = std::stoi(argv[i]);
 #else
            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU support\n");
+            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
        } else if (arg == "--main-gpu" || arg == "-mg") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-#ifdef GGML_USE_CUDA
+#ifdef GGML_USE_CUBLAS
            params.main_gpu = std::stoi(argv[i]);
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set a main GPU.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
 #endif
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-#ifdef GGML_USE_CUDA
+#ifdef GGML_USE_CUBLAS
            std::string arg_next = argv[i];

            // split string by , and /
@@ -361,14 +385,20 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                }
            }
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n");
-#endif // GGML_USE_CUDA
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+#endif // GGML_USE_CUBLAS
+        } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
+#ifdef GGML_USE_CUBLAS
+            params.mul_mat_q = false;
+#else
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
+#endif // GGML_USE_CUBLAS
        } else if (arg == "--low-vram" || arg == "-lv") {
-#ifdef GGML_USE_CUDA
+#ifdef GGML_USE_CUBLAS
            params.low_vram = true;
 #else
-      fprintf(stderr, "warning: llama.cpp was compiled without CUDA. It is not possible to set lower vram usage.\n");
-#endif // GGML_USE_CUDA
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
+#endif // GGML_USE_CUBLAS
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
        } else if (arg == "--mtest") {
@@ -387,8 +417,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.antiprompt.push_back(argv[i]);
        } else if (arg == "--perplexity") {
            params.perplexity = true;
+        } else if (arg == "--hellaswag") {
+            params.hellaswag = true;
+        } else if (arg == "--hellaswag-tasks") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.hellaswag_tasks = std::stoi(argv[i]);
        } else if (arg == "--ignore-eos") {
-            params.logit_bias[llama_token_eos()] = -INFINITY;
+            params.ignore_eos = true;
        } else if (arg == "--no-penalize-nl") {
            params.penalize_nl = false;
        } else if (arg == "-l" || arg == "--logit-bias") {
@@ -415,6 +453,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
+        } else if (arg == "--in-prefix-bos") {
+            params.input_prefix_bos = true;
        } else if (arg == "--in-prefix") {
            if (++i >= argc) {
                invalid_param = true;
@@ -427,6 +467,28 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.input_suffix = argv[i];
+        } else if (arg == "--grammar") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.grammar = argv[i];
+        } else if (arg == "--grammar-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            std::copy(
+                std::istreambuf_iterator<char>(file),
+                std::istreambuf_iterator<char>(),
+                std::back_inserter(params.grammar)
+            );
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            gpt_print_usage(argc, argv, default_params);
@@ -456,90 +518,102 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 }

 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
-    fprintf(stderr, "  --interactive-first   run in interactive mode and wait for input right away\n");
-    fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
-    fprintf(stderr, "  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
-    fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
-    fprintf(stderr, "                        halt generation at PROMPT, return control in interactive mode\n");
-    fprintf(stderr, "                        (can be specified more than once for multiple prompts).\n");
-    fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
-    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
-    fprintf(stderr, "                        prompt to start generation with (default: empty)\n");
-    fprintf(stderr, "  -e                    process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    fprintf(stderr, "  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
-    fprintf(stderr, "  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
-    fprintf(stderr, "                        not supported with --interactive or other interactive options\n");
-    fprintf(stderr, "  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
-    fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
-    fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
-    fprintf(stderr, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
-    fprintf(stderr, "  -f FNAME, --file FNAME\n");
-    fprintf(stderr, "                        prompt file to start generation.\n");
-    fprintf(stderr, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
-    fprintf(stderr, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
-    fprintf(stderr, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
-    fprintf(stderr, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
-    fprintf(stderr, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
-    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(stderr, "  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
-    fprintf(stderr, "  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
-    fprintf(stderr, "  --mirostat N          use Mirostat sampling.\n");
-    fprintf(stderr, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    fprintf(stderr, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
-    fprintf(stderr, "  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
-    fprintf(stderr, "  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
-    fprintf(stderr, "  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
-    fprintf(stderr, "                        modifies the likelihood of token appearing in the completion,\n");
-    fprintf(stderr, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
-    fprintf(stderr, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
-    fprintf(stderr, "  --cfg-negative-prompt PROMPT \n");
-    fprintf(stderr, "                        negative prompt to use for guidance. (default: empty)\n");
-    fprintf(stderr, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
-    fprintf(stderr, "  --cfg-smooth-factor N smooth factor between old and new logits (default: %f, 1.0 = no smoothing)\n", params.cfg_smooth_factor);
-    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(stderr, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
-    fprintf(stderr, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
-    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
-    fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
-    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
-    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
-    fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
-    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+    fprintf(stdout, "usage: %s [options]\n", argv[0]);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "options:\n");
+    fprintf(stdout, "  -h, --help            show this help message and exit\n");
+    fprintf(stdout, "  -i, --interactive     run in interactive mode\n");
+    fprintf(stdout, "  --interactive-first   run in interactive mode and wait for input right away\n");
+    fprintf(stdout, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
+    fprintf(stdout, "  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
+    fprintf(stdout, "  -r PROMPT, --reverse-prompt PROMPT\n");
+    fprintf(stdout, "                        halt generation at PROMPT, return control in interactive mode\n");
+    fprintf(stdout, "                        (can be specified more than once for multiple prompts).\n");
+    fprintf(stdout, "  --color               colorise output to distinguish prompt and user input from generations\n");
+    fprintf(stdout, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
+    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stdout, "  -p PROMPT, --prompt PROMPT\n");
+    fprintf(stdout, "                        prompt to start generation with (default: empty)\n");
+    fprintf(stdout, "  -e                    process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
+    fprintf(stdout, "  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
+    fprintf(stdout, "  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
+    fprintf(stdout, "                        not supported with --interactive or other interactive options\n");
+    fprintf(stdout, "  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
+    fprintf(stdout, "  --random-prompt       start with a randomized prompt.\n");
+    fprintf(stdout, "  --in-prefix-bos       prefix BOS to user inputs, preceding the `--in-prefix` string\n");
+    fprintf(stdout, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
+    fprintf(stdout, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
+    fprintf(stdout, "  -f FNAME, --file FNAME\n");
+    fprintf(stdout, "                        prompt file to start generation.\n");
+    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
+    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
+    fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
+    fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
+    fprintf(stdout, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
+    fprintf(stdout, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
+    fprintf(stdout, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    fprintf(stdout, "  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
+    fprintf(stdout, "  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+    fprintf(stdout, "  --mirostat N          use Mirostat sampling.\n");
+    fprintf(stdout, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
+    fprintf(stdout, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
+    fprintf(stdout, "  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
+    fprintf(stdout, "  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
+    fprintf(stdout, "  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
+    fprintf(stdout, "                        modifies the likelihood of token appearing in the completion,\n");
+    fprintf(stdout, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
+    fprintf(stdout, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
+    fprintf(stdout, "  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
+    fprintf(stdout, "  --grammar-file FNAME  file to read grammar from\n");
+    fprintf(stdout, "  --cfg-negative-prompt PROMPT\n");
+    fprintf(stdout, "                        negative prompt to use for guidance. (default: empty)\n");
+    fprintf(stdout, "  --cfg-negative-prompt-file FNAME\n");
+    fprintf(stdout, "                        negative prompt file to use for guidance. (default: empty)\n");
+    fprintf(stdout, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
+    fprintf(stdout, "  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
+    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
+    fprintf(stdout, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
+    fprintf(stdout, "  --no-penalize-nl      do not penalize newline token\n");
+    fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    fprintf(stdout, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
+    fprintf(stdout, "  --perplexity          compute perplexity over each ctx window of the prompt\n");
+    fprintf(stdout, "  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
+    fprintf(stdout, "  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
+    fprintf(stdout, "  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+    fprintf(stdout, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    if (llama_mlock_supported()) {
-        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+        fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
    if (llama_mmap_supported()) {
-        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+        fprintf(stdout, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
-    fprintf(stderr, "  --numa                attempt optimizations that help on some NUMA systems\n");
-    fprintf(stderr, "                        if run without this previously, it is recommended to drop the system page cache before using this\n");
-    fprintf(stderr, "                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
+    fprintf(stdout, "  --numa                attempt optimizations that help on some NUMA systems\n");
+    fprintf(stdout, "                        if run without this previously, it is recommended to drop the system page cache before using this\n");
+    fprintf(stdout, "                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
-    fprintf(stderr, "                        number of layers to store in VRAM\n");
-    fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
-    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
-    fprintf(stderr, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
+    fprintf(stdout, "  -ngl N, --n-gpu-layers N\n");
+    fprintf(stdout, "                        number of layers to store in VRAM\n");
+    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
+    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
+    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif
-    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
-    fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
-    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
-    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "\n");
+    fprintf(stdout, "  --mtest               compute maximum memory usage\n");
+    fprintf(stdout, "  --export              export the computation graph to 'llama.ggml'\n");
+    fprintf(stdout, "  --verbose-prompt      print prompt before generation\n");
+    fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
+    fprintf(stdout, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
+    fprintf(stdout, "  -m FNAME, --model FNAME\n");
+    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stdout, "\n");
 }

 std::string gpt_random_prompt(std::mt19937 & rng) {
@@ -561,39 +635,33 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
    return "The";
 }

-// TODO: not great allocating this every time
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int) add_bos);
-    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
-    assert(n >= 0);
-    res.resize(n);
-
-    return res;
-}
+//
+// Model utils
+//

 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto lparams = llama_context_default_params();

-    lparams.n_ctx        = params.n_ctx;
-    lparams.n_batch      = params.n_batch;
-    lparams.n_gpu_layers = params.n_gpu_layers;
-    lparams.main_gpu     = params.main_gpu;
-    memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
-    lparams.low_vram     = params.low_vram;
-    lparams.seed         = params.seed;
-    lparams.f16_kv       = params.memory_f16;
-    lparams.use_mmap     = params.use_mmap;
-    lparams.use_mlock    = params.use_mlock;
-    lparams.logits_all   = params.perplexity;
-    lparams.embedding    = params.embedding;
+    lparams.n_ctx           = params.n_ctx;
+    lparams.n_batch         = params.n_batch;
+    lparams.n_gpu_layers    = params.n_gpu_layers;
+    lparams.main_gpu        = params.main_gpu;
+    lparams.tensor_split    = params.tensor_split;
+    lparams.low_vram        = params.low_vram;
+    lparams.mul_mat_q       = params.mul_mat_q;
+    lparams.seed            = params.seed;
+    lparams.f16_kv          = params.memory_f16;
+    lparams.use_mmap        = params.use_mmap;
+    lparams.use_mlock       = params.use_mlock;
+    lparams.logits_all      = params.perplexity;
+    lparams.embedding       = params.embedding;
    lparams.rope_freq_base  = params.rope_freq_base;
    lparams.rope_freq_scale = params.rope_freq_scale;

    return lparams;
 }

-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
    auto lparams = llama_context_params_from_gpt_params(params);

    llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
@@ -622,378 +690,77 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        }
    }

+    if (params.ignore_eos) {
+        params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
+    }
+
    return std::make_tuple(model, lctx);
 }

-void console_init(console_state & con_st) {
-#if defined(_WIN32)
-    // Windows-specific console initialization
-    DWORD dwMode = 0;
-    con_st.hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
-    if (con_st.hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(con_st.hConsole, &dwMode)) {
-        con_st.hConsole = GetStdHandle(STD_ERROR_HANDLE);
-        if (con_st.hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(con_st.hConsole, &dwMode))) {
-            con_st.hConsole = NULL;
-        }
-    }
-    if (con_st.hConsole) {
-        // Enable ANSI colors on Windows 10+
-        if (con_st.use_color && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
-            SetConsoleMode(con_st.hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
-        }
-        // Set console output codepage to UTF8
-        SetConsoleOutputCP(CP_UTF8);
-    }
-    HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
-    if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
-        // Set console input codepage to UTF16
-        _setmode(_fileno(stdin), _O_WTEXT);
+//
+// Vocab utils
+//

-        // Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
-        dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
-        SetConsoleMode(hConIn, dwMode);
-    }
-#else
-    // POSIX-specific console initialization
-    struct termios new_termios;
-    tcgetattr(STDIN_FILENO, &con_st.prev_state);
-    new_termios = con_st.prev_state;
-    new_termios.c_lflag &= ~(ICANON | ECHO);
-    new_termios.c_cc[VMIN] = 1;
-    new_termios.c_cc[VTIME] = 0;
-    tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
-
-    con_st.tty = fopen("/dev/tty", "w+");
-    if (con_st.tty != nullptr) {
-        con_st.out = con_st.tty;
-    }
-
-    setlocale(LC_ALL, "");
-#endif
-}
-
-void console_cleanup(console_state & con_st) {
-    // Reset console color
-    console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
-
-#if !defined(_WIN32)
-    if (con_st.tty != nullptr) {
-        con_st.out = stdout;
-        fclose(con_st.tty);
-        con_st.tty = nullptr;
-    }
-    // Restore the terminal settings on POSIX systems
-    tcsetattr(STDIN_FILENO, TCSANOW, &con_st.prev_state);
-#endif
-}
-
-/* Keep track of current color of output, and emit ANSI code if it changes. */
-void console_set_color(console_state & con_st, console_color_t color) {
-    if (con_st.use_color && con_st.color != color) {
-        fflush(stdout);
-        switch(color) {
-            case CONSOLE_COLOR_DEFAULT:
-                fprintf(con_st.out, ANSI_COLOR_RESET);
-                break;
-            case CONSOLE_COLOR_PROMPT:
-                fprintf(con_st.out, ANSI_COLOR_YELLOW);
-                break;
-            case CONSOLE_COLOR_USER_INPUT:
-                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
-                break;
-            case CONSOLE_COLOR_ERROR:
-                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED);
-                break;
-        }
-        con_st.color = color;
-        fflush(con_st.out);
-    }
-}
-
-char32_t getchar32() {
-#if defined(_WIN32)
-    HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
-    wchar_t high_surrogate = 0;
-
-    while (true) {
-        INPUT_RECORD record;
-        DWORD count;
-        if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
-            return WEOF;
-        }
-
-        if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
-            wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
-            if (wc == 0) {
-                continue;
-            }
-
-            if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
-                high_surrogate = wc;
-                continue;
-            } else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
-                if (high_surrogate != 0) { // Check if we have a high surrogate
-                    return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
-                }
-            }
-
-            high_surrogate = 0; // Reset the high surrogate
-            return static_cast<char32_t>(wc);
-        }
-    }
-#else
-    wchar_t wc = getwchar();
-    if (static_cast<wint_t>(wc) == WEOF) {
-        return WEOF;
-    }
-
-#if WCHAR_MAX == 0xFFFF
-    if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
-        wchar_t low_surrogate = getwchar();
-        if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
-            return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
-        }
-    }
-    if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
-        return 0xFFFD; // Return the replacement character U+FFFD
-    }
-#endif
-
-    return static_cast<char32_t>(wc);
-#endif
-}
-
-void pop_cursor(console_state & con_st) {
-#if defined(_WIN32)
-    if (con_st.hConsole != NULL) {
-        CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-        GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo);
-
-        COORD newCursorPosition = bufferInfo.dwCursorPosition;
-        if (newCursorPosition.X == 0) {
-            newCursorPosition.X = bufferInfo.dwSize.X - 1;
-            newCursorPosition.Y -= 1;
-        } else {
-            newCursorPosition.X -= 1;
-        }
-
-        SetConsoleCursorPosition(con_st.hConsole, newCursorPosition);
-        return;
-    }
-#endif
-    putc('\b', con_st.out);
-}
-
-int estimateWidth(char32_t codepoint) {
-#if defined(_WIN32)
-    return 1;
-#else
-    return wcwidth(codepoint);
-#endif
-}
-
-int put_codepoint(console_state & con_st, const char* utf8_codepoint, size_t length, int expectedWidth) {
-#if defined(_WIN32)
-    CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-    if (!GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo)) {
-        // go with the default
-        return expectedWidth;
-    }
-    COORD initialPosition = bufferInfo.dwCursorPosition;
-    DWORD nNumberOfChars = length;
-    WriteConsole(con_st.hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
-
-    CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
-    GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
-
-    // Figure out our real position if we're in the last column
-    if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
-        DWORD nNumberOfChars;
-        WriteConsole(con_st.hConsole, &" \b", 2, &nNumberOfChars, NULL);
-        GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
-    }
-
-    int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
-    if (width < 0) {
-        width += newBufferInfo.dwSize.X;
-    }
-    return width;
-#else
-    // we can trust expectedWidth if we've got one
-    if (expectedWidth >= 0 || con_st.tty == nullptr) {
-        fwrite(utf8_codepoint, length, 1, con_st.out);
-        return expectedWidth;
-    }
-
-    fputs("\033[6n", con_st.tty); // Query cursor position
-    int x1, x2, y1, y2;
-    int results = 0;
-    results = fscanf(con_st.tty, "\033[%d;%dR", &y1, &x1);
-
-    fwrite(utf8_codepoint, length, 1, con_st.tty);
-
-    fputs("\033[6n", con_st.tty); // Query cursor position
-    results += fscanf(con_st.tty, "\033[%d;%dR", &y2, &x2);
-
-    if (results != 4) {
-        return expectedWidth;
-    }
-
-    int width = x2 - x1;
-    if (width < 0) {
-        // Calculate the width considering text wrapping
-        struct winsize w;
-        ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
-        width += w.ws_col;
-    }
-    return width;
-#endif
-}
-
-void replace_last(console_state & con_st, char ch) {
-#if defined(_WIN32)
-    pop_cursor(con_st);
-    put_codepoint(con_st, &ch, 1, 1);
-#else
-    fprintf(con_st.out, "\b%c", ch);
-#endif
-}
-
-void append_utf8(char32_t ch, std::string & out) {
-    if (ch <= 0x7F) {
-        out.push_back(static_cast<unsigned char>(ch));
-    } else if (ch <= 0x7FF) {
-        out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
-        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-    } else if (ch <= 0xFFFF) {
-        out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
-        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
-        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-    } else if (ch <= 0x10FFFF) {
-        out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
-        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
-        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
-        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+std::vector<llama_token> llama_tokenize(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    // upper limit for the number of tokens
+    int n_tokens = text.length() + add_bos;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+        GGML_ASSERT(check == -n_tokens);
    } else {
-        // Invalid Unicode code point
+        result.resize(n_tokens);
    }
+    return result;
 }

-// Helper function to remove the last UTF-8 character from a string
-void pop_back_utf8_char(std::string & line) {
-    if (line.empty()) {
-        return;
-    }
-
-    size_t pos = line.length() - 1;
-
-    // Find the start of the last UTF-8 character (checking up to 4 bytes back)
-    for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
-        if ((line[pos] & 0xC0) != 0x80) break; // Found the start of the character
-    }
-    line.erase(pos);
-}
-
-bool console_readline(console_state & con_st, std::string & line) {
-    console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-    if (con_st.out != stdout) {
-        fflush(stdout);
-    }
-
-    line.clear();
-    std::vector<int> widths;
-    bool is_special_char = false;
-    bool end_of_stream = false;
-
-    char32_t input_char;
-    while (true) {
-        fflush(con_st.out); // Ensure all output is displayed before waiting for input
-        input_char = getchar32();
-
-        if (input_char == '\r' || input_char == '\n') {
-            break;
-        }
-
-        if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
-            end_of_stream = true;
-            break;
-        }
-
-        if (is_special_char) {
-            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
-            replace_last(con_st, line.back());
-            is_special_char = false;
-        }
-
-        if (input_char == '\033') { // Escape sequence
-            char32_t code = getchar32();
-            if (code == '[' || code == 0x1B) {
-                // Discard the rest of the escape sequence
-                while ((code = getchar32()) != (char32_t) WEOF) {
-                    if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
-                        break;
-                    }
-                }
-            }
-        } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
-            if (!widths.empty()) {
-                int count;
-                do {
-                    count = widths.back();
-                    widths.pop_back();
-                    // Move cursor back, print space, and move cursor back again
-                    for (int i = 0; i < count; i++) {
-                        replace_last(con_st, ' ');
-                        pop_cursor(con_st);
-                    }
-                    pop_back_utf8_char(line);
-                } while (count == 0 && !widths.empty());
-            }
-        } else {
-            int offset = line.length();
-            append_utf8(input_char, line);
-            int width = put_codepoint(con_st, line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
-            if (width < 0) {
-                width = 0;
-            }
-            widths.push_back(width);
-        }
-
-        if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
-            console_set_color(con_st, CONSOLE_COLOR_PROMPT);
-            replace_last(con_st, line.back());
-            is_special_char = true;
-        }
-    }
-
-    bool has_more = con_st.multiline_input;
-    if (is_special_char) {
-        replace_last(con_st, ' ');
-        pop_cursor(con_st);
-
-        char last = line.back();
-        line.pop_back();
-        if (last == '\\') {
-            line += '\n';
-            fputc('\n', con_st.out);
-            has_more = !has_more;
-        } else {
-            // llama will just eat the single space, it won't act as a space
-            if (line.length() == 1 && line.back() == ' ') {
-                line.clear();
-                pop_cursor(con_st);
-            }
-            has_more = false;
-        }
+std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+    std::vector<char> result(8, 0);
+    const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_token_to_str(ctx, token, result.data(), result.size());
+        GGML_ASSERT(check == -n_tokens);
    } else {
-        if (end_of_stream) {
-            has_more = false;
-        } else {
-            line += '\n';
-            fputc('\n', con_st.out);
-        }
+        result.resize(n_tokens);
    }

-    fflush(con_st.out);
-    return has_more;
+    return std::string(result.data(), result.size());
 }
+
+std::vector<llama_token> llama_tokenize_bpe(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    int n_tokens = text.length() + add_bos;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+    return result;
+}
+
+std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
+    std::vector<char> result(8, 0);
+    const int n_tokens = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+
+    return std::string(result.data(), result.size());
+}
+
--- a/examples/common.h
+++ b/examples/common.h
@@ -11,32 +11,27 @@
 #include <unordered_map>
 #include <tuple>

-#if !defined (_WIN32)
-#include <stdio.h>
-#include <termios.h>
-#endif
-
 //
 // CLI argument parsing
 //
 int32_t get_num_physical_cores();

 struct gpt_params {
-    uint32_t seed                           = -1;  // RNG seed
+    uint32_t seed                           = -1;   // RNG seed
    int32_t n_threads                       = get_num_physical_cores();
-    int32_t n_predict                       = -1;  // new tokens to predict
-    int32_t n_ctx                           = 512; // context size
-    int32_t n_batch                         = 512; // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                          = 0;   // number of tokens to keep from initial prompt
-    int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
-    int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
-    float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
-    int32_t n_probs                         = 0;   // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t n_predict                       = -1;   // new tokens to predict
+    int32_t n_ctx                           = 512;  // context size
+    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
+    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
+    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
+    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
+    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
+    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor

    // sampling parameters
-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
@@ -46,28 +41,34 @@ struct gpt_params {
    int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
    float   frequency_penalty = 0.00f; // 0.0 = disabled
    float   presence_penalty  = 0.00f; // 0.0 = disabled
-    int     mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate

+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+
    // Classifier-Free Guidance
    // https://arxiv.org/abs/2306.17806
    std::string cfg_negative_prompt;       // string to help guidance
    float       cfg_scale         = 1.f;   // How strong is guidance
-    float       cfg_smooth_factor = 1.f;   // Smooth factor between old and new logits

-    std::string model             = "models/7B/ggml-model.bin"; // model path
+    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_alias       = "unknown"; // model alias
    std::string prompt            = "";
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
    std::string input_prefix      = "";  // string to prefix user inputs with
    std::string input_suffix      = "";  // string to suffix user inputs with
+    std::string grammar           = "";  // optional BNF-like grammar to constrain sampling
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

    std::string lora_adapter = "";  // lora adapter path
    std::string lora_base    = "";  // base model path for the lora adapter

-    bool low_vram          = false;   // if true, reduce VRAM usage at the cost of performance
+    bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
+
+    bool low_vram          = false; // if true, reduce VRAM usage at the cost of performance
+    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
@@ -78,7 +79,10 @@ struct gpt_params {
    bool embedding         = false; // get only sentence embedding
    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`
+    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles

+    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
+    bool ignore_eos        = false; // ignore generated EOS tokens
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool penalize_nl       = true;  // consider newlines as a repeatable token
    bool perplexity        = false; // compute perplexity over the prompt
@@ -96,54 +100,31 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

 std::string gpt_random_prompt(std::mt19937 & rng);

-//
-// Vocab utils
-//
-
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
-
 //
 // Model utils
 //

-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);

 //
-// Console utils
+// Vocab utils
 //

-#define ANSI_COLOR_RED     "\x1b[31m"
-#define ANSI_COLOR_GREEN   "\x1b[32m"
-#define ANSI_COLOR_YELLOW  "\x1b[33m"
-#define ANSI_COLOR_BLUE    "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN    "\x1b[36m"
-#define ANSI_COLOR_RESET   "\x1b[0m"
-#define ANSI_BOLD          "\x1b[1m"
+std::vector<llama_token> llama_tokenize(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos);

-enum console_color_t {
-    CONSOLE_COLOR_DEFAULT=0,
-    CONSOLE_COLOR_PROMPT,
-    CONSOLE_COLOR_USER_INPUT,
-    CONSOLE_COLOR_ERROR
-};
+std::vector<llama_token> llama_tokenize_bpe(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos);

-struct console_state {
-    bool multiline_input = false;
-    bool use_color = false;
-    console_color_t color = CONSOLE_COLOR_DEFAULT;
+std::string llama_token_to_str(
+        const struct llama_context * ctx,
+                       llama_token   token);

-    FILE* out = stdout;
-#if defined (_WIN32)
-    void* hConsole;
-#else
-    FILE* tty = nullptr;
-    termios prev_state;
-#endif
-};
-
-void console_init(console_state & con_st);
-void console_cleanup(console_state & con_st);
-void console_set_color(console_state & con_st, console_color_t color);
-bool console_readline(console_state & con_st, std::string & line);
+std::string llama_token_to_str_bpe(
+    const struct llama_context * ctx,
+                   llama_token   token);
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -0,0 +1,500 @@
+#include "console.h"
+#include <vector>
+#include <iostream>
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <fcntl.h>
+#include <io.h>
+#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
+#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
+#endif
+#else
+#include <climits>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <wchar.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <termios.h>
+#endif
+
+#define ANSI_COLOR_RED     "\x1b[31m"
+#define ANSI_COLOR_GREEN   "\x1b[32m"
+#define ANSI_COLOR_YELLOW  "\x1b[33m"
+#define ANSI_COLOR_BLUE    "\x1b[34m"
+#define ANSI_COLOR_MAGENTA "\x1b[35m"
+#define ANSI_COLOR_CYAN    "\x1b[36m"
+#define ANSI_COLOR_RESET   "\x1b[0m"
+#define ANSI_BOLD          "\x1b[1m"
+
+namespace console {
+
+    //
+    // Console state
+    //
+
+    static bool      advanced_display = false;
+    static bool      simple_io        = true;
+    static display_t current_display  = reset;
+
+    static FILE*     out              = stdout;
+
+#if defined (_WIN32)
+    static void*     hConsole;
+#else
+    static FILE*     tty              = nullptr;
+    static termios   initial_state;
+#endif
+
+    //
+    // Init and cleanup
+    //
+
+    void init(bool use_simple_io, bool use_advanced_display) {
+        advanced_display = use_advanced_display;
+        simple_io = use_simple_io;
+#if defined(_WIN32)
+        // Windows-specific console initialization
+        DWORD dwMode = 0;
+        hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
+        if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
+            hConsole = GetStdHandle(STD_ERROR_HANDLE);
+            if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) {
+                hConsole = nullptr;
+                simple_io = true;
+            }
+        }
+        if (hConsole) {
+            // Check conditions combined to reduce nesting
+            if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) &&
+                !SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
+                advanced_display = false;
+            }
+            // Set console output codepage to UTF8
+            SetConsoleOutputCP(CP_UTF8);
+        }
+        HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
+        if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
+            // Set console input codepage to UTF16
+            _setmode(_fileno(stdin), _O_WTEXT);
+
+            // Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
+            if (simple_io) {
+                dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
+            } else {
+                dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
+            }
+            if (!SetConsoleMode(hConIn, dwMode)) {
+                simple_io = true;
+            }
+        }
+#else
+        // POSIX-specific console initialization
+        if (!simple_io) {
+            struct termios new_termios;
+            tcgetattr(STDIN_FILENO, &initial_state);
+            new_termios = initial_state;
+            new_termios.c_lflag &= ~(ICANON | ECHO);
+            new_termios.c_cc[VMIN] = 1;
+            new_termios.c_cc[VTIME] = 0;
+            tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
+
+            tty = fopen("/dev/tty", "w+");
+            if (tty != nullptr) {
+                out = tty;
+            }
+        }
+
+        setlocale(LC_ALL, "");
+#endif
+    }
+
+    void cleanup() {
+        // Reset console display
+        set_display(reset);
+
+#if !defined(_WIN32)
+        // Restore settings on POSIX systems
+        if (!simple_io) {
+            if (tty != nullptr) {
+                out = stdout;
+                fclose(tty);
+                tty = nullptr;
+            }
+            tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
+        }
+#endif
+    }
+
+    //
+    // Display and IO
+    //
+
+    // Keep track of current display and only emit ANSI code if it changes
+    void set_display(display_t display) {
+        if (advanced_display && current_display != display) {
+            fflush(stdout);
+            switch(display) {
+                case reset:
+                    fprintf(out, ANSI_COLOR_RESET);
+                    break;
+                case prompt:
+                    fprintf(out, ANSI_COLOR_YELLOW);
+                    break;
+                case user_input:
+                    fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
+                    break;
+                case error:
+                    fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
+            }
+            current_display = display;
+            fflush(out);
+        }
+    }
+
+    char32_t getchar32() {
+#if defined(_WIN32)
+        HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
+        wchar_t high_surrogate = 0;
+
+        while (true) {
+            INPUT_RECORD record;
+            DWORD count;
+            if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
+                return WEOF;
+            }
+
+            if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
+                wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
+                if (wc == 0) {
+                    continue;
+                }
+
+                if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
+                    high_surrogate = wc;
+                    continue;
+                }
+                if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
+                    if (high_surrogate != 0) { // Check if we have a high surrogate
+                        return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
+                    }
+                }
+
+                high_surrogate = 0; // Reset the high surrogate
+                return static_cast<char32_t>(wc);
+            }
+        }
+#else
+        wchar_t wc = getwchar();
+        if (static_cast<wint_t>(wc) == WEOF) {
+            return WEOF;
+        }
+
+#if WCHAR_MAX == 0xFFFF
+        if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
+            wchar_t low_surrogate = getwchar();
+            if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
+                return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
+            }
+        }
+        if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
+            return 0xFFFD; // Return the replacement character U+FFFD
+        }
+#endif
+
+        return static_cast<char32_t>(wc);
+#endif
+    }
+
+    void pop_cursor() {
+#if defined(_WIN32)
+        if (hConsole != NULL) {
+            CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+            GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
+
+            COORD newCursorPosition = bufferInfo.dwCursorPosition;
+            if (newCursorPosition.X == 0) {
+                newCursorPosition.X = bufferInfo.dwSize.X - 1;
+                newCursorPosition.Y -= 1;
+            } else {
+                newCursorPosition.X -= 1;
+            }
+
+            SetConsoleCursorPosition(hConsole, newCursorPosition);
+            return;
+        }
+#endif
+        putc('\b', out);
+    }
+
+    int estimateWidth(char32_t codepoint) {
+#if defined(_WIN32)
+        return 1;
+#else
+        return wcwidth(codepoint);
+#endif
+    }
+
+    int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
+#if defined(_WIN32)
+        CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+        if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
+            // go with the default
+            return expectedWidth;
+        }
+        COORD initialPosition = bufferInfo.dwCursorPosition;
+        DWORD nNumberOfChars = length;
+        WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
+
+        CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
+        GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
+
+        // Figure out our real position if we're in the last column
+        if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
+            DWORD nNumberOfChars;
+            WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL);
+            GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
+        }
+
+        int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
+        if (width < 0) {
+            width += newBufferInfo.dwSize.X;
+        }
+        return width;
+#else
+        // We can trust expectedWidth if we've got one
+        if (expectedWidth >= 0 || tty == nullptr) {
+            fwrite(utf8_codepoint, length, 1, out);
+            return expectedWidth;
+        }
+
+        fputs("\033[6n", tty); // Query cursor position
+        int x1;
+        int y1;
+        int x2;
+        int y2;
+        int results = 0;
+        results = fscanf(tty, "\033[%d;%dR", &y1, &x1);
+
+        fwrite(utf8_codepoint, length, 1, tty);
+
+        fputs("\033[6n", tty); // Query cursor position
+        results += fscanf(tty, "\033[%d;%dR", &y2, &x2);
+
+        if (results != 4) {
+            return expectedWidth;
+        }
+
+        int width = x2 - x1;
+        if (width < 0) {
+            // Calculate the width considering text wrapping
+            struct winsize w;
+            ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
+            width += w.ws_col;
+        }
+        return width;
+#endif
+    }
+
+    void replace_last(char ch) {
+#if defined(_WIN32)
+        pop_cursor();
+        put_codepoint(&ch, 1, 1);
+#else
+        fprintf(out, "\b%c", ch);
+#endif
+    }
+
+    void append_utf8(char32_t ch, std::string & out) {
+        if (ch <= 0x7F) {
+            out.push_back(static_cast<unsigned char>(ch));
+        } else if (ch <= 0x7FF) {
+            out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
+            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+        } else if (ch <= 0xFFFF) {
+            out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
+            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+        } else if (ch <= 0x10FFFF) {
+            out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
+            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
+            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+        } else {
+            // Invalid Unicode code point
+        }
+    }
+
+    // Helper function to remove the last UTF-8 character from a string
+    void pop_back_utf8_char(std::string & line) {
+        if (line.empty()) {
+            return;
+        }
+
+        size_t pos = line.length() - 1;
+
+        // Find the start of the last UTF-8 character (checking up to 4 bytes back)
+        for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
+            if ((line[pos] & 0xC0) != 0x80) {
+                break; // Found the start of the character
+            }
+        }
+        line.erase(pos);
+    }
+
+    bool readline_advanced(std::string & line, bool multiline_input) {
+        if (out != stdout) {
+            fflush(stdout);
+        }
+
+        line.clear();
+        std::vector<int> widths;
+        bool is_special_char = false;
+        bool end_of_stream = false;
+
+        char32_t input_char;
+        while (true) {
+            fflush(out); // Ensure all output is displayed before waiting for input
+            input_char = getchar32();
+
+            if (input_char == '\r' || input_char == '\n') {
+                break;
+            }
+
+            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
+                end_of_stream = true;
+                break;
+            }
+
+            if (is_special_char) {
+                set_display(user_input);
+                replace_last(line.back());
+                is_special_char = false;
+            }
+
+            if (input_char == '\033') { // Escape sequence
+                char32_t code = getchar32();
+                if (code == '[' || code == 0x1B) {
+                    // Discard the rest of the escape sequence
+                    while ((code = getchar32()) != (char32_t) WEOF) {
+                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
+                            break;
+                        }
+                    }
+                }
+            } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
+                if (!widths.empty()) {
+                    int count;
+                    do {
+                        count = widths.back();
+                        widths.pop_back();
+                        // Move cursor back, print space, and move cursor back again
+                        for (int i = 0; i < count; i++) {
+                            replace_last(' ');
+                            pop_cursor();
+                        }
+                        pop_back_utf8_char(line);
+                    } while (count == 0 && !widths.empty());
+                }
+            } else {
+                int offset = line.length();
+                append_utf8(input_char, line);
+                int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
+                if (width < 0) {
+                    width = 0;
+                }
+                widths.push_back(width);
+            }
+
+            if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
+                set_display(prompt);
+                replace_last(line.back());
+                is_special_char = true;
+            }
+        }
+
+        bool has_more = multiline_input;
+        if (is_special_char) {
+            replace_last(' ');
+            pop_cursor();
+
+            char last = line.back();
+            line.pop_back();
+            if (last == '\\') {
+                line += '\n';
+                fputc('\n', out);
+                has_more = !has_more;
+            } else {
+                // llama will just eat the single space, it won't act as a space
+                if (line.length() == 1 && line.back() == ' ') {
+                    line.clear();
+                    pop_cursor();
+                }
+                has_more = false;
+            }
+        } else {
+            if (end_of_stream) {
+                has_more = false;
+            } else {
+                line += '\n';
+                fputc('\n', out);
+            }
+        }
+
+        fflush(out);
+        return has_more;
+    }
+
+    bool readline_simple(std::string & line, bool multiline_input) {
+#if defined(_WIN32)
+        std::wstring wline;
+        if (!std::getline(std::wcin, wline)) {
+            // Input stream is bad or EOF received
+            line.clear();
+            GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0);
+            return false;
+        }
+
+        int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL);
+        line.resize(size_needed);
+        WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL);
+#else
+        if (!std::getline(std::cin, line)) {
+            // Input stream is bad or EOF received
+            line.clear();
+            return false;
+        }
+#endif
+        if (!line.empty()) {
+            char last = line.back();
+            if (last == '/') { // Always return control on '/' symbol
+                line.pop_back();
+                return false;
+            }
+            if (last == '\\') { // '\\' changes the default action
+                line.pop_back();
+                multiline_input = !multiline_input;
+            }
+        }
+        line += '\n';
+
+        // By default, continue input if multiline_input is set
+        return multiline_input;
+    }
+
+    bool readline(std::string & line, bool multiline_input) {
+        set_display(user_input);
+
+        if (simple_io) {
+            return readline_simple(line, multiline_input);
+        }
+        return readline_advanced(line, multiline_input);
+    }
+
+}
--- a/common/console.h
+++ b/common/console.h
@@ -0,0 +1,19 @@
+// Console functions
+
+#pragma once
+
+#include <string>
+
+namespace console {
+    enum display_t {
+        reset = 0,
+        prompt,
+        user_input,
+        error
+    };
+
+    void init(bool use_simple_io, bool use_advanced_display);
+    void cleanup();
+    void set_display(display_t display);
+    bool readline(std::string & line, bool multiline_input);
+}
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -0,0 +1,423 @@
+#include "grammar-parser.h"
+#include <cstdint>
+#include <cwchar>
+#include <string>
+#include <utility>
+#include <stdexcept>
+#include <exception>
+
+namespace grammar_parser {
+    // NOTE: assumes valid utf8 (but checks for overrun)
+    // copied from llama.cpp
+    std::pair<uint32_t, const char *> decode_utf8(const char * src) {
+        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
+        uint8_t  first_byte = static_cast<uint8_t>(*src);
+        uint8_t  highbits   = first_byte >> 4;
+        int      len        = lookup[highbits];
+        uint8_t  mask       = (1 << (8 - len)) - 1;
+        uint32_t value      = first_byte & mask;
+        const char * end    = src + len; // may overrun!
+        const char * pos    = src + 1;
+        for ( ; pos < end && *pos; pos++) {
+            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
+        }
+        return std::make_pair(value, pos);
+    }
+
+    uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
+        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
+        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
+        return result.first->second;
+    }
+
+    uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
+        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
+        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
+        return next_id;
+    }
+
+    void add_rule(
+            parse_state & state,
+            uint32_t      rule_id,
+            const std::vector<llama_grammar_element> & rule) {
+        if (state.rules.size() <= rule_id) {
+            state.rules.resize(rule_id + 1);
+        }
+        state.rules[rule_id] = rule;
+    }
+
+    bool is_word_char(char c) {
+        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
+    }
+
+    std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
+        const char * pos   = src;
+        const char * end   = src + size;
+        uint32_t     value = 0;
+        for ( ; pos < end && *pos; pos++) {
+            value <<= 4;
+            char c = *pos;
+            if ('a' <= c && c <= 'f') {
+                value += c - 'a' + 10;
+            } else if ('A' <= c && c <= 'F') {
+                value += c - 'A' + 10;
+            } else if ('0' <= c && c <= '9') {
+                value += c - '0';
+            } else {
+                break;
+            }
+        }
+        if (pos != end) {
+            throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
+        }
+        return std::make_pair(value, pos);
+    }
+
+    const char * parse_space(const char * src, bool newline_ok) {
+        const char * pos = src;
+        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
+                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
+            if (*pos == '#') {
+                while (*pos && *pos != '\r' && *pos != '\n') {
+                    pos++;
+                }
+            } else {
+                pos++;
+            }
+        }
+        return pos;
+    }
+
+    const char * parse_name(const char * src) {
+        const char * pos = src;
+        while (is_word_char(*pos)) {
+            pos++;
+        }
+        if (pos == src) {
+            throw std::runtime_error(std::string("expecting name at ") + src);
+        }
+        return pos;
+    }
+
+    std::pair<uint32_t, const char *> parse_char(const char * src) {
+        if (*src == '\\') {
+            switch (src[1]) {
+                case 'x': return parse_hex(src + 2, 2);
+                case 'u': return parse_hex(src + 2, 4);
+                case 'U': return parse_hex(src + 2, 8);
+                case 't': return std::make_pair('\t', src + 2);
+                case 'r': return std::make_pair('\r', src + 2);
+                case 'n': return std::make_pair('\n', src + 2);
+                case '\\':
+                case '"':
+                case '[':
+                case ']':
+                    return std::make_pair(src[1], src + 2);
+                default:
+                    throw std::runtime_error(std::string("unknown escape at ") + src);
+            }
+        } else if (*src) {
+            return decode_utf8(src);
+        }
+        throw std::runtime_error("unexpected end of input");
+    }
+
+    const char * parse_alternates(
+            parse_state       & state,
+            const char        * src,
+            const std::string & rule_name,
+            uint32_t            rule_id,
+            bool                is_nested);
+
+    const char * parse_sequence(
+            parse_state                        & state,
+            const char                         * src,
+            const std::string                  & rule_name,
+            std::vector<llama_grammar_element> & out_elements,
+            bool                                 is_nested) {
+        size_t last_sym_start = out_elements.size();
+        const char * pos = src;
+        while (*pos) {
+            if (*pos == '"') { // literal string
+                pos++;
+                last_sym_start = out_elements.size();
+                while (*pos != '"') {
+                    auto char_pair = parse_char(pos);
+                         pos       = char_pair.second;
+                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '[') { // char range(s)
+                pos++;
+                enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
+                if (*pos == '^') {
+                    pos++;
+                    start_type = LLAMA_GRETYPE_CHAR_NOT;
+                }
+                last_sym_start = out_elements.size();
+                while (*pos != ']') {
+                    auto char_pair = parse_char(pos);
+                         pos       = char_pair.second;
+                    enum llama_gretype type = last_sym_start < out_elements.size()
+                        ? LLAMA_GRETYPE_CHAR_ALT
+                        : start_type;
+
+                    out_elements.push_back({type, char_pair.first});
+                    if (pos[0] == '-' && pos[1] != ']') {
+                        auto endchar_pair = parse_char(pos + 1);
+                             pos          = endchar_pair.second;
+                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
+                    }
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (is_word_char(*pos)) { // rule reference
+                const char * name_end    = parse_name(pos);
+                uint32_t     ref_rule_id = get_symbol_id(state, pos, name_end - pos);
+                pos = parse_space(name_end, is_nested);
+                last_sym_start = out_elements.size();
+                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
+            } else if (*pos == '(') { // grouping
+                // parse nested alternates into synthesized rule
+                pos = parse_space(pos + 1, true);
+                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
+                pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
+                last_sym_start = out_elements.size();
+                // output reference to synthesized rule
+                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+                if (*pos != ')') {
+                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
+                }
+                pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
+                if (last_sym_start == out_elements.size()) {
+                    throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
+                }
+
+                // apply transformation to previous symbol (last_sym_start to end) according to
+                // rewrite rules:
+                // S* --> S' ::= S S' |
+                // S+ --> S' ::= S S' | S
+                // S? --> S' ::= S |
+                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
+                std::vector<llama_grammar_element> sub_rule;
+                // add preceding symbol to generated rule
+                sub_rule.insert(
+                    sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
+                if (*pos == '*' || *pos == '+') {
+                    // cause generated rule to recurse
+                    sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+                }
+                // mark start of alternate def
+                sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
+                if (*pos == '+') {
+                    // add preceding symbol as alternate only for '+' (otherwise empty)
+                    sub_rule.insert(
+                        sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
+                }
+                sub_rule.push_back({LLAMA_GRETYPE_END, 0});
+                add_rule(state, sub_rule_id, sub_rule);
+
+                // in original rule, replace previous symbol with reference to generated rule
+                out_elements.resize(last_sym_start);
+                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
+
+                pos = parse_space(pos + 1, is_nested);
+            } else {
+                break;
+            }
+        }
+        return pos;
+    }
+
+    const char * parse_alternates(
+            parse_state       & state,
+            const char        * src,
+            const std::string & rule_name,
+            uint32_t            rule_id,
+            bool                is_nested) {
+        std::vector<llama_grammar_element> rule;
+        const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
+        while (*pos == '|') {
+            rule.push_back({LLAMA_GRETYPE_ALT, 0});
+            pos = parse_space(pos + 1, true);
+            pos = parse_sequence(state, pos, rule_name, rule, is_nested);
+        }
+        rule.push_back({LLAMA_GRETYPE_END, 0});
+        add_rule(state, rule_id, rule);
+        return pos;
+    }
+
+    const char * parse_rule(parse_state & state, const char * src) {
+        const char * name_end = parse_name(src);
+        const char * pos      = parse_space(name_end, false);
+        size_t       name_len = name_end - src;
+        uint32_t     rule_id  = get_symbol_id(state, src, name_len);
+        const std::string name(src, name_len);
+
+        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
+            throw std::runtime_error(std::string("expecting ::= at ") + pos);
+        }
+        pos = parse_space(pos + 3, true);
+
+        pos = parse_alternates(state, pos, name, rule_id, false);
+
+        if (*pos == '\r') {
+            pos += pos[1] == '\n' ? 2 : 1;
+        } else if (*pos == '\n') {
+            pos++;
+        } else if (*pos) {
+            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
+        }
+        return parse_space(pos, true);
+    }
+
+    parse_state parse(const char * src) {
+        try {
+            parse_state state;
+            const char * pos = parse_space(src, true);
+            while (*pos) {
+                pos = parse_rule(state, pos);
+            }
+            return state;
+        } catch (const std::exception & err) {
+            fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
+            return parse_state();
+        }
+    }
+
+    void print_grammar_char(FILE * file, uint32_t c) {
+        if (0x20 <= c && c <= 0x7f) {
+            fprintf(file, "%c", static_cast<char>(c));
+        } else {
+            // cop out of encoding UTF-8
+            fprintf(file, "<U+%04X>", c);
+        }
+    }
+
+    bool is_char_element(llama_grammar_element elem) {
+        switch (elem.type) {
+            case LLAMA_GRETYPE_CHAR:           return true;
+            case LLAMA_GRETYPE_CHAR_NOT:       return true;
+            case LLAMA_GRETYPE_CHAR_ALT:       return true;
+            case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
+            default:                           return false;
+        }
+    }
+
+    void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
+        for (auto elem : rule) {
+            switch (elem.type) {
+                case LLAMA_GRETYPE_END:            fprintf(file, "END");            break;
+                case LLAMA_GRETYPE_ALT:            fprintf(file, "ALT");            break;
+                case LLAMA_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
+                case LLAMA_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
+                case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
+                case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
+                case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
+            }
+            switch (elem.type) {
+                case LLAMA_GRETYPE_END:
+                case LLAMA_GRETYPE_ALT:
+                case LLAMA_GRETYPE_RULE_REF:
+                    fprintf(file, "(%u) ", elem.value);
+                    break;
+                case LLAMA_GRETYPE_CHAR:
+                case LLAMA_GRETYPE_CHAR_NOT:
+                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                case LLAMA_GRETYPE_CHAR_ALT:
+                    fprintf(file, "(\"");
+                    print_grammar_char(file, elem.value);
+                    fprintf(file, "\") ");
+                    break;
+            }
+        }
+        fprintf(file, "\n");
+    }
+
+    void print_rule(
+            FILE     * file,
+            uint32_t   rule_id,
+            const std::vector<llama_grammar_element> & rule,
+            const std::map<uint32_t, std::string>    & symbol_id_names) {
+        if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
+            throw std::runtime_error(
+                "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
+        }
+        fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
+        for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
+            llama_grammar_element elem = rule[i];
+            switch (elem.type) {
+                case LLAMA_GRETYPE_END:
+                    throw std::runtime_error(
+                        "unexpected end of rule: " + std::to_string(rule_id) + "," +
+                        std::to_string(i));
+                case LLAMA_GRETYPE_ALT:
+                    fprintf(file, "| ");
+                    break;
+                case LLAMA_GRETYPE_RULE_REF:
+                    fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
+                    break;
+                case LLAMA_GRETYPE_CHAR:
+                    fprintf(file, "[");
+                    print_grammar_char(file, elem.value);
+                    break;
+                case LLAMA_GRETYPE_CHAR_NOT:
+                    fprintf(file, "[^");
+                    print_grammar_char(file, elem.value);
+                    break;
+                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                    if (i == 0 || !is_char_element(rule[i - 1])) {
+                        throw std::runtime_error(
+                            "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
+                            std::to_string(rule_id) + "," + std::to_string(i));
+                    }
+                    fprintf(file, "-");
+                    print_grammar_char(file, elem.value);
+                    break;
+                case LLAMA_GRETYPE_CHAR_ALT:
+                    if (i == 0 || !is_char_element(rule[i - 1])) {
+                        throw std::runtime_error(
+                            "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
+                            std::to_string(rule_id) + "," + std::to_string(i));
+                    }
+                    print_grammar_char(file, elem.value);
+                    break;
+            }
+            if (is_char_element(elem)) {
+                switch (rule[i + 1].type) {
+                    case LLAMA_GRETYPE_CHAR_ALT:
+                    case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                        break;
+                    default:
+                        fprintf(file, "] ");
+                }
+            }
+        }
+        fprintf(file, "\n");
+    }
+
+    void print_grammar(FILE * file, const parse_state & state) {
+        try {
+            std::map<uint32_t, std::string> symbol_id_names;
+            for (auto kv : state.symbol_ids) {
+                symbol_id_names[kv.second] = kv.first;
+            }
+            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
+                // fprintf(file, "%zu: ", i);
+                // print_rule_binary(file, state.rules[i]);
+                print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
+                // fprintf(file, "\n");
+            }
+        } catch (const std::exception & err) {
+            fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
+        }
+    }
+
+    std::vector<const llama_grammar_element *> parse_state::c_rules() {
+        std::vector<const llama_grammar_element *> ret;
+        for (const auto & rule : rules) {
+            ret.push_back(rule.data());
+        }
+        return ret;
+    }
+}
--- a/common/grammar-parser.h
+++ b/common/grammar-parser.h
@@ -0,0 +1,29 @@
+// Implements a parser for an extended Backus-Naur form (BNF), producing the
+// binary context-free grammar format specified by llama.h. Supports character
+// ranges, grouping, and repetition operators. As an example, a grammar for
+// arithmetic might look like:
+//
+// root  ::= expr
+// expr  ::= term ([-+*/] term)*
+// term  ::= num | "(" space expr ")" space
+// num   ::= [0-9]+ space
+// space ::= [ \t\n]*
+
+#pragma once
+#include "llama.h"
+#include <vector>
+#include <map>
+#include <cstdint>
+#include <string>
+
+namespace grammar_parser {
+    struct parse_state {
+        std::map<std::string, uint32_t>                 symbol_ids;
+        std::vector<std::vector<llama_grammar_element>> rules;
+
+        std::vector<const llama_grammar_element *> c_rules();
+    };
+
+    parse_state parse(const char * src);
+    void print_grammar(FILE * file, const parse_state & state);
+}
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -0,0 +1,282 @@
+# HF falcon--> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from transformers import AutoTokenizer
+
+def bytes_to_unicode():
+    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "RWForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.FALCON
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["n_layer"]
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_context_length(2048) # not in config.json
+gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_head_count(hparams["n_head"])
+if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"])
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[str] = []
+merges: List[str] = []
+
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # gpt2 tokenizer
+    gguf_writer.add_tokenizer_model("gpt2")
+
+    print("gguf: get gpt2 tokenizer merges")
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer_json = json.load(f)
+    merges = tokenizer_json["model"]["merges"]
+
+    gguf_writer.add_token_merges(merges)
+
+    print("gguf: get gpt2 tokenizer vocab")
+
+    vocab_size = len(tokenizer_json["model"]["vocab"])
+
+    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+    byte_encoder = bytes_to_unicode()
+    byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+    for i in range(vocab_size):
+        if i in reverse_vocab:
+            try:
+                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+            except KeyError:
+                text = bytearray()
+                for c in reverse_vocab[i]:
+                    if ord(c) < 256:  # single byte character
+                        text.append(byte_decoder[ord(c)])
+                    else:  # multibyte special token character
+                        text.extend(c.encode('utf-8'))
+        else:
+            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+            pad_token = f"[PAD{i}]".encode("utf8")
+            text = bytearray(pad_token)
+
+        tokens.append(text)
+
+    gguf_writer.add_token_list(tokens)
+
+    if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
+        print("gguf: get special token ids")
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        # find special token ids
+
+        if "bos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# params for qkv transform
+n_head = hparams["n_head"]
+n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
+head_dim = hparams["hidden_size"] // n_head
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = ("pytorch_model.bin",)
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        # QKV tensor transform
+        # The original query_key_value tensor contains n_head_kv "kv groups",
+        # each consisting of n_head/n_head_kv query weights followed by one key
+        # and one value weight (shared by all query heads in the kv group).
+        # This layout makes it a big pain to work with in GGML.
+        # So we rearrange them here,, so that we have n_head query weights
+        # followed by n_head_kv key weights followed by n_head_kv value weights,
+        # in contiguous fashion.
+        # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
+
+        if "query_key_value" in name:
+            qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
+            q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
+            k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
+            v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
+            data = torch.cat((q,k,v)).reshape_as(data)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@@ -0,0 +1,266 @@
+# HF gptneox--> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from transformers import AutoTokenizer
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "GPTNeoXForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.GPTNEOX
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_context_length(hparams["max_position_embeddings"])
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
+gguf_writer.add_head_count(hparams["num_attention_heads"])
+gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[str] = []
+merges: List[str] = []
+
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # gpt2 tokenizer
+    gguf_writer.add_tokenizer_model("gpt2")
+
+    print("gguf: get gpt2 tokenizer merges")
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer_json = json.load(f)
+    merges = tokenizer_json["model"]["merges"]
+
+    gguf_writer.add_token_merges(merges)
+
+    print("gguf: get gpt2 tokenizer vocab")
+
+    vocab_size = len(tokenizer_json["model"]["vocab"])
+
+    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+    byte_encoder = bytes_to_unicode()
+    byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+    for i in range(vocab_size):
+        if i in reverse_vocab:
+            try:
+                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+            except KeyError:
+                text = bytearray()
+                for c in reverse_vocab[i]:
+                    if ord(c) < 256:  # single byte character
+                        text.append(byte_decoder[ord(c)])
+                    else:  # multibyte special token character
+                        text.extend(c.encode('utf-8'))
+        else:
+            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+            pad_token = f"[PAD{i}]".encode("utf8")
+            text = bytearray(pad_token)
+
+        tokens.append(text)
+
+    gguf_writer.add_token_list(tokens)
+
+    if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
+        print("gguf: get special token ids")
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        # find special token ids
+
+        if "bos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = ("pytorch_model.bin",)
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@@ -0,0 +1,307 @@
+# 7b pth llama --> gguf conversion
+# Only models with a single datafile are supported, like 7B
+# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from sentencepiece import SentencePieceProcessor
+
+#NDArray = np.ndarray[Any, Any]
+# compatible with python < 3.9
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("consolidated."):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "LlamaForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+if num_parts > 1:
+    print("gguf: Only models with a single datafile are supported.")
+
+    sys.exit()
+
+ARCH=gguf.MODEL_ARCH.LLAMA
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+    head_count_kv = hparams["num_key_value_heads"]
+else:
+    head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+    hf_repo = hparams["_name_or_path"]
+else:
+    hf_repo = ""
+
+if "max_sequence_length" in hparams:
+    ctx_length = hparams["max_sequence_length"]
+elif "max_position_embeddings" in hparams:
+    ctx_length = hparams["max_position_embeddings"]
+else:
+    print("gguf: can not find ctx length parameter.")
+
+    sys.exit()
+
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(ctx_length)
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[bytes] = []
+scores: List[float] = []
+toktypes: List[int] = []
+
+if Path(dir_model + "/tokenizer.model").is_file():
+    # vocab type sentencepiece
+    print("gguf: get sentencepiece tokenizer vocab and scores")
+
+    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+
+    for i in range(tokenizer.vocab_size()):
+        text: bytes
+        score: float
+
+        piece = tokenizer.id_to_piece(i)
+        text = piece.encode("utf-8")
+        score = tokenizer.get_score(i)
+
+        toktype = 1  # defualt to normal token type
+        if tokenizer.is_unknown(i):
+            toktype = 2
+        if tokenizer.is_control(i):
+            toktype = 3
+
+        # toktype = 4 is user-defined = tokens from added_tokens.json
+
+        if tokenizer.is_unused(i):
+            toktype = 5
+        if tokenizer.is_byte(i):
+            toktype = 6
+
+        tokens.append(text)
+        scores.append(score)
+        toktypes.append(toktype)
+
+    if Path(dir_model + "/added_tokens.json").is_file():
+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+            addtokens_json = json.load(f)
+
+            print("gguf: get added tokens")
+
+            for key in addtokens_json:
+                tokens.append( key.encode("utf-8") )
+                scores.append(-1000.0)
+                toktypes.append(4) # user-defined token type
+
+    gguf_writer.add_tokenizer_model("llama")
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_scores(scores)
+    gguf_writer.add_token_types(toktypes)
+
+
+print("gguf: get special token ids")
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # Look for special tokens in tokenizer.json if it exists
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer = json.load(f)
+
+    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]["content"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]["content"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]["content"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]["content"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]["content"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+else:
+    # If no tokenizer.json: Look for special tokens in config.json
+
+    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
+        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
+
+    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
+        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
+
+    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
+        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
+
+    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
+        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
+
+    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
+        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name == "rope.freqs":
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@@ -0,0 +1,350 @@
+import sys, struct, math, argparse, warnings
+from pathlib import Path
+
+import numpy as np
+
+import gguf
+
+warnings.filterwarnings('error')
+
+# Note: Does not support GGML_QKK_64
+QK_K = 256
+# Items here are (block size, type size)
+GGML_QUANT_SIZES = {
+    gguf.GGMLQuantizationType.F32  : (1, 4),
+    gguf.GGMLQuantizationType.F16  : (1, 2),
+    gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
+    gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
+    gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
+    gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
+    gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
+    gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
+    gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
+    gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
+    gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
+    gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
+    gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
+    gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
+}
+
+class Hyperparameters:
+    def __init__(self):
+        self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
+        self.n_ff = 0
+
+    def set_n_ff(self, model):
+        ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
+        assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
+        ff_tensor = model.tensors[ff_tensor_idx]
+        self.n_ff = ff_tensor.dims[1]
+
+    def load(self, data, offset):
+        (
+            self.n_vocab,
+            self.n_embd,
+            self.n_mult,
+            self.n_head,
+            self.n_layer,
+            self.n_rot,
+            self.ftype,
+        ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
+        return 4 * 7
+
+    def __str__(self):
+        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
+
+class Vocab:
+    def __init__(self):
+        self.items = []
+
+    def load(self, data, offset, n_vocab):
+        orig_offset = offset
+        for _ in range(n_vocab):
+            itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
+            assert itemlen < 4096, 'Absurd vocab item length'
+            offset += 4
+            vocab = bytes(data[offset:offset + itemlen])
+            offset += itemlen
+            score = struct.unpack('<f', data[offset:offset + 4])[0]
+            offset += 4
+            self.items.append((vocab, score))
+        return offset - orig_offset
+
+class Tensor:
+    def __init__(self):
+        self.name = None
+        self.dims = ()
+        self.dtype = None
+        self.start_offset = 0
+        self.len_bytes = 0
+
+    def load(self, data, offset):
+        orig_offset = offset
+        (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
+        assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
+        assert name_len < 4096, 'Absurd tensor name length'
+        quant = GGML_QUANT_SIZES.get(dtype)
+        assert quant is not None, 'Unknown tensor type'
+        (blksize, tysize) = quant
+        offset += 12
+        self.dtype= dtype
+        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
+        offset += 4 * n_dims
+        self.name = bytes(data[offset:offset + name_len])
+        offset += name_len
+        pad = ((offset + 31) & ~31) - offset
+        offset += pad
+        n_elems = np.prod(self.dims)
+        n_bytes = (n_elems * tysize) // blksize
+        self.start_offset = offset
+        self.len_bytes = n_bytes
+        offset += n_bytes
+        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
+        return offset - orig_offset
+
+class GGMLV3Model:
+    def __init__(self):
+        self.hyperparameters = None
+        self.vocab = None
+        self.tensor_map = {}
+        self.tensors = []
+
+    def validate_header(self, data, offset):
+        if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
+            raise ValueError('Only GGJTv3 supported')
+        return 8
+
+    def load(self, data, offset):
+        offset += self.validate_header(data, offset)
+        hp = Hyperparameters()
+        offset += hp.load(data, offset)
+        vocab = Vocab()
+        offset += vocab.load(data, offset, hp.n_vocab)
+        tensors = []
+        tensor_map = {}
+        while offset < len(data):
+            tensor = Tensor()
+            offset += tensor.load(data, offset)
+            tensor_map[tensor.name] = len(tensors)
+            tensors.append(tensor)
+        self.hyperparameters = hp
+        self.vocab = vocab
+        self.tensors = tensors
+        self.tensor_map = tensor_map
+        hp.set_n_ff(self)
+        return offset
+
+class GGMLToGGUF:
+    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
+        hp = ggml_model.hyperparameters
+        self.model = ggml_model
+        self.data = data
+        self.cfg = cfg
+        self.params_override = params_override
+        self.vocab_override = vocab_override
+        if params_override is not None:
+            n_kv_head = params_override.n_head_kv
+        else:
+            if cfg.gqa == 1:
+                n_kv_head = hp.n_head
+            else:
+                gqa = float(cfg.gqa)
+                n_kv_head = None
+                for x in range(1, 256):
+                    if float(hp.n_head) / float(x) == gqa:
+                        n_kv_head = x
+                assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
+                print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
+        self.n_kv_head = n_kv_head
+        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
+
+    def save(self):
+        print('* Preparing to save GGUF file')
+        gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
+        self.add_params(gguf_writer)
+        self.add_vocab(gguf_writer)
+        self.add_tensors(gguf_writer)
+        print("    gguf: write header")
+        gguf_writer.write_header_to_file()
+        print("    gguf: write metadata")
+        gguf_writer.write_kv_data_to_file()
+        print("    gguf: write tensors")
+        gguf_writer.write_tensors_to_file()
+        gguf_writer.close()
+
+    def add_params(self, gguf_writer):
+        hp = self.model.hyperparameters
+        cfg = self.cfg
+        desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
+        try:
+            # Filenames aren't necessarily valid UTF8.
+            name = cfg.name if cfg.name is not None else cfg.input.name
+        except UnicodeDecodeError:
+            name = None
+        print('* Adding model parameters and KV items')
+        if name is not None:
+            gguf_writer.add_name(name)
+        gguf_writer.add_description(desc)
+        if self.params_override is not None:
+            po = self.params_override
+            assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
+            assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
+            assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
+            gguf_writer.add_context_length      (po.n_ctx)
+            gguf_writer.add_embedding_length    (po.n_embd)
+            gguf_writer.add_block_count         (po.n_layer)
+            gguf_writer.add_feed_forward_length (po.n_ff)
+            gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
+            gguf_writer.add_head_count          (po.n_head)
+            gguf_writer.add_head_count_kv       (po.n_head_kv)
+            gguf_writer.add_layer_norm_rms_eps  (po.f_norm_eps)
+            return
+        gguf_writer.add_context_length(cfg.context_length)
+        gguf_writer.add_embedding_length(hp.n_embd)
+        gguf_writer.add_block_count(hp.n_layer)
+        gguf_writer.add_feed_forward_length(hp.n_ff)
+        gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
+        gguf_writer.add_head_count(hp.n_head)
+        gguf_writer.add_head_count_kv(self.n_kv_head)
+        gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
+
+    def add_vocab(self, gguf_writer):
+        hp = self.model.hyperparameters
+        gguf_writer.add_tokenizer_model('llama')
+        tokens = []
+        scores = []
+        toktypes = []
+        if self.vocab_override is not None:
+            vo = self.vocab_override
+            print('* Adding vocab item(s)')
+            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
+                tokens.append(vbytes)
+                scores.append(score)
+                toktypes.append(ttype)
+            assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
+            gguf_writer.add_token_list(tokens)
+            gguf_writer.add_token_scores(scores)
+            if len(toktypes) > 0:
+                gguf_writer.add_token_types(toktypes)
+            return
+        print(f'* Adding {hp.n_vocab} vocab item(s)')
+        assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
+        for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
+            tt = 1 # Normal
+            # Special handling for UNK, BOS, EOS tokens.
+            if tokid <= 2:
+                if tokid == 0:
+                    vbytes = b'<unk>'
+                    tt = 2
+                elif tokid == 1:
+                    vbytes = b'<s>'
+                    tt = 3
+                else:
+                    vbytes = b'</s>'
+                    tt = 3
+            elif len(vbytes) == 0:
+                tt = 3 # Control
+            elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
+                vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
+                tt = 6 # Byte
+            else:
+                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
+            toktypes.append(tt)
+            tokens.append(vbytes)
+            scores.append(vscore)
+        gguf_writer.add_token_list(tokens)
+        gguf_writer.add_token_scores(scores)
+        gguf_writer.add_token_types(toktypes)
+        gguf_writer.add_unk_token_id(0)
+        gguf_writer.add_bos_token_id(1)
+        gguf_writer.add_eos_token_id(2)
+
+    def add_tensors(self, gguf_writer):
+        nm = self.name_map
+        data = self.data
+        print(f'* Adding {len(self.model.tensors)} tensor(s)')
+        for tensor in self.model.tensors:
+            name = str(tensor.name, 'UTF-8')
+            if name.endswith('.weight'):
+                name = name[:-7]
+                suffix = '.weight'
+            elif name.endswith('.bias'):
+                name = name[:-5]
+                suffix = '.bias'
+            mapped_name = nm.get(name)
+            assert mapped_name is not None, f'Bad name {name}'
+            mapped_name += suffix
+            tempdims = list(tensor.dims[:])
+            if len(tempdims) > 1:
+                temp = tempdims[1]
+                tempdims[1] = tempdims[0]
+                tempdims[0] = temp
+            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
+            gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
+
+def handle_metadata(cfg, hp):
+    import convert
+    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
+    hf_config_path   = cfg.model_metadata_dir / "config.json"
+    orig_config_path = cfg.model_metadata_dir / "params.json"
+    # We pass a fake model here. "original" mode will check the shapes of some
+    # tensors if information is missing in the .json file: other than that, the
+    # model data isn't used so this should be safe (at least for now).
+    fakemodel = {
+        'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
+        'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
+    }
+    fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
+    fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
+    if hf_config_path.exists():
+        params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
+    elif orig_config_path.exists():
+        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
+    else:
+        raise ValueError('Unable to load metadata')
+    vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
+    convert.check_vocab_size(params, vocab)
+    return (params, vocab)
+
+def handle_args():
+    parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
+    parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
+    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
+    parser.add_argument('--name', help = 'Set model name')
+    parser.add_argument('--desc', help = 'Set model description')
+    parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
+    parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
+    parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
+    parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
+    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
+    parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
+    return parser.parse_args()
+
+def main():
+    cfg = handle_args()
+    print(f'* Using config: {cfg}')
+    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
+    data = np.memmap(cfg.input, mode = 'r')
+    model = GGMLV3Model()
+    print('* Scanning GGML input file')
+    try:
+        offset = model.load(data, 0)
+    except OverflowError:
+        print(f'!!! Caught overflow loading tensors. The most likely issue is running on Windows but not in WSL. Try running in WSL if possible.', file = sys.stderr)
+        raise
+    print(f'* GGML model hyperparameters: {model.hyperparameters}')
+    vocab_override = None
+    params_override = None
+    if cfg.model_metadata_dir is not None:
+        (params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
+        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
+        print(f'* Overriding params: {params_override}')
+        print(f'* Overriding vocab: {vocab_override}')
+    else:
+        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
+    converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
+    converter.save()
+    print(f'* Successful completion. Output saved to: {cfg.output}')
+
+if __name__ == '__main__':
+    main()
--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@@ -0,0 +1,327 @@
+# HF llama --> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List, Optional
+from pathlib import Path
+from sentencepiece import SentencePieceProcessor
+
+#NDArray = np.ndarray[Any, Any]
+# compatible with python < 3.9
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+# reverse HF permute back to original pth layout
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
+
+
+def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
+    if n_kv_head is not None and n_head != n_kv_head:
+        n_head //= n_kv_head
+
+    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape))
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "LlamaForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.LLAMA
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+    head_count_kv = hparams["num_key_value_heads"]
+else:
+    head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+    hf_repo = hparams["_name_or_path"]
+else:
+    hf_repo = ""
+
+if "max_sequence_length" in hparams:
+    ctx_length = hparams["max_sequence_length"]
+elif "max_position_embeddings" in hparams:
+    ctx_length = hparams["max_position_embeddings"]
+else:
+    print("gguf: can not find ctx length parameter.")
+
+    sys.exit()
+
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(ctx_length)
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[bytes] = []
+scores: List[float] = []
+toktypes: List[int] = []
+
+if Path(dir_model + "/tokenizer.model").is_file():
+    # vocab type sentencepiece
+    print("gguf: get sentencepiece tokenizer vocab, scores and token types")
+
+    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+
+    for i in range(tokenizer.vocab_size()):
+        text: bytes
+        score: float
+
+        piece = tokenizer.id_to_piece(i)
+        text = piece.encode("utf-8")
+        score = tokenizer.get_score(i)
+
+        toktype = 1  # defualt to normal token type
+        if tokenizer.is_unknown(i):
+            toktype = 2
+        if tokenizer.is_control(i):
+            toktype = 3
+
+        # toktype = 4 is user-defined = tokens from added_tokens.json
+
+        if tokenizer.is_unused(i):
+            toktype = 5
+        if tokenizer.is_byte(i):
+            toktype = 6
+
+        tokens.append(text)
+        scores.append(score)
+        toktypes.append(toktype)
+
+    if Path(dir_model + "/added_tokens.json").is_file():
+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+            addtokens_json = json.load(f)
+
+            print("gguf: get added tokens")
+
+            for key in addtokens_json:
+                tokens.append( key.encode("utf-8") )
+                scores.append(-1000.0)
+                toktypes.append(4) # user-defined token type
+
+
+    gguf_writer.add_tokenizer_model("llama")
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_scores(scores)
+    gguf_writer.add_token_types(toktypes)
+
+
+print("gguf: get special token ids")
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # Look for special tokens in tokenizer.json if it exists
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer = json.load(f)
+
+    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]["content"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]["content"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]["content"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]["content"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]["content"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+else:
+    # If no tokenizer.json: Look for special tokens in config.json
+
+    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
+        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
+
+    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
+        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
+
+    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
+        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
+
+    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
+        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
+
+    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
+        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = ("pytorch_model.bin",)
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name.endswith(".rotary_emb.inv_freq"):
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # reverse permute these
+        if name.endswith(".q_proj.weight"):
+            data = reverse_hf_permute(data, head_count)
+        if name.endswith(".k_proj.weight"):
+            data = reverse_hf_permute(data, head_count, head_count_kv)
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 import json
 import os
 import re
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -1,13 +0,0 @@
-# Compatibility stub
-
-import argparse
-
-import convert
-
-parser = argparse.ArgumentParser(
-    description="""[DEPRECATED - use `convert.py` instead]
-    Convert a LLaMA model checkpoint to a ggml compatible file""")
-parser.add_argument('dir_model',  help='directory containing the model checkpoint')
-parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
-args = parser.parse_args()
-convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
--- a/convert.py
+++ b/convert.py
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@@ -3,7 +3,7 @@
 ## Verifying that the model is running on the GPU with cuBLAS
 Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
-./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
+./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```

 When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM)
 CPU: 7 physical cores
 RAM: 32GB

-Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
+Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)

-Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`

 Result:

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,23 +6,6 @@ find_package(Threads REQUIRED)

 # ...

-# common
-
-set(TARGET common)
-
-add_library(${TARGET} OBJECT
-    common.h
-    common.cpp
-    )
-
-if (BUILD_SHARED_LIBS)
-    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()
-
-target_include_directories(${TARGET} PUBLIC .)
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE llama)
-
 # examples

 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
@@ -38,8 +21,10 @@ else()
    add_subdirectory(benchmark)
    add_subdirectory(baby-llama)
    add_subdirectory(train-text-from-scratch)
+    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
    add_subdirectory(embd-input)
+    add_subdirectory(llama-bench)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -2,21 +2,21 @@
 set -e

 AI_NAME="${AI_NAME:-Miku}"
-MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
+MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
 USER_NAME="${USER_NAME:-Anon}"

 # Uncomment and adjust to the number of CPU cores you want to use.
 #N_THREAD="${N_THREAD:-4}"
+CTX_SIZE="${CTX_SIZE:-4096}"
 N_PREDICTS="${N_PREDICTS:-4096}"

 GEN_OPTIONS=(--batch_size 1024
--ctx_size 2048
+--ctx_size "$CTX_SIZE"
 --keep -1
 --repeat_last_n 256
 --repeat_penalty 1.17647
--temp 0.7
--top_k 40
--top_p 0.5)
+--temp 0.6
+--mirostat 2)

 if [ -n "$N_THREAD" ]; then
    GEN_OPTIONS+=(--threads "$N_THREAD")
@@ -24,16 +24,17 @@ fi

 ./main "${GEN_OPTIONS[@]}" \
    --model "$MODEL" \
+    --in-prefix " " \
+    --in-suffix "${AI_NAME}:" \
    --n_predict "$N_PREDICTS" \
    --color --interactive \
    --reverse-prompt "${USER_NAME}:" \
-    --prompt "
-This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
+    --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
 ${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
 ${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
 ${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
 ${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
-The conversation is only between ${USER_NAME} and ${AI_NAME}
+The conversation is only between ${USER_NAME} and ${AI_NAME}.
 The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
 ${AI_NAME} can only communicate through text, so she can't send images or videos.

--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(TARGET baby-llama)
 add_executable(${TARGET} baby-llama.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -8,6 +8,12 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+#ifdef LLAMA_DEFAULT_RMS_EPS
+static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+#else
+static const float rms_norm_eps = 5e-6f;
+#endif
+
 float frand() {
    return (float)rand()/(float)RAND_MAX;
 }
@@ -562,7 +568,7 @@ struct ggml_tensor * forward(
        // norm
        {
            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);

            // cur = attention_norm*cur
            cur = ggml_mul(ctx0,
@@ -685,7 +691,7 @@ struct ggml_tensor * forward(
            // norm
            {
                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);

                // cur = ffn_norm*cur
                // cur shape [n_embd,N,1,1]
@@ -729,7 +735,7 @@ struct ggml_tensor * forward(
    {

        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);

        // inpL = norm*inpL
        // inpL shape [n_embd,N,1,1]
@@ -817,7 +823,7 @@ struct ggml_tensor * forward_batch(
        // norm
        {
            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            assert_shape_2d(cur, n_embd, N*n_batch);

            // cur = attention_norm*cur
@@ -981,7 +987,7 @@ struct ggml_tensor * forward_batch(
            // norm
            {
                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                assert_shape_2d(cur, n_embd, N*n_batch);

                // cur = ffn_norm*cur
@@ -1034,7 +1040,7 @@ struct ggml_tensor * forward_batch(
    {

        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        assert_shape_2d(inpL, n_embd, N*n_batch);

        // inpL = norm*inpL
@@ -1104,7 +1110,7 @@ struct ggml_tensor * forward_lora(
        // norm
        {
            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);

            // cur = attention_norm*cur
            cur = ggml_mul(ctx0,
@@ -1251,7 +1257,7 @@ struct ggml_tensor * forward_lora(
            // norm
            {
                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);

                // cur = ffn_norm*cur
                // cur shape [n_embd,N,1,1]
@@ -1295,7 +1301,7 @@ struct ggml_tensor * forward_lora(
    {

        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);

        // inpL = norm*inpL
        // inpL shape [n_embd,N,1,1]
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET benchmark)
 add_executable(${TARGET} benchmark-matmult.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
--- a/examples/convert-llama2c-to-ggml/CMakeLists.txt
+++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET convert-llama2c-to-ggml)
+add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -0,0 +1,26 @@
+## Convert llama2.c model to ggml
+
+This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
+
+To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
+
+`$ make -j`
+
+After successful compilation, following usage options are available:
+```
+usage: ./convert-llama2c-to-ggml [options]
+
+options:
+  -h, --help                       show this help message and exit
+  --copy-vocab-from-model FNAME    model path from which to copy vocab (default 'models/ggml-vocab.bin')
+  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model
+  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default ak_llama_model.bin')
+```
+
+An example command is as follows:
+
+`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>`
+
+Now you can use the model with command like:
+
+`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -0,0 +1,827 @@
+#include "ggml.h"
+#include "llama.h"
+
+#include <unordered_map>
+#include <vector>
+#include <cassert>
+#include <climits>
+#include <cstring>
+#include <cstdarg>
+#include <ctime>
+#include <random>
+#include <stdexcept>
+#include <algorithm>
+#include <string>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
+typedef struct {
+    int dim; // transformer dimension
+    int hidden_dim; // for ffn layers
+    int n_layers; // number of layers
+    int n_heads; // number of query heads
+    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
+    int vocab_size; // vocabulary size, usually 256 (byte-level)
+    int seq_len; // max sequence length
+} Config;
+
+typedef struct {
+    // token embedding table
+    float* token_embedding_table;    // (vocab_size, dim)
+    // weights for rmsnorms
+    float* rms_att_weight; // (layer, dim) rmsnorm weights
+    float* rms_ffn_weight; // (layer, dim)
+    // weights for matmuls
+    float* wq; // (layer, dim, dim)
+    float* wk; // (layer, dim, dim)
+    float* wv; // (layer, dim, dim)
+    float* wo; // (layer, dim, dim)
+    // weights for ffn
+    float* w1; // (layer, hidden_dim, dim)
+    float* w2; // (layer, dim, hidden_dim)
+    float* w3; // (layer, hidden_dim, dim)
+    // final rmsnorm
+    float* rms_final_weight; // (dim,)
+    // freq_cis for RoPE relatively positional embeddings
+    // float* freq_cis_real; // (seq_len, dim/2)
+    // float* freq_cis_imag; // (seq_len, dim/2)
+    // (optional) classifier weights for the logits, on the last layer
+    //float* wcls;
+} TransformerWeights;
+
+void malloc_weights(TransformerWeights* w, Config* p) {
+    // we calloc instead of malloc to keep valgrind happy
+    w->token_embedding_table = new float[p->vocab_size * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+
+    w->rms_att_weight = new float[p->n_layers * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+
+    w->rms_ffn_weight = new float[p->n_layers * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+
+    w->wq = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wk = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wv = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wo = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->rms_final_weight = new float[p->dim]();
+    printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+}
+
+int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
+    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
+    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
+    return 0;
+}
+
+void free_weights(TransformerWeights* w) {
+    delete w->token_embedding_table;
+    delete w->rms_att_weight;
+    delete w->rms_ffn_weight;
+    delete w->wq;
+    delete w->wk;
+    delete w->wv;
+    delete w->wo;
+    delete w->w1;
+    delete w->w2;
+    delete w->w3;
+    delete w->rms_final_weight;
+}
+
+void print_sample_weights(TransformerWeights *w){
+    printf("----- Quick print of first of the weight vales of all the variables\n");
+    printf("%f\n", w->token_embedding_table[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    printf("%f\n", w->rms_ffn_weight[0]);
+
+    printf("%f\n", w->wq[0]);
+    printf("%f\n", w->wk[0]);
+    printf("%f\n", w->wv[0]);
+    printf("%f\n", w->wo[0]);
+    printf("%f\n", w->w1[0]);
+    printf("%f\n", w->w2[0]);
+    printf("%f\n", w->w3[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
+
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+    using ttype = llama_token_type;
+
+    struct token_data {
+        token text;
+        float score;
+        ttype type;
+    };
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_data> id_to_token;
+};
+
+struct my_llama_hparams {
+    uint32_t n_vocab = 32000;
+    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_embd  = 4096;
+    uint32_t n_mult  = 4;
+    uint32_t n_head  = 32;
+    uint32_t n_layer = 32;
+    uint32_t n_rot   = 64;
+    bool operator!=(const my_llama_hparams& other) const {
+        return memcmp(this, &other, sizeof(my_llama_hparams));
+    }
+};
+
+struct my_llama_layer {
+    // normalization
+    struct ggml_tensor * attention_norm;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+
+    // ff
+    struct ggml_tensor * w1;
+    struct ggml_tensor * w2;
+    struct ggml_tensor * w3;
+};
+
+struct my_llama_model {
+    struct ggml_context * ctx = NULL;
+
+    my_llama_hparams hparams;
+
+    struct ggml_tensor * tok_embeddings;
+
+    struct ggml_tensor * norm;
+    struct ggml_tensor * output;
+
+    std::vector<my_llama_layer> layers;
+
+    uint32_t train_its = 0;
+    uint32_t train_samples = 0;
+    uint32_t train_tokens = 0;
+};
+
+struct train_params {
+    const char * fn_vocab_model;
+    const char * fn_llama2c_model;
+    const char * fn_llama2c_output_model;
+    const char * fn_train_data;
+    const char * fn_checkpoint_in;
+    const char * fn_checkpoint_out;
+    const char * fn_model_out;
+
+    uint32_t seed;
+
+    int n_ctx;
+    int n_embd;
+    int n_mult;
+    int n_head;
+    int n_layer;
+    int n_rotmax;
+
+    int n_threads;
+    int n_batch;
+    int n_examples;
+    int n_predict;
+
+    int print_info_interval;
+    int print_details_interval;
+
+    bool samples_start_after_nl;
+    bool use_adam;
+    bool use_flash;
+    bool use_scratch;
+
+    // only adam
+    int   warmup;
+    int   cos_decay_steps;
+    float cos_decay_restart;
+    float cos_decay_alpha;
+
+    int   lbfgs_n_iter;
+    int   adam_n_iter;
+    float adam_alpha;
+    float adam_decay;
+
+    int mem_model_gb;
+    int mem_compute_gb;
+    int mem_compute0_gb;
+    int mem_compute1_gb;
+};
+
+uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
+    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
+    return n_ff;
+}
+
+void print_params(struct my_llama_hparams * params) {
+    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
+    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
+    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+}
+
+void init_model(struct my_llama_model * model) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_vocab = hparams.n_vocab;
+
+    const uint32_t n_ff = get_n_ff(&hparams);
+    struct ggml_context * ctx = model->ctx;
+
+    model->train_its = 0;
+    model->train_samples = 0;
+    model->train_tokens = 0;
+
+    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
+
+    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+    printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
+
+    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+
+    // printing the per-layer allocations here so we dont print in the for loop.
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+
+    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
+    ggml_set_name(model->norm,           "norm.weight");
+    ggml_set_name(model->output,         "output.weight");
+
+    model->layers.resize(n_layer);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        std::string layers_i = "layers." + std::to_string(i);
+
+        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+
+        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
+        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+
+        ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
+
+        ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
+        ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
+        ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
+        ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
+
+        ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
+
+        ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
+        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
+        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
+    }
+}
+
+float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+void print_row(struct ggml_tensor * probs, int i) {
+    for (int k = 0; k < probs->ne[0]; ++k) {
+        float p = get_f32_2d(probs, k, i);
+        printf(" %f", p);
+    }
+    printf("\n");
+}
+
+void print_matrix(struct ggml_tensor * probs) {
+    assert(probs->n_dims == 2);
+    for (int i = 0; i < probs->ne[1]; ++i) {
+        for (int k = 0; k < probs->ne[0]; ++k) {
+            float p = get_f32_2d(probs, k, i);
+            printf(" %.2f", p);
+        }
+        printf("\n");
+    }
+}
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+    std::float_t read_f32() {
+        std::float_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    if (tensor == NULL) {
+        file->write_u32(0);
+        file->write_u32(0);
+        file->write_u32(GGML_TYPE_F32);
+        file->seek((0-file->tell()) & 31, SEEK_CUR);
+        return;
+    }
+    const char * name = ggml_get_name(tensor);
+    uint32_t name_len = strlen(name);
+    uint32_t nd = tensor->n_dims;
+    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
+                       (uint32_t)tensor->ne[1],
+                       (uint32_t)tensor->ne[2],
+                       (uint32_t)tensor->ne[3] };
+    file->write_u32(nd);
+    file->write_u32(name_len);
+    file->write_u32(tensor->type);
+    file->write_raw(ne, sizeof(ne[0]) * nd);
+    file->write_raw(name, name_len);
+    file->seek((0-file->tell()) & 31, SEEK_CUR);
+    file->write_raw(tensor->data, ggml_nbytes(tensor));
+}
+
+bool is_ggml_file(const char *filename) {
+    llama_file file(filename, "rb");
+    if (file.size < 4) {
+        return false;
+    }
+    uint32_t magic = file.read_u32();
+    return magic == GGUF_MAGIC;
+}
+
+void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
+    // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
+    if (is_ggml_file(filename)) {
+
+        struct llama_context_params llama_params = llama_context_default_params();
+        llama_params.vocab_only = true;
+
+        struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
+        struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
+
+        const int n_vocab = llama_n_vocab(lctx);
+        vocab->id_to_token.resize(n_vocab);
+        for (int i=0; i<n_vocab; ++i) {
+            vocab->id_to_token[i].text  = llama_token_get_text(lctx, i);
+            vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
+            vocab->id_to_token[i].type  = llama_token_get_type(lctx, i);
+            vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
+        }
+        llama_free(lctx);
+        llama_free_model(lmodel);
+    } else { // assume llama2.c vocabulary
+        printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
+        llama_file file(filename, "rb");
+        const int  n_vocab = config->vocab_size;
+        /* uint32_t max_token_length =  */ file.read_u32(); // unused
+        vocab->id_to_token.resize(n_vocab);
+        for (int i=0; i<n_vocab; ++i) {
+            float_t score = file.read_f32();
+            uint32_t len = file.read_u32();
+            std::string text = file.read_string(len);
+            vocab->id_to_token[i].text = text;
+            vocab->id_to_token[i].score = score;
+            vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
+            vocab->token_to_id.emplace(text, i);
+        }
+    }
+}
+
+void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
+    int ct;
+    switch (gg_weights->n_dims){
+        case 1:
+            ct = 0;
+            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
+                float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
+                *ptr = karpathy_weights[ct];
+                ct++;
+            }
+            break;
+        case 2:
+            ct = 0;
+            for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                    float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
+                    *ptr = karpathy_weights[ct];
+                    ct++;
+                }
+            }
+            break;
+        case 3:
+            ct = 0;
+            for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
+                for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                        float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
+                        *ptr = karpathy_weights[ct];
+                        ct++;
+                    }
+                }
+            }
+            break;
+    }
+}
+
+void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
+    struct llama_file file(filename, "wb");
+    if (file.fp == NULL) {
+        return;
+    }
+
+#pragma message("TODO: implement file saving using gguf")
+    (void) vocab;
+    (void) model;
+    (void) w;
+//    // write_magic
+//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+//    file.write_u32(LLAMA_FILE_VERSION); // version
+//    // write_hparams
+//    file.write_u32(model->hparams.n_vocab);
+//    file.write_u32(model->hparams.n_embd);
+//    file.write_u32(model->hparams.n_mult);
+//    file.write_u32(model->hparams.n_head);
+//    file.write_u32(model->hparams.n_layer);
+//    file.write_u32(model->hparams.n_rot);
+//    file.write_u32(LLAMA_FTYPE_ALL_F32);
+//
+//    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
+//    uint32_t n_vocab = model->hparams.n_vocab;
+//    for (uint32_t i = 0; i < n_vocab; i++) {
+//        const auto & token_data = vocab->id_to_token.at(i);
+//        file.write_u32((uint32_t) token_data.tok.size());
+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
+//        file.write_raw(&token_data.score, sizeof(token_data.score));
+//    }
+//
+//    // stuff AK weights into GG weights one by one.
+//    // w->token_embedding_table -> model->tok_embeddings
+//    // float*                   -> struct ggml_tensor
+//    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+//    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
+//
+//    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
+//    //print_row(model->norm, 0);
+//
+//    // for rms-att-weight
+//    int row_length = model->hparams.n_embd;
+//    const auto & hparams = model->hparams;
+//    //int n_ff = model->hparams.n_embd;
+//    int n_ff = get_n_ff(&hparams);
+//
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
+//        auto & layer = model->layers[i];
+//        // 1d
+//        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+//
+//        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
+//        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+//
+//        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+//        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
+//    }
+//    // write tensors
+//    write_tensor(&file, model->tok_embeddings);
+//    write_tensor(&file, model->norm);
+//    write_tensor(&file, model->output); // ?
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+//        auto & layer = model->layers[i];
+//
+//        write_tensor(&file, layer.attention_norm);
+//        write_tensor(&file, layer.wq);
+//        write_tensor(&file, layer.wk);
+//        write_tensor(&file, layer.wv);
+//        write_tensor(&file, layer.wo);
+//        write_tensor(&file, layer.ffn_norm);
+//        write_tensor(&file, layer.w1);
+//        write_tensor(&file, layer.w2);
+//        write_tensor(&file, layer.w3);
+//    }
+}
+
+struct train_params get_default_train_params() {
+    struct train_params params;
+    params.fn_vocab_model    = "models/ggml-vocab.bin";
+    params.fn_llama2c_output_model = "ak_llama_model.bin";
+    params.fn_train_data     = "shakespeare.txt";
+    params.fn_checkpoint_in  = "checkpoint.bin";
+    params.fn_checkpoint_out = "checkpoint.bin";
+    params.fn_model_out      = "ggml-checkpoint-f32.bin";
+
+    params.seed       =   -1;
+
+    params.n_ctx      =  128;
+    params.n_embd     =  256;
+    params.n_mult     =  256;
+    params.n_head     =    8;
+    params.n_layer    =   16;
+    params.n_rotmax   =   64;
+
+    params.n_threads  =    6;
+    params.n_batch    =    8;
+    params.n_examples =    8;
+    params.n_predict  = 1024;
+
+    params.print_info_interval    = 1;
+    params.print_details_interval = 2;
+
+    params.samples_start_after_nl = false;
+    params.use_adam               = true;
+    params.use_flash              = true;
+    params.use_scratch            = true;
+
+    // only adam
+    params.warmup            =  100;
+    params.cos_decay_steps   = 1000;
+    params.cos_decay_restart = 1.1f;
+    params.cos_decay_alpha   = 0.0f;
+
+    params.lbfgs_n_iter      = 16;
+    params.adam_n_iter       = 16;
+    params.adam_alpha        = 1e-3f;
+    params.adam_decay        = 1e-3f;
+
+    params.mem_model_gb   = 2;
+    params.mem_compute_gb = 24;
+    params.mem_compute0_gb = 8;
+    params.mem_compute1_gb = 2;
+
+    return params;
+}
+
+void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help                       show this help message and exit\n");
+    fprintf(stderr, "  --copy-vocab-from-model FNAME    llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
+    fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
+    fprintf(stderr, "\n");
+}
+
+bool params_parse(int argc, char ** argv, struct train_params * params) {
+    bool invalid_param = false;
+    bool reqd_param_found = false;
+    std::string arg;
+    struct train_params default_params = get_default_train_params();
+    const std::string arg_prefix = "--";
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "--copy-vocab-from-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_vocab_model = argv[i];
+        } else if (arg == "--llama2c-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            reqd_param_found = true;
+            params->fn_llama2c_model = argv[i];
+        } else if (arg == "--llama2c-output-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_llama2c_output_model = argv[i];
+        } else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, &default_params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            print_usage(argc, argv, &default_params);
+            exit(1);
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    if (!reqd_param_found){
+        fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
+        print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    struct train_params params = get_default_train_params();
+    if (!params_parse(argc, argv, &params)) {
+        return 1;
+    }
+    Config config;
+    TransformerWeights weights;
+    {
+        FILE *file = fopen(params.fn_llama2c_model, "rb");
+        if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
+        // read in the config header
+        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
+        // read in the Transformer weights
+        malloc_weights(&weights, &config);
+        if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
+        fclose(file);
+    }
+
+    struct llama_vocab vocab;
+    load_vocab(params.fn_vocab_model, &config, &vocab);
+
+    struct my_llama_model model;
+    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
+    model.hparams.n_ctx   = params.n_ctx;
+    model.hparams.n_embd  = config.dim; //params.n_embd;
+    model.hparams.n_mult  = 32;//params.n_mult;
+    model.hparams.n_head  = config.n_heads; //params.n_head;
+    model.hparams.n_layer = config.n_layers; //params.n_layer;
+    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+    print_params(&model.hparams);
+    struct ggml_init_params lcparams;
+    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
+    lcparams.mem_buffer = NULL;
+    lcparams.no_alloc   = false;
+
+    model.ctx = ggml_init(lcparams);
+
+    init_model(&model);
+    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
+
+    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
+
+    ggml_free(model.ctx);
+    free_weights(&weights);
+    return 0;
+}
--- a/examples/embd-input/CMakeLists.txt
+++ b/examples/embd-input/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET embdinput)
 add_library(${TARGET} embd-input-lib.cpp embd-input.h)
+install(TARGETS ${TARGET} LIBRARY)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
@@ -8,6 +9,7 @@ endif()

 set(TARGET embd-input-test)
 add_executable(${TARGET} embd-input-test.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -30,7 +30,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
+        params.seed = uint32_t(time(NULL));
    }
    fprintf(stderr, "%s: seed  = %d\n", __func__, params.seed);

@@ -167,7 +167,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

        // TODO: Apply penalties
-        // float nl_logit = logits[llama_token_nl()];
+        // float nl_logit = logits[llama_token_nl(ctx)];
        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
        // llama_sample_repetition_penalty(ctx, &candidates_p,
        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@@ -176,7 +176,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
        // last_n_repeat, alpha_frequency, alpha_presence);
        // if (!penalize_nl) {
-        //     logits[llama_token_nl()] = nl_logit;
+        //     logits[llama_token_nl(ctx)] = nl_logit;
        // }

        if (temp <= 0) {
@@ -211,7 +211,7 @@ const char * sampling(struct MyModel * mymodel) {
    llama_context * ctx = mymodel->ctx;
    int id = sampling_id(mymodel);
    static std::string ret;
-    if (id == llama_token_eos()) {
+    if (id == llama_token_eos(ctx)) {
        ret = "</s>";
    } else {
        ret = llama_token_to_str(ctx, id);
--- a/examples/embd-input/minigpt4.py
+++ b/examples/embd-input/minigpt4.py
@@ -64,7 +64,7 @@ class MiniGPT4(Blip2Base):
        self.max_txt_len = max_txt_len
        self.end_sym = end_sym
        self.model = MyModel(["main", *args])
-        # system promt
+        # system prompt
        self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
           "You will be able to see the image once I provide it to you. Please answer my questions."
           "###")
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET embedding)
 add_executable(${TARGET} embedding.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -67,28 +67,35 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
        }
        fprintf(stderr, "\n");
    }

-    if (params.embedding){
-        if (embd_inp.size() > 0) {
-            if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
-                return 1;
-            }
-        }
-
-        const int n_embd = llama_n_embd(ctx);
-        const auto embeddings = llama_get_embeddings(ctx);
-
-        for (int i = 0; i < n_embd; i++) {
-            printf("%f ", embeddings[i]);
-        }
-        printf("\n");
+    if (embd_inp.size() > (size_t)params.n_ctx) {
+        fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
+                __func__, embd_inp.size(), params.n_ctx);
+        return 1;
    }

+    while (!embd_inp.empty()) {
+        int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
+        if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return 1;
+        }
+        n_past += n_tokens;
+        embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
+    }
+
+    const int n_embd = llama_n_embd(ctx);
+    const auto embeddings = llama_get_embeddings(ctx);
+
+    for (int i = 0; i < n_embd; i++) {
+        printf("%f ", embeddings[i]);
+    }
+    printf("\n");
+
    llama_print_timings(ctx);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -0,0 +1,246 @@
+#include "ggml.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cinttypes>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <vector>
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+template<typename T>
+static std::string to_string(const T & val) {
+    std::stringstream ss;
+    ss << val;
+    return ss.str();
+}
+
+bool gguf_ex_write(const std::string & fname) {
+    struct gguf_context * ctx = gguf_init_empty();
+
+    gguf_set_val_u8  (ctx, "some.parameter.uint8",    0x12);
+    gguf_set_val_i8  (ctx, "some.parameter.int8",    -0x13);
+    gguf_set_val_u16 (ctx, "some.parameter.uint16",   0x1234);
+    gguf_set_val_i16 (ctx, "some.parameter.int16",   -0x1235);
+    gguf_set_val_u32 (ctx, "some.parameter.uint32",   0x12345678);
+    gguf_set_val_i32 (ctx, "some.parameter.int32",   -0x12345679);
+    gguf_set_val_f32 (ctx, "some.parameter.float32",  0.123456789f);
+    gguf_set_val_bool(ctx, "some.parameter.bool",     true);
+    gguf_set_val_str (ctx, "some.parameter.string",   "hello world");
+
+    gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16,   std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
+    gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
+    gguf_set_arr_str (ctx, "some.parameter.arr.str",                    std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ 128ull*1024ull*1024ull,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx_data = ggml_init(params);
+
+    const int n_tensors = 10;
+
+    // tensor infos
+    for (int i = 0; i < n_tensors; ++i) {
+        const std::string name = "tensor_" + to_string(i);
+
+        int64_t ne[GGML_MAX_DIMS] = { 1 };
+        int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
+
+        for (int j = 0; j < n_dims; ++j) {
+            ne[j] = rand() % 10 + 1;
+        }
+
+        struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
+        ggml_set_name(cur, name.c_str());
+
+        {
+            float * data = (float *) cur->data;
+            for (int j = 0; j < ggml_nelements(cur); ++j) {
+                data[j] = 100 + i;
+            }
+        }
+
+        gguf_add_tensor(ctx, cur);
+    }
+
+    gguf_write_to_file(ctx, fname.c_str(), false);
+
+    fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
+
+    ggml_free(ctx_data);
+    gguf_free(ctx);
+
+    return true;
+}
+
+// just read tensor info
+bool gguf_ex_read_0(const std::string & fname) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ NULL,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+
+    // kv
+    {
+        const int n_kv = gguf_get_n_kv(ctx);
+
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
+
+        for (int i = 0; i < n_kv; ++i) {
+            const char * key = gguf_get_key(ctx, i);
+
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
+        }
+    }
+
+    // find kv string
+    {
+        const char * findkey = "some.parameter.string";
+
+        const int keyidx = gguf_find_key(ctx, findkey);
+        if (keyidx == -1) {
+            fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
+        } else {
+            const char * key_value = gguf_get_val_str(ctx, keyidx);
+            fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
+        }
+    }
+
+    // tensor info
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+        }
+    }
+
+    gguf_free(ctx);
+
+    return true;
+}
+
+// read and create ggml_context containing the tensors and their data
+bool gguf_ex_read_1(const std::string & fname) {
+    struct ggml_context * ctx_data = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ &ctx_data,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+
+    // kv
+    {
+        const int n_kv = gguf_get_n_kv(ctx);
+
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
+
+        for (int i = 0; i < n_kv; ++i) {
+            const char * key = gguf_get_key(ctx, i);
+
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
+        }
+    }
+
+    // tensor info
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+        }
+    }
+
+    // data
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);
+
+            const char * name = gguf_get_tensor_name(ctx, i);
+
+            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+
+            fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
+
+            // print first 10 elements
+            const float * data = (const float *) cur->data;
+
+            printf("%s data[:10] : ", name);
+            for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
+                printf("%f ", data[j]);
+            }
+            printf("\n\n");
+
+            // check data
+            {
+                const float * data = (const float *) cur->data;
+                for (int j = 0; j < ggml_nelements(cur); ++j) {
+                    if (data[j] != 100 + i) {
+                        fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+
+    fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
+
+    ggml_free(ctx_data);
+    gguf_free(ctx);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 3) {
+        fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
+        return -1;
+    }
+
+    const std::string fname(argv[1]);
+    const std::string mode (argv[2]);
+
+    GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
+
+    if (mode == "w") {
+        GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
+    } else if (mode == "r") {
+        GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
+        GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
+    }
+
+    return 0;
+}
--- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
+++ b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
--- a/examples/gptneox-wip/falcon-main.cpp
+++ b/examples/gptneox-wip/falcon-main.cpp
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
--- a/examples/json-schema-to-grammar.py
+++ b/examples/json-schema-to-grammar.py
@@ -0,0 +1,132 @@
+import argparse
+import json
+import re
+import sys
+
+# whitespace is constrained to a single space char to prevent model "running away" in
+# whitespace. Also maybe improves generation quality?
+SPACE_RULE = '" "?'
+
+PRIMITIVE_RULES = {
+    'boolean': '("true" | "false") space',
+    'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
+    'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
+    'string': r''' "\"" (
+        [^"\\] |
+        "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\"" space ''',
+    'null': '"null" space',
+}
+
+INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
+GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
+GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}
+
+
+class SchemaConverter:
+    def __init__(self, prop_order):
+        self._prop_order = prop_order
+        self._rules = {'space': SPACE_RULE}
+
+    def _format_literal(self, literal):
+        escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
+            lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
+        )
+        return f'"{escaped}"'
+
+    def _add_rule(self, name, rule):
+        esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
+        if esc_name not in self._rules or self._rules[esc_name] == rule:
+            key = esc_name
+        else:
+            i = 0
+            while f'{esc_name}{i}' in self._rules:
+                i += 1
+            key = f'{esc_name}{i}'
+        self._rules[key] = rule
+        return key
+
+    def visit(self, schema, name):
+        schema_type = schema.get('type')
+        rule_name = name or 'root'
+
+        if 'oneOf' in schema or 'anyOf' in schema:
+            rule = ' | '.join((
+                self.visit(alt_schema, f'{name}{"-" if name else ""}{i}')
+                for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf'])
+            ))
+            return self._add_rule(rule_name, rule)
+
+        elif 'const' in schema:
+            return self._add_rule(rule_name, self._format_literal(schema['const']))
+
+        elif 'enum' in schema:
+            rule = ' | '.join((self._format_literal(v) for v in schema['enum']))
+            return self._add_rule(rule_name, rule)
+
+        elif schema_type == 'object' and 'properties' in schema:
+            # TODO: `required` keyword
+            prop_order = self._prop_order
+            prop_pairs = sorted(
+                schema['properties'].items(),
+                # sort by position in prop_order (if specified) then by key
+                key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
+            )
+
+            rule = '"{" space'
+            for i, (prop_name, prop_schema) in enumerate(prop_pairs):
+                prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
+                if i > 0:
+                    rule += ' "," space'
+                rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
+            rule += ' "}" space'
+
+            return self._add_rule(rule_name, rule)
+
+        elif schema_type == 'array' and 'items' in schema:
+            # TODO `prefixItems` keyword
+            item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
+            rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
+            return self._add_rule(rule_name, rule)
+
+        else:
+            assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
+            return self._add_rule(
+                'root' if rule_name == 'root' else schema_type,
+                PRIMITIVE_RULES[schema_type]
+            )
+
+    def format_grammar(self):
+        return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items()))
+
+
+def main(args_in = None):
+    parser = argparse.ArgumentParser(
+        description='''
+            Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
+            given JSON schema. Only a subset of JSON schema features are supported; more may be
+            added in the future.
+        ''',
+    )
+    parser.add_argument(
+        '--prop-order',
+        default=[],
+        type=lambda s: s.split(','),
+        help='''
+            comma-separated property names defining the order of precedence for object properties;
+            properties not specified here are given lower precedence than those that are, and are
+            sorted alphabetically
+        '''
+    )
+    parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
+    args = parser.parse_args(args_in)
+
+    schema = json.load(sys.stdin if args.schema == '-' else open(args.schema))
+    prop_order = {name: idx for idx, name in enumerate(args.prop_order)}
+    converter = SchemaConverter(prop_order)
+    converter.visit(schema, '')
+    print(converter.format_grammar())
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/llama-bench/CMakeLists.txt
+++ b/examples/llama-bench/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-bench)
+add_executable(${TARGET} llama-bench.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -0,0 +1,969 @@
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <chrono>
+#include <cinttypes>
+#include <cstring>
+#include <ctime>
+#include <iterator>
+#include <map>
+#include <numeric>
+#include <regex>
+#include <sstream>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#include "ggml.h"
+#include "llama.h"
+#include "common.h"
+#include "build-info.h"
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+// utils
+static uint64_t get_time_ns() {
+    using clock = std::chrono::high_resolution_clock;
+    return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
+}
+
+template<class T>
+static std::string join(const std::vector<T> & values, const std::string & delim) {
+    std::ostringstream str;
+    for (size_t i = 0; i < values.size(); i++) {
+        str << values[i];
+        if (i < values.size() - 1) {
+            str << delim;
+        }
+    }
+    return str.str();
+}
+
+template<class T>
+static std::vector<T> split(const std::string & str, char delim) {
+    std::vector<T> values;
+    std::istringstream str_stream(str);
+    std::string token;
+    while (std::getline(str_stream, token, delim)) {
+        T value;
+        std::istringstream token_stream(token);
+        token_stream >> value;
+        values.push_back(value);
+    }
+    return values;
+}
+
+template<typename T>
+static T avg(const std::vector<T> & v) {
+    if (v.empty()) {
+        return 0;
+    }
+    T sum = std::accumulate(v.begin(), v.end(), T(0));
+    return sum / (T)v.size();
+}
+
+template<typename T>
+static T stdev(const std::vector<T> & v) {
+    if (v.size() <= 1) {
+        return 0;
+    }
+    T mean = avg(v);
+    T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
+    T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1));
+    return stdev;
+}
+
+static bool ggml_cpu_has_metal() {
+#if defined(GGML_USE_METAL)
+    return true;
+#else
+    return false;
+#endif
+}
+
+static std::string get_cpu_info() {
+    std::string id;
+#ifdef __linux__
+    FILE * f = fopen("/proc/cpuinfo", "r");
+    if (f) {
+        char buf[1024];
+        while (fgets(buf, sizeof(buf), f)) {
+            if (strncmp(buf, "model name", 10) == 0) {
+                char * p = strchr(buf, ':');
+                if (p) {
+                    p++;
+                    while (std::isspace(*p)) {
+                        p++;
+                    }
+                    while (std::isspace(p[strlen(p) - 1])) {
+                        p[strlen(p) - 1] = '\0';
+                    }
+                    id = p;
+                    break;
+                }
+            }
+        }
+    }
+#endif
+    // TODO: other platforms
+    return id;
+}
+
+static std::string get_gpu_info() {
+    std::string id;
+#ifdef GGML_USE_CUBLAS
+    int count = ggml_cuda_get_device_count();
+    for (int i = 0; i < count; i++) {
+        char buf[128];
+        ggml_cuda_get_device_description(i, buf, sizeof(buf));
+        id += buf;
+        if (i < count - 1) {
+            id += "/";
+        }
+    }
+#endif
+    // TODO: other backends
+    return id;
+}
+
+// command line params
+enum output_formats {CSV, JSON, MARKDOWN, SQL};
+
+struct cmd_params {
+    std::vector<std::string> model;
+    std::vector<int> n_prompt;
+    std::vector<int> n_gen;
+    std::vector<int> n_batch;
+    std::vector<bool> f32_kv;
+    std::vector<int> n_threads;
+    std::vector<int> n_gpu_layers;
+    std::vector<int> main_gpu;
+    std::vector<bool> mul_mat_q;
+    std::vector<bool> low_vram;
+    std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
+    int reps;
+    bool verbose;
+    output_formats output_format;
+};
+
+static const cmd_params cmd_params_defaults = {
+    /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
+    /* n_prompt      */ {512},
+    /* n_gen         */ {128},
+    /* n_batch       */ {512},
+    /* f32_kv        */ {false},
+    /* n_threads     */ {get_num_physical_cores()},
+    /* n_gpu_layers  */ {99},
+    /* main_gpu      */ {0},
+    /* mul_mat_q     */ {true},
+    /* low_vram      */ {false},
+    /* tensor_split  */ {{}},
+    /* reps          */ 5,
+    /* verbose       */ false,
+    /* output_format */ MARKDOWN
+};
+
+static void print_usage(int /* argc */, char ** argv) {
+    fprintf(stdout, "usage: %s [options]\n", argv[0]);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "options:\n");
+    fprintf(stdout, "  -h, --help\n");
+    fprintf(stdout, "  -m, --model <filename>            (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
+    fprintf(stdout, "  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
+    fprintf(stdout, "  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
+    fprintf(stdout, "  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
+    fprintf(stdout, "  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
+    fprintf(stdout, "  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    fprintf(stdout, "  -ngl N, --n-gpu-layers <n>        (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    fprintf(stdout, "  -mg i, --main-gpu <n>             (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
+    fprintf(stdout, "  -lv, --low-vram <0|1>             (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
+    fprintf(stdout, "  -mmq, --mul-mat-q <0|1>           (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
+    fprintf(stdout, "  -ts, --tensor_split <ts0/ts1/..>               \n");
+    fprintf(stdout, "  -r, --repetitions <n>             (default: %d)\n", cmd_params_defaults.reps);
+    fprintf(stdout, "  -o, --output <csv|json|md|sql>    (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
+    fprintf(stdout, "  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
+    fprintf(stdout, "\n");
+    fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
+
+}
+
+static cmd_params parse_cmd_params(int argc, char ** argv) {
+    cmd_params params;
+    std::string arg;
+    bool invalid_param = false;
+    const std::string arg_prefix = "--";
+    const char split_delim = ',';
+
+    params.verbose = cmd_params_defaults.verbose;
+    params.output_format = cmd_params_defaults.output_format;
+    params.reps = cmd_params_defaults.reps;
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv);
+            exit(0);
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<std::string>(argv[i], split_delim);
+            params.model.insert(params.model.end(), p.begin(), p.end());
+        } else if (arg == "-p" || arg == "--n-prompt") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<int>(argv[i], split_delim);
+            params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
+        } else if (arg == "-n" || arg == "--n-gen") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<int>(argv[i], split_delim);
+            params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
+        } else if (arg == "-b" || arg == "--batch-size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<int>(argv[i], split_delim);
+            params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
+        } else if (arg == "--memory-f32") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<int>(argv[i], split_delim);
+            params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<int>(argv[i], split_delim);
+            params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
+        } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<int>(argv[i], split_delim);
+            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
+        } else if (arg == "-mg" || arg == "--main-gpu") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.main_gpu = split<int>(argv[i], split_delim);
+        } else if (arg == "-lv" || arg == "--low-vram") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<bool>(argv[i], split_delim);
+            params.low_vram.insert(params.low_vram.end(), p.begin(), p.end());
+        } else if (arg == "-mmq" || arg == "--mul-mat-q") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<bool>(argv[i], split_delim);
+            params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
+        } else if (arg == "-ts" || arg == "--tensor-split") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            for (auto ts : split<std::string>(argv[i], split_delim)) {
+                // split string by ; and /
+                const std::regex regex{R"([;/]+)"};
+                std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
+                std::vector<std::string> split_arg{it, {}};
+                GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+
+                std::array<float, LLAMA_MAX_DEVICES> tensor_split;
+                for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
+                    if (i < split_arg.size()) {
+                        tensor_split[i] = std::stof(split_arg[i]);
+                    } else {
+                        tensor_split[i] = 0.0f;
+                    }
+                }
+                params.tensor_split.push_back(tensor_split);
+            }
+        } else if (arg == "-r" || arg == "--repetitions") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.reps = std::stoi(argv[i]);
+        } else if (arg == "-o" || arg == "--output") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            if (argv[i] == std::string("csv")) {
+                params.output_format = CSV;
+            } else if (argv[i] == std::string("json")) {
+                params.output_format = JSON;
+            } else if (argv[i] == std::string("md")) {
+                params.output_format = MARKDOWN;
+            } else if (argv[i] == std::string("sql")) {
+                params.output_format = SQL;
+            } else {
+                invalid_param = true;
+                break;
+            }
+        } else if (arg == "-v" || arg == "--verbose") {
+            params.verbose = true;
+        } else {
+            invalid_param = true;
+            break;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    // set defaults
+    if (params.model.empty())        { params.model = cmd_params_defaults.model; }
+    if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
+    if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
+    if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
+    if (params.f32_kv.empty())       { params.f32_kv = cmd_params_defaults.f32_kv; }
+    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
+    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
+    if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
+    if (params.low_vram.empty())     { params.low_vram = cmd_params_defaults.low_vram; }
+    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
+    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
+
+    return params;
+}
+
+struct cmd_params_instance {
+    std::string model;
+    int n_prompt;
+    int n_gen;
+    int n_batch;
+    bool f32_kv;
+    int n_threads;
+    int n_gpu_layers;
+    int main_gpu;
+    bool mul_mat_q;
+    bool low_vram;
+    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
+
+    llama_context_params to_llama_params() const {
+        llama_context_params lparams = llama_context_default_params();
+        lparams.n_ctx = n_prompt + n_gen;
+        lparams.n_batch = n_batch;
+        lparams.f16_kv = !f32_kv;
+        lparams.n_gpu_layers = n_gpu_layers;
+        lparams.main_gpu = main_gpu;
+        lparams.mul_mat_q = mul_mat_q;
+        lparams.low_vram = low_vram;
+        lparams.tensor_split = tensor_split.data();
+
+        return lparams;
+    }
+};
+
+static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) {
+    std::vector<cmd_params_instance> instances;
+
+    for (const auto & m : params.model)
+    for (const auto & nb : params.n_batch)
+    for (const auto & fk : params.f32_kv)
+    for (const auto & nl : params.n_gpu_layers)
+    for (const auto & mg : params.main_gpu)
+    for (const auto & mmq : params.mul_mat_q)
+    for (const auto & lv : params.low_vram)
+    for (const auto & ts : params.tensor_split)
+    for (const auto & nt : params.n_threads) {
+        cmd_params_instance instance = {
+            /* .model        = */ m,
+            /* .n_prompt     = */ n_prompt,
+            /* .n_gen        = */ n_gen,
+            /* .n_batch      = */ nb,
+            /* .f32_kv       = */ fk,
+            /* .n_threads    = */ nt,
+            /* .n_gpu_layers = */ nl,
+            /* .main_gpu     = */ mg,
+            /* .mul_mat_q    = */ mmq,
+            /* .low_vram     = */ lv,
+            /* .tensor_split = */ ts,
+        };
+        instances.push_back(instance);
+    }
+    return instances;
+}
+
+static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
+    std::vector<cmd_params_instance> instances;
+
+    for (const auto & n_prompt : params.n_prompt) {
+        if (n_prompt == 0) {
+            continue;
+        }
+        auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt);
+        instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end());
+    }
+
+    for (const auto & n_gen : params.n_gen) {
+        if (n_gen == 0) {
+            continue;
+        }
+        auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
+        instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
+    }
+
+    return instances;
+}
+
+struct test {
+    static const std::string build_commit;
+    static const int build_number;
+    static const bool cuda;
+    static const bool opencl;
+    static const bool metal;
+    static const bool gpu_blas;
+    static const bool blas;
+    static const std::string cpu_info;
+    static const std::string gpu_info;
+    std::string model_filename;
+    std::string model_type;
+    int n_batch;
+    int n_threads;
+    bool f32_kv;
+    int n_gpu_layers;
+    int main_gpu;
+    bool mul_mat_q;
+    bool low_vram;
+    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
+    int n_prompt;
+    int n_gen;
+    std::string test_time;
+    std::vector<uint64_t> samples_ns;
+
+    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
+        model_filename = inst.model;
+        char buf[128];
+        llama_model_type(lmodel, buf, sizeof(buf));
+        model_type = buf;
+        n_batch = inst.n_batch;
+        n_threads = inst.n_threads;
+        f32_kv = inst.f32_kv;
+        n_gpu_layers = inst.n_gpu_layers;
+        main_gpu = inst.main_gpu;
+        mul_mat_q = inst.mul_mat_q;
+        low_vram = inst.low_vram;
+        tensor_split = inst.tensor_split;
+        n_prompt = inst.n_prompt;
+        n_gen = inst.n_gen;
+        // RFC 3339 date-time format
+        time_t t = time(NULL);
+        std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
+        test_time = buf;
+
+        (void) ctx;
+    }
+
+    uint64_t avg_ns() const {
+        return ::avg(samples_ns);
+    }
+
+    uint64_t stdev_ns() const {
+        return ::stdev(samples_ns);
+    }
+
+    std::vector<double> get_ts() const {
+        int n_tokens = n_prompt + n_gen;
+        std::vector<double> ts;
+        std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
+        return ts;
+    }
+
+    double avg_ts() const {
+        return ::avg(get_ts());
+    }
+
+    double stdev_ts() const {
+        return ::stdev(get_ts());
+    }
+
+    static std::string get_backend() {
+        if (cuda) {
+            return "CUDA";
+        }
+        if (opencl) {
+            return "OpenCL";
+        }
+        if (metal) {
+            return "Metal";
+        }
+        if (gpu_blas) {
+            return "GPU BLAS";
+        }
+        if (blas) {
+            return "BLAS";
+        }
+        return "CPU";
+    }
+
+    static const std::vector<std::string> & get_fields() {
+        static const std::vector<std::string> fields = {
+            "build_commit", "build_number",
+            "cuda", "opencl", "metal", "gpu_blas", "blas",
+            "cpu_info", "gpu_info",
+            "model_filename", "model_type",
+            "n_batch", "n_threads", "f16_kv",
+            "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
+            "n_prompt", "n_gen", "test_time",
+            "avg_ns", "stddev_ns",
+            "avg_ts", "stddev_ts"
+        };
+        return fields;
+    }
+
+    enum field_type {STRING, BOOL, INT, FLOAT};
+
+    static field_type get_field_type(const std::string & field) {
+        if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
+            field == "n_gpu_layers" || field == "main_gpu" ||
+            field == "n_prompt" || field == "n_gen" ||
+            field == "avg_ns" || field == "stddev_ns") {
+            return INT;
+        }
+        if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
+            field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
+            return BOOL;
+        }
+        if (field == "avg_ts" || field == "stddev_ts") {
+            return FLOAT;
+        }
+        return STRING;
+    }
+
+    std::vector<std::string> get_values() const {
+        std::string tensor_split_str;
+        int max_nonzero = 0;
+        for (int i = 0; i < LLAMA_MAX_DEVICES; i++) {
+            if (tensor_split[i] > 0) {
+                max_nonzero = i;
+            }
+        }
+        for (int i = 0; i <= max_nonzero; i++) {
+            char buf[32];
+            snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]);
+            tensor_split_str += buf;
+            if (i < max_nonzero) {
+                tensor_split_str += "/";
+            }
+        }
+        std::vector<std::string> values = {
+            build_commit, std::to_string(build_number),
+            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
+            cpu_info, gpu_info,
+            model_filename, model_type,
+            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
+            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
+            std::to_string(n_prompt), std::to_string(n_gen), test_time,
+            std::to_string(avg_ns()), std::to_string(stdev_ns()),
+            std::to_string(avg_ts()), std::to_string(stdev_ts())
+        };
+        return values;
+    }
+
+    std::map<std::string, std::string> get_map() const {
+        std::map<std::string, std::string> map;
+        auto fields = get_fields();
+        auto values = get_values();
+        std::transform(fields.begin(), fields.end(), values.begin(),
+                std::inserter(map, map.end()), std::make_pair<const std::string &, const std::string &>);
+        return map;
+    }
+};
+
+const std::string test::build_commit = BUILD_COMMIT;
+const int         test::build_number = BUILD_NUMBER;
+const bool        test::cuda         = !!ggml_cpu_has_cublas();
+const bool        test::opencl       = !!ggml_cpu_has_clblast();
+const bool        test::metal        = !!ggml_cpu_has_metal();
+const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
+const bool        test::blas         = !!ggml_cpu_has_blas();
+const std::string test::cpu_info     = get_cpu_info();
+const std::string test::gpu_info     = get_gpu_info();
+
+struct printer {
+    virtual ~printer() {}
+
+    FILE * fout;
+    virtual void print_header(const cmd_params & params) { (void) params; };
+    virtual void print_test(const test & t) = 0;
+    virtual void print_footer() { };
+};
+
+struct csv_printer : public printer {
+    static std::string escape_csv(const std::string & field) {
+        std::string escaped = "\"";
+        for (auto c : field) {
+            if (c == '"') {
+                escaped += "\"";
+            }
+            escaped += c;
+        }
+        escaped += "\"";
+        return escaped;
+    }
+
+    void print_header(const cmd_params & params) override  {
+        std::vector<std::string> fields = test::get_fields();
+        fprintf(fout, "%s\n", join(fields, ",").c_str());
+        (void) params;
+    }
+
+    void print_test(const test & t) override {
+        std::vector<std::string> values = t.get_values();
+        std::transform(values.begin(), values.end(), values.begin(), escape_csv);
+        fprintf(fout, "%s\n", join(values, ",").c_str());
+    }
+};
+
+struct json_printer : public printer {
+    bool first = true;
+
+    static std::string escape_json(const std::string & value) {
+        std::string escaped;
+        for (auto c : value) {
+            if (c == '"') {
+                escaped += "\\\"";
+            } else if (c == '\\') {
+                escaped += "\\\\";
+            } else  if (c <= 0x1f) {
+                char buf[8];
+                snprintf(buf, sizeof(buf), "\\u%04x", c);
+                escaped += buf;
+            } else {
+                escaped += c;
+            }
+        }
+        return escaped;
+    }
+
+    static std::string format_value(const std::string & field, const std::string & value) {
+        switch (test::get_field_type(field)) {
+            case test::STRING:
+                return "\"" + escape_json(value) + "\"";
+            case test::BOOL:
+                return value == "0" ? "false" : "true";
+            default:
+                return value;
+        }
+    }
+
+    void print_header(const cmd_params & params) override {
+        fprintf(fout, "[\n");
+        (void) params;
+    }
+
+    void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
+        assert(fields.size() == values.size());
+        for (size_t i = 0; i < fields.size(); i++) {
+            fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
+        }
+    }
+
+    void print_test(const test & t) override {
+        if (first) {
+            first = false;
+        } else {
+            fprintf(fout, ",\n");
+        }
+        fprintf(fout, "  {\n");
+        print_fields(test::get_fields(), t.get_values());
+        fprintf(fout, "    \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
+        fprintf(fout, "    \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
+        fprintf(fout, "  }");
+        fflush(fout);
+    }
+
+    void print_footer() override {
+        fprintf(fout, "\n]\n");
+    }
+};
+
+struct markdown_printer : public printer {
+    std::vector<std::string> fields;
+
+    static int get_field_width(const std::string & field) {
+        if (field == "model") {
+            return -30;
+        }
+        if (field == "t/s") {
+            return 15;
+        }
+        int width = std::max((int)field.length(), 10);
+
+        if (test::get_field_type(field) == test::STRING) {
+            return -width;
+        }
+        return width;
+    }
+
+    void print_header(const cmd_params & params) override {
+        // select fields to print
+        fields = { "model", "backend" };
+        bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
+        if (!is_cpu_backend) {
+            fields.push_back("n_gpu_layers");
+        }
+        if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
+            fields.push_back("n_threads");
+        }
+        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
+            fields.push_back("n_batch");
+        }
+        if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
+            fields.push_back("f16_kv");
+        }
+        if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
+            fields.push_back("main_gpu");
+        }
+        if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
+            fields.push_back("mul_mat_q");
+        }
+        if (params.low_vram.size() > 1 || params.low_vram != cmd_params_defaults.low_vram) {
+            fields.push_back("low_vram");
+        }
+        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
+            fields.push_back("tensor_split");
+        }
+        fields.push_back("test");
+        fields.push_back("t/s");
+
+        fprintf(fout, "|");
+        for (const auto & field : fields) {
+            fprintf(fout, " %*s |", get_field_width(field), field.c_str());
+        }
+        fprintf(fout, "\n");
+        fprintf(fout, "|");
+        for (const auto & field : fields) {
+            int width = get_field_width(field);
+            fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-");
+        }
+        fprintf(fout, "\n");
+    }
+
+    void print_test(const test & t) override {
+        std::map<std::string, std::string> vmap = t.get_map();
+
+        fprintf(fout, "|");
+        for (const auto & field : fields) {
+            std::string value;
+            if (field == "model") {
+                value = t.model_type;
+            } else if (field == "backend") {
+                value = test::get_backend();
+            } else if (field == "test") {
+                char buf[128];
+                if (t.n_prompt > 0 && t.n_gen == 0) {
+                    snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
+                } else if (t.n_gen > 0 && t.n_prompt == 0) {
+                    snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
+                } else {
+                    assert(false);
+                    exit(1);
+                }
+                value = buf;
+            } else if (field == "t/s") {
+                char buf[128];
+                snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
+                value = buf;
+            } else if (vmap.find(field) != vmap.end()) {
+                value = vmap.at(field);
+            } else {
+                assert(false);
+                exit(1);
+            }
+
+            int width = get_field_width(field);
+            if (field == "t/s") {
+                // HACK: the utf-8 character is 2 bytes
+                width += 1;
+            }
+            fprintf(fout, " %*s |", width, value.c_str());
+        }
+        fprintf(fout, "\n");
+    }
+
+    void print_footer() override {
+        fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
+    }
+};
+
+struct sql_printer : public printer {
+    static std::string get_sql_field_type(const std::string & field) {
+        switch (test::get_field_type(field)) {
+            case test::STRING:
+                return "TEXT";
+            case test::BOOL:
+            case test::INT:
+                return "INTEGER";
+            case test::FLOAT:
+                return "REAL";
+            default:
+                assert(false);
+                exit(1);
+        }
+    }
+
+    void print_header(const cmd_params & params) override {
+        std::vector<std::string> fields = test::get_fields();
+        fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
+        for (size_t i = 0; i < fields.size(); i++) {
+            fprintf(fout, "  %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),  i < fields.size() - 1 ? "," : "");
+        }
+        fprintf(fout, ");\n");
+        fprintf(fout, "\n");
+        (void) params;
+    }
+
+    void print_test(const test & t) override {
+        fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str());
+        fprintf(fout, "VALUES (");
+        std::vector<std::string> values = t.get_values();
+        for (size_t i = 0; i < values.size(); i++) {
+            fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : "");
+        }
+        fprintf(fout, ");\n");
+    }
+};
+
+static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
+    std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
+    int n_processed = 0;
+    while (n_processed < n_prompt) {
+        int n_tokens = std::min(n_prompt - n_processed, n_batch);
+        llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads);
+        n_processed += n_tokens;
+    }
+}
+
+static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
+    llama_token token = llama_token_bos(ctx);
+    for (int i = 0; i < n_gen; i++) {
+        llama_eval(ctx, &token, 1, n_past + i, n_threads);
+    }
+}
+
+static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) text;
+    (void) user_data;
+}
+
+int main(int argc, char ** argv) {
+#if !defined(NDEBUG)
+    fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
+#endif
+
+#if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__))
+    fprintf(stderr, "warning: debug build, performance may be affected\n");
+#endif
+
+#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
+    fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
+#endif
+
+    cmd_params params = parse_cmd_params(argc, argv);
+
+    // initialize llama.cpp
+    if (!params.verbose) {
+        llama_log_set(llama_null_log_callback, NULL);
+    }
+    bool numa = false;
+    llama_backend_init(numa);
+
+    // initialize printer
+    std::unique_ptr<printer> p;
+    switch (params.output_format) {
+        case CSV:
+            p.reset(new csv_printer());
+            break;
+        case JSON:
+            p.reset(new json_printer());
+            break;
+        case MARKDOWN:
+            p.reset(new markdown_printer());
+            break;
+        case SQL:
+            p.reset(new sql_printer());
+            break;
+        default:
+            assert(false);
+            exit(1);
+    }
+    p->fout = stdout;
+    p->print_header(params);
+
+    std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
+
+    for (const auto & inst : params_instances) {
+        // TODO: keep the model between tests when possible
+        llama_context_params lparams = inst.to_llama_params();
+
+        llama_model * lmodel  = llama_load_model_from_file(inst.model.c_str(), lparams);
+        if (lmodel == NULL) {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
+            return 1;
+        }
+
+        llama_context * ctx = llama_new_context_with_model(lmodel, lparams);
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
+            llama_free_model(lmodel);
+            return 1;
+        }
+
+        test t(inst, lmodel, ctx);
+
+        // warmup run
+        test_gen(ctx, 1, 0, t.n_threads);
+
+        for (int i = 0; i < params.reps; i++) {
+            uint64_t t_start = get_time_ns();
+            if (t.n_prompt > 0) {
+                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
+            }
+            if (t.n_gen > 0) {
+                test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
+            }
+            uint64_t t_ns = get_time_ns() - t_start;
+            t.samples_ns.push_back(t_ns);
+        }
+
+        p->print_test(t);
+
+        llama_print_timings(ctx);
+
+        llama_free(ctx);
+        llama_free_model(lmodel);
+    }
+
+    p->print_footer();
+
+    llama_backend_free();
+
+    return 0;
+}
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -0,0 +1,132 @@
+" Requires an already running llama.cpp server
+" To install either copy or symlink to ~/.vim/autoload/llama.vim
+" Then start with either :call llama#doLlamaGen(),
+" or add a keybind to your vimrc such as
+" nnoremap Z :call llama#doLlamaGen()<CR>
+" Similarly, you could add an insert mode keybind with
+" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
+"
+" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
+" let g:llama_api_url = "192.168.1.10:8080"
+" llama_overrides can also be set through buffer/window scopes. For instance
+" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
+" Could be added to your .vimrc to automatically set a lower temperature when
+" editing a python script
+" Additionally, an override dict can be stored at the top of a file
+" !*{"stop": ["User:"]}
+" Could be added to the start of your chatlog.txt to set the stopping token
+" These parameter dicts are merged together from lowest to highest priority:
+" server default -> g:llama_overrides -> w:llama_overrides ->
+" b:llama_overrides -> in file (!*) overrides
+"
+" Sublists (like logit_bias and stop) are overridden, not merged
+" Example override:
+" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
+if !exists("g:llama_api_url")
+    let g:llama_api_url= "127.0.0.1:8080"
+endif
+if !exists("g:llama_overrides")
+   let g:llama_overrides = {}
+endif
+const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
+const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
+let s:linedict = {}
+
+func s:callbackHandler(bufn, channel, msg)
+   if len(a:msg) < 3
+      return
+   elseif a:msg[0] == "d"
+      let l:msg = a:msg[6:-1]
+   else
+      let l:msg = a:msg
+   endif
+   let l:decoded_msg = json_decode(l:msg)
+   let l:newtext = split(l:decoded_msg['content'], "\n", 1)
+   if len(l:newtext) > 0
+      call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
+   else
+      echo "nothing genned"
+   endif
+   if len(newtext) > 1
+      let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
+      let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
+   endif
+   if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
+       echo "Finished generation"
+   endif
+endfunction
+
+func llama#doLlamaGen()
+   if exists("b:job")
+      if job_status(b:job) == "run"
+         call job_stop(b:job)
+         return
+      endif
+   endif
+
+   let l:cbuffer = bufnr("%")
+   let s:linedict[l:cbuffer] = line('$')
+   let l:buflines = getbufline(l:cbuffer, 1, 1000)
+   let l:querydata = copy(s:querydata)
+   call extend(l:querydata, g:llama_overrides)
+   if exists("w:llama_overrides")
+      call extend(l:querydata, w:llama_overrides)
+   endif
+   if exists("b:llama_overrides")
+      call extend(l:querydata, b:llama_overrides)
+   endif
+   if l:buflines[0][0:1] == '!*'
+      let l:userdata = json_decode(l:buflines[0][2:-1])
+      call extend(l:querydata, l:userdata)
+      let l:buflines = l:buflines[1:-1]
+   endif
+   let l:querydata.prompt = join(l:buflines, "\n")
+   let l:curlcommand = copy(s:curlcommand)
+   let l:curlcommand[2] = json_encode(l:querydata)
+   let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
+endfunction
+
+" Echos the tokkenization of the provided string , or cursor to end of word
+" Onus is placed on the user to include the preceding space
+func llama#tokenizeWord(...)
+    if (a:0 > 0)
+        let l:input = a:1
+    else
+        exe "normal \"*ye"
+        let l:input = @*
+    endif
+    let l:querydata = {"content": l:input}
+    let l:curlcommand = copy(s:curlcommand)
+    let l:curlcommand[2] = json_encode(l:querydata)
+    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
+   let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
+endfunction
+
+func s:tokenizeWordCallback(plaintext, channel, msg)
+    echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
+endfunction
+
+
+" Echos the token count of the entire buffer (or provided string)
+" Example usage :echo llama#tokenCount()
+func llama#tokenCount(...)
+    if (a:0 > 0)
+        let l:buflines = a:1
+    else
+        let l:buflines = getline(1,1000)
+        if l:buflines[0][0:1] == '!*'
+            let l:buflines = l:buflines[1:-1]
+        endif
+        let l:buflines = join(l:buflines, "\n")
+    endif
+    let l:querydata = {"content": l:buflines}
+    let l:curlcommand = copy(s:curlcommand)
+    let l:curlcommand[2] = json_encode(l:querydata)
+    let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
+   let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
+endfunction
+
+func s:tokenCountCallback(channel, msg)
+    let resp = json_decode(a:msg)
+    echo len(resp.tokens)
+endfunction
--- a/examples/llama2-13b.sh
+++ b/examples/llama2-13b.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+#
+# Temporary script - will be removed in the future
+#
+
+cd `dirname $0`
+cd ..
+
+./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
+       --color \
+       --ctx_size 2048 \
+       -n -1 \
+       -ins -b 256 \
+       --top_k 10000 \
+       --temp 0.2 \
+       --repeat_penalty 1.1 \
+       -t 8
--- a/examples/llama2.sh
+++ b/examples/llama2.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+#
+# Temporary script - will be removed in the future
+#
+
+cd `dirname $0`
+cd ..
+
+./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
+       --color \
+       --ctx_size 2048 \
+       -n -1 \
+       -ins -b 256 \
+       --top_k 10000 \
+       --temp 0.2 \
+       --repeat_penalty 1.1 \
+       -t 8
--- a/examples/llm.vim
+++ b/examples/llm.vim
@@ -0,0 +1,27 @@
+" Basic plugin example
+
+function! Llm()
+
+  let url = "http://127.0.0.1:8080/completion"
+
+  " Get the content of the current buffer
+  let buffer_content = join(getline(1, '$'), "\n")
+
+  " Create the JSON payload
+  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false}
+  let json_payload.prompt = buffer_content
+
+  " Define the curl command
+  let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url
+  let response = system(curl_command, json_encode(json_payload))
+
+  " Extract the content field from the response
+  let content = json_decode(response).content
+
+  let split_newlines = split(content, '\n', 1)
+
+  " Insert the content at the cursor position
+  call setline(line('.'), [ getline('.') . split_newlines[0] ] + split_newlines[1:])
+endfunction
+
+command! Llm call Llm()
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET main)
 add_executable(${TARGET} main.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -140,6 +140,12 @@ The `--ctx-size` option allows you to set the size of the prompt context used by

 -   `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.

+### Extended Context Size
+
+Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
+
+- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
+
 ### Keep Prompt

 The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
@@ -154,9 +160,13 @@ The following options allow you to control the text generation process and fine-

 ### Number of Tokens to Predict

-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity, -2 = until context filled)

-The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
+The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
+
+A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--n-keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in significant pause in output.
+
+If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.

 It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.

@@ -202,9 +212,9 @@ Example usage: `--top-p 0.95`

 -   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).

-Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. The method adjusts the logits (token probabilities) by raising them to the power of the parameter z. A higher value of z (e.g., 2.0) will further suppress less likely tokens from the tail of the distribution, while a value of 1.0 disables the effect of TFS. By setting the parameter z, you can control how much the probabilities of less likely tokens are reduced.
+Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. Similar to Top-P it tries to determine the bulk of the most likely tokens dynamically. But TFS filters out logits based on the second derivative of their probabilities. Adding tokens is stopped after the sum of the second derivatives reaches the parameter z. In short: TFS looks how quickly the probabilities of the tokens decrease and cuts off the tail of unlikely tokens using the parameter z. Typical values for z are in the range of 0.9 to 0.95. A value of 1.0 would include all tokens, and thus disables the effect of TFS.

-Example usage: `--tfs 2.0`
+Example usage: `--tfs 0.95`

 ### Locally Typical Sampling

@@ -278,6 +288,10 @@ These options help improve the performance and memory usage of the LLaMA models.

 -   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.

+### Grammars
+
+-   `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
+
 ### Quantization

 For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -4,8 +4,10 @@
 #endif

 #include "common.h"
+#include "console.h"
 #include "llama.h"
 #include "build-info.h"
+#include "grammar-parser.h"

 #include <cassert>
 #include <cinttypes>
@@ -34,9 +36,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static console_state con_st;
 static llama_context ** g_ctx;
-
 static bool is_interacting = false;

 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
@@ -45,7 +45,7 @@ void sigint_handler(int signo) {
        if (!is_interacting) {
            is_interacting=true;
        } else {
-            console_cleanup(con_st);
+            console::cleanup();
            printf("\n");
            llama_print_timings(*g_ctx);
            _exit(130);
@@ -63,10 +63,8 @@ int main(int argc, char ** argv) {

    // save choice to use color for later
    // (note for later: this is a slightly awkward choice)
-    con_st.use_color = params.use_color;
-    con_st.multiline_input = params.multiline_input;
-    console_init(con_st);
-    atexit([]() { console_cleanup(con_st); });
+    console::init(params.simple_io, params.use_color);
+    atexit([]() { console::cleanup(); });

    if (params.perplexity) {
        printf("\n************\n");
@@ -93,8 +91,8 @@ int main(int argc, char ** argv) {
    }

    if (params.n_ctx > 2048) {
-        fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);"
-                " you are on your own\n", __func__, params.n_ctx);
+        // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
+        fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
    } else if (params.n_ctx < 8) {
        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
        params.n_ctx = 8;
@@ -139,17 +137,14 @@ int main(int argc, char ** argv) {
                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }

-    // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
+    // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
    // uncomment the "used_mem" line in llama.cpp to see the results
    if (params.mem_test) {
        {
-            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
-            llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
-        }
+            fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);

-        {
-            const std::vector<llama_token> tmp = { 0, };
-            llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
+            llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
        }

        llama_print_timings(ctx);
@@ -196,10 +191,6 @@ int main(int argc, char ** argv) {

    // tokenize the prompt
    std::vector<llama_token> embd_inp;
-
-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
-
    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
    } else {
@@ -275,15 +266,12 @@ int main(int argc, char ** argv) {
        params.interactive = true;
    }

-    // determine newline token
-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
    if (params.verbose_prompt) {
        fprintf(stderr, "\n");
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
        }

        if (ctx_guidance) {
@@ -291,14 +279,14 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
            }
        }

        if (params.n_keep > 0) {
        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
+                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
            }
            fprintf(stderr, "'\n");
        }
@@ -316,7 +304,7 @@ int main(int argc, char ** argv) {
        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
        };
-        SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif

        fprintf(stderr, "%s: interactive mode on.\n", __func__);
@@ -327,6 +315,10 @@ int main(int argc, char ** argv) {
            }
        }

+        if (params.input_prefix_bos) {
+            fprintf(stderr, "Input prefix with BOS\n");
+        }
+
        if (!params.input_prefix.empty()) {
            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
        }
@@ -340,13 +332,37 @@ int main(int argc, char ** argv) {
    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    fprintf(stderr, "\n\n");

+    grammar_parser::parse_state parsed_grammar;
+    llama_grammar *             grammar = NULL;
+    if (!params.grammar.empty()) {
+        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+        // will be empty (default) if there are parse errors
+        if (parsed_grammar.rules.empty()) {
+            return 1;
+        }
+        fprintf(stderr, "%s: grammar:\n", __func__);
+        grammar_parser::print_grammar(stderr, parsed_grammar);
+        fprintf(stderr, "\n");
+
+        {
+            auto it = params.logit_bias.find(llama_token_eos(ctx));
+            if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+            }
+        }
+
+        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+        grammar = llama_grammar_init(
+            grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+    }
+
    // TODO: replace with ring-buffer
    std::vector<llama_token> last_n_tokens(n_ctx);
    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);

    if (params.interactive) {
        const char *control_message;
-        if (con_st.multiline_input) {
+        if (params.multiline_input) {
            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
                              " - To return control without starting a new line, end your input with '/'.\n";
        } else {
@@ -374,14 +390,14 @@ int main(int argc, char ** argv) {
    int n_past_guidance    = 0;

    // the first thing we will do is to output the prompt, so set color accordingly
-    console_set_color(con_st, CONSOLE_COLOR_PROMPT);
+    console::set_display(console::prompt);

    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;

    // do one empty run to warm up the model
    {
-        const std::vector<llama_token> tmp = { llama_token_bos(), };
+        const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
        llama_reset_timings(ctx);
    }
@@ -395,9 +411,9 @@ int main(int argc, char ** argv) {
            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
            if ((int)embd.size() > max_embd_size) {
                auto skipped_tokens = embd.size() - max_embd_size;
-                console_set_color(con_st, CONSOLE_COLOR_ERROR);
+                console::set_display(console::error);
                printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
-                console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+                console::set_display(console::reset);
                fflush(stdout);
                embd.resize(max_embd_size);
            }
@@ -407,8 +423,12 @@ int main(int argc, char ** argv) {
            // - take the n_keep first tokens from the original prompt (via n_past)
            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
-                const int n_left = n_past - params.n_keep;
+                if (params.n_predict == -2) {
+                    fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
+                    break;
+                }

+                const int n_left = n_past - params.n_keep;
                // always keep the first token - BOS
                n_past = std::max(1, params.n_keep);
                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
@@ -557,11 +577,11 @@ int main(int argc, char ** argv) {
                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

                if (ctx_guidance) {
-                    llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale, params.cfg_smooth_factor);
+                    llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale);
                }

                // Apply penalties
-                float nl_logit = logits[llama_token_nl()];
+                float nl_logit = logits[llama_token_nl(ctx)];
                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
                llama_sample_repetition_penalty(ctx, &candidates_p,
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@@ -570,7 +590,11 @@ int main(int argc, char ** argv) {
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                    last_n_repeat, alpha_frequency, alpha_presence);
                if (!penalize_nl) {
-                    logits[llama_token_nl()] = nl_logit;
+                    logits[llama_token_nl(ctx)] = nl_logit;
+                }
+
+                if (grammar != NULL) {
+                    llama_sample_grammar(ctx, &candidates_p, grammar);
                }

                if (temp <= 0) {
@@ -598,20 +622,14 @@ int main(int argc, char ** argv) {
                }
                // printf("`%d`", candidates_p.size);

+                if (grammar != NULL) {
+                    llama_grammar_accept_token(ctx, grammar, id);
+                }
+
                last_n_tokens.erase(last_n_tokens.begin());
                last_n_tokens.push_back(id);
            }

-            // replace end of text token with newline token when in interactive mode
-            if (id == llama_token_eos() && params.interactive && !params.instruct) {
-                id = llama_token_newline.front();
-                if (params.antiprompt.size() != 0) {
-                    // tokenize and inject first reverse prompt
-                    const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
-                    embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
-                }
-            }
-
            // add it to the context
            embd.push_back(id);

@@ -636,13 +654,13 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo) {
            for (auto id : embd) {
-                printf("%s", llama_token_to_str(ctx, id));
+                printf("%s", llama_token_to_str(ctx, id).c_str());
            }
            fflush(stdout);
        }
        // reset color to default if we there is no pending user input
        if (input_echo && (int)embd_inp.size() == n_consumed) {
-            console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+            console::set_display(console::reset);
        }

        // if not currently processing queued inputs;
@@ -668,7 +686,7 @@ int main(int argc, char ** argv) {
                    if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
                        if (params.interactive) {
                            is_interacting = true;
-                            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+                            console::set_display(console::user_input);
                        }
                        is_antiprompt = true;
                        fflush(stdout);
@@ -677,11 +695,34 @@ int main(int argc, char ** argv) {
                }
            }

+            // deal with end of text token in interactive mode
+            if (last_n_tokens.back() == llama_token_eos(ctx)) {
+                if (params.interactive) {
+                    if (params.antiprompt.size() != 0) {
+                        // tokenize and inject first reverse prompt
+                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+                        is_antiprompt = true;
+                    }
+
+                    is_interacting = true;
+                    printf("\n");
+                    console::set_display(console::user_input);
+                    fflush(stdout);
+                } else if (params.instruct) {
+                    is_interacting = true;
+                }
+            }
+
            if (n_past > 0 && is_interacting) {
                if (params.instruct) {
                    printf("\n> ");
                }

+                if (params.input_prefix_bos) {
+                    embd_inp.push_back(llama_token_bos(ctx));
+                }
+
                std::string buffer;
                if (!params.input_prefix.empty()) {
                    buffer += params.input_prefix;
@@ -691,12 +732,12 @@ int main(int argc, char ** argv) {
                std::string line;
                bool another_line = true;
                do {
-                    another_line = console_readline(con_st, line);
+                    another_line = console::readline(line, params.multiline_input);
                    buffer += line;
                } while (another_line);

                // done taking input, reset color
-                console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+                console::set_display(console::reset);

                // Add tokens to embd only if the input buffer is non-empty
                // Entering a empty line lets the user pass control back
@@ -728,18 +769,25 @@ int main(int argc, char ** argv) {
            }

            if (n_past > 0) {
+                if (is_interacting) {
+                    // reset grammar state if we're restarting generation
+                    if (grammar != NULL) {
+                        llama_grammar_free(grammar);
+
+                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
+                        grammar = llama_grammar_init(
+                            grammar_rules.data(), grammar_rules.size(),
+                            parsed_grammar.symbol_ids.at("root"));
+                    }
+                }
                is_interacting = false;
            }
        }

        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos()) {
-            if (params.instruct) {
-                is_interacting = true;
-            } else {
-                fprintf(stderr, " [end of text]\n");
-                break;
-            }
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
+            fprintf(stderr, " [end of text]\n");
+            break;
        }

        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
@@ -759,6 +807,9 @@ int main(int argc, char ** argv) {
    llama_free(ctx);
    llama_free_model(model);

+    if (grammar != NULL) {
+        llama_grammar_free(grammar);
+    }
    llama_backend_free();

    return 0;
--- a/examples/make-ggml.py
+++ b/examples/make-ggml.py
@@ -0,0 +1,92 @@
+"""
+This script converts Hugging Face llama models to GGML and quantizes them.
+
+Usage:
+python make-ggml.py --model {model_dir_or_hf_repo_name} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
+
+Arguments:
+- --model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
+- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
+- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
+- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
+- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
+
+Quant types:
+- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
+- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
+- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
+- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
+- Q2_K: smallest, extreme quality loss - not recommended
+- Q3_K: alias for Q3_K_M
+- Q3_K_S: very small, very high quality loss
+- Q3_K_M: very small, very high quality loss
+- Q3_K_L: small, substantial quality loss
+- Q4_K: alias for Q4_K_M
+- Q4_K_S: small, significant quality loss
+- Q4_K_M: medium, balanced quality - recommended
+- Q5_K: alias for Q5_K_M
+- Q5_K_S: large, low quality loss - recommended
+- Q5_K_M: large, very low quality loss - recommended
+- Q6_K: very large, extremely low quality loss
+- Q8_0: very large, extremely low quality loss - not recommended
+- F16: extremely large, virtually no quality loss - not recommended
+- F32: absolutely huge, lossless - not recommended
+"""
+import subprocess
+subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
+
+import argparse
+import os
+from huggingface_hub import snapshot_download
+
+def main(model, outname, outdir, quants, keep_fp16):
+    ggml_version = "v3"
+
+    if not os.path.isdir(model):
+        print(f"Model not found at {model}. Downloading...")
+        try:
+            if outname is None:
+                outname = model.split('/')[-1]
+            model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
+        except Exception as e:
+            raise Exception(f"Could not download the model: {e}")
+
+    if outdir is None:
+        outdir = f'../models/{outname}'
+
+    if not os.path.isfile(f"{model}/config.json"):
+        raise Exception(f"Could not find config.json in {model}")
+
+    os.makedirs(outdir, exist_ok=True)
+
+    print("Building llama.cpp")
+    subprocess.run(f"cd .. && make quantize", shell=True, check=True)
+
+    fp16 = f"{outdir}/{outname}.ggml{ggml_version}.fp16.bin"
+
+    print(f"Making unquantised GGML at {fp16}")
+    if not os.path.isfile(fp16):
+        subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
+    else:
+        print(f"Unquantised GGML already exists at: {fp16}")
+
+    print("Making quants")
+    for type in quants:
+        outfile = f"{outdir}/{outname}.ggml{ggml_version}.{type}.bin"
+        print(f"Making {type} : {outfile}")
+        subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
+
+    if not keep_fp16:
+        os.remove(fp16)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Convert/Quantize HF to GGML. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
+    parser.add_argument('--model', required=True, help='Downloaded model dir or Hugging Face model repo name')
+    parser.add_argument('--outname', default=None, help='Output model(s) name')
+    parser.add_argument('--outdir', default=None, help='Output directory')
+    parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
+    parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
+
+    args = parser.parse_args()
+
+    main(args.model, args.outname, args.outdir, args.quants, args.keep_fp16)
--- a/examples/metal/CMakeLists.txt
+++ b/examples/metal/CMakeLists.txt
@@ -1,3 +1,4 @@
 set(TEST_TARGET metal)
 add_executable(${TEST_TARGET} metal.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TEST_TARGET} PRIVATE ggml)
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@@ -2,7 +2,7 @@
 //
 // - First, export a LLaMA graph:
 //
-//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export
+//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
 //
 // - Run this tool to evaluate the exported graph:
 //
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET perplexity)
 add_executable(${TARGET} perplexity.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -4,6 +4,8 @@

 #include <cmath>
 #include <ctime>
+#include <sstream>
+#include <cstring>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -32,13 +34,15 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    // BOS tokens will be added for each chunk before eval
    auto tokens = ::llama_tokenize(ctx, params.prompt, true);

-    int count   = 0;
+    const int n_chunk_max = tokens.size() / params.n_ctx;

-    const int n_chunk = tokens.size() / params.n_ctx;
+    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
    const int n_vocab = llama_n_vocab(ctx);
    const int n_batch = params.n_batch;

+    int count = 0;
    double nll = 0.0;
+
    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);

    for (int i = 0; i < n_chunk; ++i) {
@@ -60,7 +64,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {

            // add BOS token for the first batch of each chunk
            if (j == 0) {
-                tokens[batch_start] = llama_token_bos();
+                tokens[batch_start] = llama_token_bos(ctx);
            }

            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
@@ -85,7 +89,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            fprintf(stderr, "%d minutes\n", total_seconds / 60);
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

        // We get the logits for all the tokens in the context window (params.n_ctx)
@@ -118,6 +122,242 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    printf("\n");
 }

+std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
+        int n_vocab, int n_thread) {
+    std::vector<float> result;
+    result.reserve(tokens.size() * n_vocab);
+    size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
+    for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
+        size_t n_tokens = tokens.size() - i_chunk * n_batch;
+        n_tokens = std::min(n_tokens, size_t(n_batch));
+        if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return {};
+        }
+
+        const auto logits = llama_get_logits(ctx);
+        result.insert(result.end(), logits, logits + n_tokens * n_vocab);
+
+        n_past += n_tokens;
+    }
+    return result;
+}
+
+void hellaswag_score(llama_context * ctx, const gpt_params & params) {
+    // Calculates hellaswag score (acc_norm) from prompt
+    //
+    // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
+    // All used data fields are preprocessed as in https://github.com/EleutherAI/lm-evaluation-harness/blob/df3da98c5405deafd519c2ddca52bb7c3fe36bef/lm_eval/tasks/hellaswag.py#L62-L68
+    //
+    // All 10042 tasks should be extracted to keep the results standardized like other implementations.
+    //
+    // Datafile layout:
+    // ['??'] denotes json fields
+    // 6 lines per task:
+    // ['activity_label'] + ": " +['ctx']  - The first part of the query, the context
+    // ['label'] - The index the best common sense ending aka gold ending
+    // ['endings'][0] - Endings added to the first part of the query
+    // ['endings'][1]
+    // ['endings'][2]
+    // ['endings'][3]
+
+    std::vector<std::string> prompt_lines;
+    std::istringstream strstream(params.prompt);
+    std::string line;
+
+    while (std::getline(strstream,line,'\n')) {
+        prompt_lines.push_back(line);
+    }
+
+    if( prompt_lines.size() % 6 != 0) {
+        fprintf(stderr, "%s : number of lines in prompt not a multiple of 6.\n", __func__);
+        return;
+    }
+
+    size_t hs_task_count = prompt_lines.size()/6;
+    fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
+
+    // This is needed as usual for LLaMA models
+    bool prepend_bos = true;
+
+    // Number of tasks to use when computing the score
+    if ( params.hellaswag_tasks < hs_task_count  ) {
+        hs_task_count = params.hellaswag_tasks;
+    }
+
+    // The tasks should be randomized so the score stabilizes quickly.
+    bool randomize_tasks = true;
+
+    // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
+    std::mt19937 rng(1);
+
+    // Dataholder for hellaswag tasks
+    struct hs_data_t {
+        std::string context;
+        size_t gold_ending_idx;
+        std::string ending[4];
+        size_t ending_logprob_count[4];
+        double ending_logprob[4];
+    };
+
+    fprintf(stderr, "%s : selecting %zu %s tasks.\n", __func__, hs_task_count, (randomize_tasks?"randomized":"the first")  );
+
+    // Select and read data from prompt lines
+    hs_data_t *hs_data = new hs_data_t[hs_task_count];
+    for (size_t i=0; i < hs_task_count; i++) {
+        size_t idx = i;
+
+        // Select a random example of those left in the prompt
+        if (randomize_tasks) {
+            std::uniform_int_distribution<size_t> dist(0, prompt_lines.size()/6-1 ) ;
+            idx = dist(rng);
+        }
+
+        hs_data[i].context = prompt_lines[idx*6];
+        hs_data[i].gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
+        for (size_t j=0; j < 4; j++) {
+            hs_data[i].ending[j] = " " + prompt_lines[idx*6+2+j];
+        }
+
+        // Delete the selected random example from the prompt
+        if (randomize_tasks) {
+            prompt_lines.erase( std::next(prompt_lines.begin(),idx*6)  , std::next(prompt_lines.begin(),idx*6+6) );
+        }
+    }
+
+    fprintf(stderr, "%s : calculating hellaswag score over selected tasks.\n", __func__);
+    printf("\ntask\tacc_norm\n");
+
+    double acc = 0.0f;
+    const int n_vocab = llama_n_vocab(ctx);
+
+    std::vector<float> tok_logits(n_vocab);
+
+    for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
+
+        // Tokenize the context to count tokens
+        std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
+        size_t context_size = context_embd.size();
+
+        // Do the 1st ending
+        // In this case we include the context when evaluating
+        auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
+        auto query_size = query_embd.size();
+        //printf("First query: %d\n",(int)query_size);
+
+        // Stop if query wont fit the ctx window
+        if (query_size > (size_t)params.n_ctx) {
+            fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
+            return;
+        }
+
+        // Speedup small evaluations by evaluating atleast 32 tokens
+        if (query_size < 32) {
+            query_embd.resize(32);
+        }
+
+        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
+        if (logits.empty()) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return;
+        }
+
+        std::memcpy(tok_logits.data(), logits.data() + (context_size-1)*n_vocab, n_vocab*sizeof(float));
+        const auto first_probs = softmax(tok_logits);
+
+        hs_data[task_idx].ending_logprob_count[0] = 1;
+        hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);
+
+        // Calculate the logprobs over the ending
+        for (size_t j = context_size; j < query_size - 1; j++) {
+
+            std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
+
+            const float prob = softmax(tok_logits)[query_embd[j + 1]];
+
+            hs_data[task_idx].ending_logprob[0] += std::log(prob);
+            hs_data[task_idx].ending_logprob_count[0]++;
+        }
+
+        // Calculate the mean token logprob for acc_norm
+        hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];
+
+        // Do the remaining endings
+        // For these, we use the bare ending with n_past = context_size
+        //
+        for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
+
+            // Tokenize the query
+            query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
+            query_size = query_embd.size();
+
+            // Stop if query wont fit the ctx window
+            if (context_size + query_size > (size_t)params.n_ctx) {
+                fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
+                return;
+            }
+
+            // Speedup small evaluations by evaluating atleast 32 tokens
+            // No, resizing to 32 is actually slightly slower (at least on CUDA)
+            //if (query_size < 32) {
+            //    query_embd.resize(32);
+            //}
+
+            // Evaluate the query
+            logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads);
+            if (logits.empty()) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return;
+            }
+
+            hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
+            hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);
+
+            // Calculate the logprobs over the ending
+            for (size_t j = 0; j < query_size - 1; j++) {
+                std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
+
+                const float prob = softmax(tok_logits)[query_embd[j + 1]];
+
+                hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
+                hs_data[task_idx].ending_logprob_count[ending_idx]++;
+            }
+
+            // Calculate the mean token logprob for acc_norm
+            hs_data[task_idx].ending_logprob[ending_idx] /= hs_data[task_idx].ending_logprob_count[ending_idx];
+
+
+//            printf("task %lu, ending %lu, whole_len %lu, context_len %lu, ending_logprob_count %lu, ending_logprob %.4f\n",
+//                task_idx,ending_idx,whole_size,context_size, hs_data[task_idx].ending_logprob_count[ending_idx], hs_data[task_idx].ending_logprob[ending_idx] );
+        }
+
+        // Find the ending with maximum logprob
+        size_t ending_logprob_max_idx = 0;
+        double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
+        for (size_t j = 1; j < 4; j++) {
+            if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
+                ending_logprob_max_idx = j;
+                ending_logprob_max_val =  hs_data[task_idx].ending_logprob[j];
+            }
+        }
+
+//        printf("max logprob ending idx %lu, gold ending idx %lu\n", ending_logprob_max_idx, hs_data[task_idx].gold_ending_idx);
+
+        // If the gold ending got the maximum logprobe add one accuracy point
+        if (ending_logprob_max_idx == hs_data[task_idx].gold_ending_idx) {
+            acc += 1.0;
+        }
+
+        // Print the accumulated accuracy mean x 100
+        printf("%zu\t%.8lf\n",task_idx+1, acc/double(task_idx+1)*100.0);
+        fflush(stdout);
+    }
+
+    delete [] hs_data;
+
+    printf("\n");
+}
+
 int main(int argc, char ** argv) {
    gpt_params params;

@@ -166,7 +406,11 @@ int main(int argc, char ** argv) {
                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }

-    perplexity(ctx, params);
+    if (params.hellaswag) {
+        hellaswag_score(ctx, params);
+    } else {
+        perplexity(ctx, params);
+    }

    llama_print_timings(ctx);
    llama_free(ctx);
--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(TARGET quantize-stats)
 add_executable(${TARGET} quantize-stats.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -24,7 +24,7 @@
 #endif

 struct quantize_stats_params {
-    std::string model = "models/7B/ggml-model-f16.bin";
+    std::string model = "models/7B/ggml-model-f16.gguf";
    bool verbose = false;
    bool per_layer_stats = false;
    bool print_histogram = false;
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -14,103 +14,27 @@ struct quant_option {
 };

 static const std::vector<struct quant_option> QUANT_OPTIONS = {
-    {
-        "Q4_0",
-        LLAMA_FTYPE_MOSTLY_Q4_0,
-        " 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
-    },
-    {
-        "Q4_1",
-        LLAMA_FTYPE_MOSTLY_Q4_1,
-        " 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
-    },
-    {
-        "Q5_0",
-        LLAMA_FTYPE_MOSTLY_Q5_0,
-        " 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
-    },
-    {
-        "Q5_1",
-        LLAMA_FTYPE_MOSTLY_Q5_1,
-        " 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
-    },
+    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 3.50G, +0.2499 ppl @ 7B", },
+    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1846 ppl @ 7B", },
+    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.30G, +0.0796 ppl @ 7B", },
+    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0415 ppl @ 7B", },
 #ifdef GGML_USE_K_QUANTS
-    {
-        "Q2_K",
-        LLAMA_FTYPE_MOSTLY_Q2_K,
-        " 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
-    },
-    {
-        "Q3_K",
-        LLAMA_FTYPE_MOSTLY_Q3_K_M,
-        "alias for Q3_K_M"
-    },
-    {
-        "Q3_K_S",
-        LLAMA_FTYPE_MOSTLY_Q3_K_S,
-        " 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
-    },
-    {
-        "Q3_K_M",
-        LLAMA_FTYPE_MOSTLY_Q3_K_M,
-        " 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
-    },
-    {
-        "Q3_K_L",
-        LLAMA_FTYPE_MOSTLY_Q3_K_L,
-        " 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
-    },
-    {
-        "Q4_K",
-        LLAMA_FTYPE_MOSTLY_Q4_K_M,
-        "alias for Q4_K_M",
-    },
-    {
-        "Q4_K_S",
-        LLAMA_FTYPE_MOSTLY_Q4_K_S,
-        " 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
-    },
-    {
-        "Q4_K_M",
-        LLAMA_FTYPE_MOSTLY_Q4_K_M,
-        " 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
-    },
-    {
-        "Q5_K",
-        LLAMA_FTYPE_MOSTLY_Q5_K_M,
-        "alias for Q5_K_M",
-    },
-    {
-        "Q5_K_S",
-        LLAMA_FTYPE_MOSTLY_Q5_K_S,
-        " 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
-    },
-    {
-        "Q5_K_M",
-        LLAMA_FTYPE_MOSTLY_Q5_K_M,
-        " 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
-    },
-    {
-        "Q6_K",
-        LLAMA_FTYPE_MOSTLY_Q6_K,
-        " 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
-    },
+    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.67G, +0.8698 ppl @ 7B", },
+    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
+    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
+    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
+    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
+    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
+    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
+    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
+    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
+    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
+    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
+    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0044 ppl @ 7B", },
 #endif
-    {
-        "Q8_0",
-        LLAMA_FTYPE_MOSTLY_Q8_0,
-        " 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
-    },
-    {
-        "F16",
-        LLAMA_FTYPE_MOSTLY_F16,
-        "13.00G              @ 7B - extremely large, virtually no quality loss - not recommended",
-    },
-    {
-        "F32",
-        LLAMA_FTYPE_ALL_F32,
-        "26.00G              @ 7B - absolutely huge, lossless - not recommended",
-    },
+    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ 7B", },
+    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
+    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
 };


@@ -144,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 }

 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 void usage(const char * executable) {
-    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
+    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    fprintf(stderr, "\nAllowed quantization types:\n");
@@ -194,8 +118,8 @@ int main(int argc, char ** argv) {
        if (pos != std::string::npos) {
            fpath = fname_inp.substr(0, pos + 1);
        }
-        // export as [inp path]/ggml-model-[ftype].bin
-        fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
+        // export as [inp path]/ggml-model-[ftype].gguf
+        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
        arg_idx++;
    }
    else {
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET save-load-state)
 add_executable(${TARGET} save-load-state.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -44,9 +44,8 @@ int main(int argc, char ** argv) {
        llama_free_model(model);
        return 1;
    }
-    auto tokens = std::vector<llama_token>(params.n_ctx);
-    auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
-
+    auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
+    auto n_prompt_tokens = tokens.size();
    if (n_prompt_tokens < 1) {
        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
        llama_free(ctx);
@@ -91,7 +90,7 @@ int main(int argc, char ** argv) {
        auto next_token_str = llama_token_to_str(ctx, next_token);
        last_n_tokens_data.push_back(next_token);

-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
        if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx);
@@ -151,7 +150,7 @@ int main(int argc, char ** argv) {
        auto next_token_str = llama_token_to_str(ctx2, next_token);
        last_n_tokens_data.push_back(next_token);

-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
        if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx2);
--- a/examples/server-llama2-13B.sh
+++ b/examples/server-llama2-13B.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -e
+
+cd "$(dirname "$0")/.." || exit
+
+# Specify the model you want to use here:
+MODEL="${MODEL:-./models/llama-2-13b-chat.ggmlv3.q5_K_M.bin}"
+PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt}
+
+# Adjust to the number of CPU cores you want to use.
+N_THREAD="${N_THREAD:-12}"
+
+# Note: you can also override the generation options by specifying them on the command line:
+GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
+
+
+# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
+./server $GEN_OPTIONS \
+  --model "$MODEL" \
+  --threads "$N_THREAD" \
+  --rope-freq-scale 1.0 \
+  "$@"
+
+# I used this to test the model with mps, but omitted it from the general purpose. If you want to use it, just specify it on the command line.
+# -ngl 1 \
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -2,10 +2,14 @@ set(TARGET server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 add_executable(${TARGET} server.cpp json.hpp httplib.h)
+install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+if (WIN32)
+    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -5,7 +5,7 @@ This example demonstrates a simple HTTP API server and a simple web front end to
 Command line options:

 -   `--threads N`, `-t N`: Set the number of threads to use during computation.
-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
 -   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 -   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
@@ -16,6 +16,7 @@ Command line options:
 -   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 -   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 -   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
+-   `--numa`: Attempt optimizations that help on some NUMA systems.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
 -   `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
@@ -47,15 +48,14 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./server -m models/7B/ggml-model.bin -c 2048
+./server -m models/7B/ggml-model.gguf -c 2048
 ```

 ### Windows:

 ```powershell
-server.exe -m models\7B\ggml-model.bin -c 2048
+server.exe -m models\7B\ggml-model.gguf -c 2048
 ```
-
 The above command will start a server that by default listens on `127.0.0.1:8080`.
 You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.

@@ -126,7 +126,7 @@ node .

    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.

-    `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
+    `prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.

    `stop`: Specify a JSON array of stopping strings.
    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
@@ -151,6 +151,8 @@ node .

    `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).

+    `grammar`: Set grammar for grammar-based sampling (default: no grammar)
+
    `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).

    `ignore_eos`: Ignore end of stream token and continue generating (default: false).
@@ -163,7 +165,7 @@ node .

    `content`: Set the text to tokenize.

-    Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
+    Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.

 -   **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.

--- a/examples/server/chat-llama2.sh
+++ b/examples/server/chat-llama2.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+
+API_URL="${API_URL:-http://127.0.0.1:8080}"
+
+CHAT=(
+    "Hello, Assistant."
+    "Hello. How may I help you today?"
+)
+
+INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+
+trim() {
+    shopt -s extglob
+    set -- "${1##+([[:space:]])}"
+    printf "%s" "${1%%+([[:space:]])}"
+}
+
+trim_trailing() {
+    shopt -s extglob
+    printf "%s" "${1%%+([[:space:]])}"
+}
+
+format_prompt() {
+    if [[ "${#CHAT[@]}" -eq 0 ]]; then
+        echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>"
+    else
+        LAST_INDEX=$(( ${#CHAT[@]} - 1 ))
+        echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]"
+    fi
+}
+
+tokenize() {
+    curl \
+        --silent \
+        --request POST \
+        --url "${API_URL}/tokenize" \
+        --header "Content-Type: application/json" \
+        --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \
+    | jq '.tokens[]'
+}
+
+N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l)
+
+chat_completion() {
+    PROMPT="$(trim_trailing "$(format_prompt "$1")")"
+    DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{
+        prompt: .,
+        temperature: 0.2,
+        top_k: 40,
+        top_p: 0.9,
+        n_keep: $n_keep,
+        n_predict: 1024,
+        stop: ["[INST]"],
+        stream: true
+    }')"
+
+    # Create a temporary file to hold the Python output
+    TEMPFILE=$(mktemp)
+
+    exec 3< <(curl \
+        --silent \
+        --no-buffer \
+        --request POST \
+        --url "${API_URL}/completion" \
+        --header "Content-Type: application/json" \
+        --data-raw "${DATA}")
+
+    python -c "
+import json
+import sys
+
+answer = ''
+while True:
+    line = sys.stdin.readline()
+    if not line:
+        break
+    if line.startswith('data: '):
+        json_content = line[6:].strip()
+        content = json.loads(json_content)['content']
+        sys.stdout.write(content)
+        sys.stdout.flush()
+        answer += content
+
+answer = answer.rstrip('\n')
+
+# Write the answer to the temporary file
+with open('$TEMPFILE', 'w') as f:
+    f.write(answer)
+    " <&3
+
+    exec 3<&-
+
+    # Read the answer from the temporary file
+    ANSWER=$(cat $TEMPFILE)
+
+    # Clean up the temporary file
+    rm $TEMPFILE
+
+    printf "\n"
+
+    CHAT+=("$1" "$(trim "$ANSWER")")
+}
+
+while true; do
+    echo -en "\033[0;32m"  # Green color
+    read -r -e -p "> " QUESTION
+    echo -en "\033[0m"  # Reset color
+    chat_completion "${QUESTION}"
+done
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@@ -1,5 +1,34 @@
 import * as readline from 'node:readline'
 import { stdin, stdout } from 'node:process'
+import { readFileSync } from 'node:fs'
+import { SchemaConverter }  from './public/json-schema-to-grammar.mjs'
+
+const args = process.argv.slice(2);
+const grammarJsonSchemaFile = args.find(
+    (_, index) => args[index - 1] === "--grammar-json-schema"
+);
+const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
+
+// Example usage: function,arguments
+const grammarJsonSchemaPropOrder = args.find(
+    (_, index) => args[index - 1] === "--grammar-json-schema-prop-order"
+);
+const propOrder = grammarJsonSchemaPropOrder
+    ? grammarJsonSchemaPropOrder
+          .split(",")
+          .reduce((acc, cur, index) => ({ ...acc, [cur]: index }), {})
+    : {};
+
+let grammar = null
+if (grammarJsonSchemaFile) {
+    const schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
+    const converter = new SchemaConverter(propOrder)
+    converter.visit(schema, '')
+    grammar = converter.formatGrammar()
+}
+if (grammarFile) {
+    grammar = readFileSync(grammarFile, 'utf-8')
+}

 const API_URL = 'http://127.0.0.1:8080'

@@ -48,6 +77,7 @@ async function chat_completion(question) {
            n_keep: n_keep,
            n_predict: 256,
            stop: ["\n### Human:"], // stop completion after generating this
+            grammar,
            stream: true,
        })
    })
--- a/examples/server/completion.js.hpp
+++ b/examples/server/completion.js.hpp
@@ -87,289 +87,342 @@ unsigned char completion_js[] = {
  0x20, 0x54, 0x65, 0x78, 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72,
  0x28, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
  0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d,
-  0x20, 0x74, 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29,
-  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20,
-  0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72,
-  0x2e, 0x72, 0x65, 0x61, 0x64, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x73, 0x65, 0x20, 0x61,
-  0x6e, 0x73, 0x77, 0x65, 0x72, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68,
-  0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69,
-  0x70, 0x6c, 0x65, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x6f, 0x66,
-  0x3a, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x5c, 0x6e, 0x20, 0x77, 0x69,
-  0x74, 0x68, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x61, 0x6c, 0x77, 0x61,
-  0x79, 0x73, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x74, 0x20, 0x61,
-  0x73, 0x20, 0x61, 0x20, 0x6b, 0x65, 0x79, 0x2e, 0x20, 0x69, 0x6e, 0x20,
-  0x6f, 0x75, 0x72, 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x77, 0x65, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x6d, 0x61, 0x69,
-  0x6e, 0x6c, 0x79, 0x20, 0x63, 0x61, 0x72, 0x65, 0x20, 0x61, 0x62, 0x6f,
-  0x75, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3a,
-  0x20, 0x6b, 0x65, 0x79, 0x20, 0x68, 0x65, 0x72, 0x65, 0x2c, 0x20, 0x77,
-  0x68, 0x69, 0x63, 0x68, 0x20, 0x77, 0x65, 0x20, 0x65, 0x78, 0x70, 0x65,
-  0x63, 0x74, 0x20, 0x61, 0x73, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74,
-  0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
-  0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x28, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x70, 0x61,
-  0x72, 0x73, 0x65, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20,
-  0x65, 0x76, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61,
-  0x64, 0x64, 0x20, 0x74, 0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72,
-  0x65, 0x73, 0x75, 0x6c, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20,
-  0x3d, 0x20, 0x2f, 0x5e, 0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73,
-  0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x6f, 0x66, 0x20,
-  0x74, 0x65, 0x78, 0x74, 0x2e, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x41, 0x6c,
-  0x6c, 0x28, 0x72, 0x65, 0x67, 0x65, 0x78, 0x29, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
-  0x6c, 0x74, 0x5b, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d,
-  0x20, 0x3d, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63, 0x65, 0x20,
-  0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73,
-  0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
-  0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a, 0x75, 0x73,
-  0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64, 0x61, 0x74,
-  0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
-  0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53,
-  0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73,
-  0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
-  0x20, 0x2b, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
-  0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x79,
-  0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79,
-  0x69, 0x65, 0x6c, 0x64, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b,
-  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x69,
-  0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20, 0x73,
-  0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66, 0x72,
-  0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20, 0x77,
-  0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
-  0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
-  0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
-  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e,
-  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e,
-  0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
-  0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
-  0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
-  0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72,
-  0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63,
-  0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d,
-  0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74,
-  0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e,
-  0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
-  0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
-  0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b,
-  0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20,
-  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74,
-  0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79,
-  0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x63, 0x72,
-  0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f,
-  0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f,
-  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72,
-  0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65,
-  0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66,
-  0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
-  0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a,
-  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28,
-  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
-  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76,
-  0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28,
-  0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28,
-  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
-  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
-  0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e,
-  0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20,
-  0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70,
-  0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
-  0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
-  0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b,
-  0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20,
-  0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
-  0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
-  0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
-  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c,
-  0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
-  0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72,
-  0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73,
-  0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c,
-  0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
-  0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e,
-  0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
-  0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
-  0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74,
-  0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74,
-  0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61,
-  0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77,
-  0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74,
-  0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20,
-  0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68,
-  0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29,
-  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e,
-  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72,
-  0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
-  0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
-  0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
-  0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
-  0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e,
-  0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
-  0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74,
-  0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
+  0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f,
+  0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f,
+  0x20, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20,
+  0x70, 0x61, 0x72, 0x74, 0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65,
+  0x61, 0x64, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20,
+  0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65,
+  0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75,
+  0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c,
+  0x65, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69,
+  0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61,
+  0x64, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
+  0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f,
+  0x6e, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2f, 0x2f, 0x20, 0x41, 0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c,
+  0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61,
+  0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72,
+  0x65, 0x6e, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66,
+  0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d,
+  0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20,
+  0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f,
+  0x64, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61,
+  0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x2f, 0x2f, 0x20, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68,
+  0x61, 0x72, 0x61, 0x63, 0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61,
+  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65,
+  0x42, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74,
+  0x2e, 0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c,
+  0x6e, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2f, 0x2f, 0x20, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65,
+  0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c,
+  0x69, 0x6e, 0x65, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c,
+  0x65, 0x74, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74,
+  0x65, 0x78, 0x74, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c,
+  0x6e, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x2f, 0x2f, 0x20, 0x49, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65,
+  0x78, 0x74, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65,
+  0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69,
+  0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68,
+  0x65, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20,
+  0x6c, 0x69, 0x6e, 0x65, 0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f,
+  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x2f, 0x2f, 0x20, 0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74,
+  0x20, 0x69, 0x6e, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72,
+  0x20, 0x74, 0x6f, 0x20, 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64,
+  0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74,
+  0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61,
+  0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x21, 0x65, 0x6e, 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69,
+  0x6e, 0x65, 0x42, 0x72, 0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f,
+  0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e,
+  0x70, 0x6f, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76,
+  0x65, 0x72, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20,
+  0x52, 0x65, 0x73, 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76,
+  0x65, 0x72, 0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76,
+  0x65, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65,
+  0x61, 0x6b, 0x20, 0x61, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e,
+  0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73,
+  0x65, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76,
+  0x65, 0x6e, 0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64,
+  0x20, 0x74, 0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73,
+  0x75, 0x6c, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20,
+  0x2f, 0x5e, 0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e,
+  0x2a, 0x29, 0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e,
+  0x65, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63,
+  0x68, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78,
+  0x65, 0x63, 0x28, 0x6c, 0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61,
+  0x74, 0x63, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b,
+  0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20,
+  0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69,
+  0x6e, 0x63, 0x65, 0x20, 0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20,
+  0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x2e, 0x63, 0x70, 0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73,
+  0x20, 0x6a, 0x75, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e,
+  0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75,
+  0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72,
+  0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d,
+  0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28,
+  0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d,
+  0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
+  0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
+  0x2f, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c,
+  0x64, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
+  0x2f, 0x20, 0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20,
+  0x61, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e,
+  0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72,
+  0x2c, 0x20, 0x77, 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72,
+  0x65, 0x61, 0x6b, 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61,
+  0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69,
+  0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61,
+  0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f,
+  0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
+  0x73, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
  0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
-  0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
-  0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63,
-  0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69,
-  0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
-  0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63,
-  0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43,
-  0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22,
-  0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20,
-  0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e,
-  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e,
-  0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
-  0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
-  0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
-  0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e,
-  0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
-  0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
-  0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28,
-  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
-  0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b,
-  0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20,
-  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
-  0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20,
-  0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65,
-  0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d,
-  0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e,
-  0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e,
-  0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73,
-  0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a,
-  0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a,
-  0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c,
-  0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70,
-  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28,
-  0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e,
-  0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
-  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
-  0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a,
-  0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f,
-  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f,
-  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
-  0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d,
-  0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64,
-  0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74,
-  0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f,
-  0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e,
-  0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d,
-  0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
+  0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20,
+  0x3d, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62,
+  0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20,
+  0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61,
+  0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72,
+  0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65,
+  0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20,
+  0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
+  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c,
+  0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20,
+  0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20,
+  0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x63,
+  0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f,
+  0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f,
+  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f,
+  0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76,
+  0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20,
+  0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c,
+  0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
+  0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45,
+  0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72,
+  0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20,
+  0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63,
+  0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28,
+  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
+  0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f,
+  0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78,
+  0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c,
+  0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72,
+  0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
  0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20,
  0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d,
  0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50,
-  0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63,
-  0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72,
-  0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74,
-  0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
-  0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
-  0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
-  0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72,
-  0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
-  0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b,
-  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e,
-  0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65,
-  0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
-  0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a,
-  0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20,
-  0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65,
-  0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
-  0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f,
-  0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65,
-  0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65,
-  0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
-  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
-  0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
-  0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
-  0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
-  0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63,
-  0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f,
-  0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61,
-  0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70,
-  0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e,
-  0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61,
-  0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20,
-  0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74,
-  0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69,
-  0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65,
-  0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69,
-  0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20,
-  0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20,
-  0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20,
-  0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73,
-  0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74,
-  0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
-  0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20,
-  0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20,
-  0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e,
-  0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
-  0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
+  0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20,
+  0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20,
+  0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
+  0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f,
+  0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e,
+  0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20,
+  0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
+  0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f,
+  0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b,
+  0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61,
+  0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e,
+  0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70,
+  0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65,
+  0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e,
+  0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c,
+  0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63,
+  0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29,
+  0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75,
+  0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65,
+  0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69,
+  0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
+  0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45,
+  0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73,
+  0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65,
+  0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74,
+  0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65,
+  0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
+  0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74,
+  0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73,
+  0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
+  0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74,
+  0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54,
+  0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74,
+  0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20,
+  0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28,
+  0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b,
+  0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75,
+  0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69,
+  0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
+  0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45,
+  0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73,
+  0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f,
+  0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69,
+  0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
+  0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29,
+  0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
+  0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74,
+  0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c,
+  0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65,
+  0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
+  0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f,
+  0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74,
+  0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20,
+  0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20,
+  0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f,
+  0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a,
+  0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c,
+  0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28,
+  0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e,
+  0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d,
+  0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72,
+  0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
+  0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f,
+  0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a,
+  0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
+  0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f,
+  0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
+  0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a,
+  0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f,
+  0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d,
+  0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d,
+  0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20,
+  0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20,
+  0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e,
+  0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20,
+  0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e,
+  0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74,
+  0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e,
+  0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70,
+  0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d,
+  0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
+  0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75,
+  0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74,
+  0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c,
+  0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68,
+  0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28,
+  0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a,
+  0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72,
+  0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a,
+  0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
+  0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
+  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74,
+  0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c,
+  0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20,
+  0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72,
+  0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20,
+  0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f,
+  0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
+  0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a,
+  0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65,
+  0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20,
+  0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68,
+  0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68,
+  0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c,
+  0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67,
+  0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74,
+  0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20,
+  0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72,
+  0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d,
+  0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d,
+  0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65,
+  0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74,
+  0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f,
+  0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61,
+  0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22,
+  0x2f, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22,
+  0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20,
+  0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
  0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
-  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x77,
-  0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f,
-  0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22, 0x29,
-  0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72,
-  0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20,
-  0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67,
-  0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
-  0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
+  0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
 };
-unsigned int completion_js_len = 4462;
+unsigned int completion_js_len = 5099;
--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@@ -11,8 +11,10 @@ echo >> $PUBLIC/index.js # add newline

 FILES=$(ls $PUBLIC)

+cd $PUBLIC
 for FILE in $FILES; do
-  func=$(echo $FILE | tr '.' '_')
-  echo "generate $FILE.hpp ($func)"
-  xxd -n $func -i $PUBLIC/$FILE > $DIR/$FILE.hpp
+  echo "generate $FILE.hpp"
+
+  # use simple flag for old version of xxd
+  xxd -i $FILE > $DIR/$FILE.hpp
 done
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/index.js.hpp
+++ b/examples/server/index.js.hpp
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
@@ -0,0 +1,311 @@
+unsigned char json_schema_to_grammar_mjs[] = {
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f,
+  0x52, 0x55, 0x4c, 0x45, 0x20, 0x3d, 0x20, 0x27, 0x22, 0x20, 0x22, 0x3f,
+  0x27, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x52,
+  0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45,
+  0x53, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x62, 0x6f, 0x6f, 0x6c,
+  0x65, 0x61, 0x6e, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x74, 0x72, 0x75, 0x65,
+  0x22, 0x20, 0x7c, 0x20, 0x22, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x22, 0x29,
+  0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x6e,
+  0x75, 0x6d, 0x62, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d, 0x22,
+  0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20, 0x5b,
+  0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a, 0x29,
+  0x29, 0x20, 0x28, 0x22, 0x2e, 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d,
+  0x2b, 0x29, 0x3f, 0x20, 0x28, 0x5b, 0x65, 0x45, 0x5d, 0x20, 0x5b, 0x2d,
+  0x2b, 0x5d, 0x3f, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2b, 0x29, 0x3f,
+  0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x69,
+  0x6e, 0x74, 0x65, 0x67, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d,
+  0x22, 0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20,
+  0x5b, 0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a,
+  0x29, 0x29, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20,
+  0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x60, 0x20, 0x22,
+  0x5c, 0x5c, 0x22, 0x22, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x5b, 0x5e, 0x22, 0x5c, 0x5c, 0x5c, 0x5c, 0x5d, 0x20,
+  0x7c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x22, 0x5c,
+  0x5c, 0x5c, 0x5c, 0x22, 0x20, 0x28, 0x5b, 0x22, 0x5c, 0x5c, 0x5c, 0x5c,
+  0x2f, 0x62, 0x66, 0x6e, 0x72, 0x74, 0x5d, 0x20, 0x7c, 0x20, 0x22, 0x75,
+  0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2a, 0x20,
+  0x22, 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60,
+  0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x20, 0x27, 0x22,
+  0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
+  0x2c, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44, 0x5f, 0x52, 0x55, 0x4c, 0x45,
+  0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f, 0x52, 0x45, 0x20, 0x3d, 0x20,
+  0x2f, 0x5b, 0x5e, 0x5c, 0x64, 0x41, 0x2d, 0x5a, 0x61, 0x2d, 0x7a, 0x2d,
+  0x5d, 0x2b, 0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45,
+  0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, 0x52,
+  0x45, 0x20, 0x3d, 0x20, 0x2f, 0x5b, 0x5c, 0x6e, 0x5c, 0x72, 0x22, 0x5d,
+  0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x47, 0x52,
+  0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41,
+  0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x53, 0x20, 0x3d, 0x20,
+  0x7b, 0x27, 0x5c, 0x72, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x72, 0x27,
+  0x2c, 0x20, 0x27, 0x5c, 0x6e, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x6e,
+  0x27, 0x2c, 0x20, 0x27, 0x22, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x22,
+  0x27, 0x7d, 0x3b, 0x0a, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20,
+  0x63, 0x6c, 0x61, 0x73, 0x73, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
+  0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f,
+  0x72, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x3d,
+  0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x7c,
+  0x7c, 0x20, 0x7b, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x20, 0x3d, 0x20,
+  0x6e, 0x65, 0x77, 0x20, 0x4d, 0x61, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c,
+  0x65, 0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x27, 0x73, 0x70, 0x61, 0x63,
+  0x65, 0x27, 0x2c, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f, 0x52, 0x55,
+  0x4c, 0x45, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
+  0x61, 0x6c, 0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x4a, 0x53,
+  0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79,
+  0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x2e, 0x72, 0x65,
+  0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54,
+  0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f,
+  0x52, 0x45, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x20,
+  0x3d, 0x3e, 0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c,
+  0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50,
+  0x45, 0x53, 0x5b, 0x6d, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x60, 0x22, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x7d,
+  0x22, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f,
+  0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x6e, 0x61, 0x6d, 0x65,
+  0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d,
+  0x65, 0x20, 0x3d, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2e, 0x72, 0x65, 0x70,
+  0x6c, 0x61, 0x63, 0x65, 0x28, 0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44,
+  0x5f, 0x52, 0x55, 0x4c, 0x45, 0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f,
+  0x52, 0x45, 0x2c, 0x20, 0x27, 0x2d, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20,
+  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
+  0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28, 0x65, 0x73,
+  0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x28,
+  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x3d, 0x3d, 0x3d,
+  0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
+  0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28,
+  0x60, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
+  0x7b, 0x69, 0x7d, 0x60, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x60, 0x24, 0x7b,
+  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x69, 0x7d,
+  0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65,
+  0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x6b, 0x65, 0x79, 0x2c, 0x20, 0x72,
+  0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20,
+  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73,
+  0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20,
+  0x3d, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x74, 0x79, 0x70,
+  0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20,
+  0x6e, 0x61, 0x6d, 0x65, 0x20, 0x7c, 0x7c, 0x20, 0x27, 0x72, 0x6f, 0x6f,
+  0x74, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x6f, 0x6e, 0x65, 0x4f,
+  0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e,
+  0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c,
+  0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e,
+  0x6f, 0x6e, 0x65, 0x4f, 0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68,
+  0x65, 0x6d, 0x61, 0x2e, 0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x2e, 0x6d,
+  0x61, 0x70, 0x28, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69,
+  0x73, 0x69, 0x74, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
+  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20,
+  0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x69, 0x7d, 0x60, 0x29, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e,
+  0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74,
+  0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65,
+  0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x72,
+  0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20,
+  0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c,
+  0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
+  0x61, 0x6c, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x65,
+  0x6e, 0x75, 0x6d, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d,
+  0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x65, 0x6e, 0x75, 0x6d,
+  0x2e, 0x6d, 0x61, 0x70, 0x28, 0x76, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69,
+  0x74, 0x65, 0x72, 0x61, 0x6c, 0x28, 0x76, 0x29, 0x29, 0x2e, 0x6a, 0x6f,
+  0x69, 0x6e, 0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c,
+  0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
+  0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x63,
+  0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d,
+  0x20, 0x27, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x27, 0x20, 0x26, 0x26,
+  0x20, 0x27, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73,
+  0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
+  0x54, 0x4f, 0x44, 0x4f, 0x3a, 0x20, 0x60, 0x72, 0x65, 0x71, 0x75, 0x69,
+  0x72, 0x65, 0x64, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64,
+  0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f,
+  0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72,
+  0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
+  0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70,
+  0x72, 0x6f, 0x70, 0x50, 0x61, 0x69, 0x72, 0x73, 0x20, 0x3d, 0x20, 0x4f,
+  0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x65, 0x6e, 0x74, 0x72, 0x69, 0x65,
+  0x73, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x70, 0x72, 0x6f,
+  0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x29, 0x2e, 0x73, 0x6f, 0x72,
+  0x74, 0x28, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
+  0x20, 0x73, 0x6f, 0x72, 0x74, 0x20, 0x62, 0x79, 0x20, 0x70, 0x6f, 0x73,
+  0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f,
+  0x70, 0x5f, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x28, 0x69, 0x66, 0x20,
+  0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x65, 0x64, 0x29, 0x20, 0x74,
+  0x68, 0x65, 0x6e, 0x20, 0x62, 0x79, 0x20, 0x6b, 0x65, 0x79, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x3d, 0x20, 0x74, 0x79,
+  0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64,
+  0x65, 0x72, 0x5b, 0x61, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3d, 0x3d, 0x3d,
+  0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x27, 0x20, 0x3f, 0x20,
+  0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x61, 0x5b,
+  0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49, 0x6e, 0x66, 0x69, 0x6e, 0x69,
+  0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x42,
+  0x20, 0x3d, 0x20, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72,
+  0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d,
+  0x5d, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65,
+  0x72, 0x27, 0x20, 0x3f, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64,
+  0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49,
+  0x6e, 0x66, 0x69, 0x6e, 0x69, 0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x2d, 0x20, 0x6f, 0x72, 0x64,
+  0x65, 0x72, 0x42, 0x20, 0x7c, 0x7c, 0x20, 0x61, 0x5b, 0x30, 0x5d, 0x2e,
+  0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x65, 0x43, 0x6f, 0x6d, 0x70, 0x61, 0x72,
+  0x65, 0x28, 0x62, 0x5b, 0x30, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d,
+  0x20, 0x27, 0x22, 0x7b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x70,
+  0x50, 0x61, 0x69, 0x72, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63,
+  0x68, 0x28, 0x28, 0x5b, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65,
+  0x2c, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
+  0x5d, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x70, 0x72, 0x6f, 0x70, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d,
+  0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69, 0x73,
+  0x69, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
+  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20,
+  0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x4e,
+  0x61, 0x6d, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3e, 0x20,
+  0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x27,
+  0x20, 0x22, 0x2c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20,
+  0x2b, 0x3d, 0x20, 0x60, 0x20, 0x24, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
+  0x61, 0x6c, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65, 0x29,
+  0x7d, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x22, 0x3a, 0x22, 0x20,
+  0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70,
+  0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x60, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20,
+  0x27, 0x20, 0x22, 0x7d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64,
+  0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61,
+  0x6d, 0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66,
+  0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65,
+  0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x61, 0x72, 0x72, 0x61, 0x79, 0x27,
+  0x20, 0x26, 0x26, 0x20, 0x27, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x27, 0x20,
+  0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x4f,
+  0x44, 0x4f, 0x20, 0x60, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, 0x49, 0x74,
+  0x65, 0x6d, 0x73, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64,
+  0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f,
+  0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75,
+  0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x2e, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x2c, 0x20, 0x60, 0x24,
+  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65,
+  0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20, 0x3a, 0x20, 0x22, 0x22, 0x7d,
+  0x69, 0x74, 0x65, 0x6d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65,
+  0x20, 0x3d, 0x20, 0x60, 0x22, 0x5b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63,
+  0x65, 0x20, 0x28, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75, 0x6c,
+  0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x28, 0x22, 0x2c, 0x22, 0x20,
+  0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d,
+  0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x29, 0x2a, 0x29,
+  0x3f, 0x20, 0x22, 0x5d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64,
+  0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d,
+  0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x50, 0x52,
+  0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45,
+  0x53, 0x5b, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65,
+  0x5d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
+  0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x55, 0x6e, 0x72, 0x65, 0x63, 0x6f,
+  0x67, 0x6e, 0x69, 0x7a, 0x65, 0x64, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x3a, 0x20, 0x24, 0x7b, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74,
+  0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x29, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61,
+  0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65,
+  0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20,
+  0x3f, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20, 0x3a, 0x20, 0x73,
+  0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x2c, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x50, 0x52, 0x49, 0x4d, 0x49,
+  0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x53, 0x5b, 0x73,
+  0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x5d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72,
+  0x6d, 0x61, 0x74, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x28, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x67,
+  0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72,
+  0x75, 0x6c, 0x65, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68,
+  0x28, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65,
+  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x2b, 0x3d, 0x20,
+  0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x3a, 0x3a, 0x3d,
+  0x20, 0x24, 0x7b, 0x72, 0x75, 0x6c, 0x65, 0x7d, 0x5c, 0x6e, 0x60, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x72, 0x61, 0x6d,
+  0x6d, 0x61, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a
+};
+unsigned int json_schema_to_grammar_mjs_len = 3695;
--- a/examples/server/public/completion.js
+++ b/examples/server/public/completion.js
@@ -43,6 +43,7 @@ export async function* llama(prompt, params = {}, config = {}) {
  const decoder = new TextDecoder();

  let content = "";
+  let leftover = ""; // Buffer for partially read lines

  try {
    let cont = true;
@@ -53,29 +54,47 @@ export async function* llama(prompt, params = {}, config = {}) {
        break;
      }

-      // sse answers in the form multiple lines of: value\n with data always present as a key. in our case we
-      // mainly care about the data: key here, which we expect as json
-      const text = decoder.decode(result.value);
+      // Add any leftover data to the current chunk of data
+      const text = leftover + decoder.decode(result.value);

-      // parse all sse events and add them to result
-      const regex = /^(\S+):\s(.*)$/gm;
-      for (const match of text.matchAll(regex)) {
-        result[match[1]] = match[2]
+      // Check if the last character is a line break
+      const endsWithLineBreak = text.endsWith('\n');
+
+      // Split the text into lines
+      let lines = text.split('\n');
+
+      // If the text doesn't end with a line break, then the last line is incomplete
+      // Store it in leftover to be added to the next chunk of data
+      if (!endsWithLineBreak) {
+        leftover = lines.pop();
+      } else {
+        leftover = ""; // Reset leftover if we have a line break at the end
      }

-      // since we know this is llama.cpp, let's just decode the json in data
-      result.data = JSON.parse(result.data);
-      content += result.data.content;
+      // Parse all sse events and add them to result
+      const regex = /^(\S+):\s(.*)$/gm;
+      for (const line of lines) {
+        const match = regex.exec(line);
+        if (match) {
+          result[match[1]] = match[2]
+          // since we know this is llama.cpp, let's just decode the json in data
+          if (result.data) {
+            result.data = JSON.parse(result.data);
+            content += result.data.content;

-      // yield
-      yield result;
+            // yield
+            yield result;

-      // if we got a stop token from server, we will break here
-      if (result.data.stop) {
-        if (result.data.generation_settings) {
-          generation_settings = result.data.generation_settings;
+            // if we got a stop token from server, we will break here
+            if (result.data.stop) {
+              if (result.data.generation_settings) {
+                generation_settings = result.data.generation_settings;
+              }
+              cont = false;
+              break;
+            }
+          }
        }
-        break;
      }
    }
  } catch (e) {
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -3,12 +3,11 @@
 <head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
+  <meta name="color-scheme" content="light dark">
  <title>llama.cpp - chat</title>

  <style>
    body {
-      background-color: #fff;
-      color: #000;
      font-family: system-ui;
      font-size: 90%;
    }
@@ -73,6 +72,37 @@
      margin: 0;
    }

+    fieldset.two {
+      display: grid;
+      grid-template: "a a";
+      gap: 1em;
+    }
+
+    fieldset.three {
+      display: grid;
+      grid-template: "a a a";
+      gap: 1em;
+    }
+
+    details {
+      border: 1px solid #aaa;
+      border-radius: 4px;
+      padding: 0.5em 0.5em 0;
+      margin-top: 0.5em;
+    }
+
+    summary {
+      font-weight: bold;
+      margin: -0.5em -0.5em 0;
+      padding: 0.5em;
+      cursor: pointer;
+    }
+
+    details[open] {
+      padding: 0.5em;
+    }
+
+
    textarea {
      padding: 5px;
      flex-grow: 1;
@@ -111,26 +141,165 @@
    } from '/index.js';

    import { llama } from '/completion.js';
+    import { SchemaConverter } from '/json-schema-to-grammar.mjs';

    const session = signal({
-      prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
+      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
      historyTemplate: "{{name}}: {{message}}",
      transcript: [],
      type: "chat",
-      char: "llama",
+      char: "Llama",
      user: "User",
    })

    const params = signal({
      n_predict: 400,
      temperature: 0.7,
-      repeat_last_n: 256,
-      repeat_penalty: 1.18,
-      top_k: 40,
-      top_p: 0.5,
+      repeat_last_n: 256, // 0 = disable penalty, -1 = context size
+      repeat_penalty: 1.18, // 1.0 = disabled
+      top_k: 40, // <= 0 to use vocab size
+      top_p: 0.5, // 1.0 = disabled
+      tfs_z: 1.0, // 1.0 = disabled
+      typical_p: 1.0, // 1.0 = disabled
+      presence_penalty: 0.0, // 0.0 = disabled
+      frequency_penalty: 0.0, // 0.0 = disabled
+      mirostat: 0, // 0/1/2
+      mirostat_tau: 5, // target entropy
+      mirostat_eta: 0.1, // learning rate
+      grammar: '',
    })

+    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
+
+    const local_storage_storageKey = "llamacpp_server_local_storage";
+
+    function local_storage_setDataFromObject(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
+    }
+
+    function local_storage_setDataFromRawText(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
+    }
+
+    function local_storage_getDataAsObject(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return JSON.parse(item);
+      }
+    }
+
+    function local_storage_getDataAsRawText(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return item;
+      }
+    }
+
+    // create a container for user templates and settings
+
+    const savedUserTemplates = signal({})
+    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
+
+    // let's import locally saved templates and settings if there are any
+    // user templates and settings are stored in one object
+    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
+
+    console.log('Importing saved templates')
+
+    let importedTemplates = local_storage_getDataAsObject('user_templates')
+
+    if (importedTemplates) {
+      // saved templates were successfuly imported.
+
+      console.log('Processing saved templates and updating default template')
+
+      //console.log(importedTemplates);
+      savedUserTemplates.value = importedTemplates;
+
+      //override default template
+      savedUserTemplates.value.default = { session: session.value, params: params.value }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    } else {
+      // no saved templates detected.
+
+      console.log('Initializing LocalStorage and saving default template')
+
+      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    }
+
+    function userTemplateResetToDefault() {
+      console.log('Reseting themplate to default')
+      selectedUserTemplate.value.name = 'default';
+      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
+    }
+
+    function userTemplateApply(t) {
+      session.value = t.data.session;
+      params.value = t.data.params;
+    }
+
+    function userTemplateResetToDefaultAndApply() {
+      userTemplateResetToDefault()
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    function userTemplateLoadAndApplyAutosaved() {
+      // get autosaved last used template
+      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
+
+      if (lastUsedTemplate) {
+
+        console.log('Autosaved template found, restoring')
+
+        selectedUserTemplate.value = lastUsedTemplate
+      }
+      else {
+
+        console.log('No autosaved template found, using default template')
+        // no autosaved last used template was found, so load from default.
+
+        userTemplateResetToDefault()
+      }
+
+      console.log('Applying template')
+      // and update internal data from templates
+
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    //console.log(savedUserTemplates.value)
+    //console.log(selectedUserTemplate.value)
+
+    function userTemplateAutosave() {
+      console.log('Template Autosave...')
+      if (selectedUserTemplate.value.name == 'default') {
+        // we don't want to save over default template, so let's create a new one
+        let newTemplateName = 'UserTemplate-' + Date.now().toString()
+        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
+
+        console.log('Saving as ' + newTemplateName)
+
+        // save in the autosave slot
+        local_storage_setDataFromObject('user_templates_last', newTemplate)
+
+        // and load it back and apply
+        userTemplateLoadAndApplyAutosaved()
+      } else {
+        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
+      }
+    }
+
+    console.log('Checking for autosaved last used template')
+    userTemplateLoadAndApplyAutosaved()
+
+    /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
+
    const llamaStats = signal(null)
    const controller = signal(null)

@@ -245,8 +414,9 @@

      useEffect(() => {
        // scroll to bottom (if needed)
-        if (container.current && container.current.scrollHeight <= container.current.scrollTop + container.current.offsetHeight + 300) {
-          container.current.scrollTo(0, container.current.scrollHeight)
+        const parent = container.current.parentElement;
+        if (parent && parent.scrollHeight <= parent.scrollTop + parent.offsetHeight + 300) {
+          parent.scrollTo(0, parent.scrollHeight)
        }
      }, [messages])

@@ -264,15 +434,84 @@
      const updateSession = (el) => session.value = { ...session.value, [el.target.name]: el.target.value }
      const updateParams = (el) => params.value = { ...params.value, [el.target.name]: el.target.value }
      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
+      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }
+
+      const grammarJsonSchemaPropOrder = signal('')
+      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
+      const convertJSONSchemaGrammar = () => {
+        try {
+          const schema = JSON.parse(params.value.grammar)
+          const converter = new SchemaConverter(
+            grammarJsonSchemaPropOrder.value
+              .split(',')
+              .reduce((acc, cur, i) => ({...acc, [cur.trim()]: i}), {})
+          )
+          converter.visit(schema, '')
+          params.value = {
+            ...params.value,
+            grammar: converter.formatGrammar(),
+          }
+        } catch (e) {
+          alert(`Convert failed: ${e.message}`)
+        }
+      }
+
+      const FloatField = ({label, max, min, name, step, value}) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" step="${step}" name="${name}" value="${value}" oninput=${updateParamsFloat} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const IntField = ({label, max, min, name, value}) => {
+        return html`
+          <div>
+            <label for="${name}">${label}</label>
+            <input type="range" id="${name}" min="${min}" max="${max}" name="${name}" value="${value}" oninput=${updateParamsInt} />
+            <span>${value}</span>
+          </div>
+        `
+      };
+
+      const userTemplateReset = (e) => {
+        e.preventDefault();
+        userTemplateResetToDefaultAndApply()
+      }
+
+      const UserTemplateResetButton = () => {
+        if (selectedUserTemplate.value.name == 'default') {
+          return html`
+            <button disabled>Using default template</button>
+          `
+        }
+
+        return html`
+          <button onclick=${userTemplateReset}>Reset all to default</button>
+        `
+      };
+
+      useEffect(() => {
+        // autosave template on every change
+        userTemplateAutosave()
+      }, [session.value, params.value])

      return html`
        <form>
+          <fieldset>
+            <${UserTemplateResetButton}/>
+          </fieldset>
+
          <fieldset>
            <div>
              <label for="prompt">Prompt</label>
              <textarea type="text" name="prompt" value="${session.value.prompt}" rows=4 oninput=${updateSession}/>
            </div>
+          </fieldset>

+          <fieldset class="two">
            <div>
              <label for="user">User name</label>
              <input type="text" name="user" value="${session.value.user}" oninput=${updateSession} />
@@ -282,7 +521,9 @@
              <label for="bot">Bot name</label>
              <input type="text" name="char" value="${session.value.char}" oninput=${updateSession} />
            </div>
+          </fieldset>

+          <fieldset>
            <div>
              <label for="template">Prompt template</label>
              <textarea id="template" name="template" value="${session.value.template}" rows=4 oninput=${updateSession}/>
@@ -294,36 +535,49 @@
            </div>

            <div>
-              <label for="temperature">Temperature</label>
-              <input type="range" id="temperature" min="0.0" max="1.0" step="0.01" name="temperature" value="${params.value.temperature}" oninput=${updateParamsFloat} />
-              <span>${params.value.temperature}</span>
+              <label for="template">Grammar</label>
+              <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+              <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+              <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
            </div>
-
-            <div>
-              <label for="nPredict">Predictions</label>
-              <input type="range" id="nPredict" min="1" max="2048" step="1" name="n_predict" value="${params.value.n_predict}" oninput=${updateParamsFloat} />
-              <span>${params.value.n_predict}</span>
-            </div>
-
-            <div>
-              <label for="repeat_penalty">Penalize repeat sequence</label>
-              <input type="range" id="repeat_penalty" min="0.0" max="2.0" step="0.01" name="repeat_penalty" value="${params.value.repeat_penalty}" oninput=${updateParamsFloat} />
-              <span>${params.value.repeat_penalty}</span>
-            </div>
-
-            <div>
-              <label for="repeat_last_n">Consider N tokens for penalize</label>
-              <input type="range" id="repeat_last_n" min="0.0" max="2048" name="repeat_last_n" value="${params.value.repeat_last_n}" oninput=${updateParamsFloat} />
-              <span>${params.value.repeat_last_n}</span>
-            </div>
-
          </fieldset>
+
+          <fieldset class="two">
+            ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
+            ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
+            ${FloatField({label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty})}
+            ${IntField({label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n})}
+            ${IntField({label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k})}
+            ${FloatField({label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p})}
+          </fieldset>
+          <details>
+            <summary>More options</summary>
+            <fieldset class="two">
+              ${FloatField({label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z})}
+              ${FloatField({label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p})}
+              ${FloatField({label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty})}
+              ${FloatField({label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty})}
+            </fieldset>
+            <hr />
+            <fieldset class="three">
+              <div>
+                <label><input type="radio" name="mirostat" value="0" checked=${params.value.mirostat == 0} oninput=${updateParamsInt} /> no Mirostat</label>
+                <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
+                <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
+              </div>
+              ${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
+              ${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
+            </fieldset>
+          </details>
        </form>
      `
    }
    // poor mans markdown replacement
    const Markdownish = (params) => {
      const md = params.text
+        .replace(/&/g, '&amp;')
+        .replace(/</g, '&lt;')
+        .replace(/>/g, '&gt;')
        .replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
        .replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
        .replace(/__(.*?)__/g, '<strong>$1</strong>')
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -0,0 +1,112 @@
+const SPACE_RULE = '" "?';
+
+const PRIMITIVE_RULES = {
+  boolean: '("true" | "false") space',
+  number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
+  integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
+  string: ` "\\"" (
+        [^"\\\\] |
+        "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\\"" space`,
+  null: '"null" space',
+};
+
+const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
+const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
+const GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'};
+
+export class SchemaConverter {
+  constructor(propOrder) {
+    this._propOrder = propOrder || {};
+    this._rules = new Map();
+    this._rules.set('space', SPACE_RULE);
+  }
+
+  _formatLiteral(literal) {
+    const escaped = JSON.stringify(literal).replace(
+      GRAMMAR_LITERAL_ESCAPE_RE,
+      m => GRAMMAR_LITERAL_ESCAPES[m]
+    );
+    return `"${escaped}"`;
+  }
+
+  _addRule(name, rule) {
+    let escName = name.replace(INVALID_RULE_CHARS_RE, '-');
+    let key = escName;
+
+    if (this._rules.has(escName)) {
+      if (this._rules.get(escName) === rule) {
+        return key;
+      }
+
+      let i = 0;
+      while (this._rules.has(`${escName}${i}`)) {
+        i += 1;
+      }
+      key = `${escName}${i}`;
+    }
+
+    this._rules.set(key, rule);
+    return key;
+  }
+
+  visit(schema, name) {
+    const schemaType = schema.type;
+    const ruleName = name || 'root';
+
+    if (schema.oneOf || schema.anyOf) {
+      const rule = (schema.oneOf || schema.anyOf).map((altSchema, i) =>
+        this.visit(altSchema, `${name}${name ? "-" : ""}${i}`)
+      ).join(' | ');
+
+      return this._addRule(ruleName, rule);
+    } else if ('const' in schema) {
+      return this._addRule(ruleName, this._formatLiteral(schema.const));
+    } else if ('enum' in schema) {
+      const rule = schema.enum.map(v => this._formatLiteral(v)).join(' | ');
+      return this._addRule(ruleName, rule);
+    } else if (schemaType === 'object' && 'properties' in schema) {
+      // TODO: `required` keyword (from python implementation)
+      const propOrder = this._propOrder;
+      const propPairs = Object.entries(schema.properties).sort((a, b) => {
+        // sort by position in prop_order (if specified) then by key
+        const orderA = typeof propOrder[a[0]] === 'number' ? propOrder[a[0]] : Infinity;
+        const orderB = typeof propOrder[b[0]] === 'number' ? propOrder[b[0]] : Infinity;
+        return orderA - orderB || a[0].localeCompare(b[0]);
+      });
+
+      let rule = '"{" space';
+      propPairs.forEach(([propName, propSchema], i) => {
+        const propRuleName = this.visit(propSchema, `${name}${name ? "-" : ""}${propName}`);
+        if (i > 0) {
+          rule += ' "," space';
+        }
+        rule += ` ${this._formatLiteral(propName)} space ":" space ${propRuleName}`;
+      });
+      rule += ' "}" space';
+
+      return this._addRule(ruleName, rule);
+    } else if (schemaType === 'array' && 'items' in schema) {
+      // TODO `prefixItems` keyword (from python implementation)
+      const itemRuleName = this.visit(schema.items, `${name}${name ? "-" : ""}item`);
+      const rule = `"[" space (${itemRuleName} ("," space ${itemRuleName})*)? "]" space`;
+      return this._addRule(ruleName, rule);
+    } else {
+      if (!PRIMITIVE_RULES[schemaType]) {
+        throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
+      }
+      return this._addRule(
+        ruleName === 'root' ? 'root' : schemaType,
+        PRIMITIVE_RULES[schemaType]
+      );
+    }
+  }
+
+  formatGrammar() {
+    let grammar = '';
+    this._rules.forEach((rule, name) => {
+      grammar += `${name} ::= ${rule}\n`;
+    });
+    return grammar;
+  }
+}
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,6 +1,7 @@
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
+#include "grammar-parser.h"

 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -14,6 +15,7 @@
 #include "index.html.hpp"
 #include "index.js.hpp"
 #include "completion.js.hpp"
+#include "json-schema-to-grammar.mjs.hpp"

 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
@@ -188,6 +190,7 @@ struct llama_server_context
    size_t n_past = 0;
    size_t n_remain = 0;

+    json prompt;
    std::vector<llama_token> embd;
    std::vector<llama_token> last_n_tokens;

@@ -195,6 +198,9 @@ struct llama_server_context
    llama_context *ctx = nullptr;
    gpt_params params;

+    grammar_parser::parse_state parsed_grammar;
+    llama_grammar *grammar = nullptr;
+
    bool truncated = false;
    bool stopped_eos = false;
    bool stopped_word = false;
@@ -226,6 +232,7 @@ struct llama_server_context
    void rewind()
    {
        params.antiprompt.clear();
+        params.grammar.clear();
        num_prompt_tokens = 0;
        num_tokens_predicted = 0;
        generated_text = "";
@@ -237,9 +244,13 @@ struct llama_server_context
        stopped_limit = false;
        stopping_word = "";
        multibyte_pending = 0;
-
        n_remain = 0;
        n_past = 0;
+
+        if (grammar != nullptr) {
+            llama_grammar_free(grammar);
+            grammar = nullptr;
+        }
    }

    bool loadModel(const gpt_params &params_)
@@ -257,10 +268,82 @@ struct llama_server_context
        return true;
    }

+    std::vector<llama_token> tokenize(json json_prompt, bool add_bos)
+    {
+        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+        // or the first element of the json_prompt array is a string.
+        std::vector<llama_token> prompt_tokens;
+
+        if (json_prompt.is_array())
+        {
+            bool first = true;
+            for (const auto& p : json_prompt)
+            {
+                if (p.is_string())
+                {
+                    auto s = p.template get<std::string>();
+                    std::vector<llama_token> p;
+                    if (first)
+                    {
+                        s.insert(0, 1, ' '); // add a space if it's the first
+                        p = ::llama_tokenize(ctx, s, add_bos);
+                        first = false;
+                    }
+                    else
+                    {
+                        p = ::llama_tokenize(ctx, s, false);
+                    }
+                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+                }
+                else
+                {
+                    if (first)
+                    {
+                        first = false;
+                    }
+                    prompt_tokens.push_back(p.template get<llama_token>());
+                }
+            }
+        }
+        else
+        {
+            auto s = json_prompt.template get<std::string>();
+            s.insert(0, 1, ' '); // always add a first space
+            prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+        }
+
+        return prompt_tokens;
+    }
+
+    bool loadGrammar()
+    {
+        if (!params.grammar.empty()) {
+            parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+            // will be empty (default) if there are parse errors
+            if (parsed_grammar.rules.empty()) {
+                LOG_ERROR("grammar parse error", {{"grammar", params.grammar}});
+                return false;
+            }
+            grammar_parser::print_grammar(stderr, parsed_grammar);
+
+            {
+                auto it = params.logit_bias.find(llama_token_eos(ctx));
+                if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                    LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
+                }
+            }
+
+            std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+            grammar = llama_grammar_init(
+                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+        }
+        return true;
+    }
+
    void loadPrompt()
    {
-        params.prompt.insert(0, 1, ' '); // always add a first space
-        std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
+        auto prompt_tokens = tokenize(prompt, true);  // always add BOS
+
        num_prompt_tokens = prompt_tokens.size();

        if (params.n_keep < 0)
@@ -367,7 +450,7 @@ struct llama_server_context
        if (params.n_predict == 0)
        {
            has_next_token = false;
-            result.tok = llama_token_eos();
+            result.tok = llama_token_eos(ctx);
            return result;
        }

@@ -407,7 +490,7 @@ struct llama_server_context
            llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};

            // Apply penalties
-            float nl_logit = logits[llama_token_nl()];
+            float nl_logit = logits[llama_token_nl(ctx)];
            auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
            llama_sample_repetition_penalty(ctx, &candidates_p,
                                            last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@@ -417,7 +500,11 @@ struct llama_server_context
                                                          last_n_repeat, alpha_frequency, alpha_presence);
            if (!penalize_nl)
            {
-                logits[llama_token_nl()] = nl_logit;
+                logits[llama_token_nl(ctx)] = nl_logit;
+            }
+
+            if (grammar != nullptr) {
+                llama_sample_grammar(ctx, &candidates_p, grammar);
            }

            if (temp <= 0)
@@ -457,10 +544,15 @@ struct llama_server_context
                }
            }

+            if (grammar != nullptr) {
+                llama_grammar_accept_token(ctx, grammar, result.tok);
+            }
+
            for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
            {
                result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
            }
+
            last_n_tokens.erase(last_n_tokens.begin());
            last_n_tokens.push_back(result.tok);
            num_tokens_predicted++;
@@ -471,7 +563,7 @@ struct llama_server_context
        // decrement remaining sampling budget
        --n_remain;

-        if (!embd.empty() && embd.back() == llama_token_eos())
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx))
        {
            // stopping_word = llama_token_to_str(ctx, embd.back());
            has_next_token = false;
@@ -601,47 +693,50 @@ struct llama_server_context
 static void server_print_usage(const char *argv0, const gpt_params &params,
                               const server_params &sparams)
 {
-    fprintf(stderr, "usage: %s [options]\n", argv0);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(stderr, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
-    fprintf(stderr, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
-    fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    fprintf(stdout, "usage: %s [options]\n", argv0);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "options:\n");
+    fprintf(stdout, "  -h, --help            show this help message and exit\n");
+    fprintf(stdout, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
+    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
    if (llama_mlock_supported())
    {
-        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+        fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
    if (llama_mmap_supported())
    {
-        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+        fprintf(stdout, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
+    fprintf(stdout, "  --numa                attempt optimizations that help on some NUMA systems\n");
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
-    fprintf(stderr, "                        number of layers to store in VRAM\n");
-    fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
-    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-    fprintf(stderr, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -ngl N, --n-gpu-layers N\n");
+    fprintf(stdout, "                        number of layers to store in VRAM\n");
+    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
+    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+    fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
+    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -a ALIAS, --alias ALIAS\n");
-    fprintf(stderr, "                        set an alias for the model, will be added as `model` field in completion response\n");
-    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
-    fprintf(stderr, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
-    fprintf(stderr, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
-    fprintf(stderr, "  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str());
-    fprintf(stderr, "  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
-    fprintf(stderr, "  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
-    fprintf(stderr, "\n");
+    fprintf(stdout, "  -m FNAME, --model FNAME\n");
+    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stdout, "  -a ALIAS, --alias ALIAS\n");
+    fprintf(stdout, "                        set an alias for the model, will be added as `model` field in completion response\n");
+    fprintf(stdout, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
+    fprintf(stdout, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
+    fprintf(stdout, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
+    fprintf(stdout, "  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str());
+    fprintf(stdout, "  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
+    fprintf(stdout, "  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
+    fprintf(stdout, "\n");
 }

 static void server_params_parse(int argc, char **argv, server_params &sparams,
@@ -726,7 +821,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        }
        else if (arg == "--rope-freq-base")
        {
-            if (++i >= argc) {
+            if (++i >= argc)
+            {
                invalid_param = true;
                break;
            }
@@ -734,7 +830,8 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        }
        else if (arg == "--rope-freq-scale")
        {
-            if (++i >= argc) {
+            if (++i >= argc)
+            {
                invalid_param = true;
                break;
            }
@@ -806,7 +903,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                }
            }
 #else
-            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {});
+            LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
 #endif // GGML_USE_CUBLAS
        }
        else if (arg == "--low-vram" || arg == "-lv")
@@ -814,7 +911,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
 #ifdef GGML_USE_CUBLAS
            params.low_vram = true;
 #else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
+            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
+#endif // GGML_USE_CUBLAS
+        }
+        else if (arg == "--no-mul-mat-q" || arg == "-nommq")
+        {
+#ifdef GGML_USE_CUBLAS
+            params.mul_mat_q = false;
+#else
+            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
 #endif // GGML_USE_CUBLAS
        }
        else if (arg == "--main-gpu" || arg == "-mg")
@@ -865,6 +970,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        {
            params.use_mmap = false;
        }
+        else if (arg == "--numa")
+        {
+            params.numa = true;
+        }
        else if (arg == "--embedding")
        {
            params.embedding = true;
@@ -887,7 +996,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,

 static json format_generation_settings(llama_server_context &llama)
 {
-    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos());
+    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
    const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
                            eos_bias->second < 0.0f && std::isinf(eos_bias->second);

@@ -915,6 +1024,7 @@ static json format_generation_settings(llama_server_context &llama)
        {"stream", llama.stream},
        {"logit_bias", llama.params.logit_bias},
        {"n_probs", llama.params.n_probs},
+        {"grammar", llama.params.grammar},
    };
 }

@@ -932,7 +1042,7 @@ static json format_timings(llama_server_context &llama)
    assert(timings.n_eval == llama.num_tokens_predicted);

    return json{
-        {"prompt_n", timings.n_eval},
+        {"prompt_n", timings.n_p_eval},
        {"prompt_ms", timings.t_p_eval_ms},
        {"prompt_per_token_ms", timings.t_p_eval_ms / timings.n_p_eval},
        {"prompt_per_second", 1e3 / timings.t_p_eval_ms * timings.n_p_eval},
@@ -954,14 +1064,13 @@ static json format_final_response(llama_server_context &llama, const std::string
        {"tokens_predicted", llama.num_tokens_predicted},
        {"tokens_evaluated", llama.num_prompt_tokens},
        {"generation_settings", format_generation_settings(llama)},
-        {"prompt", llama.params.prompt},
+        {"prompt", llama.prompt},
        {"truncated", llama.truncated},
        {"stopped_eos", llama.stopped_eos},
        {"stopped_word", llama.stopped_word},
        {"stopped_limit", llama.stopped_limit},
        {"stopping_word", llama.stopping_word},
        {"tokens_cached", llama.n_past},
-        {"tokens_predicted", llama.num_tokens_predicted},
        {"timings", format_timings(llama)},
    };

@@ -994,34 +1103,52 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
        {"tokens", tokens}};
 }

+template <typename T>
+static T json_value(const json &body, const std::string &key, const T &default_value)
+{
+    // Fallback null to default value
+    return body.contains(key) && !body.at(key).is_null()
+        ? body.value(key, default_value)
+        : default_value;
+}
+
 static void parse_options_completion(const json &body, llama_server_context &llama)
 {
    gpt_params default_params;

-    llama.stream = body.value("stream", false);
-    llama.params.n_predict = body.value("n_predict", default_params.n_predict);
-    llama.params.top_k = body.value("top_k", default_params.top_k);
-    llama.params.top_p = body.value("top_p", default_params.top_p);
-    llama.params.tfs_z = body.value("tfs_z", default_params.tfs_z);
-    llama.params.typical_p = body.value("typical_p", default_params.typical_p);
-    llama.params.repeat_last_n = body.value("repeat_last_n", default_params.repeat_last_n);
-    llama.params.temp = body.value("temperature", default_params.temp);
-    llama.params.repeat_penalty = body.value("repeat_penalty", default_params.repeat_penalty);
-    llama.params.presence_penalty = body.value("presence_penalty", default_params.presence_penalty);
-    llama.params.frequency_penalty = body.value("frequency_penalty", default_params.frequency_penalty);
-    llama.params.mirostat = body.value("mirostat", default_params.mirostat);
-    llama.params.mirostat_tau = body.value("mirostat_tau", default_params.mirostat_tau);
-    llama.params.mirostat_eta = body.value("mirostat_eta", default_params.mirostat_eta);
-    llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl);
-    llama.params.n_keep = body.value("n_keep", default_params.n_keep);
-    llama.params.seed = body.value("seed", default_params.seed);
-    llama.params.prompt = body.value("prompt", default_params.prompt);
-    llama.params.n_probs = body.value("n_probs", default_params.n_probs);
+    llama.stream = json_value(body, "stream", false);
+    llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
+    llama.params.top_k = json_value(body, "top_k", default_params.top_k);
+    llama.params.top_p = json_value(body, "top_p", default_params.top_p);
+    llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
+    llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
+    llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
+    llama.params.temp = json_value(body, "temperature", default_params.temp);
+    llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
+    llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
+    llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
+    llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
+    llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
+    llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
+    llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
+    llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
+    llama.params.seed = json_value(body, "seed", default_params.seed);
+    llama.params.grammar = json_value(body, "grammar", default_params.grammar);
+    llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
+
+    if (body.count("prompt") != 0)
+    {
+        llama.prompt = body["prompt"];
+    }
+    else
+    {
+        llama.prompt = "";
+    }

    llama.params.logit_bias.clear();
-    if (body.value("ignore_eos", false))
+    if (json_value(body, "ignore_eos", false))
    {
-        llama.params.logit_bias[llama_token_eos()] = -INFINITY;
+        llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
    }

    const auto &logit_bias = body.find("logit_bias");
@@ -1137,6 +1264,12 @@ int main(int argc, char **argv)
        res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
        return false; });

+    // this is only called if no index.html is found in the public --path
+    svr.Get("/json-schema-to-grammar.mjs", [](const Request &, Response &res)
+            {
+        res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
+        return false; });
+
    svr.Post("/completion", [&llama](const Request &req, Response &res)
             {
        auto lock = llama.lock();
@@ -1147,6 +1280,12 @@ int main(int argc, char **argv)

        parse_options_completion(json::parse(req.body), llama);

+        if (!llama.loadGrammar())
+        {
+            res.status = 400;
+            return;
+        }
+
        llama.loadPrompt();
        llama.beginCompletion();

@@ -1242,7 +1381,11 @@ int main(int argc, char **argv)
                sink.done();
                return true;
            };
-            res.set_chunked_content_provider("text/event-stream", chunked_content_provider);
+            const auto on_complete = [&](bool) {
+                llama.mutex.unlock();
+            };
+            lock.release();
+            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
        } });

    svr.Get("/model.json", [&llama](const Request &, Response &res)
@@ -1258,8 +1401,11 @@ int main(int argc, char **argv)
        auto lock = llama.lock();

        const json body = json::parse(req.body);
-        const std::string content = body.value("content", "");
-        const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
+        std::vector<llama_token> tokens;
+        if (body.count("content") != 0)
+        {
+            tokens = llama.tokenize(body["content"], false);
+        }
        const json data = format_tokenizer_response(tokens);
        return res.set_content(data.dump(), "application/json"); });

@@ -1271,7 +1417,14 @@ int main(int argc, char **argv)

        llama.rewind();
        llama_reset_timings(llama.ctx);
-        llama.params.prompt = body.value("content", "");
+        if (body.count("content") != 0)
+        {
+            llama.prompt = body["content"];
+        }
+        else
+        {
+            llama.prompt = "";
+        }
        llama.params.n_predict = 0;
        llama.loadPrompt();
        llama.beginCompletion();
@@ -1298,8 +1451,12 @@ int main(int argc, char **argv)

    svr.set_error_handler([](const Request &, Response &res)
                          {
-        res.set_content("File Not Found", "text/plain");
-        res.status = 404; });
+        if (res.status == 400) {
+            res.set_content("Invalid request", "text/plain");
+        } else if (res.status != 500) {
+            res.set_content("File Not Found", "text/plain");
+            res.status = 404;
+        } });

    // set timeouts and change hostname and port
    svr.set_read_timeout(sparams.read_timeout);
@@ -1327,6 +1484,9 @@ int main(int argc, char **argv)
        return 1;
    }

+    if (llama.grammar != nullptr) {
+        llama_grammar_free(llama.grammar);
+    }
    llama_backend_free();

    return 0;
--- a/examples/simple/CMakeLists.txt
+++ b/examples/simple/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(TARGET simple)
 add_executable(${TARGET} simple.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -2,180 +2,129 @@
 #define _GNU_SOURCE
 #endif

-#include "common.h"
-#include "llama.h"
 #include "build-info.h"

-#include <cassert>
-#include <cinttypes>
+#include "common.h"
+#include "llama.h"
+
 #include <cmath>
 #include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
 #include <string>
 #include <vector>

-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
-#include <windows.h>
-#include <signal.h>
-#endif
-
-
-
-int main(int argc, char ** argv)
-{
+int main(int argc, char ** argv) {
    gpt_params params;

-    //---------------------------------
-    // Print help :
-    //---------------------------------
-
-    if ( argc == 1 || argv[1][0] == '-' )
-    {
-        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
+    if (argc == 1 || argv[1][0] == '-') {
+        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
        return 1 ;
    }

-    //---------------------------------
-    // Load parameters :
-    //---------------------------------
-
-    if ( argc >= 2 )
-    {
+    if (argc >= 2) {
        params.model = argv[1];
    }

-    if ( argc >= 3 )
-    {
+    if (argc >= 3) {
        params.prompt = argv[2];
    }

-    if ( params.prompt.empty() )
-    {
+    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }

-    //---------------------------------
-    // Init LLM :
-    //---------------------------------
+    // init LLM

    llama_backend_init(params.numa);

-    llama_model * model;
-    llama_context * ctx;
+    llama_context_params ctx_params = llama_context_default_params();

-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);

-    if ( model == NULL )
-    {
-        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

-    //---------------------------------
-    // Tokenize the prompt :
-    //---------------------------------
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+
+    // tokenize the prompt

    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
+    tokens_list = ::llama_tokenize(ctx, params.prompt, true);

-    const int max_context_size     = llama_n_ctx( ctx );
-    const int max_tokens_list_size = max_context_size - 4 ;
+    const int max_context_size     = llama_n_ctx(ctx);
+    const int max_tokens_list_size = max_context_size - 4;

-    if ( (int)tokens_list.size() > max_tokens_list_size )
-    {
-        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
-             __func__ , (int)tokens_list.size() , max_tokens_list_size );
+    if ((int) tokens_list.size() > max_tokens_list_size) {
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
        return 1;
    }

-    fprintf( stderr, "\n\n" );
+    fprintf(stderr, "\n\n");

-    // Print the tokens from the prompt :
-
-    for( auto id : tokens_list )
-    {
-        printf( "%s" , llama_token_to_str( ctx , id ) );
+    for (auto id : tokens_list) {
+        fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
    }

-    fflush(stdout);
+    fflush(stderr);

-
-    //---------------------------------
-    // Main prediction loop :
-    //---------------------------------
+    // main loop

    // The LLM keeps a contextual cache memory of previous token evaluation.
    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.

-    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
-    {
-        //---------------------------------
-        // Evaluate the tokens :
-        //---------------------------------
+    const int n_gen = std::min(32, max_context_size);

-        if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
-        {
-            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
+    while (llama_get_kv_cache_token_count(ctx) < n_gen) {
+        // evaluate the transformer
+
+        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }

        tokens_list.clear();

-        //---------------------------------
-        // Select the best prediction :
-        //---------------------------------
+        // sample the next token

        llama_token new_token_id = 0;

-        auto logits  = llama_get_logits( ctx );
-        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
+        auto logits  = llama_get_logits(ctx);
+        auto n_vocab = llama_n_vocab(ctx);

        std::vector<llama_token_data> candidates;
-        candidates.reserve( n_vocab );
+        candidates.reserve(n_vocab);

-        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
-        {
-            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
        }

        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

-        // Select it using the "Greedy sampling" method :
-        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
-
+        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);

        // is it an end of stream ?
-        if ( new_token_id == llama_token_eos() )
-        {
+        if (new_token_id == llama_token_eos(ctx)) {
            fprintf(stderr, " [end of text]\n");
            break;
        }

-        // Print the new token :
-        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
-        fflush( stdout );
+        // print the new token :
+        printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
+        fflush(stdout);

-        // Push this new token for next evaluation :
-        tokens_list.push_back( new_token_id );
+        // push this new token for next evaluation
+        tokens_list.push_back(new_token_id);
+    }

-    } // wend of main loop
-
-    llama_free( ctx );
-    llama_free_model( model );
+    llama_free(ctx);
+    llama_free_model(model);

    llama_backend_free();

+    fprintf(stderr, "\n\n");
+
    return 0;
 }
-
-// EOF
--- a/examples/train-text-from-scratch/CMakeLists.txt
+++ b/examples/train-text-from-scratch/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(TARGET train-text-from-scratch)
 add_executable(${TARGET} train-text-from-scratch.cpp)
+install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "common.h"
 #include "llama.h"
 #include <unordered_map>
 #include <vector>
@@ -16,6 +17,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+static const float rms_norm_eps = 1e-5f;
+
 struct random_normal_distribution {
    std::mt19937 gen;
    std::normal_distribution<float> rd;
@@ -167,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
 struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
+    using ttype = llama_token_type;

-    struct token_score {
-        token tok;
+    struct token_data {
+        token text;
        float score;
+        ttype type;
    };

    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
+    std::vector<token_data> id_to_token;
 };

 struct my_llama_hparams {
@@ -439,7 +444,7 @@ struct ggml_tensor * forward(
        // norm
        {
            // cur shape [n_embd,N,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);

            // cur = attention_norm*cur
            cur = ggml_mul(ctx0,
@@ -562,7 +567,7 @@ struct ggml_tensor * forward(
            // norm
            {
                // cur shape [n_embd,N,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);

                // cur = ffn_norm*cur
                // cur shape [n_embd,N,1,1]
@@ -606,7 +611,7 @@ struct ggml_tensor * forward(
    {

        // inpL shape [n_embd,N,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);

        // inpL = norm*inpL
        // inpL shape [n_embd,N,1,1]
@@ -694,7 +699,7 @@ struct ggml_tensor * forward_batch(
        // norm
        {
            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            assert_shape_2d(cur, n_embd, N*n_batch);

            // cur = attention_norm*cur
@@ -857,7 +862,7 @@ struct ggml_tensor * forward_batch(
            // norm
            {
                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                assert_shape_2d(cur, n_embd, N*n_batch);

                // cur = ffn_norm*cur
@@ -910,7 +915,7 @@ struct ggml_tensor * forward_batch(
    {

        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        assert_shape_2d(inpL, n_embd, N*n_batch);

        // inpL = norm*inpL
@@ -979,7 +984,7 @@ struct ggml_tensor * forward_batch_wo_cache(
        // norm
        {
            // cur shape [n_embd,N*n_batch,1,1]
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            assert_shape_2d(cur, n_embd, N*n_batch);

            // cur = attention_norm*cur
@@ -1085,7 +1090,7 @@ struct ggml_tensor * forward_batch_wo_cache(
            // norm
            {
                // cur shape [n_embd,N*n_batch,1,1]
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                assert_shape_2d(cur, n_embd, N*n_batch);

                // cur = ffn_norm*cur
@@ -1138,7 +1143,7 @@ struct ggml_tensor * forward_batch_wo_cache(
    {

        // inpL shape [n_embd,N*n_batch,1,1]
-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        assert_shape_2d(inpL, n_embd, N*n_batch);

        // inpL = norm*inpL
@@ -1203,7 +1208,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(

        // norm
        {
-            cur = ggml_rms_norm(ctx0, inpL);
+            cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
            assert_shape_2d(cur, n_embd, N*n_batch);

            // cur = attention_norm*cur
@@ -1267,7 +1272,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
        {
            // norm
            {
-                cur = ggml_rms_norm(ctx0, inpFF);
+                cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps);
                assert_shape_2d(cur, n_embd, N*n_batch);

                // cur = ffn_norm*cur
@@ -1311,7 +1316,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn(
    // norm
    {

-        inpL = ggml_rms_norm(ctx0, inpL);
+        inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
        assert_shape_2d(inpL, n_embd, N*n_batch);

        // inpL = norm*inpL
@@ -1434,7 +1439,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
    gf->perf_time_us = 0;

    const auto & hparams = model->hparams;
-    //const int n_ctx      = hparams.n_ctx;
+    const int n_ctx      = hparams.n_ctx;
    const int n_vocab    = hparams.n_vocab;
    const int n_embd     = hparams.n_embd;
    const int n_layer    = hparams.n_layer;
@@ -1603,7 +1608,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
        struct my_llama_layer & layer = model->layers[il];
        // tensors with values necessary for backward pass are in persistent buf(-1)
        // other tensors with buf(0) and buf(1) are only temporary needed, and their memory reused after layer is completed.
-        use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur));                                    assert_shape_2d(t02, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t02 = expand(gf, ggml_rms_norm     (ctx0, cur, rms_norm_eps));                      assert_shape_2d(t02, n_embd, N*n_batch);
        use_buf( 0); struct ggml_tensor * t03 = expand(gf, ggml_repeat       (ctx0, layer.attention_norm, t02));              assert_shape_2d(t03, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t04 = expand(gf, ggml_mul          (ctx0, t02, t03));                               assert_shape_2d(t04, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t05 = expand(gf, ggml_mul_mat      (ctx0, layer.wq, t04));                          assert_shape_2d(t05, n_embd, N*n_batch);
@@ -1623,7 +1628,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
        use_buf(-1); struct ggml_tensor * t19 = expand(gf, ggml_reshape_2d   (ctx0, t18, n_embd, N*n_batch));                 assert_shape_2d(t19, n_embd, N*n_batch);
        use_buf( 0); struct ggml_tensor * t20 = expand(gf, ggml_mul_mat      (ctx0, layer.wo, t19));                          assert_shape_2d(t20, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t21 = expand(gf, ggml_add          (ctx0, t20, cur));                               assert_shape_2d(t21, n_embd, N*n_batch);
-        use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21));                                    assert_shape_2d(t22, n_embd, N*n_batch);
+        use_buf(-1); struct ggml_tensor * t22 = expand(gf, ggml_rms_norm     (ctx0, t21, rms_norm_eps));                      assert_shape_2d(t22, n_embd, N*n_batch);
        use_buf( 0); struct ggml_tensor * t23 = expand(gf, ggml_repeat       (ctx0, layer.ffn_norm, t22));                    assert_shape_2d(t23, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t24 = expand(gf, ggml_mul          (ctx0, t23, t22));                               assert_shape_2d(t24, n_embd, N*n_batch);
        use_buf(-1); struct ggml_tensor * t25 = expand(gf, ggml_mul_mat      (ctx0, layer.w3, t24));                          assert_shape_2d(t25, n_ff, N*n_batch);
@@ -1666,7 +1671,7 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
    }
    clr_buf(0);
    use_buf(0);
-    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur));                       assert_shape_2d(t31, n_embd, N*n_batch);
+    struct ggml_tensor * t31   = expand(gf, ggml_rms_norm  (ctx0, cur, rms_norm_eps));         assert_shape_2d(t31, n_embd, N*n_batch);
    struct ggml_tensor * t32   = expand(gf, ggml_repeat    (ctx0, model->norm, t31));          assert_shape_2d(t32, n_embd, N*n_batch);
    struct ggml_tensor * t33   = expand(gf, ggml_mul       (ctx0, t32, t31));                  assert_shape_2d(t33, n_embd, N*n_batch);
    use_buf(-1);
@@ -1863,10 +1868,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
        t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
        t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
        t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
-        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
+        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx, 10000.0f, 1.0f, 0.0f, false));        assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
        t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
        t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
-        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode));                            assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
+        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx, 10000.0f, 1.0f, 0.0f, false));        assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
        t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
        t04->grad = expand(gb, ggml_add_inplace(ctx0,
                        ggml_add_inplace(ctx0,
@@ -1959,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) {


 void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_to_str(ctx, token));
+    printf("%s", llama_token_to_str(ctx, token).c_str());
 }

 void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@@ -1993,7 +1998,7 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
    }
 }

-void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
    int n_tokens = tokens_input->ne[0];
    int n_vocab  = target_logits->ne[0];

@@ -2002,7 +2007,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons

    ggml_set_f32(target_logits, -1.0f/n_vocab);
    ggml_set_f32(target_probs, 0.0f);
-    ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
+    ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx));
    for (int i=1; i<n_tokens+1; ++i) {
        int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
        set_f32_2d(target_logits, token, i-1, +1.0f);
@@ -2013,7 +2018,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
    }
 }

-void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
    GGML_ASSERT(tokens_input->n_dims  == 2);
    GGML_ASSERT(target_logits->n_dims == 3);
    GGML_ASSERT(target_probs->n_dims  == 3);
@@ -2033,7 +2038,7 @@ void get_example_targets_batch(struct llama_context * /*lctx*/, const int * trai
        size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
        GGML_ASSERT(sample+n_tokens-1 < n_train_data);

-        set_i32_2d(tokens_input, 0, k, llama_token_bos());
+        set_i32_2d(tokens_input, 0, k, llama_token_bos(lctx));
        for (int i=1; i<n_tokens+1; ++i) {
            int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
            // print_token(lctx, token);
@@ -2186,11 +2191,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
    f.read_raw(buf.data(), f.size);
    buf[f.size] = '\0';

-    out.resize(buf.size());
-
-    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
-    if (n_tokens >= 0) {
-        out.resize(n_tokens);
+    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
+    if (n_tokens < 0) {
+        out.resize(-n_tokens);
+        llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
    }

    bool verify = false;
@@ -2198,17 +2202,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
        const char * in  = buf.data();
        const char * end = buf.data() + buf.size();
        for (int i = 0; i < (int) out.size(); ++i) {
-            const char * s = llama_token_to_str(lctx, out[i]);
-            int len = strlen(s);
+            std::string s = llama_token_to_str(lctx, out[i]);
+            int len = s.length();
            if (in >= end) {
                printf("%s: unexpected end of original text.\n", __func__);
                break;
            }
-            const bool matches = (strncmp(in, s, len) == 0);
+            const bool matches = (strncmp(in, s.c_str(), len) == 0);
            if (matches) {
                in += len;
            } else {
-                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
+                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
            }
        }
    }
@@ -2292,7 +2296,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
    const auto params = sampler->params;

    // Apply penalties
-    const float nl_logit = logits[llama_token_nl()];
+    const float nl_logit = logits[llama_token_nl(ctx)];

    const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);

@@ -2311,7 +2315,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
        params.alpha_presence);

    if (!params.penalize_nl) {
-        logits[llama_token_nl()] = nl_logit;
+        logits[llama_token_nl(ctx)] = nl_logit;
    }

    llama_token token = 0;
@@ -2610,42 +2614,45 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
        return;
    }

-    // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-    file.write_u32(LLAMA_FILE_VERSION); // version
-    // write_hparams
-    file.write_u32(model->hparams.n_vocab);
-    file.write_u32(model->hparams.n_embd);
-    file.write_u32(model->hparams.n_mult);
-    file.write_u32(model->hparams.n_head);
-    file.write_u32(model->hparams.n_layer);
-    file.write_u32(model->hparams.n_rot);
-    file.write_u32(LLAMA_FTYPE_ALL_F32);
-    // write_vocab
-    uint32_t n_vocab = model->hparams.n_vocab;
-    for (uint32_t i = 0; i < n_vocab; i++) {
-        const auto & token_score = vocab->id_to_token.at(i);
-        file.write_u32((uint32_t) token_score.tok.size());
-        file.write_raw(token_score.tok.data(), token_score.tok.size());
-        file.write_raw(&token_score.score, sizeof(token_score.score));
-    }
-    // write tensors
-    write_tensor(&file, model->tok_embeddings);
-    write_tensor(&file, model->norm);
-    write_tensor(&file, model->output);
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        write_tensor(&file, layer.attention_norm);
-        write_tensor(&file, layer.wq);
-        write_tensor(&file, layer.wk);
-        write_tensor(&file, layer.wv);
-        write_tensor(&file, layer.wo);
-        write_tensor(&file, layer.ffn_norm);
-        write_tensor(&file, layer.w1);
-        write_tensor(&file, layer.w2);
-        write_tensor(&file, layer.w3);
-    }
+#pragma message("TODO: implement file saving using gguf")
+    (void) vocab;
+    (void) model;
+//    // write_magic
+//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+//    file.write_u32(LLAMA_FILE_VERSION); // version
+//    // write_hparams
+//    file.write_u32(model->hparams.n_vocab);
+//    file.write_u32(model->hparams.n_embd);
+//    file.write_u32(model->hparams.n_mult);
+//    file.write_u32(model->hparams.n_head);
+//    file.write_u32(model->hparams.n_layer);
+//    file.write_u32(model->hparams.n_rot);
+//    file.write_u32(LLAMA_FTYPE_ALL_F32);
+//    // write_vocab
+//    uint32_t n_vocab = model->hparams.n_vocab;
+//    for (uint32_t i = 0; i < n_vocab; i++) {
+//        const auto & token_data = vocab->id_to_token.at(i);
+//        file.write_u32((uint32_t) token_data.tok.size());
+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
+//        file.write_raw(&token_data.score, sizeof(token_data.score));
+//    }
+//    // write tensors
+//    write_tensor(&file, model->tok_embeddings);
+//    write_tensor(&file, model->norm);
+//    write_tensor(&file, model->output);
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+//        auto & layer = model->layers[i];
+//
+//        write_tensor(&file, layer.attention_norm);
+//        write_tensor(&file, layer.wq);
+//        write_tensor(&file, layer.wk);
+//        write_tensor(&file, layer.wv);
+//        write_tensor(&file, layer.wo);
+//        write_tensor(&file, layer.ffn_norm);
+//        write_tensor(&file, layer.w1);
+//        write_tensor(&file, layer.w2);
+//        write_tensor(&file, layer.w3);
+//    }
 }

 float cosine_decay(const int decay_steps, const float alpha, int step) {
@@ -3050,20 +3057,13 @@ int main(int argc, char ** argv) {

    struct llama_vocab vocab;
    {
-        std::vector<const char *> strings;
-        std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);
-        strings.resize(n_vocab, NULL);
-        scores.resize(n_vocab, 0);
-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        const int n_vocab = llama_n_vocab(lctx);
        vocab.id_to_token.resize(n_vocab);
        for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
-            float       score = scores[i];
-            vocab.id_to_token[i].tok   = tok;
-            vocab.id_to_token[i].score = score;
-            vocab.token_to_id.emplace(tok, i);
+            vocab.id_to_token[i].text  = llama_token_get_text(lctx, i);
+            vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
+            vocab.id_to_token[i].type  = llama_token_get_type(lctx, i);
+            vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
        }
    }

@@ -3176,7 +3176,7 @@ int main(int argc, char ** argv) {
    std::vector<int> train_samples;
    train_samples.push_back(0);
    for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
-        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) {
+        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) {
            train_samples.push_back(i);
        }
    }
@@ -3336,7 +3336,7 @@ int main(int argc, char ** argv) {
        struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
        struct ggml_tensor * target_probs  = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);

-        get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
+        get_example_targets(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
        for (int i=sample_ctx; i<n_tokens; ++i) {
            ggml_set_i32_1d(tokens_input, i, n_vocab/2);
        }
--- a/flake.nix
+++ b/flake.nix
@@ -6,56 +6,66 @@
  outputs = { self, nixpkgs, flake-utils }:
    flake-utils.lib.eachDefaultSystem (system:
      let
-        inherit (pkgs.stdenv) isAarch64 isDarwin;
-        inherit (pkgs.lib) optionals;
-        isM1 = isAarch64 && isDarwin;
-        osSpecific = if isM1 then
-          with pkgs.darwin.apple_sdk_11_0.frameworks; [
-            Accelerate
-            MetalKit
-            MetalPerformanceShaders
-            MetalPerformanceShadersGraph
-          ]
-        else if isDarwin then
-          with pkgs.darwin.apple_sdk.frameworks; [
-            Accelerate
-            CoreGraphics
-            CoreVideo
-          ]
-        else
-          [ ];
+        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
+        buildInputs = with pkgs; [ openmpi ];
+        osSpecific = with pkgs; buildInputs ++
+        (
+          if isAarch64 && isDarwin then
+            with pkgs.darwin.apple_sdk_11_0.frameworks; [
+              Accelerate
+              MetalKit
+            ]
+          else if isAarch32 && isDarwin then
+            with pkgs.darwin.apple_sdk.frameworks; [
+              Accelerate
+              CoreGraphics
+              CoreVideo
+            ]
+          else
+            with pkgs; [ openblas ]
+        );
        pkgs = import nixpkgs { inherit system; };
+        nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
        llama-python =
-          pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
+          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
+        postPatch = ''
+          substituteInPlace ./ggml-metal.m \
+            --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+          substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
+        '';
+        postInstall = ''
+          mv $out/bin/main $out/bin/llama
+          mv $out/bin/server $out/bin/llama-server
+        '';
+        cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
      in {
        packages.default = pkgs.stdenv.mkDerivation {
          name = "llama.cpp";
          src = ./.;
-          postPatch = if isM1 then ''
-            substituteInPlace ./ggml-metal.m \
-              --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-          '' else
-            "";
-          nativeBuildInputs = with pkgs; [ cmake ];
+          postPatch = postPatch;
+          nativeBuildInputs = nativeBuildInputs;
          buildInputs = osSpecific;
-          cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
-            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
-            "-DLLAMA_METAL=ON"
+          cmakeFlags = cmakeFlags
+            ++ (if isAarch64 && isDarwin then [
+              "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
+              "-DLLAMA_METAL=ON"
+            ] else [
+              "-DLLAMA_BLAS=ON"
+              "-DLLAMA_BLAS_VENDOR=OpenBLAS"
          ]);
-          installPhase = ''
-            runHook preInstall
-
-            mkdir -p $out/bin
-            mv bin/* $out/bin/
-            mv $out/bin/main $out/bin/llama
-            mv $out/bin/server $out/bin/llama-server
-
-            echo "#!${llama-python}/bin/python" > $out/bin/convert.py
-            cat ${./convert.py} >> $out/bin/convert.py
-            chmod +x $out/bin/convert.py
-
-            runHook postInstall
-          '';
+          postInstall = postInstall;
+          meta.mainProgram = "llama";
+        };
+        packages.opencl = pkgs.stdenv.mkDerivation {
+          name = "llama.cpp";
+          src = ./.;
+          postPatch = postPatch;
+          nativeBuildInputs = nativeBuildInputs;
+          buildInputs = with pkgs; buildInputs ++ [ clblast ];
+          cmakeFlags = cmakeFlags ++ [
+            "-DLLAMA_CLBLAST=ON"
+          ];
+          postInstall = postInstall;
          meta.mainProgram = "llama";
        };
        apps.llama-server = {
@@ -72,7 +82,7 @@
        };
        apps.default = self.apps.${system}.llama;
        devShells.default = pkgs.mkShell {
-          packages = with pkgs; [ cmake llama-python ] ++ osSpecific;
+          packages = nativeBuildInputs ++ osSpecific;
        };
      });
 }
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -0,0 +1,579 @@
+#include "ggml-alloc.h"
+#include "ggml.h"
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define UNUSED(x) (void)(x)
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+//#define GGML_ALLOCATOR_DEBUG
+
+//#define AT_PRINTF printf
+#define AT_PRINTF(...) ((void)0)
+
+struct hash_node {
+    struct ggml_tensor * t;
+    int n_children;
+    int n_views;
+};
+
+static size_t hash(void * p) {
+    return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
+}
+
+static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
+    size_t h = hash(t);
+
+    // linear probing
+    size_t i = h;
+    while (hash_table[i].t != NULL) {
+        if (hash_table[i].t == t) {
+            return &hash_table[i];
+        }
+        i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
+        if (i == h) {
+            // hash table is full
+            GGML_ASSERT(false);
+        }
+    }
+
+    hash_table[i].t = t;
+    return &hash_table[i];
+}
+
+// TODO: GGML_PAD ?
+static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
+    assert(alignment && !(alignment & (alignment - 1))); // power of 2
+    size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
+    return offset + align;
+}
+
+struct free_block {
+    void * addr;
+    size_t size;
+};
+
+#define MAX_FREE_BLOCKS 128
+
+struct ggml_allocr {
+    void * data;
+    size_t size;
+    size_t alignment;
+    int n_free_blocks;
+    struct free_block free_blocks[MAX_FREE_BLOCKS];
+    struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
+    size_t max_size;
+    bool measure;
+    int parse_seq[GGML_MAX_NODES];
+    bool has_parse_seq;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    struct ggml_tensor * allocated_tensors[1024];
+#endif
+};
+
+#ifdef GGML_ALLOCATOR_DEBUG
+static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    for (int i = 0; i < 1024; i++) {
+        if (alloc->allocated_tensors[i] == NULL) {
+            alloc->allocated_tensors[i] = tensor;
+            return;
+        }
+    }
+    GGML_ASSERT(!"out of allocated_tensors");
+}
+static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    for (int i = 0; i < 1024; i++) {
+        if (alloc->allocated_tensors[i] == tensor ||
+            (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
+            alloc->allocated_tensors[i] = NULL;
+            return;
+        }
+    }
+    printf("tried to free tensor %s not found\n", tensor->name);
+    GGML_ASSERT(!"tensor not found");
+}
+#endif
+
+
+static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    return ggml_nbytes(tensor);
+
+    UNUSED(alloc);
+}
+
+void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size = aligned_offset(NULL, size, alloc->alignment);
+
+    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
+
+    size_t max_avail = 0;
+
+    // find the best fitting free block besides the last block
+    int best_fit_block = -1;
+    size_t best_fit_size = SIZE_MAX;
+    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
+        struct free_block * block = &alloc->free_blocks[i];
+        max_avail = MAX(max_avail, block->size);
+        if (block->size >= size && block->size <= best_fit_size) {
+            best_fit_block = i;
+            best_fit_size = block->size;
+        }
+    }
+
+    AT_PRINTF("block %d\n", best_fit_block);
+
+    if (best_fit_block == -1) {
+        // the last block is our last resort
+        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+        if (block->size >= size) {
+            best_fit_block = alloc->n_free_blocks - 1;
+            max_avail = MAX(max_avail, block->size);
+        } else {
+            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
+                    __func__, size, max_avail);
+            GGML_ASSERT(!"not enough space in the buffer");
+        return;
+        }
+    }
+    struct free_block * block = &alloc->free_blocks[best_fit_block];
+    void * addr = block->addr;
+    block->addr = (char*)block->addr + size;
+    block->size -= size;
+    if (block->size == 0) {
+        // remove block if empty
+        alloc->n_free_blocks--;
+        for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
+            alloc->free_blocks[j] = alloc->free_blocks[j+1];
+        }
+    }
+
+    tensor->data = addr;
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    add_allocated_tensor(alloc, tensor);
+    size_t cur_max = (char*)addr - (char*)alloc->data + size;
+    if (cur_max > alloc->max_size) {
+        printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+        for (int i = 0; i < 1024; i++) {
+            if (alloc->allocated_tensors[i]) {
+                printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
+            }
+        }
+        printf("\n");
+    }
+#endif
+
+    alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
+}
+
+// this is a very naive implementation, but for our case the number of free blocks should be very small
+static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+    void * ptr = tensor->data;
+
+    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
+        // the tensor was not allocated in this buffer
+        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
+        // the easiest way to deal with this is just to ignore it
+        return;
+    }
+
+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
+    size = aligned_offset(NULL, size, alloc->alignment);
+    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
+
+#ifdef GGML_ALLOCATOR_DEBUG
+    remove_allocated_tensor(alloc, tensor);
+#endif
+
+    // see if we can merge with an existing block
+    for (int i = 0; i < alloc->n_free_blocks; i++) {
+        struct free_block * block = &alloc->free_blocks[i];
+        // check if ptr is at the end of the block
+        if ((char*)block->addr + block->size == ptr) {
+            block->size += size;
+            // check if we can merge with the next block
+            if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
+                block->size += alloc->free_blocks[i+1].size;
+                alloc->n_free_blocks--;
+                for (int j = i+1; j < alloc->n_free_blocks; j++) {
+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+                }
+            }
+            return;
+        }
+        // check if ptr is at the beginning of the block
+        if ((char*)ptr + size == block->addr) {
+            block->addr = ptr;
+            block->size += size;
+            // check if we can merge with the previous block
+            if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
+                alloc->free_blocks[i-1].size += block->size;
+                alloc->n_free_blocks--;
+                for (int j = i; j < alloc->n_free_blocks; j++) {
+                    alloc->free_blocks[j] = alloc->free_blocks[j+1];
+                }
+            }
+            return;
+        }
+    }
+    // otherwise, add a new block
+    GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
+    // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
+    int insert_pos = 0;
+    while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
+        insert_pos++;
+    }
+    // shift all blocks from insert_pos onward to make room for the new block
+    for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
+        alloc->free_blocks[i] = alloc->free_blocks[i-1];
+    }
+    // insert the new block
+    alloc->free_blocks[insert_pos].addr = ptr;
+    alloc->free_blocks[insert_pos].size = size;
+    alloc->n_free_blocks++;
+}
+
+void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
+    int pos = 0;
+    for (int i = 0; i < n; i++) {
+        if (list[i] != -1) {
+            alloc->parse_seq[pos] = list[i];
+            pos++;
+        }
+    }
+    alloc->has_parse_seq = true;
+}
+
+void ggml_allocr_reset(struct ggml_allocr * alloc) {
+    alloc->n_free_blocks = 1;
+    size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
+    alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
+    alloc->free_blocks[0].size = alloc->size - align_offset;
+}
+
+struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+
+    *alloc = (struct ggml_allocr){
+        /*.data          = */ data,
+        /*.size          = */ size,
+        /*.alignment     = */ alignment,
+        /*.n_free_blocks = */ 0,
+        /*.free_blocks   = */ {{0}},
+        /*.hash_table    = */ {{0}},
+        /*.max_size      = */ 0,
+        /*.measure       = */ false,
+        /*.parse_seq     = */ {0},
+        /*.has_parse_seq = */ false,
+#ifdef GGML_ALLOCATOR_DEBUG
+        /*.allocated_tensors = */ = {0},
+#endif
+    };
+
+    ggml_allocr_reset(alloc);
+
+    return alloc;
+}
+
+// address and size of the buffer when measuring
+// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
+static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
+static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB
+
+struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
+    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
+
+    *alloc = (struct ggml_allocr){
+        /*.data          = */ MEASURE_BASE_ADDR,
+        /*.size          = */ MEASURE_MAX_SIZE,
+        /*.alignment     = */ alignment,
+        /*.n_free_blocks = */ 0,
+        /*.free_blocks   = */ {{0}},
+        /*.hash_table    = */ {{0}},
+        /*.max_size      = */ 0,
+        /*.measure       = */ true,
+        /*.parse_seq     = */ {0},
+        /*.has_parse_seq = */ false,
+#ifdef GGML_ALLOCATOR_DEBUG
+        /*.allocated_tensors = */ = {0},
+#endif
+    };
+
+    ggml_allocr_reset(alloc);
+
+    return alloc;
+}
+
+void ggml_allocr_free(struct ggml_allocr * alloc) {
+    free(alloc);
+}
+
+bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
+    return alloc->measure;
+}
+
+//////////// compute graph allocator
+
+static bool ggml_is_view(struct ggml_tensor * t) {
+    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
+           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
+}
+
+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
+    if (a->type != b->type) {
+        return false;
+    }
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if (a->ne[i] != b->ne[i]) {
+            return false;
+        }
+        if (a->nb[i] != b->nb[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
+    switch (t->op) {
+        case GGML_OP_PERMUTE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_VIEW:
+            return t->src[0];
+        case GGML_OP_CPY:
+            return t->src[1];
+        default:
+            return NULL;
+    }
+}
+
+static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
+    struct ggml_tensor * parent = t;
+    do {
+        parent = get_view_parent(parent);
+    } while (ggml_is_view(parent));
+    return parent;
+}
+
+static bool ggml_op_can_inplace(enum ggml_op op) {
+    switch (op) {
+        case GGML_OP_SCALE:
+        case GGML_OP_DIAG_MASK_ZERO:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_ADD:
+        case GGML_OP_ADD1:
+        case GGML_OP_ACC:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_SQR:
+        case GGML_OP_SQRT:
+        case GGML_OP_LOG:
+        case GGML_OP_UNARY:
+        case GGML_OP_ROPE:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_SET:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_CONT:
+            return true;
+
+        default:
+            return false;
+    }
+}
+
+static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
+    struct hash_node * ht = alloc->hash_table;
+    if (node->data == NULL) {
+        if (ggml_is_view(node)) {
+            size_t offset;
+            switch(node->op) {
+                case GGML_OP_VIEW:
+                    memcpy(&offset, node->op_params, sizeof(size_t));
+                    node->data = (char *) node->src[0]->data + offset;
+                    break;
+                case GGML_OP_PERMUTE:
+                case GGML_OP_RESHAPE:
+                case GGML_OP_TRANSPOSE:
+                    node->data = node->src[0]->data;
+                    break;
+                case GGML_OP_CPY:
+                    node->data = node->src[1]->data;
+                    break;
+                default:
+                    GGML_ASSERT(!"unknown view op");
+                    break;
+            }
+        } else {
+            // see if we can reuse a parent's buffer (inplace)
+            if (ggml_op_can_inplace(node->op)) {
+                for (int i = 0; i < GGML_MAX_SRC; i++) {
+                    struct ggml_tensor * parent = node->src[i];
+                    if (parent == NULL) {
+                        break;
+                    }
+
+                    // if the node's data is external, then we cannot re-use it
+                    if ((char *) parent->data < (char *) alloc->data ||
+                        (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
+                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
+                        continue;
+                    }
+
+                    struct hash_node * p_hn = hash_get(ht, parent);
+                    if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
+                        if (ggml_is_view(parent)) {
+                            struct ggml_tensor * view_src = get_view_source(parent);
+                            struct hash_node * view_src_hn = hash_get(ht, view_src);
+                            if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
+                                // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
+                                // the parent's data that it will need later (same layout requirement). the problem is that then
+                                // we cannot free the tensor because the original address of the allocation is lost.
+                                // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
+                                // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
+                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
+                                node->data = parent->data;
+                                return;
+                            }
+                        }
+                        else {
+                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
+                            node->data = parent->data;
+                        }
+                        return;
+                    }
+                }
+            }
+            ggml_allocr_alloc(alloc, node);
+        }
+    }
+}
+
+static size_t ggml_allocator_alloc_graph_tensors_n(
+    struct ggml_allocr * alloc,
+    struct ggml_cgraph ** graphs, int n_graphs,
+    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
+
+    // reset hash table
+    struct hash_node * ht = alloc->hash_table;
+    memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
+
+    // count number of children and views
+    for (int g = 0; g < n_graphs; g++) {
+        struct ggml_cgraph * gf = graphs[g];
+        for (int i = 0; i < gf->n_nodes; i++) {
+            struct ggml_tensor * node = gf->nodes[i];
+
+            if (ggml_is_view(node)) {
+                struct ggml_tensor * view_src = get_view_source(node);
+                hash_get(ht, view_src)->n_views += 1;
+            }
+
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
+                }
+                hash_get(ht, parent)->n_children += 1;
+            }
+        }
+    }
+
+    // allocate tensors
+    for (int g = 0; g < n_graphs; g++) {
+        struct ggml_cgraph * gf = graphs[g];
+        AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
+        // graph inputs are allocated first to ensure that they are not overwritten by each other
+        if (inputs != NULL && inputs[g] != NULL) {
+            for (int i = 0; inputs[g][i] != NULL; i++) {
+                struct ggml_tensor * input = inputs[g][i];
+                AT_PRINTF("input: %s\n", input->name);
+                allocate_node(alloc, input);
+            }
+        }
+        for (int ind = 0; ind < gf->n_nodes; ind++) {
+            int i;
+            if (alloc->has_parse_seq) {
+                i = alloc->parse_seq[ind];
+            } else {
+                i = ind;
+            }
+            struct ggml_tensor * node = gf->nodes[i];
+
+            // allocate parents (leafs)
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
+                }
+                allocate_node(alloc, parent);
+            }
+
+            // allocate node
+            allocate_node(alloc, node);
+
+            AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
+                }
+                AT_PRINTF("%s", parent->name);
+                if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
+                    AT_PRINTF(", ");
+                }
+            }
+            AT_PRINTF("\n");
+
+            // update parents
+            for (int j = 0; j < GGML_MAX_SRC; j++) {
+                struct ggml_tensor * parent = node->src[j];
+                if (parent == NULL) {
+                    break;
+                }
+                struct hash_node * p_hn = hash_get(ht, parent);
+                p_hn->n_children -= 1;
+
+                //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
+
+                if (p_hn->n_children == 0 && p_hn->n_views == 0) {
+                    if (ggml_is_view(parent)) {
+                        struct ggml_tensor * view_src = get_view_source(parent);
+                        struct hash_node * view_src_hn = hash_get(ht, view_src);
+                        view_src_hn->n_views -= 1;
+                        AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
+                        if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
+                            ggml_allocator_free_tensor(alloc, view_src);
+                        }
+                    }
+                    else {
+                        if (parent->data != node->data) {
+                            ggml_allocator_free_tensor(alloc, parent);
+                        }
+                    }
+                }
+            }
+            AT_PRINTF("\n");
+        }
+        // free graph outputs here that wouldn't be freed otherwise because they have no children
+        if (outputs != NULL && outputs[g] != NULL) {
+            for (int i = 0; outputs[g][i] != NULL; i++) {
+                struct ggml_tensor * output = outputs[g][i];
+                AT_PRINTF("output: %s\n", output->name);
+                ggml_allocator_free_tensor(alloc, output);
+            }
+        }
+    }
+
+    return alloc->max_size;
+}
+
+size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
+    return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+}
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+
+GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
+GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
+
+// tell the allocator to parse nodes following the order described in the list
+// you should call this if your graph are optimized to execute out-of-order
+GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
+
+GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
+GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
+GGML_API void   ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
+GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
+
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml-backend.c
+++ b/ggml-backend.c
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -1,162 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-    struct ggml_backend;
-
-    // backend buffer
-    typedef void * ggml_buffer_context_t;
-    struct ggml_backend_buffer;
-
-    struct ggml_backend_buffer_interface {
-        // allocator functions
-        void   (*free_buffer)   (struct ggml_backend_buffer * alloc);
-        void   (*alloc_tensor)  (struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor);
-        void   (*free_tensor)   (struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor);
-        void   (*reset)         (struct ggml_backend_buffer * alloc);
-        // functions overriden by the backend
-        size_t (*get_alloc_size)(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor); // pre-allocation callback
-        void   (*init_tensor)   (struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor); // post-allocation callback
-        void   (*free_data)     (struct ggml_backend_buffer * alloc); // free backend-specific data // TODO: better name
-    };
-
-    struct ggml_backend_buffer {
-        struct ggml_backend_buffer_interface interface;
-        ggml_buffer_context_t context;
-        struct ggml_backend * backend;
-        void * backend_data;
-        bool measure;
-        size_t max_size;
-    };
-
-    // backend buffer helper functions
-    GGML_API      void ggml_backend_buffer_free(struct ggml_backend_buffer * alloc);
-    static inline void ggml_backend_buffer_tensor_alloc(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.alloc_tensor(alloc, tensor); }
-    static inline void ggml_backend_buffer_tensor_free(struct ggml_backend_buffer * alloc, struct ggml_tensor * tensor) { alloc->interface.free_tensor(alloc, tensor); }
-    static inline void ggml_backend_buffer_reset(struct ggml_backend_buffer * alloc) { alloc->interface.reset(alloc); }
-
-    // default buffer allocator
-    GGML_API struct ggml_backend_buffer * ggml_allocator_default_init(void * data, size_t size, size_t alignment);
-
-    // buffer
-
-    // buffers have space for the tensor structs in host memory, and tensor data in backend-specific memory
-    struct ggml_buffer {
-        // host memory
-        size_t mem_size;
-        void * mem_buffer;
-
-        // tensor data
-        struct ggml_backend_buffer * backend_buffer;
-    };
-
-    GGML_API struct ggml_buffer * ggml_buffer_alloc        (struct ggml_backend * backend, size_t size, size_t max_tensors);
-    GGML_API struct ggml_buffer * ggml_buffer_measure_alloc(struct ggml_backend * backend, size_t max_tensors);
-    // measure buffers only calculate the maximum size of the buffer without allocating it - useful for pre-allocation
-    GGML_API void ggml_buffer_free(struct ggml_buffer * buffer);
-
-    // backend
-    typedef void * ggml_backend_context_t;
-    typedef void * ggml_graph_plan_t;
-
-    struct ggml_backend_interface {
-        const char * (*get_name)(struct ggml_backend * backend);
-
-        void (*free)(struct ggml_backend * backend);
-
-        // buffer allocation
-        struct ggml_backend_buffer * (*alloc_buffer)(struct ggml_backend * backend, size_t size);
-
-        // tensor data access
-        // these functions can be asynchronous. helper functions are provided for synchronous access that automatically call synchronize
-        void (*set_tensor_async)(struct ggml_backend * backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*get_tensor_async)(struct ggml_backend * backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
-        void (*synchronize)     (struct ggml_backend * backend);
-
-        // (optional) copy tensor between different backends, allow for single-copy tranfers
-        void (*cpy_tensor_from)(struct ggml_backend * backend, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void (*cpy_tensor_to)  (struct ggml_backend * backend, struct ggml_tensor * src, struct ggml_tensor * dst);
-
-        // compute graph with a plan
-        ggml_graph_plan_t (*graph_plan_create) (struct ggml_backend * backend, struct ggml_cgraph * cgraph);
-        void              (*graph_plan_free)   (struct ggml_backend * backend, ggml_graph_plan_t plan);
-        void              (*graph_plan_compute)(struct ggml_backend * backend, ggml_graph_plan_t plan);
-
-        // compute graph without a plan
-        void              (*graph_compute)     (struct ggml_backend * backend, struct ggml_cgraph * cgraph);
-
-        // check if a backend supports a given operation
-        // this could be used to fallback automatically to the CPU backend if a backend doesn't support an operation
-        // bool (*supports_op)(struct ggml_backend * backend, struct ggml_tensor * op);
-    };
-
-    struct ggml_backend {
-        struct ggml_backend_interface interface;
-        ggml_backend_context_t context;
-    };
-
-    // backend helper functions
-    static inline const char * ggml_backend_name(struct ggml_backend * backend) { return backend->interface.get_name(backend); }
-    static inline void ggml_backend_free(struct ggml_backend * backend) { backend->interface.free(backend); }
-    static inline void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { tensor->backend->interface.set_tensor_async(tensor->backend, tensor, data, offset, size); }
-    static inline void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { tensor->backend->interface.get_tensor_async(tensor->backend, tensor, data, offset, size); }
-    static inline void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { tensor->backend->interface.set_tensor_async(tensor->backend, tensor, data, offset, size); tensor->backend->interface.synchronize(tensor->backend); }
-    static inline void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { tensor->backend->interface.get_tensor_async(tensor->backend, tensor, data, offset, size); tensor->backend->interface.synchronize(tensor->backend); }
-    static inline void ggml_backend_synchronize(struct ggml_backend * backend) { backend->interface.synchronize(backend); }
-    static inline ggml_graph_plan_t ggml_backend_graph_plan_create(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { return backend->interface.graph_plan_create(backend, cgraph); }
-    static inline void ggml_backend_graph_plan_free(struct ggml_backend * backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_free(backend, plan); }
-    static inline void ggml_backend_graph_plan_compute(struct ggml_backend * backend, ggml_graph_plan_t plan) { backend->interface.graph_plan_compute(backend, plan); }
-    static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface.graph_compute(backend, cgraph); }
-
-    // tensor copy between different backends
-    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    // CPU backend
-    GGML_API struct ggml_backend * ggml_backend_cpu_init(void);
-    GGML_API void ggml_backend_cpu_set_n_threads(struct ggml_backend * backend_cpu, int n_threads);
-
-    ///////////////////////////
-
-    // graph splitting
-    #define GGML_MAX_SPLITS 200
-    #define GGML_MAX_SPLIT_INPUTS 4
-
-    struct ggml_graph_split {
-        char name[GGML_MAX_NAME];
-        struct ggml_context * ctx;
-        struct ggml_tensor  * src_inputs[GGML_MAX_SPLIT_INPUTS + 1];
-        struct ggml_tensor  * dst_inputs[GGML_MAX_SPLIT_INPUTS + 1];
-        struct ggml_cgraph  * graph;
-    };
-
-    // TODO: this shouldn't be fixed size, allocate from ggml_context
-    struct ggml_graph_splits {
-        int n_splits;
-        struct ggml_graph_split splits[GGML_MAX_SPLITS];
-    };
-
-    // TODO: allocate in ggml_context
-    struct ggml_graph_splits ggml_graph_split_init(void);
-    // this won't be needed once we can allocate graphs from a ggml_context
-    GGML_API void ggml_graph_splits_free(struct ggml_graph_splits * splits);
-
-    // add a split to the graph - single and multiple inputs versions
-    GGML_API void ggml_graph_splits_add(struct ggml_graph_splits * splits, struct ggml_tensor ** input, struct ggml_context * ctx, const char * fmt, ...);
-    GGML_API void ggml_graph_splits_add_n(struct ggml_graph_splits * splits, struct ggml_tensor *** inputs, struct ggml_context * ctx, const char * fmt, ...);
-
-    // build graphs for all splits
-    GGML_API void ggml_graph_splits_build_forward(struct ggml_graph_splits * splits, struct ggml_tensor * output);
-
-    // compute
-    GGML_API void ggml_graph_splits_compute(struct ggml_graph_splits * splits);
-
-    // graph tensor allocator
-    GGML_API void ggml_graph_allocate_tensors(struct ggml_cgraph * graph, struct ggml_context * ctx);
-    GGML_API void ggml_graph_splits_allocate_tensors(struct ggml_graph_splits * splits);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml-cuda-kern.h
+++ b/ggml-cuda-kern.h
@@ -1,468 +0,0 @@
-// kernels for ggml-cuda
-#include <cuda.h>
-#include <cuda_fp16.h>
-
-
-template<typename dst_t>
-using to_t_cuda_t = void (*)(const void * x, dst_t * y, int k, cudaStream_t stream);
-
-// support for vector types in generic code
-template<typename T> struct vec2_t_impl;
-template<> struct vec2_t_impl<half>   { typedef half2 type; };
-template<> struct vec2_t_impl<float>  { typedef float2 type; };
-
-template<typename T> using vec2_t = typename vec2_t_impl<T>::type;
-
-template<typename T> inline __host__ __device__ vec2_t<T> make_vec2_t(const T & x, const T & y);
-template<> inline __host__ __device__ vec2_t<half>  make_vec2_t(const  half & x, const  half & y) { return make_half2 (x, y); }
-template<> inline __host__ __device__ vec2_t<float> make_vec2_t(const float & x, const float & y) { return make_float2(x, y); }
-
-// the cuda headers define operators for half2, but not for float2
-// they are defined here to simplify generic code
-inline __host__ __device__ float2   operator+(const float2 & a, const float2 & b) { return make_float2(a.x + b.x, a.y + b.y); }
-inline __host__ __device__ float2   operator-(const float2 & a, const float2 & b) { return make_float2(a.x - b.x, a.y - b.y); }
-inline __host__ __device__ float2   operator*(const float2 & a, const float2 & b) { return make_float2(a.x * b.x, a.y * b.y); }
-inline __host__ __device__ float2   operator/(const float2 & a, const float2 & b) { return make_float2(a.x / b.x, a.y / b.y); }
-inline __host__ __device__ float2 & operator+=(     float2 & a, const float2 & b) { a.x += b.x; a.y += b.y; return a; }
-inline __host__ __device__ float2 & operator-=(     float2 & a, const float2 & b) { a.x -= b.x; a.y -= b.y; return a; }
-inline __host__ __device__ float2 & operator*=(     float2 & a, const float2 & b) { a.x *= b.x; a.y *= b.y; return a; }
-inline __host__ __device__ float2 & operator/=(     float2 & a, const float2 & b) { a.x /= b.x; a.y /= b.y; return a; }
-
-template<typename dst_t>
-using dequantize_kernel_t = void (*)(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v);
-
-__device__ half  sqrt(const half x) { return hsqrt(x); }
-__device__ half  exp(const half x) { return hexp(x); }
-__device__ half2 exp(const half2 x) { return h2exp(x); }
-__device__ half  cos(const half x) { return hcos(x); }
-__device__ half  sin(const half x) { return hsin(x); }
-__device__ half  max(const half x, const half y) { return __hmax(x, y); }
-__device__ half2 max(const half2 x, const half2 y) { return __hmax2(x, y); }
-
-
-template<typename T> struct op_max { __device__ T operator()(T a, T b) const { return max(a, b); } };
-template<typename T> struct op_sum { __device__ T operator()(T a, T b) const { return a + b; } };
-
-template<template<typename> class op_t, typename T>
-static inline __device__ T warp_reduce_all(T val) {
-    op_t<T> op;
-#pragma unroll
-    for (int mask = warpSize/2; mask > 0; mask /= 2)  {
-        val = op(val, __shfl_xor_sync(0xffffffff, val, mask, 32));
-    }
-    return val;
-}
-
-template<typename T>
-static __device__ T zero_init() { return T(0); }
-template<>
-__device__ half2 zero_init() { return half2(0.0f, 0.0f); }
-
-template<template<typename> class op_t, typename T>
-static __device__ T block_reduce_all(const T val, const T init = zero_init<T>()) {
-    const int warp_id = threadIdx.x / warpSize; // warp id within the block
-    const int lane_id = threadIdx.x % warpSize; // lane id within the warp
-    const int num_warps = blockDim.x / warpSize; // number of warps in the block
-
-    __shared__ T lane_result[32]; // max 32 warps per block
-
-    // reduce warps
-    T warp_reduction = warp_reduce_all<op_t>(val);
-
-    __syncthreads();
-
-    // first thread within a warp writes reduction to shared memory
-    if (lane_id == 0) {
-        lane_result[warp_id] = warp_reduction;
-    }
-
-    // wait for all warps to finish writing their reductions
-    __syncthreads();
-
-    // reduce the results of all warps
-    T block_reduction = init;
-    if (lane_id < num_warps) {
-        block_reduction = lane_result[lane_id];
-    }
-
-    block_reduction = warp_reduce_all<op_t>(block_reduction);
-
-    return block_reduction;
-}
-
-template<typename dst_t>
-static __device__ void convert_fp16(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v) {
-    const half * x = (const half *) vx;
-
-    v.x = (dst_t)(x[ib + iqs + 0]);
-    v.y = (dst_t)(x[ib + iqs + 1]);
-}
-
-template<typename dst_t>
-static __device__ void convert_fp32(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v) {
-    const float * x = (const float *) vx;
-
-    v.x = (dst_t)(x[ib + iqs + 0]);
-    v.y = (dst_t)(x[ib + iqs + 1]);
-}
-
-template<typename src0_t, typename src1_t, typename dst_t>
-static __global__ void k_mul_mat_p021(const src0_t * vx, const src1_t * y, dst_t * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
-    const src0_t * x = vx;
-    // const int col_x = blockDim.x*blockIdx.x + threadIdx.x;
-    // const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
-
-    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
-    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
-
-    const int nrows_y = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst = row_x;
-
-    dst_t tmp = 0;
-
-    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
-        const int col_x = col_x0 + threadIdx.x;
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        // x is transposed and permuted
-        const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x;
-        const dst_t xi = (dst_t)(x[ix]);
-
-        const int row_y = col_x;
-
-        // y is not transposed but permuted
-        const int iy = channel*nrows_y + row_y;
-
-        tmp += xi * y[iy];
-    }
-
-    // dst is not transposed and not permuted
-    const int idst = channel*nrows_dst + row_dst;
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (threadIdx.x == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-template<typename src0_t, typename src1_t, typename dst_t>
-static __global__ void k_mul_mat_vec_nc(
-    const src0_t * vx, const src1_t * y, dst_t * dst, const int ncols_x, const int nrows_x,
-    const int row_stride_x, const int nchannels_x, const int channel_stride_x) {
-
-    const src0_t * x = vx;
-
-    const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
-    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
-
-    const int nrows_y = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst = row_x;
-
-    const int idst = channel*nrows_dst + row_dst;
-
-    dst_t tmp = 0;
-
-    for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
-        const int col_x = col_x0 + threadIdx.x;
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x;
-        const dst_t xi = (dst_t)(x[ix]);
-
-        const int row_y = col_x;
-
-        const int iy = channel*nrows_y + row_y;
-
-        tmp += xi * y[iy];
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (threadIdx.x == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-template <typename src_t, typename dst_t>
-static __global__ void k_cpy(const char * cx, char * cdst, const int ne,
-                                   const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
-                                   const int ne10, const int ne11, const int nb10, const int nb11, const int nb12) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= ne) {
-        return;
-    }
-
-    const int i02 = i / (ne00*ne01);
-    const int i01 = (i - i02*ne01*ne00) / ne00;
-    const int i00 = i - i02*ne01*ne00 - i01*ne00;
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
-
-    const int i12 = i / (ne10*ne11);
-    const int i11 = (i - i12*ne10*ne11) / ne10;
-    const int i10 = i - i12*ne10*ne11 - i11*ne10;
-    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
-
-    *(dst_t *)(cdst + dst_offset) = *(const src_t *)(cx + x_offset);
-}
-
-template<typename src0_t, typename src1_t, typename dst_t>
-static __global__ void k_add(const src0_t * x, const src1_t * y, dst_t * dst, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = (dst_t)x[i] + (dst_t)y[i];
-}
-
-template<typename src0_t, typename src1_t, typename dst_t>
-static __global__ void k_mul(const src0_t * x, const src1_t * y, dst_t * dst, const int kx, const int ky) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= kx) {
-        return;
-    }
-    dst[i] = (dst_t)x[i] * (dst_t)y[i%ky];
-}
-
-template<typename src0_t, typename dst_t>
-static __global__ void k_silu(const src0_t * x, dst_t * dst, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] / (src0_t(1) + exp(-x[i]));
-}
-
-// TODO: unstable with f16 compute, using f32 compute for now
-template<typename src0_t, typename dst_t>
-static __global__ void k_rms_norm(const src0_t * x, dst_t * dst, const int ncols) {
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    const int tid = threadIdx.x;
-
-    const float eps  = 1e-6;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
-        const float xi = x[row*ncols + col];
-        tmp += xi * xi;
-    }
-
-    // sum up partial sums
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    const float mean = tmp / (float)ncols;
-    const float scale = 1.0f / sqrtf(mean + eps);
-
-    for (int col = tid; col < ncols; col += WARP_SIZE) {
-        dst[row*ncols + col] = scale * (float)x[row*ncols + col];
-    }
-}
-
-template<typename src0_t, typename dst_t>
-static __global__ void k_rope(const src0_t * x, dst_t * dst, const int ncols, const float p, const float theta_scale) {
-    const int col = 2*(blockDim.x*blockIdx.x + threadIdx.x);
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = blockDim.y*blockIdx.y + threadIdx.y;
-    const int i = row*ncols + col;
-
-    const dst_t theta = p * powf(theta_scale, col/2);
-    const dst_t sin_theta = sin(theta);
-    const dst_t cos_theta = cos(theta);
-
-    const dst_t x0 = x[i + 0];
-    const dst_t x1 = x[i + 1];
-
-    dst[i + 0] = (dst_t)x0*cos_theta - (dst_t)x1*sin_theta;
-    dst[i + 1] = (dst_t)x0*sin_theta + (dst_t)x1*cos_theta;
-}
-
-template<typename src0_t, typename dst_t>
-static __global__ void k_diag_mask_inf(const src0_t * x, dst_t * dst, const int ncols, const int rows_per_channel, const int n_past) {
-    const int col = blockDim.x*blockIdx.x + threadIdx.x;
-    const int row = blockDim.y*blockIdx.y + threadIdx.y;
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int i = row*ncols + col;
-    //dst[i] = col > (n_past + row % rows_per_channel) ? (dst_t)-INFINITY : (dst_t)x[i];
-    dst[i] = (dst_t)x[i] - (dst_t)((col > n_past + row % rows_per_channel) * INT_MAX); // equivalent within rounding error but slightly faster on GPU
-}
-
-// TODO: numerically stable version - low prio since the softmax is computed in the fused attention kernel
-// check: https://arxiv.org/pdf/2001.04438.pdf
-template<typename src0_t, typename dst_t>
-static __global__ void k_soft_max_orig(const src0_t * x, dst_t * dst, const int ncols) {
-    const int row = blockDim.y*blockIdx.y + threadIdx.y;
-    const int block_size = blockDim.x;
-    const int tid = threadIdx.x;
-
-    float tmp = 0;
-
-    for (int block_start = 0; block_start < ncols; block_start += block_size) {
-        const int col = block_start + tid;
-
-        if (col >= ncols) {
-            break;
-        }
-
-        const int i = row*ncols + col;
-        const float val = expf(x[i]);
-        tmp += val;
-        dst[i] = val;
-    }
-
-    // sum up partial sums
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    for (int block_start = 0; block_start < ncols; block_start += block_size) {
-        const int col = block_start + tid;
-
-        if (col >= ncols) {
-            break;
-        }
-
-        const int i = row*ncols + col;
-        dst[i] /= tmp;
-    }
-}
-
-template<typename src_t, typename dst_t, int pack_size, int block_size>
-static __global__ void k_soft_max(const src_t * x, dst_t * dst, const int64_t nrows, const int64_t ncols) {
-    //assert(ncols % pack_size == 0);
-    const int tid = threadIdx.x;
-    const int num_packs = ncols / pack_size;
-
-    for (int row = blockIdx.x; row < nrows; row += gridDim.x) {
-        src_t th_max = -INFINITY;
-        // row max thread
-        #pragma unroll
-        for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-            // load pack
-            src_t pack[pack_size];
-            #pragma unroll
-            for (int i = 0; i < pack_size; i++) {
-                pack[i] = x[row * ncols + pack_id * pack_size + i];
-            }
-            // reduce max pack
-            #pragma unroll
-            for (int i = 0; i < pack_size; ++i) {
-                th_max = max(th_max, pack[i]);
-            }
-        }
-        // reduce max row warp threads
-        src_t row_max = block_reduce_all<op_max>(th_max, (src_t)-INFINITY);
-
-        // row exp sum thread
-        src_t th_sum = 0;
-        #pragma unroll
-        for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-            // load pack
-            src_t pack[pack_size];
-            #pragma unroll
-            for (int i = 0; i < pack_size; i++) {
-                pack[i] = x[row * ncols + pack_id * pack_size + i];
-            }
-            // reduce pack
-            #pragma unroll
-            for (int i = 0; i < pack_size; ++i) {
-                th_sum += exp(pack[i] - row_max);
-            }
-        }
-
-        // reduce row exp sum all threads
-        src_t row_sum = block_reduce_all<op_sum>(th_sum);
-
-        // store (row - row_max) / row exp sum
-        #pragma unroll
-        for (int pack_id = tid; pack_id < num_packs; pack_id += block_size) {
-            // load pack
-            src_t pack[pack_size];
-            #pragma unroll
-            for (int i = 0; i < pack_size; i++) {
-                pack[i] = x[row * ncols + pack_id * pack_size + i];
-            }
-            // reduce pack
-            #pragma unroll
-            for (int i = 0; i < pack_size; ++i) {
-                pack[i] = exp(pack[i] - row_max) / row_sum;
-            }
-
-            // store pack
-            #pragma unroll
-            for (int i = 0; i < pack_size; i++) {
-                dst[row * ncols + pack_id * pack_size + i] = pack[i];
-            }
-        }
-    }
-}
-
-template<typename src0_t, typename src1_t, typename dst_t>
-static __global__ void k_scale(const src0_t * x, dst_t * dst, const src1_t * scale, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = (dst_t)(*scale) * (dst_t)x[i];
-}
-
-template<typename dst_t, int qk, int qr, dequantize_kernel_t<dst_t> dequantize_kernel>
-static __global__ void k_get_rows(const void * x, const int * y, dst_t * dst, const int ncols) {
-    const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
-    const int row = blockDim.y*blockIdx.y + threadIdx.y;
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int r = y[row];
-
-    // copy x[r*ncols + col] to dst[row*ncols + col]
-    const int xi = r*ncols + col;
-    const int di = row*ncols + col;
-
-    const int ib = xi/qk; // block index
-    const int iqs = (xi%qk)/qr; // quant index
-    const int iybs = di - di%qk; // y block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    vec2_t<dst_t> v;
-    dequantize_kernel(x, ib, iqs, v);
-    dst[iybs + iqs + 0]        = v.x;
-    dst[iybs + iqs + y_offset] = v.y;
-}
--- a/ggml-cuda-quant.h
+++ b/ggml-cuda-quant.h
@@ -1,920 +0,0 @@
-// quants kernels for ggml-cuda
-
-// QK = number of values after dequantization
-// QR = QK / number of values before dequantization
-// QI = number of 32 bit integers before dequantization
-
-#define QK4_0 32
-#define QR4_0 2
-#define QI4_0 4
-typedef struct {
-    half    d;              // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-#define QR4_1 2
-#define QI4_1 4
-typedef struct {
-    half    d;              // delta
-    half    m;              // min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-#define QK5_0 32
-#define QR5_0 2
-#define QI5_0 4
-typedef struct {
-    half d;                 // delta
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2];  // nibbles / quants
-} block_q5_0;
-static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
-
-#define QK5_1 32
-#define QR5_1 2
-#define QI5_1 4
-typedef struct {
-    half d;                 // delta
-    half m;                 // min
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2];  // nibbles / quants
-} block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
-
-#define QK8_0 32
-#define QR8_0 1
-#define QI8_0 8
-typedef struct {
-    half    d;              // delta
-    int8_t  qs[QK8_0];      // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
-
-#define QK8_1 32
-#define QR8_1 1
-#define QI8_1 8
-typedef struct {
-    half    d;              // delta
-    half    s;              // unquantized sum
-    int8_t  qs[QK8_0];      // quants
-} block_q8_1;
-static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
-
-//================================= k-quants
-
-#define QK_K 256
-
-typedef struct {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    half d;                  // super-block scale for quantized scales
-    half dmin;               // super-block scale for quantized mins
-} block_q2_K;
-static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
-
-typedef struct {
-    uint8_t hmask[QK_K/8];
-    uint8_t qs[QK_K/4]; // nibbles / quants
-    uint8_t scales[3*QK_K/64];
-    half d;
-} block_q3_K;
-static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
-
-typedef struct {
-    half d;                    // super-block scale for quantized scales
-    half dmin;                 // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
-
-typedef struct {
-    half    d;                   // super-block scale for quantized scales
-    half    dmin;                // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
-
-typedef struct {
-    uint8_t ql[QK_K/2];   // quants, lower 4 bits
-    uint8_t qh[QK_K/4];   // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales
-    half    d;         // delta
-} block_q6_K;
-static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
-
-
-template<typename src1_t, typename dst_t>
-using dot_kernel_k_t = void (*)(const void * vx, const int ib, const int iqs, const src1_t * y, dst_t & v);
-
-template<typename dst_t>
-using vec_dot_q_cuda_t = dst_t (*)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
-
-
-// TODO: f16
-template<typename src_t>
-static __global__ void quantize_q8_1(const src_t * x, void * vy, const int k) {
-    const int i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    block_q8_1 * y = (block_q8_1 *) vy;
-
-    const int ib = i / QK8_0; // block index
-    const int iqs = i % QK8_0; // quant index
-
-    const float xi = x[i];
-    float amax = fabsf(xi);
-    float sum = xi;
-
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
-        sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
-    }
-
-    const float d = amax / 127;
-    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
-
-    y[ib].qs[iqs] = q;
-
-    if (iqs > 0) {
-        return;
-    }
-
-    y[ib].d = d;
-    y[ib].s = sum;
-}
-
-template<typename dst_t>
-static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
-    const block_q4_0 * x = (const block_q4_0 *) vx;
-
-    const dst_t d = x[ib].d;
-
-    const uint8_t vui = x[ib].qs[iqs];
-
-    v.x = vui & 0xF;
-    v.y = vui >> 4;
-
-    const vec2_t<dst_t> off2 = make_vec2_t<dst_t>(8, 8);
-    const vec2_t<dst_t> d2   = make_vec2_t<dst_t>(d, d);
-
-    v = (v - off2) * d2;
-}
-
-template<typename dst_t>
-static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
-    const block_q4_1 * x = (const block_q4_1 *) vx;
-
-    const dst_t d = x[ib].d;
-    const dst_t m = x[ib].m;
-
-    const uint8_t vui = x[ib].qs[iqs];
-
-    v.x = vui & 0xF;
-    v.y = vui >> 4;
-
-    const vec2_t<dst_t> d2 = make_vec2_t<dst_t>(d, d);
-    const vec2_t<dst_t> m2 = make_vec2_t<dst_t>(m, m);
-
-    v = v * d2 + m2;
-}
-
-template<typename dst_t>
-static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
-    const block_q5_0 * x = (const block_q5_0 *) vx;
-
-    const dst_t d = x[ib].d;
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
-
-    const vec2_t<dst_t> off2 = make_vec2_t<dst_t>(16, 16);
-    const vec2_t<dst_t> d2   = make_vec2_t<dst_t>(d, d);
-
-    v = (v - off2) * d2;
-}
-
-template<typename dst_t>
-static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
-    const block_q5_1 * x = (const block_q5_1 *) vx;
-
-    const dst_t d = x[ib].d;
-    const dst_t m = x[ib].m;
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y = ((x[ib].qs[iqs] >>  4) | xh_1);
-
-    const vec2_t<dst_t> d2 = make_vec2_t<dst_t>(d, d);
-    const vec2_t<dst_t> m2 = make_vec2_t<dst_t>(m, m);
-
-    v = v * d2 + m2;
-}
-
-template<typename dst_t>
-static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, vec2_t<dst_t> & v){
-    const block_q8_0 * x = (const block_q8_0 *) vx;
-
-    const dst_t d = x[ib].d;
-
-    v.x = x[ib].qs[iqs + 0];
-    v.y = x[ib].qs[iqs + 1];
-
-    const vec2_t<dst_t> d2 = make_vec2_t<dst_t>(d, d);
-
-    v = v * d2;
-}
-
-//================================== k-quants
-
-static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
-
-    const int i   = blockIdx.x;
-    const int tid = threadIdx.x;
-    const int n   = tid/32;
-    const int l   = tid - 32*n;
-    const int is  = 8*n + l/16;
-
-    const block_q2_K * x = (const block_q2_K *) vx;
-
-    const uint8_t q = x[i].qs[32*n + l];
-    float * y = yy + i*QK_K + 128*n;
-
-    float dall = x[i].d;
-    float dmin = x[i].dmin;
-    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
-    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
-    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
-    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
-
-}
-
-static __device__ void vec_dot_q2_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
-
-    const block_q2_K * x = (const block_q2_K *) vx;
-
-    // if n is 0, we want to do the lower 128, else the upper 128,
-    // covering y[l+0],  y[l+32], y[l+64], y[l+96] and
-    //          y[l+16], y[l+48], y[l+80], y[l+112]
-    int n = iqs/128;                // 0 or 1
-    int r = iqs - 128*n;            // 0...120 in steps of 8
-    int l = r/8;                    // 0...15 in steps of 1
-
-    const float   * y = yy + 128*n + l;
-    const uint8_t * q = x[ib].qs + 32*n + l;
-    const uint8_t * s = x[ib].scales + 8*n;
-
-    const float dall = x[ib].d;
-    const float dmin = x[ib].dmin;
-
-    float sum = y[  0] * (dall * ((s[0] & 0xF) * ((q[ 0] >> 0) & 3)) - dmin * (s[0] >> 4))
-              + y[ 32] * (dall * ((s[2] & 0xF) * ((q[ 0] >> 2) & 3)) - dmin * (s[2] >> 4))
-              + y[ 64] * (dall * ((s[4] & 0xF) * ((q[ 0] >> 4) & 3)) - dmin * (s[4] >> 4))
-              + y[ 96] * (dall * ((s[6] & 0xF) * ((q[ 0] >> 6) & 3)) - dmin * (s[6] >> 4))
-              + y[ 16] * (dall * ((s[1] & 0xF) * ((q[16] >> 0) & 3)) - dmin * (s[1] >> 4))
-              + y[ 48] * (dall * ((s[3] & 0xF) * ((q[16] >> 2) & 3)) - dmin * (s[3] >> 4))
-              + y[ 80] * (dall * ((s[5] & 0xF) * ((q[16] >> 4) & 3)) - dmin * (s[5] >> 4))
-              + y[112] * (dall * ((s[7] & 0xF) * ((q[16] >> 6) & 3)) - dmin * (s[7] >> 4));
-
-    result = sum;
-
-}
-
-static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
-
-    int r = threadIdx.x/4;
-    int i = blockIdx.x;
-    int tid = r/2;
-    int is0 = r%2;
-    int l0 = 16*is0 + 4*(threadIdx.x%4);
-    int n = tid / 4;
-    int j = tid - 4*n;
-
-    const block_q3_K * x = (const block_q3_K *) vx;
-
-    uint8_t m = 1 << (4*n + j);
-    int is = 8*n + 2*j + is0;
-    int shift = 2*j;
-
-    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
-                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
-                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
-                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
-    float d_all = x[i].d;
-    float dl = d_all * (us - 32);
-
-    float * y = yy + i*QK_K + 128*n + 32*j;
-    const uint8_t * q = x[i].qs + 32*n;
-    const uint8_t * hm = x[i].hmask;
-
-    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
-
-}
-
-static __device__ void vec_dot_q3_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
-
-    const block_q3_K * x = (const block_q3_K *) vx;
-
-    const uint32_t kmask1 = 0x03030303;
-    const uint32_t kmask2 = 0x0f0f0f0f;
-
-    uint32_t aux[3];
-    uint32_t utmp[4];
-
-    // if n is 0, we want to do the lower 128, else the upper 128,
-    // covering y[l+0],  y[l+32], y[l+64], y[l+96] and
-    //          y[l+16], y[l+48], y[l+80], y[l+112]
-    int n = iqs/128;                // 0 or 1
-    int r = iqs - 128*n;            // 0...120 in steps of 8
-    int l = r/8;                    // 0...15 in steps of 1
-
-    const float   * y = yy + 128*n + l;
-    const uint8_t * q = x[ib].qs + 32*n + l;
-    const uint8_t * hm = x[ib].hmask + l;
-    const int8_t  * s = (const int8_t *)utmp + 8*n;
-
-    memcpy(aux, x[ib].scales, 12);
-    utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
-    utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
-    utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
-    utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
-
-    const float dall = x[ib].d;
-
-    const uint8_t m = 1 << (4*n);
-
-    float sum = y[  0] * (s[0] - 32) * (((q[ 0] >> 0) & 3) - (hm[ 0] & (m << 0) ? 0 : 4))
-              + y[ 32] * (s[2] - 32) * (((q[ 0] >> 2) & 3) - (hm[ 0] & (m << 1) ? 0 : 4))
-              + y[ 64] * (s[4] - 32) * (((q[ 0] >> 4) & 3) - (hm[ 0] & (m << 2) ? 0 : 4))
-              + y[ 96] * (s[6] - 32) * (((q[ 0] >> 6) & 3) - (hm[ 0] & (m << 3) ? 0 : 4))
-              + y[ 16] * (s[1] - 32) * (((q[16] >> 0) & 3) - (hm[16] & (m << 0) ? 0 : 4))
-              + y[ 48] * (s[3] - 32) * (((q[16] >> 2) & 3) - (hm[16] & (m << 1) ? 0 : 4))
-              + y[ 80] * (s[5] - 32) * (((q[16] >> 4) & 3) - (hm[16] & (m << 2) ? 0 : 4))
-              + y[112] * (s[7] - 32) * (((q[16] >> 6) & 3) - (hm[16] & (m << 3) ? 0 : 4));
-
-    result = sum * dall;
-
-}
-
-static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
-    if (j < 4) {
-        d = q[j] & 63; m = q[j + 4] & 63;
-    } else {
-        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
-        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
-    }
-}
-
-static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
-    const block_q4_K * x = (const block_q4_K *) vx;
-
-    const int i = blockIdx.x;
-
-    //// assume 64 threads - this is very slightly better than the one below
-    //const int tid = threadIdx.x;
-    //const int il  = tid/16;
-    //const int ir  = tid%16;
-    //const int is  = 2*il;
-    //const int n   = 2;
-
-    // assume 32 threads
-    const int tid = threadIdx.x;
-    const int il  = tid/8;
-    const int ir  = tid%8;
-    const int is  = 2*il;
-    const int n   = 4;
-
-    float * y = yy + i*QK_K + 64*il + n*ir;
-
-    const float dall = x[i].d;
-    const float dmin = x[i].dmin;
-
-    const uint8_t * q = x[i].qs + 32*il + n*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const float d1 = dall * sc; const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const float d2 = dall * sc; const float m2 = dmin * m;
-    for (int l = 0; l < n; ++l) {
-        y[l + 0] = d1 * (q[l] & 0xF) - m1;
-        y[l +32] = d2 * (q[l] >>  4) - m2;
-    }
-}
-
-static __device__ void vec_dot_q4_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
-
-    const block_q4_K * x = (const block_q4_K *) vx;
-
-                                    // iqs is in 0...248 in steps of 8 =>
-    const int j  = iqs / 64;        // j  is in 0...3
-    const int ir = (iqs - 64*j)/2;  // ir is in 0...28 in steps of 4
-    const int is = 2*j;             // is is in 0...6 in steps of 2
-
-    const float   * y = yy + 64*j + ir;
-    const uint8_t * q = x[ib].qs + 32*j + ir;
-
-    const float dall = x[ib].d;
-    const float dmin = x[ib].dmin;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[ib].scales, sc, m);
-    const float d1 = dall * sc;
-    const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[ib].scales, sc, m);
-    const float d2 = dall * sc;
-    const float m2 = dmin * m;
-
-    float sum = 0;
-    for (int k = 0; k < 4; ++k) {
-        sum += y[k +  0] * (d1 * (q[k] & 0xF) - m1);
-        sum += y[k + 32] * (d2 * (q[k] >>  4) - m2);
-    }
-    result = sum;
-
-}
-
-static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
-    const block_q5_K * x = (const block_q5_K *) vx;
-
-    const int i = blockIdx.x;
-
-    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
-    const int il  = tid/16;   // il is in 0...3
-    const int ir  = tid%16;   // ir is in 0...15
-    const int is  = 2*il;     // is is in 0...6
-
-    float * y = yy + i*QK_K + 64*il + 2*ir;
-
-    const float dall = x[i].d;
-    const float dmin = x[i].dmin;
-
-    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
-    const uint8_t * qh = x[i].qh + 2*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const float d1 = dall * sc; const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const float d2 = dall * sc; const float m2 = dmin * m;
-
-    uint8_t   hm  = 1 << (2*il);
-    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
-    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
-    hm <<= 1;
-    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
-    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
-}
-
-static __device__ void vec_dot_q5_K(const void * vx, const int ib, const int iqs, const float * yy, float & result) {
-
-    const block_q5_K * x = (const block_q5_K *) vx;
-
-                                    // iqs is in 0...248 in steps of 8 =>
-    const int j  = iqs / 64;        // j  is in 0...3
-    const int ir = (iqs - 64*j)/2;  // ir is in 0...28 in steps of 4
-    const int is = 2*j;             // is is in 0...6 in steps of 2
-
-    const float   * y  = yy + 64*j + ir;
-    const uint8_t * ql = x[ib].qs + 32*j + ir;
-    const uint8_t * qh = x[ib].qh + ir;
-
-    const float dall = x[ib].d;
-    const float dmin = x[ib].dmin;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[ib].scales, sc, m);
-    const float d1 = dall * sc;
-    const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[ib].scales, sc, m);
-    const float d2 = dall * sc;
-    const float m2 = dmin * m;
-
-    uint8_t   hm  = 1 << is;
-    float sum = 0;
-    for (int k = 0; k < 4; ++k) {
-        sum += y[k +  0] * (d1 * ((ql[k] & 0xF) + (qh[k] & hm ? 16 : 0)) - m1);
-    }
-    hm <<= 1;
-    for (int k = 0; k < 4; ++k) {
-        sum += y[k + 32] * (d2 * ((ql[k] >>  4) + (qh[k] & hm ? 16 : 0)) - m2);
-    }
-    result = sum;
-
-}
-
-template<typename dst_t>
-static __global__ void dequantize_block_q6_K(const void * vx, dst_t * yy) {
-    const block_q6_K * x = (const block_q6_K *) vx;
-
-    const int i = blockIdx.x;
-
-    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
-    const int ip  = tid/32;   // ip is 0 or 1
-    const int il  = tid - 32*ip; // 0...32
-    const int is  = 8*ip + il/16;
-
-    // TODO: fp16 compute
-    dst_t * y = yy + i*QK_K + 128*ip + il;
-
-    const float d = x[i].d;
-
-    const uint8_t * ql = x[i].ql + 64*ip + il;
-    const uint8_t   qh = x[i].qh[32*ip + il];
-    const int8_t  * sc = x[i].scales + is;
-
-    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
-    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-}
-
-template<typename src1_t, typename dst_t>
-static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const src1_t * yy, dst_t * dst, const int ncols, int nrows) {
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = blockIdx.y*blockDim.y + threadIdx.y;
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q6_K * x = (const block_q6_K *)vx + ib0;
-
-    const int tid = threadIdx.x/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = threadIdx.x%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-#if K_QUANTS_PER_ITERATION == 1
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
-    const int is = 0;
-#else
-    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
-    const int is = in / 4;
-#endif
-    const int ql_offset = 64*im + l0;
-    const int qh_offset = 32*im + l0;
-    const int s_offset  =  8*im + is;
-    const int y_offset = 128*im + l0;
-
-    dst_t tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const src1_t  * y  = yy + i * QK_K + y_offset;
-        const uint8_t * ql = x[i].ql + ql_offset;
-        const uint8_t * qh = x[i].qh + qh_offset;
-        const int8_t  * s  = x[i].scales + s_offset;
-
-        const dst_t d = x[i].d;
-
-#if K_QUANTS_PER_ITERATION == 1
-        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
-                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
-                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
-                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
-                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
-                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
-                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
-                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
-        tmp += sum;
-#else
-        dst_t sum = 0;
-        for (int l = 0; l < 4; ++l) {
-            sum += (dst_t)y[l+ 0] * (dst_t)s[0] * d * (dst_t)((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
-                 + (dst_t)y[l+32] * (dst_t)s[2] * d * (dst_t)((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
-                 + (dst_t)y[l+64] * (dst_t)s[4] * d * (dst_t)((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
-                 + (dst_t)y[l+96] * (dst_t)s[6] * d * (dst_t)((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
-        }
-        tmp += sum;
-#endif
-
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-template <typename dst_t, int qk, int qr, dequantize_kernel_t<dst_t> dequantize_kernel>
-static __global__ void dequantize_block(const void * vx, dst_t * y, const int k) {
-    const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
-
-    if (i >= k) {
-        return;
-    }
-
-    const int ib = i/qk; // block index
-    const int iqs = (i%qk)/qr; // quant index
-    const int iybs = i - i%qk; // y block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    vec2_t<dst_t> v;
-    dequantize_kernel(vx, ib, iqs, v);
-
-    y[iybs + iqs + 0]        = v.x;
-    y[iybs + iqs + y_offset] = v.y;
-}
-
-template<typename dst_t>
-static __device__ __forceinline__ dst_t vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
-    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
-
-    int vi;
-    memcpy(&vi,  &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
-    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
-    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
-
-    const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
-
-    // subtract 8 from each quantized value
-    const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
-    const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
-
-    // SIMD dot product of quantized values
-    int sumi = __dp4a(vi0, ui0, 0);
-    sumi     = __dp4a(vi1, ui1, sumi);
-
-    return sumi*d;
-#else
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 600
-}
-
-template<typename dst_t>
-static __device__ __forceinline__ dst_t vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
-    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
-
-    const int vi  = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
-    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
-    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
-
-    const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
-    const float m = bq4_1->m;
-    const float s = bq8_1->s;
-
-    const int vi0 = (vi >> 0) & 0x0F0F0F0F;
-    const int vi1 = (vi >> 4) & 0x0F0F0F0F;
-
-    // SIMD dot product of quantized values
-    int sumi = __dp4a(vi0, ui0, 0);
-    sumi     = __dp4a(vi1, ui1, sumi);
-
-    return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
-#else
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 600
-}
-
-template<typename dst_t>
-static __device__ __forceinline__ dst_t vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
-    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
-
-    int qs;
-    memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
-    const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
-    const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
-    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
-    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
-
-    const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
-
-    int vi0 = (qs  >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
-    vi0    |= (qh0 <<  4) & 0x00000010; // 1 ->  5
-    vi0    |= (qh0 << 11) & 0x00001000; // 2 -> 13
-    vi0    |= (qh0 << 18) & 0x00100000; // 3 -> 21
-    vi0    |= (qh0 << 25) & 0x10000000; // 4 -> 29
-    vi0     = __vsub4(vi0,  0x10101010); // subtract 16 from quantized values
-    int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
-
-    int vi1 = (qs  >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
-    vi1    |= (qh1 <<  4) & 0x00000010; // 1 ->  5
-    vi1    |= (qh1 << 11) & 0x00001000; // 2 -> 13
-    vi1    |= (qh1 << 18) & 0x00100000; // 3 -> 21
-    vi1    |= (qh1 << 25) & 0x10000000; // 4 -> 29
-    vi1     = __vsub4(vi1,  0x10101010); // subtract 16 from quantized values
-    sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
-
-    return sumi*d;
-#else
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 600
-}
-
-template<typename dst_t>
-static __device__ __forceinline__ dst_t vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
-    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
-
-    const int qs  = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
-    const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
-    const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
-    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
-    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
-
-    const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
-    const float m = bq5_1->m;
-    const float s = bq8_1->s;
-
-    int vi0 = (qs  >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
-    vi0    |= (qh0 <<  4) & 0x00000010; // 1 ->  5
-    vi0    |= (qh0 << 11) & 0x00001000; // 2 -> 13
-    vi0    |= (qh0 << 18) & 0x00100000; // 3 -> 21
-    vi0    |= (qh0 << 25) & 0x10000000; // 4 -> 29
-    int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
-
-    int vi1 = (qs  >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
-    vi1    |= (qh1 <<  4) & 0x00000010; // 1 ->  5
-    vi1    |= (qh1 << 11) & 0x00001000; // 2 -> 13
-    vi1    |= (qh1 << 18) & 0x00100000; // 3 -> 21
-    vi1    |= (qh1 << 25) & 0x10000000; // 4 -> 29
-    sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
-
-    return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
-#else
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 600
-}
-
-template<typename dst_t>
-static __device__ __forceinline__ dst_t vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
-#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
-    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
-
-    int vi;
-    memcpy(&vi,  &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
-    const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
-
-    const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
-
-    // SIMD dot product of quantized values
-    int sumi = __dp4a(vi, ui, 0);
-
-    return sumi*d;
-#else
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= 600
-}
-
-template <typename dst_t, int qk, int qi, typename block_q_t, vec_dot_q_cuda_t<dst_t> vec_dot_q_cuda>
-static __global__ void mul_mat_vec_q(const void * vx, const void * vy, dst_t * dst, const int ncols, const int nrows) {
-    const int row = blockIdx.y*blockDim.y + threadIdx.y;
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = WARP_SIZE / qi;
-
-// partial sum for each thread
-    float tmp = 0.0f;
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
-        const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
-
-        const int iby = i + threadIdx.x / qi; // y block index
-
-        const int iqs  = threadIdx.x % qi; // x block quant index when casting the quants to int
-
-        tmp += (float)vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (threadIdx.x == 0) {
-        dst[row] = (dst_t)tmp;
-    }
-}
-
-template <typename src1_t, typename dst_t, int qk, int qr, dequantize_kernel_t<dst_t> dequantize_kernel>
-static __global__ void dequantize_mul_mat_vec(const void * vx, const src1_t * y, dst_t * dst, const int ncols, const int nrows) {
-    // qk = quantized weights per x block
-    // qr = number of quantized weights per data value in x block
-    const int row = blockIdx.y*blockDim.y + threadIdx.y;
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int tid = threadIdx.x;
-
-    const int iter_stride = 2*GGML_CUDA_DMMV_X;
-    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    vec2_t<dst_t> tmp2 = make_vec2_t<dst_t>(0, 0); // partial sum for thread in warp
-
-    for (int i = 0; i < ncols; i += iter_stride) {
-        const int col = i + vals_per_iter*tid;
-        const int ib = (row*ncols + col)/qk; // x block index
-        const int iqs = (col%qk)/qr; // x quant index
-        const int iybs = col - col%qk; // y block start index
-
-// processing >2 values per i iter is faster for fast GPUs
-#pragma unroll
-        for (int j = 0; j < vals_per_iter; j += 2) {
-            // process 2 vals per j iter
-            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
-
-            // dequantize
-            vec2_t<dst_t> xc;
-            dequantize_kernel(vx, ib, iqs + j/qr, xc);
-
-            // matrix multiplication
-            vec2_t<dst_t> yc = make_vec2_t<dst_t>(
-                y[iybs + iqs + j/qr + 0],
-                y[iybs + iqs + j/qr + y_offset]);
-            tmp2 += xc * yc;
-        }
-    }
-
-    // sum up partial sums and write back result
-    // TODO: reducing as half2 may be faster, but requires special handling for float2
-    dst_t tmp = tmp2.x + tmp2.y;
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-template <typename src1_t, typename dst_t, int n_thread, dot_kernel_k_t<src1_t, dst_t> dot_kernel>
-static __global__ void dequantize_mul_mat_vec_k(const void * vx, const src1_t * y, dst_t * dst, const int ncols) {
-    const int row = blockIdx.x*blockDim.y + threadIdx.y;
-    const int tid = threadIdx.x;
-
-    const int iter_stride = QK_K;
-    const int vals_per_iter = iter_stride / n_thread;
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    dst_t tmp = 0; // partial sum for thread in warp
-
-    for (int i = 0; i < ncols; i += iter_stride) {
-        const int col = i + vals_per_iter*tid;
-        const int ib = ib0 + col/QK_K; // x block index
-        const int iqs = col%QK_K; // x quant index
-        const int iybs = col - col%QK_K; // y block start index
-
-        dst_t v;
-        dot_kernel(vx, ib, iqs, y + iybs, v);
-        tmp += v;
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
-    }
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -6,15 +6,32 @@
 extern "C" {
 #endif

+#define GGML_CUDA_MAX_DEVICES       16
+
+GGML_API void   ggml_init_cublas(void);
 GGML_API void * ggml_cuda_host_malloc(size_t size);
 GGML_API void   ggml_cuda_host_free(void * ptr);
-GGML_API void   ggml_cuda_host_register(void * ptr, size_t size);
-GGML_API void   ggml_cuda_host_unregister(void * ptr);

-// backend API
+GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);
+GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);

-GGML_API struct ggml_backend * ggml_backend_cuda_init();
+GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);

+GGML_API void   ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
+
+GGML_API void   ggml_cuda_set_main_device(int main_device);
+GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
+GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);
+GGML_API void   ggml_cuda_free_scratch(void);
+GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+
+GGML_API int    ggml_cuda_get_device_count(void);
+GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);

 #ifdef  __cplusplus
 }
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -38,6 +38,9 @@ struct ggml_metal_context;
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);

+void * ggml_metal_host_malloc(size_t n);
+void   ggml_metal_host_free  (void * data);
+
 // set the number of command buffers to use
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);

@@ -61,6 +64,16 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
 // get data from the device into host memory
 void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);

+// try to find operations that can be run concurrently in the graph
+// you should run it again if the topology of your graph changes
+void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
+
+// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+
+// output the concur_list for ggml_alloc
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
+
 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -5,7 +5,11 @@
 #import <Foundation/Foundation.h>

 #import <Metal/Metal.h>
-#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))

 #ifdef GGML_METAL_NDEBUG
 #define metal_printf(...)
@@ -15,6 +19,8 @@

 #define UNUSED(x) (void)(x)

+#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
+
 struct ggml_metal_buffer {
    const char * name;

@@ -36,12 +42,16 @@ struct ggml_metal_context {
    int n_buffers;
    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];

+    int concur_list[GGML_MAX_CONCUR];
+    int concur_list_len;
+
    // custom kernels
 #define GGML_METAL_DECL_KERNEL(name) \
    id<MTLFunction>             function_##name; \
    id<MTLComputePipelineState> pipeline_##name

    GGML_METAL_DECL_KERNEL(add);
+    GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast
    GGML_METAL_DECL_KERNEL(mul);
    GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
    GGML_METAL_DECL_KERNEL(scale);
@@ -68,6 +78,14 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
    GGML_METAL_DECL_KERNEL(rope);
    GGML_METAL_DECL_KERNEL(alibi_f32);
    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
@@ -97,14 +115,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    ctx->device = MTLCreateSystemDefaultDevice();
    ctx->queue  = [ctx->device newCommandQueue];
    ctx->n_buffers = 0;
+    ctx->concur_list_len = 0;

-    // determine if we can use MPS
-    if (MPSSupportsMTLDevice(ctx->device)) {
-        fprintf(stderr, "%s: using MPS\n", __func__);
-    } else {
-        fprintf(stderr, "%s: not using MPS\n", __func__);
-        GGML_ASSERT(false && "MPS not supported");
-    }

 #if 0
    // compile from source string and show compile log
@@ -114,7 +126,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
        if (error) {
            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-            exit(1);
+            return NULL;
        }
    }
 #else
@@ -132,7 +144,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
        if (error) {
            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-            exit(1);
+            return NULL;
        }

 #ifdef GGML_QKK_64
@@ -144,19 +156,25 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #endif
        if (error) {
            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-            exit(1);
+            return NULL;
        }
    }
 #endif

    // load kernels
    {
+        NSError * error = nil;
 #define GGML_METAL_ADD_KERNEL(name) \
        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
-        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \
-        fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name);
+        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
+        fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); \
+        if (error) { \
+            fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+            return NULL; \
+        }

        GGML_METAL_ADD_KERNEL(add);
+        GGML_METAL_ADD_KERNEL(add_row);
        GGML_METAL_ADD_KERNEL(mul);
        GGML_METAL_ADD_KERNEL(mul_row);
        GGML_METAL_ADD_KERNEL(scale);
@@ -183,6 +201,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
        GGML_METAL_ADD_KERNEL(rope);
        GGML_METAL_ADD_KERNEL(alibi_f32);
        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
@@ -211,10 +237,33 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    free(ctx);
 }

+void * ggml_metal_host_malloc(size_t n) {
+    void * data = NULL;
+    const int result = posix_memalign((void **) &data, getpagesize(), n);
+    if (result != 0) {
+        fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
+        return NULL;
+    }
+
+    return data;
+}
+
+void ggml_metal_host_free(void * data) {
+    free(data);
+}
+
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
    ctx->n_cb = n_cb;
 }

+int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
+    return ctx->concur_list_len;
+}
+
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
+    return ctx->concur_list;
+}
+
 // finds the Metal buffer that contains the tensor data on the GPU device
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
@@ -353,11 +402,112 @@ void ggml_metal_get_tensor(
    memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
 }

+void ggml_metal_graph_find_concurrency(
+        struct ggml_metal_context * ctx,
+        struct ggml_cgraph * gf, bool check_mem) {
+    int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
+    int nodes_unused[GGML_MAX_CONCUR];
+
+    for (int i = 0; i < GGML_MAX_CONCUR; i++) { ctx->concur_list[i] = 0; }
+    for (int i = 0; i < gf->n_nodes;     i++) { nodes_unused[i]     = 1; }
+    ctx->concur_list_len = 0;
+
+    int n_left    = gf->n_nodes;
+    int n_start   = 0; // all nodes before n_start at nodes_unused array have been sorted and store back to ctx->concur_list
+    int level_pos = 0; // at ctx->concur_list, the last layer (level) ends at level_pos
+
+    while (n_left > 0) {
+        // number of nodes at a layer (that can be issued concurrently)
+        int concurrency = 0;
+        for (int i = n_start; i < ((n_start + search_depth > gf->n_nodes) ? gf->n_nodes : n_start + search_depth); i++) {
+            if (nodes_unused[i]) {
+                // if the requirements for gf->nodes[i] are satisfied
+                int exe_flag = 1;
+
+                // scan all srcs
+                for (int src_ind = 0; src_ind < GGML_MAX_SRC; src_ind++) {
+                    struct ggml_tensor * src_cur = gf->nodes[i]->src[src_ind];
+                    if (src_cur) {
+                        // if is leaf nodes it's satisfied.
+                        // TODO: ggml_is_leaf()
+                        if (src_cur->op == GGML_OP_NONE && src_cur->grad == NULL) {
+                            continue;
+                        }
+
+                        // otherwise this src should be the output from previous nodes.
+                        int is_found = 0;
+
+                        // scan 2*search_depth back because we inserted barrier.
+                        //for (int j = ((level_pos - 2*search_depth) < 0 ? 0 : (level_pos - 2*search_depth)); j < level_pos; j++) {
+                        for (int j = MAX(0, level_pos - 2*search_depth); j < level_pos; j++) {
+                            if (ctx->concur_list[j] >= 0 && gf->nodes[ctx->concur_list[j]] == src_cur) {
+                                is_found = 1;
+                                break;
+                            }
+                        }
+                        if (is_found == 0) {
+                            exe_flag = 0;
+                            break;
+                        }
+                    }
+                }
+                if (exe_flag && check_mem) {
+                    // check if nodes[i]'s data will be overwritten by a node before nodes[i].
+                    // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
+                    int64_t data_start = (int64_t) gf->nodes[i]->data;
+                    int64_t length     = (int64_t) ggml_nbytes(gf->nodes[i]);
+                    for (int j = n_start; j < i; j++) {
+                        if (nodes_unused[j] && gf->nodes[j]->op != GGML_OP_RESHAPE \
+                                            && gf->nodes[j]->op != GGML_OP_VIEW \
+                                            && gf->nodes[j]->op != GGML_OP_TRANSPOSE \
+                                            && gf->nodes[j]->op != GGML_OP_PERMUTE) {
+                            if (((int64_t)gf->nodes[j]->data) >= data_start + length || \
+                                ((int64_t)gf->nodes[j]->data) + (int64_t) ggml_nbytes(gf->nodes[j]) <= data_start) {
+                                continue;
+                            }
+
+                            exe_flag = 0;
+                        }
+                    }
+                }
+                if (exe_flag) {
+                    ctx->concur_list[level_pos + concurrency] = i;
+                    nodes_unused[i] = 0;
+                    concurrency++;
+                    ctx->concur_list_len++;
+                }
+            }
+        }
+        n_left -= concurrency;
+        // adding a barrier different layer
+        ctx->concur_list[level_pos + concurrency] = -1;
+        ctx->concur_list_len++;
+        // jump all sorted nodes at nodes_bak
+        while (!nodes_unused[n_start]) {
+            n_start++;
+        }
+        level_pos += concurrency + 1;
+    }
+
+    if (ctx->concur_list_len > GGML_MAX_CONCUR) {
+        fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
+    }
+}
+
 void ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
               struct ggml_cgraph * gf) {
    metal_printf("%s: evaluating graph\n", __func__);

+    // if there is ctx->concur_list, dispatch concurrently
+    // else fallback to serial dispatch
+    MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
+
+    const bool has_concur = ctx->concur_list_len && ctx->concur_list_len <= GGML_MAX_CONCUR;
+
+    const int n_nodes  = has_concur ? ctx->concur_list_len      : gf->n_nodes;
+    edesc.dispatchType = has_concur ? MTLDispatchTypeConcurrent : MTLDispatchTypeSerial;
+
    // create multiple command buffers and enqueue them
    // then, we encode the graph into the command buffers in parallel

@@ -376,7 +526,7 @@ void ggml_metal_graph_compute(
    dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);

    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-        const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
+        const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;

        dispatch_async(queue, ^{
            size_t offs_src0 = 0;
@@ -385,12 +535,19 @@ void ggml_metal_graph_compute(

            id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];

-            id<MTLComputeCommandEncoder> encoder = nil;
+            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];

-            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
-            const int node_end   = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
+            const int node_start =                                  (cb_idx + 0) * n_nodes_per_cb;
+            const int node_end   = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
+
+            for (int ind = node_start; ind < node_end; ++ind) {
+                const int i = has_concur ? ctx->concur_list[ind] : ind;
+
+                if (i == -1) {
+                    [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
+                    continue;
+                }

-            for (int i = node_start; i < node_end; ++i) {
                metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));

                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
@@ -460,14 +617,16 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ADD:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
+                            if (ggml_nelements(src1) == ne10) {
+                                // src1 is a row
+                                [encoder setComputePipelineState:ctx->pipeline_add_row];
+                            } else {
+                                [encoder setComputePipelineState:ctx->pipeline_add];
                            }
-
-                            [encoder setComputePipelineState:ctx->pipeline_add];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];

                            const int64_t n = ggml_nelements(dst);

@@ -475,10 +634,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_MUL:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
                                [encoder setComputePipelineState:ctx->pipeline_mul_row];
@@ -496,10 +651,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_SCALE:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
                            const float scale = *(const float *) src1->data;

                            [encoder setComputePipelineState:ctx->pipeline_scale];
@@ -511,54 +662,46 @@ void ggml_metal_graph_compute(

                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
-                    case GGML_OP_SILU:
-                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
+                    case GGML_OP_UNARY:
+                        switch (ggml_get_unary_op(gf->nodes[i])) {
+                            case GGML_UNARY_OP_SILU:
+                                {
+                                    [encoder setComputePipelineState:ctx->pipeline_silu];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];

-                            [encoder setComputePipelineState:ctx->pipeline_silu];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                                    const int64_t n = ggml_nelements(dst);

-                            const int64_t n = ggml_nelements(dst);
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            case GGML_UNARY_OP_RELU:
+                                {
+                                    [encoder setComputePipelineState:ctx->pipeline_relu];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];

-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                    const int64_t n = ggml_nelements(dst);
+
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            case GGML_UNARY_OP_GELU:
+                                {
+                                    [encoder setComputePipelineState:ctx->pipeline_gelu];
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                                    const int64_t n = ggml_nelements(dst);
+
+                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                                } break;
+                            default:
+                                {
+                                    fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                    GGML_ASSERT(false);
+                                }
                        } break;
-                    case GGML_OP_RELU:
-                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
-                            [encoder setComputePipelineState:ctx->pipeline_relu];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                            const int64_t n = ggml_nelements(dst);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_GELU:
-                    {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
-                            [encoder setComputePipelineState:ctx->pipeline_gelu];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                            const int64_t n = ggml_nelements(dst);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                    } break;
                    case GGML_OP_SOFT_MAX:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
                            const int nth = 32;

                            [encoder setComputePipelineState:ctx->pipeline_soft_max];
@@ -573,11 +716,7 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_DIAG_MASK_INF:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
-                            const int n_past = ((int32_t *)(src1->data))[0];
+                            const int n_past = ((int32_t *)(dst->op_params))[0];

                            [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -593,53 +732,44 @@ void ggml_metal_graph_compute(
                            // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224

                            GGML_ASSERT(ne00 == ne10);
-                            GGML_ASSERT(ne02 == ne12);
+                            // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
+                            uint gqa = ne12/ne02;
+                            GGML_ASSERT(ne03 == ne13);

+                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
                            if (ggml_is_contiguous(src0) &&
                                ggml_is_contiguous(src1) &&
-                                (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
-
-                                if (encoder != nil) {
-                                    [encoder endEncoding];
-                                    encoder = nil;
+                                src1t == GGML_TYPE_F32 &&
+                                [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                                ne00%32 == 0 &&
+                                ne11 > 1) {
+                                    switch (src0->type) {
+                                        case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
+                                        case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
+                                        case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
+                                        case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
+                                        case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
+                                        case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
+                                        case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
+                                        case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
+                                        default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
+                                    }
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                                    [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                                    [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
+                                    [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
+                                    [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
+                                    [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
+                                    [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
+                                    [encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
+                                    [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+                                    [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                                }
-
-                                MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
-                                MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
-
-                                // for F32 x F32 we use MPS
-                                MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
-                                    matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt];
-
-                                MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
-                                    matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt];
-
-                                MPSMatrixDescriptor * desc  = [MPSMatrixDescriptor
-                                    matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32];
-
-                                MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
-                                    initWithDevice:ctx->device transposeLeft:false transposeRight:true
-                                        resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
-
-                                // we need to do ne02 multiplications
-                                // TODO: is there a way to do this in parallel - currently very slow ..
-                                // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
-                                for (int64_t i02 = 0; i02 < ne02; ++i02) {
-                                    size_t offs_src0_cur = offs_src0 + i02*nb02;
-                                    size_t offs_src1_cur = offs_src1 + i02*nb12;
-                                    size_t offs_dst_cur  = offs_dst  + i02*nb2;
-
-                                    MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
-                                    MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
-                                    MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst_cur  descriptor:desc ];
-
-                                    [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
-                                }
-                            } else {
-                                if (encoder == nil) {
-                                    encoder = [command_buffer computeCommandEncoder];
-                                }
-
+                            else {
                                int nth0 = 32;
                                int nth1 = 1;

@@ -647,8 +777,6 @@ void ggml_metal_graph_compute(
                                switch (src0t) {
                                    case GGML_TYPE_F16:
                                        {
-                                            GGML_ASSERT(ne02 == ne12);
-
                                            nth0 = 64;
                                            nth1 = 1;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
@@ -676,8 +804,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4;
-                                            nth1 = 16;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
                                        } break;
                                    case GGML_TYPE_Q3_K:
@@ -685,8 +813,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4;
-                                            nth1 = 16;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
                                        } break;
                                    case GGML_TYPE_Q4_K:
@@ -694,8 +822,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4;
-                                            nth1 = 16;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
                                        } break;
                                    case GGML_TYPE_Q5_K:
@@ -703,8 +831,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4;
-                                            nth1 = 16;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
                                        } break;
                                    case GGML_TYPE_Q6_K:
@@ -712,8 +840,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4;
-                                            nth1 = 16;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
                                        } break;
                                    default:
@@ -728,27 +856,36 @@ void ggml_metal_graph_compute(
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
                                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
-                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
-                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
-                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
-                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
-                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
-                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
-                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
-                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
-                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
+                                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
+                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
+                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
+                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
+                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
+                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
+                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
+                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
+                                [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];

-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
+                                    src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
-                                else if (src0t == GGML_TYPE_Q2_K ||
-                                         src0t == GGML_TYPE_Q3_K ||
-                                         src0t == GGML_TYPE_Q4_K ||
-                                         src0t == GGML_TYPE_Q5_K ||
-                                         src0t == GGML_TYPE_Q6_K) {
-                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                else if (src0t == GGML_TYPE_Q3_K) {
+#ifdef GGML_QKK_64
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#else
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#endif
+                                }
+                                else if (src0t == GGML_TYPE_Q5_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src0t == GGML_TYPE_Q6_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
                                    [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -757,10 +894,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_GET_ROWS:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
                            switch (src0->type) {
                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
                                case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
@@ -786,13 +919,10 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_RMS_NORM:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
+                            float eps;
+                            memcpy(&eps, dst->op_params, sizeof(float));

-                            const float eps = 1e-6f;
-
-                            const int nth = 256;
+                            const int nth = 512;

                            [encoder setComputePipelineState:ctx->pipeline_rms_norm];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -800,7 +930,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
                            [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];

                            const int64_t nrows = ggml_nrows(src0);

@@ -808,10 +938,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_NORM:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
                            const float eps = 1e-5f;

                            const int nth = 256;
@@ -830,15 +956,12 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ALIBI:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
                            GGML_ASSERT((src0t == GGML_TYPE_F32));

-                            const int   n_past   = ((int32_t *) src1->data)[0]; UNUSED(n_past);
-                            const int   n_head   = ((int32_t *) src1->data)[1];
-                            const float max_bias = ((float *)   src1->data)[2];
+                            const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
+                            const int n_head = ((int32_t *) dst->op_params)[1];
+                            float max_bias;
+                            memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));

                            if (__builtin_popcount(n_head) != 1) {
                                GGML_ASSERT(false && "only power-of-two n_head implemented");
@@ -872,19 +995,14 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ROPE:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
-                            const int n_dims = ((int32_t *) src1->data)[1];
-                            const int mode   = ((int32_t *) src1->data)[2];
-
-                            const int n_past = ((int32_t *)(src1->data))[0];
+                            const int n_past = ((int32_t *) dst->op_params)[0];
+                            const int n_dims = ((int32_t *) dst->op_params)[1];
+                            const int mode   = ((int32_t *) dst->op_params)[2];

                            float freq_base;
                            float freq_scale;
-                            memcpy(&freq_base,  (int32_t *) src1->data + 4, sizeof(float));
-                            memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float));
+                            memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
+                            memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));

                            [encoder setComputePipelineState:ctx->pipeline_rope];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@@ -913,12 +1031,10 @@ void ggml_metal_graph_compute(

                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
+                    case GGML_OP_DUP:
                    case GGML_OP_CPY:
+                    case GGML_OP_CONT:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoder];
-                            }
-
                            const int nth = 32;

                            switch (src0t) {
@@ -963,8 +1079,10 @@ void ggml_metal_graph_compute(
                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
                    default:
-                        fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                        GGML_ASSERT(false);
+                        {
+                            fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                            GGML_ASSERT(false);
+                        }
                }
            }

--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -183,6 +183,15 @@
 #    define GGML_API
 #endif

+// TODO: support for clang
+#ifdef __GNUC__
+#    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+#    define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+#    define GGML_DEPRECATED(func, hint) func
+#endif
+
 #include <stdint.h>
 #include <stddef.h>
 #include <stdbool.h>
@@ -198,7 +207,7 @@
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
-#define GGML_MAX_NAME          48
+#define GGML_MAX_NAME          64
 #define GGML_MAX_OP_PARAMS     32
 #define GGML_DEFAULT_N_THREADS 4

@@ -206,8 +215,14 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1

+#define GGUF_MAGIC   0x46554747 // "GGUF"
+#define GGUF_VERSION 1
+
+#define GGUF_DEFAULT_ALIGNMENT 32
+
 #define GGML_UNUSED(x) (void)(x)

+#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))

 #define GGML_ASSERT(x) \
    do { \
@@ -245,8 +260,9 @@
 extern "C" {
 #endif

-#ifdef __ARM_NEON
-    // we use the built-in 16-bit float type
+#if defined(__ARM_NEON) && defined(__CUDACC__)
+    typedef half ggml_fp16_t;
+#elif defined(__ARM_NEON)
    typedef __fp16 ggml_fp16_t;
 #else
    typedef uint16_t ggml_fp16_t;
@@ -286,6 +302,12 @@ extern "C" {
        GGML_TYPE_COUNT,
    };

+    enum ggml_backend {
+        GGML_BACKEND_CPU = 0,
+        GGML_BACKEND_GPU = 10,
+        GGML_BACKEND_GPU_SPLIT = 20,
+    };
+
    // model file types
    enum ggml_ftype {
        GGML_FTYPE_UNKNOWN     = -1,
@@ -324,20 +346,12 @@ extern "C" {
        GGML_OP_ARGMAX,
        GGML_OP_REPEAT,
        GGML_OP_REPEAT_BACK,
-        GGML_OP_ABS,
-        GGML_OP_SGN,
-        GGML_OP_NEG,
-        GGML_OP_STEP,
-        GGML_OP_TANH,
-        GGML_OP_ELU,
-        GGML_OP_RELU,
-        GGML_OP_GELU,
-        GGML_OP_GELU_QUICK,
-        GGML_OP_SILU,
+        GGML_OP_CONCAT,
        GGML_OP_SILU_BACK,
        GGML_OP_NORM, // normalize
        GGML_OP_RMS_NORM,
        GGML_OP_RMS_NORM_BACK,
+        GGML_OP_GROUP_NORM,

        GGML_OP_MUL_MAT,
        GGML_OP_OUT_PROD,
@@ -363,18 +377,29 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_1D,
        GGML_OP_CONV_2D,
+        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,

+        GGML_OP_UPSCALE, // nearest interpolate
+
        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
        GGML_OP_FLASH_ATTN_BACK,
        GGML_OP_WIN_PART,
        GGML_OP_WIN_UNPART,
+        GGML_OP_GET_REL_POS,
+        GGML_OP_ADD_REL_POS,
+
+        GGML_OP_UNARY,

        GGML_OP_MAP_UNARY,
        GGML_OP_MAP_BINARY,

+        GGML_OP_MAP_CUSTOM1_F32,
+        GGML_OP_MAP_CUSTOM2_F32,
+        GGML_OP_MAP_CUSTOM3_F32,
+
        GGML_OP_MAP_CUSTOM1,
        GGML_OP_MAP_CUSTOM2,
        GGML_OP_MAP_CUSTOM3,
@@ -385,6 +410,24 @@ extern "C" {
        GGML_OP_COUNT,
    };

+    enum ggml_unary_op {
+        GGML_UNARY_OP_ABS,
+        GGML_UNARY_OP_SGN,
+        GGML_UNARY_OP_NEG,
+        GGML_UNARY_OP_STEP,
+        GGML_UNARY_OP_TANH,
+        GGML_UNARY_OP_ELU,
+        GGML_UNARY_OP_RELU,
+        GGML_UNARY_OP_GELU,
+        GGML_UNARY_OP_GELU_QUICK,
+        GGML_UNARY_OP_SILU,
+    };
+
+    enum ggml_object_type {
+        GGML_OBJECT_TENSOR,
+        GGML_OBJECT_GRAPH,
+        GGML_OBJECT_WORK_BUFFER
+    };

    // ggml object
    struct ggml_object {
@@ -393,16 +436,17 @@ extern "C" {

        struct ggml_object * next;

-        char padding[8];
+        enum ggml_object_type type;
+
+        char padding[4];
    };

    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);

    // n-dimensional tensor
    struct ggml_tensor {
-        struct ggml_backend * backend;
-
-        enum ggml_type type;
+        enum ggml_type    type;
+        enum ggml_backend backend;

        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -415,17 +459,13 @@ extern "C" {
        enum ggml_op op;

        // op params - allocated as int32_t for alignment
-        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)];
+        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];

        bool is_param;

        struct ggml_tensor * grad;
        struct ggml_tensor * src[GGML_MAX_SRC];

-        bool visited;   // used to build graphs
-        int n_children; // used by the allocator
-        int n_views;
-
        // performance
        int     perf_runs;
        int64_t perf_cycles;
@@ -433,11 +473,11 @@ extern "C" {

        void * data;

-        void * extra; // extra things e.g. for ggml-cuda.cu
-
        char name[GGML_MAX_NAME];

-        char padding[12];
+        void * extra; // extra things e.g. for ggml-cuda.cu
+
+        char padding[4];
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@@ -458,37 +498,47 @@ extern "C" {
        void * abort_callback_data;
    };

+    // next prime after GGML_MAX_NODES
+    // #define GGML_GRAPH_HASHTABLE_SIZE 4099
+    // next prime after GGML_MAX_NODES * 2 (nodes + leafs)
+    #define GGML_GRAPH_HASHTABLE_SIZE 8273
+
    // computation graph
    struct ggml_cgraph {
        int n_nodes;
        int n_leafs;
-        bool closed;

        struct ggml_tensor * nodes[GGML_MAX_NODES];
        struct ggml_tensor * grads[GGML_MAX_NODES];
        struct ggml_tensor * leafs[GGML_MAX_NODES];

+        void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
+
        // performance
        int     perf_runs;
        int64_t perf_cycles;
        int64_t perf_time_us;
    };

-    enum ggml_alloc_mode {
-        GGML_ALLOC_NONE,            // do not allocate tensors
-        GGML_ALLOC_IMMEDIATE,       // allocate tensors immediately
-        GGML_ALLOC_COMPUTE_SEQ,     // delay allocation until graph build time, allocate tensors for sequential graph computation
-        //GGML_ALLOC_COMPUTE_PAR,     // allocate tensors for parallel graph computation
+    static const size_t GGML_GRAPH_SIZE = sizeof(struct ggml_cgraph);
+
+    // scratch buffer
+    struct ggml_scratch {
+        size_t offs;
+        size_t size;
+        void * data;
    };

-    // context parameters
    struct ggml_init_params {
-        struct ggml_buffer * buffer;
-        enum ggml_alloc_mode alloc_mode;   // tensor allocation mode
-        enum ggml_type       compute_type; // type of intermediate results
+        // memory pool
+        size_t mem_size;   // bytes
+        void * mem_buffer; // if NULL, memory will be allocated internally
+        bool   no_alloc;   // don't allocate memory for the tensor data
    };

-    // task types
+
+    // compute types
+
    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
    enum ggml_task_type {
@@ -525,6 +575,7 @@ extern "C" {
    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);

    GGML_API int     ggml_blck_size (enum ggml_type type);
@@ -533,6 +584,7 @@ extern "C" {

    GGML_API const char * ggml_type_name(enum ggml_type type);
    GGML_API const char * ggml_op_name  (enum ggml_op   op);
+    GGML_API const char * ggml_op_symbol(enum ggml_op   op);

    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);

@@ -545,24 +597,26 @@ extern "C" {
    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);

+    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+
    // use this to compute the memory overhead of a tensor
    GGML_API size_t ggml_tensor_overhead(void);

    // main
-    GGML_API struct ggml_init_params ggml_init_params_default(void);
-    GGML_API struct ggml_context *   ggml_init(struct ggml_init_params params);
-    GGML_API void                    ggml_free(struct ggml_context * ctx);

-    GGML_API void    ggml_set_alloc_mode(struct ggml_context * ctx, enum ggml_alloc_mode mode);
+    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
+    GGML_API void                  ggml_free(struct ggml_context * ctx);

-    // TODO: update for ggml_buffer
    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
+
+    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);
+    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
+
    GGML_API void *  ggml_get_mem_buffer     (const struct ggml_context * ctx);
    GGML_API size_t  ggml_get_mem_size       (const struct ggml_context * ctx);
    GGML_API size_t  ggml_get_max_tensor_size(const struct ggml_context * ctx);

-    GGML_API struct ggml_buffer * ggml_get_buffer(const struct ggml_context * ctx);
-
    GGML_API struct ggml_tensor * ggml_new_tensor(
            struct ggml_context * ctx,
            enum   ggml_type type,
@@ -616,9 +670,11 @@ extern "C" {
    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);

-    GGML_API const char *         ggml_get_name(const struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name);
-    GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...);
+    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+
+    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
+    GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);

    //
    // operations on tensors with backpropagation
@@ -628,6 +684,11 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_dup_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    GGML_API struct ggml_tensor * ggml_add(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -752,6 +813,13 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // concat a and b on dim 2
+    // used in stable-diffusion
+    GGML_API struct ggml_tensor * ggml_concat(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    GGML_API struct ggml_tensor * ggml_abs(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
@@ -852,14 +920,30 @@ extern "C" {

    GGML_API struct ggml_tensor * ggml_rms_norm(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a,
+            float                 eps);

    GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
+            struct ggml_tensor  * a,
+            float                 eps);
+
+    // group normalize along ne0*ne1*n_groups
+    // used in stable-diffusion
+    // TODO: eps is hardcoded to 1e-6 for now
+    GGML_API struct ggml_tensor * ggml_group_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_groups);
+
+    GGML_API struct ggml_tensor * ggml_group_norm_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_groups);

    // a - x
    // b - dy
+    // TODO: update with configurable eps
    GGML_API struct ggml_tensor * ggml_rms_norm_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -951,11 +1035,22 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // a -> b, in-place, return view(b)
+    GGML_API struct ggml_tensor * ggml_cpy_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    // make contiguous
    GGML_API struct ggml_tensor * ggml_cont(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // make contiguous, in-place
+    GGML_API struct ggml_tensor * ggml_cont_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // return view(a), b specifies the new shape
    // TODO: when we start computing gradient, make a copy instead of view
    GGML_API struct ggml_tensor * ggml_reshape(
@@ -1131,20 +1226,29 @@ extern "C" {
            int                   n_past,
            int                   n_dims,
            int                   mode,
+            int                   n_ctx,
            float                 freq_base,
-            float                 freq_scale,
-            int                   n_ctx);
+            float                 freq_scale);

-    // custom RoPE, in-place, returns view(a)
+    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
            int                   n_dims,
            int                   mode,
+            int                   n_ctx,
            float                 freq_base,
-            float                 freq_scale,
-            int                   n_ctx);
+            float                 freq_scale);
+
+    // xPos RoPE, in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            float                 base,
+            bool                  down);

    // rotary position embedding backward, i.e compute dx from dy
    // a - dy
@@ -1153,7 +1257,12 @@ extern "C" {
            struct ggml_tensor  * a,
            int                   n_past,
            int                   n_dims,
-            int                   mode);
+            int                   mode,
+            int                   n_ctx,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 xpos_base,
+            bool                  xpos_down);

    // alibi position embedding
    // in-place, returns view(a)
@@ -1180,6 +1289,15 @@ extern "C" {
            int                   p0,  // padding
            int                   d0); // dilation

+    // conv_1d with padding = half
+    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
+    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s,
+            int                   d);
+
    GGML_API struct ggml_tensor * ggml_conv_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1191,14 +1309,38 @@ extern "C" {
            int                   d0,
            int                   d1);

-    // conv_1d with padding = half
-    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
-    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
+
+    // kernel size is a->ne[0] x a->ne[1]
+    // stride is equal to kernel size
+    // padding is zero
+    // example:
+    // a:     16   16    3  768
+    // b:   1024 1024    3    1
+    // res:   64   64  768    1
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // kernel size is a->ne[0] x a->ne[1]
+    // stride is 1
+    // padding is half
+    // example:
+    // a:      3    3    256  256
+    // b:     64   64    256    1
+    // res:   64   64    256    1
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            int                   s,
-            int                   d);
+            int                   stride);

    enum ggml_op_pool {
        GGML_OP_POOL_MAX,
@@ -1206,7 +1348,7 @@ extern "C" {
        GGML_OP_POOL_COUNT,
    };

-    GGML_API struct ggml_tensor* ggml_pool_1d(
+    GGML_API struct ggml_tensor * ggml_pool_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            enum ggml_op_pool     op,
@@ -1214,7 +1356,7 @@ extern "C" {
            int                   s0, // stride
            int                   p0); // padding

-    GGML_API struct ggml_tensor* ggml_pool_2d(
+    GGML_API struct ggml_tensor * ggml_pool_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            enum ggml_op_pool     op,
@@ -1225,6 +1367,13 @@ extern "C" {
            int                   p0,
            int                   p1);

+    // nearest interpolate
+    // used in stable-diffusion
+    GGML_API struct ggml_tensor * ggml_upscale(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   scale_factor);
+
    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@@ -1268,6 +1417,37 @@ extern "C" {
            int                   h0,
            int                   w);

+    GGML_API struct ggml_tensor * ggml_unary(
+            struct ggml_context * ctx,
+             struct ggml_tensor * a,
+             enum ggml_unary_op op);
+
+    GGML_API struct ggml_tensor * ggml_unary_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_unary_op op);
+
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_get_rel_pos(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   qh,
+            int                   kh);
+
+    // used in sam
+
+    GGML_API struct ggml_tensor * ggml_add_rel_pos(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * pw,
+            struct ggml_tensor  * ph);
+
+    GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * pw,
+            struct ggml_tensor  * ph);
+
    // custom operators

    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@@ -1277,63 +1457,129 @@ extern "C" {
    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);

-    GGML_API struct ggml_tensor * ggml_map_unary_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
-                   ggml_unary_op_f32_t   fun);
+                   ggml_unary_op_f32_t   fun),
+        "use ggml_map_custom1 instead");

-    GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
-                   ggml_unary_op_f32_t   fun);
+                   ggml_unary_op_f32_t   fun),
+        "use ggml_map_custom1_inplace instead");

-    GGML_API struct ggml_tensor * ggml_map_binary_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b,
-                   ggml_binary_op_f32_t   fun);
+                   ggml_binary_op_f32_t   fun),
+        "use ggml_map_custom2 instead");

-    GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b,
-                   ggml_binary_op_f32_t   fun);
+                   ggml_binary_op_f32_t   fun),
+        "use ggml_map_custom2_inplace instead");

-    GGML_API struct ggml_tensor * ggml_map_custom1_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
-                   ggml_custom1_op_f32_t   fun);
+                   ggml_custom1_op_f32_t   fun),
+        "use ggml_map_custom1 instead");

-    GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
-                   ggml_custom1_op_f32_t   fun);
+                   ggml_custom1_op_f32_t   fun),
+        "use ggml_map_custom1_inplace instead");

-    GGML_API struct ggml_tensor * ggml_map_custom2_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
            struct ggml_tensor           * b,
-                   ggml_custom2_op_f32_t   fun);
+                   ggml_custom2_op_f32_t   fun),
+        "use ggml_map_custom2 instead");

-    GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
            struct ggml_tensor           * b,
-                   ggml_custom2_op_f32_t   fun);
+                   ggml_custom2_op_f32_t   fun),
+        "use ggml_map_custom2_inplace instead");

-    GGML_API struct ggml_tensor * ggml_map_custom3_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
            struct ggml_tensor           * b,
            struct ggml_tensor           * c,
-                   ggml_custom3_op_f32_t   fun);
+                   ggml_custom3_op_f32_t   fun),
+        "use ggml_map_custom3 instead");

-    GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
            struct ggml_context          * ctx,
            struct ggml_tensor           * a,
            struct ggml_tensor           * b,
            struct ggml_tensor           * c,
-                   ggml_custom3_op_f32_t   fun);
+                   ggml_custom3_op_f32_t   fun),
+        "use ggml_map_custom3_inplace instead");
+
+    // custom operators v2
+
+    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
+    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
+    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
+
+    #define GGML_N_TASKS_MAX -1
+
+    GGML_API struct ggml_tensor * ggml_map_custom1(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            ggml_custom1_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            ggml_custom1_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom2(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            ggml_custom2_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            ggml_custom2_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom3(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            struct ggml_tensor    * c,
+            ggml_custom3_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            struct ggml_tensor    * c,
+            ggml_custom3_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);

    // loss function

@@ -1356,12 +1602,16 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * tensor);

+
    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);

    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);

-    GGML_API void ggml_graph_close  (struct ggml_cgraph * cgraph);
+    // graph allocation in a context
+    GGML_API struct ggml_cgraph * ggml_new_graph        (struct ggml_context * ctx);
+    GGML_API struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API size_t ggml_graph_overhead(void);

    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
@@ -1561,6 +1811,118 @@ extern "C" {

    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);

+    //
+    // gguf
+    //
+
+    enum gguf_type {
+        GGUF_TYPE_UINT8   = 0,
+        GGUF_TYPE_INT8    = 1,
+        GGUF_TYPE_UINT16  = 2,
+        GGUF_TYPE_INT16   = 3,
+        GGUF_TYPE_UINT32  = 4,
+        GGUF_TYPE_INT32   = 5,
+        GGUF_TYPE_FLOAT32 = 6,
+        GGUF_TYPE_BOOL    = 7,
+        GGUF_TYPE_STRING  = 8,
+        GGUF_TYPE_ARRAY   = 9,
+        GGUF_TYPE_COUNT,       // marks the end of the enum
+    };
+
+    struct gguf_context;
+
+    struct gguf_init_params {
+        bool no_alloc;
+
+        // if not NULL, create a ggml_context and allocate the tensor data in it
+        struct ggml_context ** ctx;
+    };
+
+    GGML_API struct gguf_context * gguf_init_empty(void);
+    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
+    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
+
+    GGML_API void gguf_free(struct gguf_context * ctx);
+
+    GGML_API const char * gguf_type_name(enum gguf_type type);
+
+    GGML_API int    gguf_get_version    (struct gguf_context * ctx);
+    GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
+    GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
+    GGML_API void * gguf_get_data       (struct gguf_context * ctx);
+
+    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);
+    GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);
+    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
+
+    GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
+    GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
+
+    // results are undefined if the wrong type is used for the key
+    GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
+    GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);
+    GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);
+    GGML_API int16_t      gguf_get_val_i16 (struct gguf_context * ctx, int i);
+    GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);
+    GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);
+    GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);
+    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
+    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);
+    GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
+
+    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
+    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);
+    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
+    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);
+
+    // overrides existing values or adds a new one
+    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
+    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
+    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
+    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
+    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
+    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
+    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
+    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
+    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
+    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
+    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
+
+    // set or add KV pairs from another context
+    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
+
+    // manage tensor info
+    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
+    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
+
+    // writing gguf files can be done in 2 ways:
+    //
+    // - write the entire gguf_context to a binary file in a single pass:
+    //
+    //   gguf_write_to_file(ctx, fname);
+    //
+    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
+    //
+    //   FILE * f = fopen(fname, "wb");
+    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
+    //   fwrite(f, ...);
+    //   void * data = gguf_meta_get_meta_data(ctx);
+    //   fseek(f, 0, SEEK_SET);
+    //   fwrite(f, data, gguf_get_meta_size(ctx));
+    //   free(data);
+    //   fclose(f);
+    //
+
+    // write the entire context to a binary file
+    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
+
+    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
+    GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
+    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);
+
    //
    // system info
    //
@@ -1577,8 +1939,9 @@ extern "C" {
    GGML_API int ggml_cpu_has_fp16_va    (void);
    GGML_API int ggml_cpu_has_wasm_simd  (void);
    GGML_API int ggml_cpu_has_blas       (void);
-    GGML_API int ggml_cpu_has_cuda       (void);
+    GGML_API int ggml_cpu_has_cublas     (void);
    GGML_API int ggml_cpu_has_clblast    (void);
+    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_vsx        (void);

@@ -1597,6 +1960,10 @@ extern "C" {
    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);

    typedef struct {
+        const char      * type_name;
+        int               blck_size;
+        size_t            type_size;
+        bool              is_quantized;
        ggml_to_float_t   to_float;
        ggml_from_float_t from_float;
        ggml_from_float_t from_float_reference;
@@ -1604,11 +1971,8 @@ extern "C" {
        enum ggml_type    vec_dot_type;
    } ggml_type_traits_t;

-    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
+    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

 #ifdef  __cplusplus
 }
 #endif
-
-
-#include "ggml-backend.h"
--- a/gguf.py
+++ b/gguf.py
@@ -0,0 +1,722 @@
+import shutil
+import sys
+import struct
+import tempfile
+import numpy as np
+
+from enum import IntEnum, auto
+from typing import Any, IO, List, Optional
+
+#
+# constants
+#
+
+GGUF_MAGIC             = 0x46554747
+GGUF_VERSION           = 1
+GGUF_DEFAULT_ALIGNMENT = 32
+
+# general
+KEY_GENERAL_ARCHITECTURE         = "general.architecture"
+KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
+KEY_GENERAL_ALIGNMENT            = "general.alignment"
+KEY_GENERAL_NAME                 = "general.name"
+KEY_GENERAL_AUTHOR               = "general.author"
+KEY_GENERAL_URL                  = "general.url"
+KEY_GENERAL_DESCRIPTION          = "general.description"
+KEY_GENERAL_LICENSE              = "general.license"
+KEY_GENERAL_SOURCE_URL           = "general.source.url"
+KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
+KEY_GENERAL_FILE_TYPE            = "general.file_type"
+
+# LLM
+KEY_LLM_CONTEXT_LENGTH        = "{arch}.context_length"
+KEY_LLM_EMBEDDING_LENGTH      = "{arch}.embedding_length"
+KEY_LLM_BLOCK_COUNT           = "{arch}.block_count"
+KEY_LLM_FEED_FORWARD_LENGTH   = "{arch}.feed_forward_length"
+KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
+KEY_LLM_TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
+
+# attention
+KEY_ATTENTION_HEAD_COUNT        = "{arch}.attention.head_count"
+KEY_ATTENTION_HEAD_COUNT_KV     = "{arch}.attention.head_count_kv"
+KEY_ATTENTION_MAX_ALIBI_BIAS    = "{arch}.attention.max_alibi_bias"
+KEY_ATTENTION_CLAMP_KQV         = "{arch}.attention.clamp_kqv"
+KEY_ATTENTION_LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
+KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
+
+# RoPE
+KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
+KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"
+
+# tokenization
+KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
+KEY_TOKENIZER_LIST       = "tokenizer.ggml.tokens"
+KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
+KEY_TOKENIZER_SCORES     = "tokenizer.ggml.scores"
+KEY_TOKENIZER_MERGES     = "tokenizer.ggml.merges"
+KEY_TOKENIZER_BOS_ID     = "tokenizer.ggml.bos_token_id"
+KEY_TOKENIZER_EOS_ID     = "tokenizer.ggml.eos_token_id"
+KEY_TOKENIZER_UNK_ID     = "tokenizer.ggml.unknown_token_id"
+KEY_TOKENIZER_SEP_ID     = "tokenizer.ggml.seperator_token_id"
+KEY_TOKENIZER_PAD_ID     = "tokenizer.ggml.padding_token_id"
+KEY_TOKENIZER_HF_JSON    = "tokenizer.huggingface.json"
+KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"
+
+
+#
+# recommended mapping of model tensor names for storage in gguf
+#
+
+
+class MODEL_ARCH(IntEnum):
+    LLAMA   = auto()
+    FALCON  = auto()
+    GPT2    = auto()
+    GPTJ    = auto()
+    GPTNEOX = auto()
+    MPT     = auto()
+
+
+class MODEL_TENSOR(IntEnum):
+    TOKEN_EMBD    = auto()
+    POS_EMBD      = auto()
+    OUTPUT        = auto()
+    OUTPUT_NORM   = auto()
+    ROPE_FREQS    = auto()
+    ATTN_Q        = auto()
+    ATTN_K        = auto()
+    ATTN_V        = auto()
+    ATTN_QKV      = auto()
+    ATTN_OUT      = auto()
+    ATTN_NORM     = auto()
+    ATTN_NORM_2   = auto()
+    ATTN_ROT_EMBD = auto()
+    FFN_GATE      = auto()
+    FFN_DOWN      = auto()
+    FFN_UP        = auto()
+    FFN_NORM      = auto()
+
+
+MODEL_ARCH_NAMES = {
+    MODEL_ARCH.LLAMA:   "llama",
+    MODEL_ARCH.FALCON:  "falcon",
+    MODEL_ARCH.GPT2:    "gpt2",
+    MODEL_ARCH.GPTJ:    "gptj",
+    MODEL_ARCH.GPTNEOX: "gptneox",
+    MODEL_ARCH.MPT:     "mpt",
+}
+
+MODEL_TENSOR_NAMES = {
+    MODEL_ARCH.LLAMA: {
+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
+        MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
+        MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.GPTNEOX: {
+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.FALCON: {
+        MODEL_TENSOR.TOKEN_EMBD:  "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM: "output_norm",
+        MODEL_TENSOR.OUTPUT:      "output",
+        MODEL_TENSOR.ATTN_NORM:   "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
+        MODEL_TENSOR.ATTN_QKV:    "blk.{bid}.attn_qkv",
+        MODEL_TENSOR.ATTN_OUT:    "blk.{bid}.attn_output",
+        MODEL_TENSOR.FFN_DOWN:    "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:      "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.GPT2: {
+        # TODO
+    },
+    # TODO
+}
+
+# tensors that will not be serialized
+MODEL_TENSOR_SKIP = {
+    MODEL_ARCH.LLAMA: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+}
+
+
+# TODO: the following helper functions should be removed
+#       instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
+#       however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
+# REMOVE
+def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
+    for skip in MODEL_TENSOR_SKIP.get(arch, []):
+        for i in range(n_blocks):
+            if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
+                return True
+
+    return False
+
+
+def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
+    tensor_map = {}
+
+    # Token embeddings
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
+
+    tensor_map["gpt_neox.embed_in"]           = mapped_to  # gptneox
+    tensor_map["transformer.wte"]             = mapped_to  # gpt2 mpt
+    tensor_map["transformer.word_embeddings"] = mapped_to  # falcon
+    tensor_map["model.embed_tokens"]          = mapped_to  # llama-hf
+    tensor_map["tok_embeddings"]              = mapped_to  # llama-pth
+
+    # Position embeddings
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
+
+    tensor_map["transformer.wpe"] = mapped_to  # gpt2
+
+    # Output
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
+
+    tensor_map["embed_out"] = mapped_to  # gptneox
+    tensor_map["lm_head"]   = mapped_to  # gpt2 mpt falcon llama-hf
+    tensor_map["output"]    = mapped_to  # llama-pth
+
+    # Output norm
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
+
+    tensor_map["gpt_neox.final_layer_norm"] = mapped_to  # gptneox
+    tensor_map["transformer.ln_f"]          = mapped_to  # gpt2 falcon
+    tensor_map["transformer.norm_f"]        = mapped_to  # mpt
+    tensor_map["model.norm"]                = mapped_to  # llama-hf
+    tensor_map["norm"]                      = mapped_to  # llama-pth
+
+    # Rope frequencies
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
+
+    tensor_map["rope.freqs"] = mapped_to  # llama-pth
+
+    # Attention and feed-forward blocks
+    for i in range(0, n_blocks):
+        # Attention norm
+        # TODO: is there are simpler way to write these 2 lines in Python?
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".ln_1"]              = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".norm_1"]       = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".input_layernorm"]   = mapped_to  # falcon7b
+        tensor_map["transformer.h."+str(i)+".ln_mlp"]            = mapped_to  # falcon40b
+        tensor_map["model.layers."+str(i)+".input_layernorm"]    = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention_norm"]           = mapped_to  # llama-pth
+
+        # Attention norm 2
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to  # falcon40b
+
+        # Attention query-key-value
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"]    = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".attn.c_attn"]                    = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"]                 = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to  # falcon
+
+        # Attention query
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wq"]           = mapped_to  # llama-pth
+
+        # Attention key
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wk"]           = mapped_to  # llama-pth
+
+        # Attention value
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wv"]           = mapped_to  # llama-pth
+
+        # Attention output
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".attention.dense"]    = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".attn.c_proj"]          = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".attn.out_proj"]   = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".self_attn.o_proj"]      = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wo"]                = mapped_to  # llama-pth
+
+        # Rotary embeddings
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"]  = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to  # llama-pth
+
+        # Feed-forward norm
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".ln_2"]                       = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".norm_2"]                = mapped_to  # mpt
+        tensor_map["model.layers."+str(i)+".post_attention_layernorm"]    = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".ffn_norm"]                          = mapped_to  # llama-pth
+
+        # Feed-forward up
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".mlp.c_fc"]            = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"]    = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"]   = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".mlp.up_proj"]          = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w3"]            = mapped_to  # llama-pth
+
+        # Feed-forward gate
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w1"]     = mapped_to  # llama-pth
+
+        # Feed-forward down
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".mlp.c_proj"]          = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"]  = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"]   = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".mlp.down_proj"]        = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w2"]            = mapped_to  # llama-pth
+
+    return tensor_map
+
+
+class TokenType(IntEnum):
+    NORMAL       = 1
+    UNKNOWN      = 2
+    CONTROL      = 3
+    USER_DEFINED = 4
+    UNUSED       = 5
+    BYTE         = 6
+
+#
+# implementation
+#
+
+
+class GGMLQuantizationType(IntEnum):
+    F32  = 0
+    F16  = 1
+    Q4_0 = 2
+    Q4_1 = 3
+    Q5_0 = 6
+    Q5_1 = 7
+    Q8_0 = 8
+    Q8_1 = 9
+    Q2_K = 10
+    Q3_K = 11
+    Q4_K = 12
+    Q5_K = 13
+    Q6_K = 14
+    Q8_K = 15
+
+
+class GGUFValueType(IntEnum):
+    UINT8   = 0
+    INT8    = 1
+    UINT16  = 2
+    INT16   = 3
+    UINT32  = 4
+    INT32   = 5
+    FLOAT32 = 6
+    BOOL    = 7
+    STRING  = 8
+    ARRAY   = 9
+
+    @staticmethod
+    def get_type(val):
+        if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
+            return GGUFValueType.STRING
+        elif isinstance(val, list):
+            return GGUFValueType.ARRAY
+        elif isinstance(val, float):
+            return GGUFValueType.FLOAT32
+        elif isinstance(val, bool):
+            return GGUFValueType.BOOL
+        elif isinstance(val, int):
+            return GGUFValueType.INT32
+        else:
+            print("Unknown type: "+str(type(val)))
+            sys.exit()
+
+
+class GGUFWriter:
+    def __init__(self, path: str, arch: str, use_temp_file = True):
+        self.fout = open(path, "wb")
+        self.arch = arch
+        self.offset_tensor = 0
+        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
+        self.kv_data = b""
+        self.kv_data_count = 0
+        self.ti_data = b""
+        self.ti_data_count = 0
+        self.add_architecture()
+        self.use_temp_file = use_temp_file
+        self.tensors = []
+
+    def write_header_to_file(self):
+        self.fout.write(struct.pack("<I", GGUF_MAGIC))
+        self.fout.write(struct.pack("<I", GGUF_VERSION))
+        self.fout.write(struct.pack("<I", self.ti_data_count))
+        self.fout.write(struct.pack("<I", self.kv_data_count))
+        self.flush()
+#        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
+
+    def write_kv_data_to_file(self):
+        self.fout.write(self.kv_data)
+        self.flush()
+
+    def write_ti_data_to_file(self):
+        self.fout.write(self.ti_data)
+        self.flush()
+
+    def add_key(self, key: str):
+        self.add_val(key, GGUFValueType.STRING, add_vtype=False)
+
+    def add_uint8(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT8)
+
+    def add_int8(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT8)
+
+    def add_uint16(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT16)
+
+    def add_int16(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT16)
+
+    def add_uint32(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT32)
+
+    def add_int32(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT32)
+
+    def add_float32(self, key: str, val: float):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.FLOAT32)
+
+    def add_bool(self, key: str, val: bool):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.BOOL)
+
+    def add_string(self, key: str, val: str):
+        if len(val) == 0:
+            return
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.STRING)
+
+    def add_array(self, key: str, val: list):
+        if not isinstance(val, list):
+            raise ValueError("Value must be a list for array type")
+
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.ARRAY)
+
+    def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
+        if vtype is None:
+            vtype = GGUFValueType.get_type(val)
+
+        if add_vtype:
+            self.kv_data += struct.pack("<I", vtype)
+            self.kv_data_count += 1
+
+        if vtype == GGUFValueType.UINT8:
+            self.kv_data += struct.pack("<B", val)
+        elif vtype == GGUFValueType.INT8:
+            self.kv_data += struct.pack("<b", val)
+        elif vtype == GGUFValueType.UINT16:
+            self.kv_data += struct.pack("<H", val)
+        elif vtype == GGUFValueType.INT16:
+            self.kv_data += struct.pack("<h", val)
+        elif vtype == GGUFValueType.UINT32:
+            self.kv_data += struct.pack("<I", val)
+        elif vtype == GGUFValueType.INT32:
+            self.kv_data += struct.pack("<i", val)
+        elif vtype == GGUFValueType.FLOAT32:
+            self.kv_data += struct.pack("<f", val)
+        elif vtype == GGUFValueType.BOOL:
+            self.kv_data += struct.pack("?", val)
+        elif vtype == GGUFValueType.STRING:
+            encoded_val = val.encode("utf8") if isinstance(val, str) else val
+            self.kv_data += struct.pack("<I", len(encoded_val))
+            self.kv_data += encoded_val
+        elif vtype == GGUFValueType.ARRAY:
+            ltype = set([GGUFValueType.get_type(item) for item in val])
+            assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
+            self.kv_data += struct.pack("<I", list(ltype)[0])
+            self.kv_data += struct.pack("<I", len(val))
+            for item in val:
+                self.add_val(item, add_vtype=False)
+        else:
+            raise ValueError("Invalid GGUF metadata value type")
+
+    @staticmethod
+    def ggml_pad(x: int, n: int) -> int:
+        return ((x + n - 1) // n) * n
+
+    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
+        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
+
+        encoded_name = name.encode("utf8")
+        self.ti_data += struct.pack("<I", len(encoded_name))
+        self.ti_data += encoded_name
+        n_dims = len(tensor_shape)
+        self.ti_data += struct.pack("<I", n_dims)
+        for i in range(n_dims):
+            self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
+        if raw_dtype is None:
+            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+        else:
+            dtype = raw_dtype
+        self.ti_data += struct.pack("<I", dtype)
+        self.ti_data += struct.pack("<Q", self.offset_tensor)
+        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
+        self.ti_data_count += 1
+
+    def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
+        if self.use_temp_file and not hasattr(self, "temp_file"):
+            self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
+            self.temp_file.seek(0)
+
+        self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
+
+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
+
+        if not self.use_temp_file:
+            self.tensors.append((tensor, pad))
+            return
+
+        tensor.tofile(self.temp_file)
+
+        if pad != 0:
+            self.temp_file.write(bytes([0] * pad))
+
+    def write_tensor_data(self, tensor: np.ndarray):
+        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))
+
+        tensor.tofile(self.fout)
+
+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))
+
+    def write_tensors_to_file(self):
+        self.write_ti_data_to_file()
+
+        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))
+
+        if not self.use_temp_file:
+            for (currtensor, currpad) in self.tensors:
+                currtensor.tofile(self.fout)
+                if currpad != 0:
+                    self.fout.write(bytes([0] * currpad))
+            return
+
+        self.temp_file.seek(0)
+
+        shutil.copyfileobj(self.temp_file, self.fout)
+        self.flush()
+        self.temp_file.close()
+
+    def flush(self):
+        self.fout.flush()
+
+    def close(self):
+        self.fout.close()
+
+    def add_architecture(self):
+        self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
+
+    def add_author(self, author: str):
+        self.add_string(KEY_GENERAL_AUTHOR, author)
+
+    def add_tensor_data_layout(self, layout: str):
+        self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
+
+    def add_url(self, url: str):
+        self.add_string(KEY_GENERAL_URL, url)
+
+    def add_description(self, description: str):
+        self.add_string(KEY_GENERAL_DESCRIPTION, description)
+
+    def add_source_url(self, url: str):
+        self.add_string(KEY_GENERAL_SOURCE_URL, url)
+
+    def add_source_hf_repo(self, repo: str):
+        self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
+
+    def add_file_type(self, ftype: int):
+        self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
+
+    def add_name(self, name: str):
+        self.add_string(KEY_GENERAL_NAME, name)
+
+    def add_quantization_version(self, quantization_version: GGMLQuantizationType):
+        self.add_uint32(
+            KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
+
+    def add_custom_alignment(self, alignment: int):
+        self.data_alignment = alignment
+        self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
+
+    def add_context_length(self, length: int):
+        self.add_uint32(
+            KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
+
+    def add_embedding_length(self, length: int):
+        self.add_uint32(
+            KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_block_count(self, length: int):
+        self.add_uint32(
+            KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
+
+    def add_feed_forward_length(self, length: int):
+        self.add_uint32(
+            KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+
+    def add_parallel_residual(self, use: bool):
+        self.add_bool(
+            KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
+
+    def add_tensor_data_layout(self, layout: str):
+        self.add_string(
+            KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
+
+    def add_head_count(self, count: int):
+        self.add_uint32(
+            KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
+
+    def add_head_count_kv(self, count: int):
+        self.add_uint32(
+            KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
+
+    def add_max_alibi_bias(self, bias: float):
+        self.add_float32(
+            KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
+
+    def add_clamp_kqv(self, value: float):
+        self.add_float32(
+            KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
+
+    def add_layer_norm_eps(self, value: float):
+        self.add_float32(
+            KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
+
+    def add_layer_norm_rms_eps(self, value: float):
+        self.add_float32(
+            KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
+
+    def add_rope_dimension_count(self, count: int):
+        self.add_uint32(
+            KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
+
+    def add_rope_scale_linear(self, value:  float):
+        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
+
+    def add_tokenizer_model(self, model: str):
+        self.add_string(KEY_TOKENIZER_MODEL, model)
+
+    def add_token_list(self, tokens: List):
+        self.add_array(KEY_TOKENIZER_LIST, tokens)
+
+    def add_token_merges(self, merges: List):
+        self.add_array(KEY_TOKENIZER_MERGES, merges)
+
+    def add_token_types(self, types: List[int]):
+        self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
+
+    def add_token_scores(self, scores: List[float]):
+        self.add_array(KEY_TOKENIZER_SCORES, scores)
+
+    def add_bos_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
+
+    def add_eos_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
+
+    def add_unk_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
+
+    def add_sep_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
+
+    def add_pad_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
+
+
+# Example usage:
+if __name__ == "__main__":
+    # Example usage with a file
+    gguf_writer = GGUFWriter("example.gguf", "llama")
+
+    gguf_writer.add_architecture()
+    gguf_writer.add_block_count(12)
+    gguf_writer.add_uint32("answer", 42)  # Write a 32-bit integer
+    gguf_writer.add_float32("answer_in_float", 42.0)  # Write a 32-bit float
+    gguf_writer.add_custom_alignment(64)
+
+    tensor1 = np.ones((32,), dtype=np.float32) * 100.0
+    tensor2 = np.ones((64,), dtype=np.float32) * 101.0
+    tensor3 = np.ones((96,), dtype=np.float32) * 102.0
+
+    gguf_writer.add_tensor("tensor1", tensor1)
+    gguf_writer.add_tensor("tensor2", tensor2)
+    gguf_writer.add_tensor("tensor3", tensor3)
+
+    gguf_writer.write_header_to_file()
+    gguf_writer.write_kv_data_to_file()
+    gguf_writer.write_tensors_to_file()
+
+    gguf_writer.close()
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -0,0 +1,91 @@
+# GBNF Guide
+
+GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`.
+
+## Background
+
+[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
+
+## Basics
+
+In GBNF, we define *production rules* that specify how a *non-terminal* (rule name) can be replaced with sequences of *terminals* (characters, specifically Unicode [code points](https://en.wikipedia.org/wiki/Code_point)) and other non-terminals. The basic format of a production rule is `nonterminal ::= sequence...`.
+
+## Example
+
+Before going deeper, let's look at some of the features demonstrated in `grammars/chess.gbnf`, a small chess notation grammar:
+```
+# `root` specifies the pattern for the overall output
+root ::= (
+    # it must start with the characters "1. " followed by a sequence
+    # of characters that match the `move` rule, followed by a space, followed
+    # by another move, and then a newline
+    "1. " move " " move "\n"
+
+    # it's followed by one or more subsequent moves, numbered with one or two digits
+    ([1-9] [0-9]? ". " move " " move "\n")+
+)
+
+# `move` is an abstract representation, which can be a pawn, nonpawn, or castle.
+# The `[+#]?` denotes the possibility of checking or mate signs after moves
+move ::= (pawn | nonpawn | castle) [+#]?
+
+pawn ::= ...
+nonpawn ::= ...
+castle ::= ...
+```
+
+## Non-Terminals and Terminals
+
+Non-terminal symbols (rule names) stand for a pattern of terminals and other non-terminals. They are required to be a dashed lowercase word, like `move`, `castle`, or `check-mate`.
+
+Terminals are actual characters ([code points](https://en.wikipedia.org/wiki/Code_point)). They can be specified as a sequence like `"1"` or `"O-O"` or as ranges like `[1-9]` or `[NBKQR]`.
+
+## Characters and character ranges
+
+Terminals support the full range of Unicode. Unicode characters can be specified directly in the grammar, for example `hiragana ::= [ぁ-ゟ]`, or with escapes: 8-bit (`\xXX`), 16-bit (`\uXXXX`) or 32-bit (`\UXXXXXXXX`).
+
+Character ranges can be negated with `^`:
+```
+single-line ::= [^\n]+ "\n"`
+```
+
+## Sequences and Alternatives
+
+The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
+
+Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`.
+
+Parentheses `()` can be used to group sequences, which allows for embedding alternatives in a larger rule or applying repetition and optptional symbols (below) to a sequence.
+
+## Repetition and Optional Symbols
+
+- `*` after a symbol or sequence means that it can be repeated zero or more times.
+- `+` denotes that the symbol or sequence should appear one or more times.
+- `?` makes the preceding symbol or sequence optional.
+
+## Comments and newlines
+
+Comments can be specified with `#`:
+```
+# defines optional whitspace
+ws ::= [ \t\n]+
+```
+
+Newlines are allowed between rules and between symbols or sequences nested inside parentheses. Additionally, a newline after an alternate marker `|` will continue the current rule, even outside of parentheses.
+
+## The root rule
+
+In a full grammar, the `root` rule always defines the starting point of the grammar. In other words, it specifies what the entire output must match.
+
+```
+# a grammar for lists
+root ::= ("- " item)+
+item ::= [^\n]+ "\n"
+```
+
+## Next steps
+
+This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with:
+```
+./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
+```
--- a/grammars/arithmetic.gbnf
+++ b/grammars/arithmetic.gbnf
@@ -0,0 +1,6 @@
+root  ::= (expr "=" ws term "\n")+
+expr  ::= term ([-+*/] term)*
+term  ::= ident | num | "(" ws expr ")" ws
+ident ::= [a-z] [a-z0-9_]* ws
+num   ::= [0-9]+ ws
+ws    ::= [ \t\n]*
--- a/grammars/chess.gbnf
+++ b/grammars/chess.gbnf
@@ -0,0 +1,13 @@
+# Specifies chess moves as a list in algebraic notation, using PGN conventions
+
+# Force first move to "1. ", then any 1-2 digit number after, relying on model to follow the pattern
+root    ::= "1. " move " " move "\n" ([1-9] [0-9]? ". " move " " move "\n")+
+move    ::= (pawn | nonpawn | castle) [+#]?
+
+# piece type, optional file/rank, optional capture, dest file & rank
+nonpawn ::= [NBKQR] [a-h]? [1-8]? "x"? [a-h] [1-8]
+
+# optional file & capture, dest file & rank, optional promotion
+pawn    ::= ([a-h] "x")? [a-h] [1-8] ("=" [NBKQR])?
+
+castle  ::= "O-O" "-O"?
--- a/grammars/japanese.gbnf
+++ b/grammars/japanese.gbnf
@@ -0,0 +1,7 @@
+# A probably incorrect grammar for Japanese
+root        ::= jp-char+ ([ \t\n] jp-char+)*
+jp-char     ::= hiragana | katakana | punctuation | cjk
+hiragana    ::= [ぁ-ゟ]
+katakana    ::= [ァ-ヿ]
+punctuation ::= [、-〾]
+cjk         ::= [一-鿿]
--- a/Show More
+++ b/Show More