Fix values shown in the quantize tool help (#2735 )

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Strided perplexity (#2714 )
2026-02-26 14:23:22 +02:00 · 2023-08-23 12:57:12 +03:00 · 2023-08-23 12:56:42 +03:00 · 2023-08-23 03:31:09 -06:00 · 2023-08-23 15:12:12 +08:00 · 2023-08-22 21:01:57 -04:00
168 changed files with 19519 additions and 20139 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
 *.o
 *.a
 *.so
+*.gguf
+*.bin
 .DS_Store
 .build/
 .cache/
@@ -39,13 +41,17 @@ models-mnt
 /perplexity
 /embedding
 /train-text-from-scratch
+/convert-llama2c-to-ggml
 /simple
 /benchmark-matmult
 /vdot
 /server
 /Pipfile
 /embd-input-test
+/gguf
+/gguf-llama-simple
 /libllama.so
+/llama-bench
 build-info.h
 arm_neon.h
 compile_commands.json
@@ -62,12 +68,12 @@ perf-*.txt

 examples/jeopardy/results.txt

-
 pyproject.toml
 poetry.lock
 poetry.toml

 # Test binaries
+tests/test-grammar-parser
 tests/test-double-float
 tests/test-grad0
 tests/test-opt
@@ -75,3 +81,4 @@ tests/test-quantize-fns
 tests/test-quantize-perf
 tests/test-sampling
 tests/test-tokenizer-0
+
--- a/.gitmodules
+++ b/.gitmodules
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,14 +69,12 @@ option(LLAMA_BLAS                            "llama: use BLAS"
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
 #option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
-set(LLAMA_CUDA_MMQ_Y       "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
 option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
-option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
@@ -257,7 +255,6 @@ if (LLAMA_CUBLAS)
 #        if (LLAMA_CUDA_CUBLAS)
 #            add_compile_definitions(GGML_CUDA_CUBLAS)
 #        endif()
-        add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@@ -299,7 +296,6 @@ if (LLAMA_METAL)
    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
-    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)

    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)

@@ -316,7 +312,6 @@ if (LLAMA_METAL)
        ${FOUNDATION_LIBRARY}
        ${METAL_FRAMEWORK}
        ${METALKIT_FRAMEWORK}
-        ${METALPERFORMANCE_FRAMEWORK}
        )
 endif()

@@ -357,126 +352,6 @@ if (LLAMA_CLBLAST)
    endif()
 endif()

-if (LLAMA_KOMPUTE)
-    find_package(Vulkan COMPONENTS glslc REQUIRED)
-    find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
-    if (NOT glslc_executable)
-        message(FATAL_ERROR "glslc not found")
-    endif()
-
-    function(compile_shader)
-      set(options)
-      set(oneValueArgs)
-      set(multiValueArgs SOURCES)
-      cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-      foreach(source ${compile_shader_SOURCES})
-        set(spv_file ${source}.spv)
-        add_custom_command(
-            OUTPUT ${spv_file}
-            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-            COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
-            COMMENT "Compiling ${source} to ${source}.spv"
-        )
-
-        get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
-        set(FILE_NAME "shader${RAW_FILE_NAME}")
-        string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
-        string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
-        string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
-        set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
-        message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
-        add_custom_command(
-          OUTPUT ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
-          COMMAND xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
-          COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
-          DEPENDS ${spv_file}
-          COMMENT "Converting to hpp: ${FILE_NAME}"
-        )
-      endforeach()
-    endfunction()
-
-    if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
-        message(STATUS "Kompute found")
-        add_subdirectory(kompute)
-
-        # Compile our shaders
-        compile_shader(SOURCES
-          kompute/op_scale.comp
-          kompute/op_add.comp
-          kompute/op_addrow.comp
-          kompute/op_mul.comp
-          kompute/op_mulrow.comp
-          kompute/op_silu.comp
-          kompute/op_relu.comp
-          kompute/op_gelu.comp
-          kompute/op_softmax.comp
-          kompute/op_norm.comp
-          kompute/op_rmsnorm.comp
-          kompute/op_diagmask.comp
-          kompute/op_mul_mat_f16.comp
-          kompute/op_mul_mat_q4_0.comp
-          kompute/op_mul_mat_q4_1.comp
-          kompute/op_getrows_f16.comp
-          kompute/op_getrows_q4_0.comp
-          kompute/op_getrows_q4_1.comp
-          kompute/op_rope.comp
-          kompute/op_cpy_f16_f16.comp
-          kompute/op_cpy_f16_f32.comp
-          kompute/op_cpy_f32_f16.comp
-          kompute/op_cpy_f32_f32.comp
-        )
-
-        # Create a custom target for our generated shaders
-        add_custom_target(generated_shaders DEPENDS
-          shaderop_scale.h
-          shaderop_add.h
-          shaderop_addrow.h
-          shaderop_mul.h
-          shaderop_mulrow.h
-          shaderop_silu.h
-          shaderop_relu.h
-          shaderop_gelu.h
-          shaderop_softmax.h
-          shaderop_norm.h
-          shaderop_rmsnorm.h
-          shaderop_diagmask.h
-          shaderop_mul_mat_f16.h
-          shaderop_mul_mat_q4_0.h
-          shaderop_mul_mat_q4_1.h
-          shaderop_getrows_f16.h
-          shaderop_getrows_q4_0.h
-          shaderop_getrows_q4_1.h
-          shaderop_rope.h
-          shaderop_cpy_f16_f16.h
-          shaderop_cpy_f16_f32.h
-          shaderop_cpy_f32_f16.h
-          shaderop_cpy_f32_f32.h
-        )
-
-        # Create a custom command that depends on the generated_shaders
-        add_custom_command(
-            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
-            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
-            DEPENDS generated_shaders
-            COMMENT "Ensuring shaders are generated before compiling ggml-vulkan.cpp"
-        )
-
-        # Add the stamp to the main sources to ensure dependency tracking
-        set(GGML_SOURCES_KOMPUTE ggml-vulkan.cpp ggml-vulkan.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
-        add_compile_definitions(GGML_USE_KOMPUTE)
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
-        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
-    else()
-        message(WARNING "Kompute not found")
-    endif()
-endif()
-
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(c_flags
@@ -622,9 +497,11 @@ else()
 endif()

 #
-# Build libraries
+# libraries
 #

+# ggml
+
 add_library(ggml OBJECT
            ggml.c
            ggml.h
@@ -632,7 +509,6 @@ add_library(ggml OBJECT
            ggml-alloc.h
            ${GGML_SOURCES_CUDA}
            ${GGML_SOURCES_OPENCL}
-            ${GGML_SOURCES_KOMPUTE}
            ${GGML_SOURCES_METAL}
            ${GGML_SOURCES_MPI}
            ${GGML_SOURCES_EXTRA}
@@ -650,10 +526,11 @@ if (BUILD_SHARED_LIBS)
    install(TARGETS ggml_shared LIBRARY)
 endif()

+# llama
+
 add_library(llama
            llama.cpp
            llama.h
-            llama-util.h
            )

 target_include_directories(llama PUBLIC .)
@@ -672,6 +549,10 @@ if (BUILD_SHARED_LIBS)
    install(TARGETS llama LIBRARY)
 endif()

+#
+# install
+#
+
 include(GNUInstallDirs)
 install(
    FILES convert.py
@@ -695,11 +576,23 @@ install(
        WORLD_READ
        WORLD_EXECUTE
    DESTINATION ${CMAKE_INSTALL_BINDIR})
+if (LLAMA_METAL)
+    install(
+        FILES ggml-metal.metal
+        PERMISSIONS
+            OWNER_READ
+            OWNER_WRITE
+            GROUP_READ
+            WORLD_READ
+        DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()

 #
 # programs, examples and tests
 #

+add_subdirectory(common)
+
 if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
    include(CTest)
    add_subdirectory(tests)
--- a/LICENSE_SOM.txt
+++ b/LICENSE_SOM.txt
@@ -1,30 +0,0 @@
-Software for Open Models License (SOM)
-Version 1.0 dated August 30th, 2023
-
-This license governs use of the accompanying Software. If you use the Software, you accept this license. If you do not accept the license, do not use the Software.
-
-This license is intended to encourage open release of models created, modified, processed, or otherwise used via the Software under open licensing terms, and should be interpreted in light of that intent.
-
-1. Definitions
-The “Licensor” is the person or entity who is making the Software available under this license. “Software” is the software made available by Licensor under this license.
-A “Model” is the output of a machine learning algorithm, and excludes the Software.
-“Model Source Materials” must include the Model and model weights, and may include any input data, input data descriptions, documentation or training descriptions for the Model.
-“Open Licensing Terms” means: (a) any open source license approved by the Open Source Initiative, or (b) any other terms that make the Model Source Materials publicly available free of charge, and allow recipients to use, modify and distribute the Model Source Materials. Terms described in (b) may include reasonable restrictions such as non-commercial or non-production limitations, or require use in compliance with law.
-
-2. Grant of Rights. Subject to the conditions and limitations in section 3:
-(A) Copyright Grant. Licensor grants you a non-exclusive, worldwide, royalty-free copyright license to copy, modify, and distribute the Software and any modifications of the Software you create under this license. The foregoing license includes without limitation the right to create, modify, and use Models using this Software.
-
-(B) Patent Grant. Licensor grants you a non-exclusive, worldwide, royalty-free license, under any patents owned or controlled by Licensor, to make, have made, use, sell, offer for sale, import, or otherwise exploit the Software.  No license is granted to patent rights that are not embodied in the operation of the Software in the form provided by Licensor.
-
-3. Conditions and Limitations
-(A) Model Licensing and Access. If you use the Software to create, modify, process, or otherwise use any Model, including usage to create inferences with a Model, whether or not you make the Model available to others, you must make that Model Source Materials publicly available under Open Licensing Terms. 
-
-(B) No Re-Licensing. If you redistribute the Software, or modifications to the Software made under the license granted above, you must make it available only under the terms of this license. You may offer additional terms such as warranties, maintenance and support, but You, and not Licensor, are responsible for performing such terms.
-
-(C) No Trademark License. This license does not grant you rights to use the Licensor’s name, logo, or trademarks.
-
-(D) If you assert in writing a claim against any person or entity alleging that the use of the Software infringes any patent, all of your licenses to the Software under Section 2 end automatically as of the date you asserted the claim.
-
-(E) If you distribute any portion of the Software, you must retain all copyright, patent, trademark, and attribution notices that are present in the Software, and you must include a copy of this license.
-
-(F) The Software is licensed “as-is.” You bear the entire risk of using it. Licensor gives You no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws that this license cannot change. To the extent permitted under your local laws, the Licensor disclaims and excludes the implied warranties of merchantability, fitness for a particular purpose and non-infringement. To the extent this disclaimer is unlawful, you, and not Licensor, are responsible for any liability.
--- a/44
+++ b/44
@@ -1,8 +1,8 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf llama-bench

 # Binaries only useful for tests
-TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
+TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0

 default: $(BUILD_TARGETS)

@@ -45,8 +45,8 @@ OPT = -Ofast
 else
 OPT = -O3
 endif
-CFLAGS   = -I.              $(OPT) -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
+CFLAGS   = -I.            $(OPT) -std=c11   -fPIC
+CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC
 LDFLAGS  =

 ifdef LLAMA_DEBUG
@@ -253,11 +253,6 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
 else
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
-ifdef LLAMA_CUDA_MMQ_Y
-	NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
-else
-	NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
-endif # LLAMA_CUDA_MMQ_Y
 #ifdef LLAMA_CUDA_CUBLAS
 #	NVCCFLAGS += -DGGML_CUDA_CUBLAS
 #endif # LLAMA_CUDA_CUBLAS
@@ -288,7 +283,7 @@ endif # LLAMA_CLBLAST
 ifdef LLAMA_METAL
 	CFLAGS   += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
 	CXXFLAGS += -DGGML_USE_METAL
-	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
 	OBJS     += ggml-metal.o
 endif # LLAMA_METAL

@@ -334,23 +329,23 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h

 OBJS += ggml-alloc.o

-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
+llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-common.o: examples/common.cpp examples/common.h
+common.o: common/common.cpp common/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-console.o: examples/console.cpp examples/console.h
+console.o: common/console.cpp common/console.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
+grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 clean:
-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf llama-bench build-info.h $(TEST_TARGETS)

 #
 # Examples
@@ -380,7 +375,7 @@ embedding: examples/embedding/embedding.cpp                   build-info.h ggml.
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o $(OBJS)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)

 $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
@@ -390,7 +385,16 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
 embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput

-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
+gguf: examples/gguf/gguf.cpp                                  build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp    build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 build-info.h: $(wildcard .git/index) scripts/build-info.sh
@@ -414,6 +418,12 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

+tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
+tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
 tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

--- a/README.md
+++ b/README.md
@@ -9,13 +9,19 @@

 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

-**Hot topics:**
+### Hot topics

- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
- New roadmap: https://github.com/users/ggerganov/projects/7
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
+A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
+
+Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
+
+### Current `master` should be considered in Beta - expect some issues for a few days!
+
+### Be prepared to re-convert and / or re-quantize your GGUF models while this notice is up!
+
+### Issues with non-GGUF models will be considered with low priority!
+
+----

 <details>
  <summary>Table of Contents</summary>
@@ -33,6 +39,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
        <li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
        <li><a href="#quantization">Quantization</a></li>
        <li><a href="#interactive-mode">Interactive mode</a></li>
+        <li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
        <li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
        <li><a href="#using-openllama">Using OpenLLaMA</a></li>
        <li><a href="#using-gpt4all">Using GPT4All</a></li>
@@ -96,8 +103,10 @@ as the main playground for developing new features for the [ggml](https://github
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
+- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
+- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)

 **UI:**

@@ -238,12 +247,17 @@ In order to build llama.cpp you have three different options.
    cmake --build . --config Release
    ```

- Using `Zig`:
+- Using `Zig` (version 0.11 or later):
+
+    Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
+    it's also possible to cross compile for other operating systems and architectures:

    ```bash
-    zig build -Doptimize=ReleaseFast
+    zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
    ```

+    The `zig targets` command will give you valid options to use.
+
 -   Using `gmake` (FreeBSD):

    1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@@ -284,7 +298,7 @@ When built with Metal support, you can enable GPU inference with the `--gpu-laye
 Any value larger than 0 will offload the computation to the GPU. For example:

 ```bash
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
 ```

 ### MPI Build
@@ -323,7 +337,7 @@ The above will distribute the computation across 2 processes on the first host a
 Finally, you're ready to run a computation using `mpirun`:

 ```bash
-mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```

 ### BLAS Build
@@ -406,10 +420,9 @@ Building the program with BLAS support may lead to some performance improvements
 --->
  | Option                  | Legal values           | Default | Description |
  |-------------------------|------------------------|---------|-------------|
-  | LLAMA_CUDA_MMQ_Y        | Positive integer >= 32 |      64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. |
  | LLAMA_CUDA_FORCE_DMMV   | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
  | LLAMA_CUDA_F16          | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |

@@ -507,10 +520,10 @@ python3 convert.py models/7B/
  python convert.py models/7B/ --vocabtype bpe

 # quantize the model to 4-bits (using q4_0 method)
-./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
+./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0

 # run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -n 128
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
 ```

 When running the larger models, make sure you have enough disk space to store all the intermediate files.
@@ -566,7 +579,7 @@ Here is an example of a few-shot interaction, invoked with the command
 ./examples/chat-13B.sh

 # custom arguments using a 13B model
-./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
+./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```

 Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
@@ -592,6 +605,16 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
    CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
 ```

+### Constrained output with grammars
+
+`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only:
+
+```bash
+./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:'
+```
+
+The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
+
 ### Instruction mode with Alpaca

 1. First, download the `ggml` Alpaca model into the `./models` folder
@@ -629,6 +652,8 @@ OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It

 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)

+*Note: these instructions are likely obsoleted by the GGUF update*
+
 - Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
 - Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
 - Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
@@ -704,7 +729,7 @@ If your issue is with model generation quality, then please at least scan the fo
 #### How to run

 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
-2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
 3. Output:
 ```
 perplexity : calculating perplexity over 655 chunks
@@ -803,13 +828,13 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-
 On completion, you are ready to play!

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 or with a light image:

 ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 ### Docker With CUDA
@@ -840,8 +865,8 @@ The resulting images, are essentially the same as the non-CUDA images:
 After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.

 ```bash
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
-docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
+docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
 ```

 ### Contributing
@@ -871,3 +896,4 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
 - [BLIS](./docs/BLIS.md)
 - [Performance troubleshooting](./docs/token_generation_performance_tips.md)
 - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [GBNF grammars](./grammars/README.md)
--- a/build.zig
+++ b/build.zig
@@ -1,5 +1,6 @@
 // Compatible with Zig Version 0.11.0
 const std = @import("std");
+const ArrayList = std.ArrayList;
 const Compile = std.Build.Step.Compile;
 const ConfigHeader = std.Build.Step.ConfigHeader;
 const Mode = std.builtin.Mode;
@@ -10,11 +11,31 @@ const Maker = struct {
    target: CrossTarget,
    optimize: Mode,
    config_header: *ConfigHeader,
+    enable_lto: bool,

-    const cflags = .{"-std=c11"};
-    const cxxflags = .{"-std=c++11"};
+    include_dirs: ArrayList([]const u8),
+    cflags: ArrayList([]const u8),
+    cxxflags: ArrayList([]const u8),
+    objs: ArrayList(*Compile),

-    fn init(builder: *std.build.Builder) Maker {
+    fn addInclude(m: *Maker, dir: []const u8) !void {
+        try m.include_dirs.append(dir);
+    }
+    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
+        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
+    }
+    fn addCFlag(m: *Maker, flag: []const u8) !void {
+        try m.cflags.append(flag);
+    }
+    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
+        try m.cxxflags.append(flag);
+    }
+    fn addFlag(m: *Maker, flag: []const u8) !void {
+        try m.addCFlag(flag);
+        try m.addCxxFlag(flag);
+    }
+
+    fn init(builder: *std.build.Builder) !Maker {
        const commit_hash = @embedFile(".git/refs/heads/master");
        const config_header = builder.addConfigHeader(
            .{ .style = .blank, .include_path = "build-info.h" },
@@ -23,58 +44,71 @@ const Maker = struct {
                .BUILD_COMMIT = commit_hash[0 .. commit_hash.len - 1], // omit newline
            },
        );
-        return Maker{
+        var m = Maker{
            .builder = builder,
            .target = builder.standardTargetOptions(.{}),
            .optimize = builder.standardOptimizeOption(.{}),
            .config_header = config_header,
+            .enable_lto = false,
+            .include_dirs = ArrayList([]const u8).init(builder.allocator),
+            .cflags = ArrayList([]const u8).init(builder.allocator),
+            .cxxflags = ArrayList([]const u8).init(builder.allocator),
+            .objs = ArrayList(*Compile).init(builder.allocator),
        };
+        try m.addCFlag("-std=c11");
+        try m.addCxxFlag("-std=c++11");
+        try m.addProjectInclude(&.{});
+        try m.addProjectInclude(&.{"examples"});
+        return m;
    }

    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
        if (std.mem.endsWith(u8, src, ".c")) {
-            o.addCSourceFiles(&.{src}, &cflags);
+            o.addCSourceFiles(&.{src}, m.cflags.items);
            o.linkLibC();
        } else {
-            o.addCSourceFiles(&.{src}, &cxxflags);
+            o.addCSourceFiles(&.{src}, m.cxxflags.items);
            o.linkLibCpp();
        }
-        o.addIncludePath(.{ .path = "." });
-        o.addIncludePath(.{ .path = "./examples" });
+        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
+        o.want_lto = m.enable_lto;
        return o;
    }

    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
-        e.addIncludePath(.{ .path = "." });
-        e.addIncludePath(.{ .path = "./examples" });
-        e.addCSourceFiles(&.{src}, &cxxflags);
+        e.addCSourceFiles(&.{src}, m.cxxflags.items);
        for (deps) |d| e.addObject(d);
+        for (m.objs.items) |o| e.addObject(o);
+        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
        e.linkLibC();
        e.linkLibCpp();
        e.addConfigHeader(m.config_header);
        m.builder.installArtifact(e);
-
-        // Currently a bug is preventing correct linking for optimized builds for Windows:
-        // https://github.com/ziglang/zig/issues/15958
-        if (e.target.isWindows()) {
-            e.want_lto = false;
-        }
+        e.want_lto = m.enable_lto;
        return e;
    }
 };

-pub fn build(b: *std.build.Builder) void {
-    const make = Maker.init(b);
+pub fn build(b: *std.build.Builder) !void {
+    var make = try Maker.init(b);
+    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
+
+    if (b.option(bool, "k-quants", "Enable K-quants, (default: true)") orelse true) {
+        try make.addFlag("-DGGML_USE_K_QUANTS");
+        const k_quants = make.obj("k_quants", "k_quants.c");
+        try make.objs.append(k_quants);
+    }

    const ggml = make.obj("ggml", "ggml.c");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
    const llama = make.obj("llama", "llama.cpp");
    const common = make.obj("common", "examples/common.cpp");
+    const console = make.obj("common", "examples/console.cpp");
    const grammar_parser = make.obj("grammar-parser", "examples/grammar-parser.cpp");

-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
+    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama });
    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 {

    python3 ../convert.py ${path_models}

-    model_f16="${path_models}/ggml-model-f16.bin"
-    model_q8_0="${path_models}/ggml-model-q8_0.bin"
-    model_q4_0="${path_models}/ggml-model-q4_0.bin"
-    model_q4_1="${path_models}/ggml-model-q4_1.bin"
-    model_q5_0="${path_models}/ggml-model-q5_0.bin"
-    model_q5_1="${path_models}/ggml-model-q5_1.bin"
-    model_q2_k="${path_models}/ggml-model-q2_k.bin"
-    model_q3_k="${path_models}/ggml-model-q3_k.bin"
-    model_q4_k="${path_models}/ggml-model-q4_k.bin"
-    model_q5_k="${path_models}/ggml-model-q5_k.bin"
-    model_q6_k="${path_models}/ggml-model-q6_k.bin"
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"

    wiki_test_60="${path_wiki}/wiki.test-60.raw"

@@ -285,17 +285,17 @@ function gg_run_open_llama_7b_v2 {

    python3 ../convert.py ${path_models}

-    model_f16="${path_models}/ggml-model-f16.bin"
-    model_q8_0="${path_models}/ggml-model-q8_0.bin"
-    model_q4_0="${path_models}/ggml-model-q4_0.bin"
-    model_q4_1="${path_models}/ggml-model-q4_1.bin"
-    model_q5_0="${path_models}/ggml-model-q5_0.bin"
-    model_q5_1="${path_models}/ggml-model-q5_1.bin"
-    model_q2_k="${path_models}/ggml-model-q2_k.bin"
-    model_q3_k="${path_models}/ggml-model-q3_k.bin"
-    model_q4_k="${path_models}/ggml-model-q4_k.bin"
-    model_q5_k="${path_models}/ggml-model-q5_k.bin"
-    model_q6_k="${path_models}/ggml-model-q6_k.bin"
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"

    wiki_test="${path_wiki}/wiki.test.raw"

--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -0,0 +1,20 @@
+# common
+
+set(TARGET common)
+
+add_library(${TARGET} OBJECT
+    common.h
+    common.cpp
+    console.h
+    console.cpp
+    grammar-parser.h
+    grammar-parser.cpp
+    )
+
+if (BUILD_SHARED_LIBS)
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
+endif()
+
+target_include_directories(${TARGET} PUBLIC .)
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE llama)
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -170,18 +170,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "-gqa" || arg == "--gqa") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_gqa = std::stoi(argv[i]);
-        } else if (arg == "-eps" || arg == "--rms-norm-eps") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.rms_norm_eps = std::stof(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
@@ -274,6 +262,21 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.cfg_negative_prompt = argv[i];
+        } else if (arg == "--cfg-negative-prompt-file") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
+            if (params.cfg_negative_prompt.back() == '\n') {
+                params.cfg_negative_prompt.pop_back();
+            }
        } else if (arg == "--cfg-scale") {
            if (++i >= argc) {
                invalid_param = true;
@@ -286,7 +289,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_batch = std::stoi(argv[i]);
-            params.n_batch = std::min(512, params.n_batch);
        } else if (arg == "--keep") {
            if (++i >= argc) {
                invalid_param = true;
@@ -385,11 +387,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 #else
            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
 #endif // GGML_USE_CUBLAS
-        } else if (arg == "--mul-mat-q" || arg == "-mmq") {
+        } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
 #ifdef GGML_USE_CUBLAS
-            params.mul_mat_q = true;
+            params.mul_mat_q = false;
 #else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--low-vram" || arg == "-lv") {
 #ifdef GGML_USE_CUBLAS
@@ -415,6 +417,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.antiprompt.push_back(argv[i]);
        } else if (arg == "--perplexity") {
            params.perplexity = true;
+        } else if (arg == "--ppl-stride") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.ppl_stride = std::stoi(argv[i]);
+        } else if (arg == "--ppl-output-type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.ppl_output_type = std::stoi(argv[i]);
        } else if (arg == "--hellaswag") {
            params.hellaswag = true;
        } else if (arg == "--hellaswag-tasks") {
@@ -424,7 +438,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            }
            params.hellaswag_tasks = std::stoi(argv[i]);
        } else if (arg == "--ignore-eos") {
-            params.logit_bias[llama_token_eos()] = -INFINITY;
+            params.ignore_eos = true;
        } else if (arg == "--no-penalize-nl") {
            params.penalize_nl = false;
        } else if (arg == "-l" || arg == "--logit-bias") {
@@ -543,11 +557,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    fprintf(stdout, "  -f FNAME, --file FNAME\n");
    fprintf(stdout, "                        prompt file to start generation.\n");
-    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
+    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
-    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
    fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
    fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
@@ -567,8 +579,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
    fprintf(stdout, "  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
    fprintf(stdout, "  --grammar-file FNAME  file to read grammar from\n");
-    fprintf(stdout, "  --cfg-negative-prompt PROMPT \n");
+    fprintf(stdout, "  --cfg-negative-prompt PROMPT\n");
    fprintf(stdout, "                        negative prompt to use for guidance. (default: empty)\n");
+    fprintf(stdout, "  --cfg-negative-prompt-file FNAME\n");
+    fprintf(stdout, "                        negative prompt file to use for guidance. (default: empty)\n");
    fprintf(stdout, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
    fprintf(stdout, "  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
@@ -597,11 +611,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "                        number of layers to store in VRAM\n");
    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
-    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
-    fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
-    fprintf(stdout, "                        Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
-    fprintf(stdout, "                        is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
+    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
+    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif
    fprintf(stdout, "  --mtest               compute maximum memory usage\n");
    fprintf(stdout, "  --export              export the computation graph to 'llama.ggml'\n");
@@ -633,24 +647,15 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
    return "The";
 }

-// TODO: not great allocating this every time
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int) add_bos);
-    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
-    assert(n >= 0);
-    res.resize(n);
-
-    return res;
-}
+//
+// Model utils
+//

 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto lparams = llama_context_default_params();

    lparams.n_ctx           = params.n_ctx;
    lparams.n_batch         = params.n_batch;
-    lparams.n_gqa           = params.n_gqa;
-    lparams.rms_norm_eps    = params.rms_norm_eps;
    lparams.n_gpu_layers    = params.n_gpu_layers;
    lparams.main_gpu        = params.main_gpu;
    lparams.tensor_split    = params.tensor_split;
@@ -668,7 +673,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    return lparams;
 }

-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
    auto lparams = llama_context_params_from_gpt_params(params);

    llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
@@ -697,5 +702,77 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        }
    }

+    if (params.ignore_eos) {
+        params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
+    }
+
    return std::make_tuple(model, lctx);
 }
+
+//
+// Vocab utils
+//
+
+std::vector<llama_token> llama_tokenize(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    // upper limit for the number of tokens
+    int n_tokens = text.length() + add_bos;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+    return result;
+}
+
+std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+    std::vector<char> result(8, 0);
+    const int n_tokens = llama_token_to_str(ctx, token, result.data(), result.size());
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_token_to_str(ctx, token, result.data(), result.size());
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+
+    return std::string(result.data(), result.size());
+}
+
+std::vector<llama_token> llama_tokenize_bpe(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos) {
+    int n_tokens = text.length() + add_bos;
+    std::vector<llama_token> result(n_tokens);
+    n_tokens = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+    return result;
+}
+
+std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
+    std::vector<char> result(8, 0);
+    const int n_tokens = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
+    if (n_tokens < 0) {
+        result.resize(-n_tokens);
+        const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
+        GGML_ASSERT(check == -n_tokens);
+    } else {
+        result.resize(n_tokens);
+    }
+
+    return std::string(result.data(), result.size());
+}
+
--- a/examples/common.h
+++ b/examples/common.h
@@ -22,19 +22,16 @@ struct gpt_params {
    int32_t n_predict                       = -1;   // new tokens to predict
    int32_t n_ctx                           = 512;  // context size
    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_gqa                           = 1;    // grouped-query attention factor (TODO: move to hparams)
    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
-    float   rms_norm_eps                    = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
    float   rope_freq_base                  = 10000.0f; // RoPE base frequency
    float   rope_freq_scale                 = 1.0f;     // RoPE frequency scaling factor

    // sampling parameters
-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
    int32_t top_k             = 40;    // <= 0 to use vocab size
    float   top_p             = 0.95f; // 1.0 = disabled
    float   tfs_z             = 1.00f; // 1.0 = disabled
@@ -48,12 +45,14 @@ struct gpt_params {
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate

+    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
+
    // Classifier-Free Guidance
    // https://arxiv.org/abs/2306.17806
    std::string cfg_negative_prompt;       // string to help guidance
    float       cfg_scale         = 1.f;   // How strong is guidance

-    std::string model             = "models/7B/ggml-model.bin"; // model path
+    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
    std::string model_alias       = "unknown"; // model alias
    std::string prompt            = "";
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
@@ -65,11 +64,15 @@ struct gpt_params {
    std::string lora_adapter = "";  // lora adapter path
    std::string lora_base    = "";  // base model path for the lora adapter

+    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+                                    //                                       (which is more convenient to use for plotting)
+                                    //
    bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score

    bool low_vram          = false; // if true, reduce VRAM usage at the cost of performance
-    bool mul_mat_q         = false; // if true, use experimental mul_mat_q kernels
+    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
@@ -83,6 +86,7 @@ struct gpt_params {
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles

    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
+    bool ignore_eos        = false; // ignore generated EOS tokens
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool penalize_nl       = true;  // consider newlines as a repeatable token
    bool perplexity        = false; // compute perplexity over the prompt
@@ -100,15 +104,31 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

 std::string gpt_random_prompt(std::mt19937 & rng);

-//
-// Vocab utils
-//
-
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
-
 //
 // Model utils
 //

-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
+
+//
+// Vocab utils
+//
+
+std::vector<llama_token> llama_tokenize(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos);
+
+std::vector<llama_token> llama_tokenize_bpe(
+        struct llama_context * ctx,
+           const std::string & text,
+                        bool   add_bos);
+
+std::string llama_token_to_str(
+        const struct llama_context * ctx,
+                       llama_token   token);
+
+std::string llama_token_to_str_bpe(
+    const struct llama_context * ctx,
+                   llama_token   token);
--- a/examples/console.cpp
+++ b/examples/console.cpp
@@ -10,6 +10,9 @@
 #include <windows.h>
 #include <fcntl.h>
 #include <io.h>
+#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
+#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
+#endif
 #else
 #include <climits>
 #include <sys/ioctl.h>
@@ -68,9 +71,10 @@ namespace console {
            }
        }
        if (hConsole) {
-            // Enable ANSI colors on Windows 10+
-            if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
-                SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
+            // Check conditions combined to reduce nesting
+            if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) &&
+                !SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
+                advanced_display = false;
            }
            // Set console output codepage to UTF8
            SetConsoleOutputCP(CP_UTF8);
--- a/examples/console.h
+++ b/examples/console.h
--- a/examples/grammar-parser.cpp
+++ b/examples/grammar-parser.cpp
--- a/examples/grammar-parser.h
+++ b/examples/grammar-parser.h
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -0,0 +1,282 @@
+# HF falcon--> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from transformers import AutoTokenizer
+
+def bytes_to_unicode():
+    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "RWForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.FALCON
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["n_layer"]
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_context_length(2048) # not in config.json
+gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_head_count(hparams["n_head"])
+if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"])
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[str] = []
+merges: List[str] = []
+
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # gpt2 tokenizer
+    gguf_writer.add_tokenizer_model("gpt2")
+
+    print("gguf: get gpt2 tokenizer merges")
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer_json = json.load(f)
+    merges = tokenizer_json["model"]["merges"]
+
+    gguf_writer.add_token_merges(merges)
+
+    print("gguf: get gpt2 tokenizer vocab")
+
+    vocab_size = len(tokenizer_json["model"]["vocab"])
+
+    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+    byte_encoder = bytes_to_unicode()
+    byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+    for i in range(vocab_size):
+        if i in reverse_vocab:
+            try:
+                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+            except KeyError:
+                text = bytearray()
+                for c in reverse_vocab[i]:
+                    if ord(c) < 256:  # single byte character
+                        text.append(byte_decoder[ord(c)])
+                    else:  # multibyte special token character
+                        text.extend(c.encode('utf-8'))
+        else:
+            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+            pad_token = f"[PAD{i}]".encode("utf8")
+            text = bytearray(pad_token)
+
+        tokens.append(text)
+
+    gguf_writer.add_token_list(tokens)
+
+    if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
+        print("gguf: get special token ids")
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        # find special token ids
+
+        if "bos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# params for qkv transform
+n_head = hparams["n_head"]
+n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
+head_dim = hparams["hidden_size"] // n_head
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = ("pytorch_model.bin",)
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        # QKV tensor transform
+        # The original query_key_value tensor contains n_head_kv "kv groups",
+        # each consisting of n_head/n_head_kv query weights followed by one key
+        # and one value weight (shared by all query heads in the kv group).
+        # This layout makes it a big pain to work with in GGML.
+        # So we rearrange them here,, so that we have n_head query weights
+        # followed by n_head_kv key weights followed by n_head_kv value weights,
+        # in contiguous fashion.
+        # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
+
+        if "query_key_value" in name:
+            qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
+            q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
+            k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
+            v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
+            data = torch.cat((q,k,v)).reshape_as(data)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@@ -0,0 +1,266 @@
+# HF gptneox--> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from transformers import AutoTokenizer
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+
+
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a significant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "GPTNeoXForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.GPTNEOX
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_context_length(hparams["max_position_embeddings"])
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
+gguf_writer.add_head_count(hparams["num_attention_heads"])
+gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[str] = []
+merges: List[str] = []
+
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # gpt2 tokenizer
+    gguf_writer.add_tokenizer_model("gpt2")
+
+    print("gguf: get gpt2 tokenizer merges")
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer_json = json.load(f)
+    merges = tokenizer_json["model"]["merges"]
+
+    gguf_writer.add_token_merges(merges)
+
+    print("gguf: get gpt2 tokenizer vocab")
+
+    vocab_size = len(tokenizer_json["model"]["vocab"])
+
+    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+    byte_encoder = bytes_to_unicode()
+    byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+    for i in range(vocab_size):
+        if i in reverse_vocab:
+            try:
+                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+            except KeyError:
+                text = bytearray()
+                for c in reverse_vocab[i]:
+                    if ord(c) < 256:  # single byte character
+                        text.append(byte_decoder[ord(c)])
+                    else:  # multibyte special token character
+                        text.extend(c.encode('utf-8'))
+        else:
+            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+            pad_token = f"[PAD{i}]".encode("utf8")
+            text = bytearray(pad_token)
+
+        tokens.append(text)
+
+    gguf_writer.add_token_list(tokens)
+
+    if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
+        print("gguf: get special token ids")
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        # find special token ids
+
+        if "bos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = ("pytorch_model.bin",)
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@@ -0,0 +1,307 @@
+# 7b pth llama --> gguf conversion
+# Only models with a single datafile are supported, like 7B
+# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from sentencepiece import SentencePieceProcessor
+
+#NDArray = np.ndarray[Any, Any]
+# compatible with python < 3.9
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("consolidated."):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "LlamaForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+if num_parts > 1:
+    print("gguf: Only models with a single datafile are supported.")
+
+    sys.exit()
+
+ARCH=gguf.MODEL_ARCH.LLAMA
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+    head_count_kv = hparams["num_key_value_heads"]
+else:
+    head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+    hf_repo = hparams["_name_or_path"]
+else:
+    hf_repo = ""
+
+if "max_sequence_length" in hparams:
+    ctx_length = hparams["max_sequence_length"]
+elif "max_position_embeddings" in hparams:
+    ctx_length = hparams["max_position_embeddings"]
+else:
+    print("gguf: can not find ctx length parameter.")
+
+    sys.exit()
+
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(ctx_length)
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[bytes] = []
+scores: List[float] = []
+toktypes: List[int] = []
+
+if Path(dir_model + "/tokenizer.model").is_file():
+    # vocab type sentencepiece
+    print("gguf: get sentencepiece tokenizer vocab and scores")
+
+    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+
+    for i in range(tokenizer.vocab_size()):
+        text: bytes
+        score: float
+
+        piece = tokenizer.id_to_piece(i)
+        text = piece.encode("utf-8")
+        score = tokenizer.get_score(i)
+
+        toktype = 1  # defualt to normal token type
+        if tokenizer.is_unknown(i):
+            toktype = 2
+        if tokenizer.is_control(i):
+            toktype = 3
+
+        # toktype = 4 is user-defined = tokens from added_tokens.json
+
+        if tokenizer.is_unused(i):
+            toktype = 5
+        if tokenizer.is_byte(i):
+            toktype = 6
+
+        tokens.append(text)
+        scores.append(score)
+        toktypes.append(toktype)
+
+    if Path(dir_model + "/added_tokens.json").is_file():
+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+            addtokens_json = json.load(f)
+
+            print("gguf: get added tokens")
+
+            for key in addtokens_json:
+                tokens.append( key.encode("utf-8") )
+                scores.append(-1000.0)
+                toktypes.append(4) # user-defined token type
+
+    gguf_writer.add_tokenizer_model("llama")
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_scores(scores)
+    gguf_writer.add_token_types(toktypes)
+
+
+print("gguf: get special token ids")
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # Look for special tokens in tokenizer.json if it exists
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer = json.load(f)
+
+    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]["content"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]["content"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]["content"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]["content"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]["content"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+else:
+    # If no tokenizer.json: Look for special tokens in config.json
+
+    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
+        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
+
+    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
+        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
+
+    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
+        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
+
+    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
+        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
+
+    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
+        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name == "rope.freqs":
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@@ -0,0 +1,344 @@
+import sys, struct, math, argparse
+from pathlib import Path
+
+import numpy as np
+
+import gguf
+
+# Note: Does not support GGML_QKK_64
+QK_K = 256
+# Items here are (block size, type size)
+GGML_QUANT_SIZES = {
+    gguf.GGMLQuantizationType.F32  : (1, 4),
+    gguf.GGMLQuantizationType.F16  : (1, 2),
+    gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
+    gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
+    gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
+    gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
+    gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
+    gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
+    gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
+    gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
+    gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
+    gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
+    gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
+    gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
+}
+
+class Hyperparameters:
+    def __init__(self):
+        self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
+        self.n_ff = 0
+
+    def set_n_ff(self, model):
+        ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
+        assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
+        ff_tensor = model.tensors[ff_tensor_idx]
+        self.n_ff = ff_tensor.dims[1]
+
+    def load(self, data, offset):
+        (
+            self.n_vocab,
+            self.n_embd,
+            self.n_mult,
+            self.n_head,
+            self.n_layer,
+            self.n_rot,
+            self.ftype,
+        ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
+        return 4 * 7
+
+    def __str__(self):
+        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
+
+class Vocab:
+    def __init__(self):
+        self.items = []
+
+    def load(self, data, offset, n_vocab):
+        orig_offset = offset
+        for _ in range(n_vocab):
+            itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
+            assert itemlen < 4096, 'Absurd vocab item length'
+            offset += 4
+            vocab = bytes(data[offset:offset + itemlen])
+            offset += itemlen
+            score = struct.unpack('<f', data[offset:offset + 4])[0]
+            offset += 4
+            self.items.append((vocab, score))
+        return offset - orig_offset
+
+class Tensor:
+    def __init__(self):
+        self.name = None
+        self.dims = ()
+        self.dtype = None
+        self.start_offset = 0
+        self.len_bytes = 0
+
+    def load(self, data, offset):
+        orig_offset = offset
+        (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
+        assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
+        assert name_len < 4096, 'Absurd tensor name length'
+        quant = GGML_QUANT_SIZES.get(dtype)
+        assert quant is not None, 'Unknown tensor type'
+        (blksize, tysize) = quant
+        offset += 12
+        self.dtype= dtype
+        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
+        offset += 4 * n_dims
+        self.name = bytes(data[offset:offset + name_len])
+        offset += name_len
+        pad = ((offset + 31) & ~31) - offset
+        offset += pad
+        n_elems = np.prod(self.dims)
+        n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
+        self.start_offset = offset
+        self.len_bytes = n_bytes
+        offset += n_bytes
+        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
+        return offset - orig_offset
+
+class GGMLV3Model:
+    def __init__(self):
+        self.hyperparameters = None
+        self.vocab = None
+        self.tensor_map = {}
+        self.tensors = []
+
+    def validate_header(self, data, offset):
+        if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
+            raise ValueError('Only GGJTv3 supported')
+        return 8
+
+    def load(self, data, offset):
+        offset += self.validate_header(data, offset)
+        hp = Hyperparameters()
+        offset += hp.load(data, offset)
+        vocab = Vocab()
+        offset += vocab.load(data, offset, hp.n_vocab)
+        tensors = []
+        tensor_map = {}
+        while offset < len(data):
+            tensor = Tensor()
+            offset += tensor.load(data, offset)
+            tensor_map[tensor.name] = len(tensors)
+            tensors.append(tensor)
+        self.hyperparameters = hp
+        self.vocab = vocab
+        self.tensors = tensors
+        self.tensor_map = tensor_map
+        hp.set_n_ff(self)
+        return offset
+
+class GGMLToGGUF:
+    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
+        hp = ggml_model.hyperparameters
+        self.model = ggml_model
+        self.data = data
+        self.cfg = cfg
+        self.params_override = params_override
+        self.vocab_override = vocab_override
+        if params_override is not None:
+            n_kv_head = params_override.n_head_kv
+        else:
+            if cfg.gqa == 1:
+                n_kv_head = hp.n_head
+            else:
+                gqa = float(cfg.gqa)
+                n_kv_head = None
+                for x in range(1, 256):
+                    if float(hp.n_head) / float(x) == gqa:
+                        n_kv_head = x
+                assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
+                print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
+        self.n_kv_head = n_kv_head
+        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
+
+    def save(self):
+        print('* Preparing to save GGUF file')
+        gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
+        self.add_params(gguf_writer)
+        self.add_vocab(gguf_writer)
+        self.add_tensors(gguf_writer)
+        print("    gguf: write header")
+        gguf_writer.write_header_to_file()
+        print("    gguf: write metadata")
+        gguf_writer.write_kv_data_to_file()
+        print("    gguf: write tensors")
+        gguf_writer.write_tensors_to_file()
+        gguf_writer.close()
+
+    def add_params(self, gguf_writer):
+        hp = self.model.hyperparameters
+        cfg = self.cfg
+        desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
+        try:
+            # Filenames aren't necessarily valid UTF8.
+            name = cfg.name if cfg.name is not None else cfg.input.name
+        except UnicodeDecodeError:
+            name = None
+        print('* Adding model parameters and KV items')
+        if name is not None:
+            gguf_writer.add_name(name)
+        gguf_writer.add_description(desc)
+        if self.params_override is not None:
+            po = self.params_override
+            assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
+            assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
+            assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
+            gguf_writer.add_context_length      (po.n_ctx)
+            gguf_writer.add_embedding_length    (po.n_embd)
+            gguf_writer.add_block_count         (po.n_layer)
+            gguf_writer.add_feed_forward_length (po.n_ff)
+            gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
+            gguf_writer.add_head_count          (po.n_head)
+            gguf_writer.add_head_count_kv       (po.n_head_kv)
+            gguf_writer.add_layer_norm_rms_eps  (po.f_norm_eps)
+            return
+        gguf_writer.add_context_length(cfg.context_length)
+        gguf_writer.add_embedding_length(hp.n_embd)
+        gguf_writer.add_block_count(hp.n_layer)
+        gguf_writer.add_feed_forward_length(hp.n_ff)
+        gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
+        gguf_writer.add_head_count(hp.n_head)
+        gguf_writer.add_head_count_kv(self.n_kv_head)
+        gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
+
+    def add_vocab(self, gguf_writer):
+        hp = self.model.hyperparameters
+        gguf_writer.add_tokenizer_model('llama')
+        tokens = []
+        scores = []
+        toktypes = []
+        if self.vocab_override is not None:
+            vo = self.vocab_override
+            print('* Adding vocab item(s)')
+            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
+                tokens.append(vbytes)
+                scores.append(score)
+                toktypes.append(ttype)
+            assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
+            gguf_writer.add_token_list(tokens)
+            gguf_writer.add_token_scores(scores)
+            if len(toktypes) > 0:
+                gguf_writer.add_token_types(toktypes)
+            return
+        print(f'* Adding {hp.n_vocab} vocab item(s)')
+        assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
+        for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
+            tt = 1 # Normal
+            # Special handling for UNK, BOS, EOS tokens.
+            if tokid <= 2:
+                if tokid == 0:
+                    vbytes = b'<unk>'
+                    tt = 2
+                elif tokid == 1:
+                    vbytes = b'<s>'
+                    tt = 3
+                else:
+                    vbytes = b'</s>'
+                    tt = 3
+            elif len(vbytes) == 0:
+                tt = 3 # Control
+            elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
+                vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
+                tt = 6 # Byte
+            else:
+                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
+            toktypes.append(tt)
+            tokens.append(vbytes)
+            scores.append(vscore)
+        gguf_writer.add_token_list(tokens)
+        gguf_writer.add_token_scores(scores)
+        gguf_writer.add_token_types(toktypes)
+        gguf_writer.add_unk_token_id(0)
+        gguf_writer.add_bos_token_id(1)
+        gguf_writer.add_eos_token_id(2)
+
+    def add_tensors(self, gguf_writer):
+        nm = self.name_map
+        data = self.data
+        print(f'* Adding {len(self.model.tensors)} tensor(s)')
+        for tensor in self.model.tensors:
+            name = str(tensor.name, 'UTF-8')
+            if name.endswith('.weight'):
+                name = name[:-7]
+                suffix = '.weight'
+            elif name.endswith('.bias'):
+                name = name[:-5]
+                suffix = '.bias'
+            mapped_name = nm.get(name)
+            assert mapped_name is not None, f'Bad name {name}'
+            mapped_name += suffix
+            tempdims = list(tensor.dims[:])
+            if len(tempdims) > 1:
+                temp = tempdims[1]
+                tempdims[1] = tempdims[0]
+                tempdims[0] = temp
+            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
+            gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
+
+def handle_metadata(cfg, hp):
+    import convert
+    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
+    hf_config_path   = cfg.model_metadata_dir / "config.json"
+    orig_config_path = cfg.model_metadata_dir / "params.json"
+    # We pass a fake model here. "original" mode will check the shapes of some
+    # tensors if information is missing in the .json file: other than that, the
+    # model data isn't used so this should be safe (at least for now).
+    fakemodel = {
+        'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
+        'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
+    }
+    fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
+    fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
+    if hf_config_path.exists():
+        params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
+    elif orig_config_path.exists():
+        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
+    else:
+        raise ValueError('Unable to load metadata')
+    vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
+    convert.check_vocab_size(params, vocab)
+    return (params, vocab)
+
+def handle_args():
+    parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
+    parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
+    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
+    parser.add_argument('--name', help = 'Set model name')
+    parser.add_argument('--desc', help = 'Set model description')
+    parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
+    parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
+    parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
+    parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
+    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
+    parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
+    return parser.parse_args()
+
+def main():
+    cfg = handle_args()
+    print(f'* Using config: {cfg}')
+    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
+    data = np.memmap(cfg.input, mode = 'r')
+    model = GGMLV3Model()
+    print('* Scanning GGML input file')
+    offset = model.load(data, 0)
+    print(f'* GGML model hyperparameters: {model.hyperparameters}')
+    vocab_override = None
+    params_override = None
+    if cfg.model_metadata_dir is not None:
+        (params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
+        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
+        print(f'* Overriding params: {params_override}')
+        print(f'* Overriding vocab: {vocab_override}')
+    else:
+        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
+    converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
+    converter.save()
+    print(f'* Successful completion. Output saved to: {cfg.output}')
+
+if __name__ == '__main__':
+    main()
--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@@ -0,0 +1,327 @@
+# HF llama --> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List, Optional
+from pathlib import Path
+from sentencepiece import SentencePieceProcessor
+
+#NDArray = np.ndarray[Any, Any]
+# compatible with python < 3.9
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+# reverse HF permute back to original pth layout
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
+
+
+def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
+    if n_kv_head is not None and n_head != n_kv_head:
+        n_head //= n_kv_head
+
+    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape))
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "LlamaForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.LLAMA
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+    head_count_kv = hparams["num_key_value_heads"]
+else:
+    head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+    hf_repo = hparams["_name_or_path"]
+else:
+    hf_repo = ""
+
+if "max_sequence_length" in hparams:
+    ctx_length = hparams["max_sequence_length"]
+elif "max_position_embeddings" in hparams:
+    ctx_length = hparams["max_position_embeddings"]
+else:
+    print("gguf: can not find ctx length parameter.")
+
+    sys.exit()
+
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(ctx_length)
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[bytes] = []
+scores: List[float] = []
+toktypes: List[int] = []
+
+if Path(dir_model + "/tokenizer.model").is_file():
+    # vocab type sentencepiece
+    print("gguf: get sentencepiece tokenizer vocab, scores and token types")
+
+    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+
+    for i in range(tokenizer.vocab_size()):
+        text: bytes
+        score: float
+
+        piece = tokenizer.id_to_piece(i)
+        text = piece.encode("utf-8")
+        score = tokenizer.get_score(i)
+
+        toktype = 1  # defualt to normal token type
+        if tokenizer.is_unknown(i):
+            toktype = 2
+        if tokenizer.is_control(i):
+            toktype = 3
+
+        # toktype = 4 is user-defined = tokens from added_tokens.json
+
+        if tokenizer.is_unused(i):
+            toktype = 5
+        if tokenizer.is_byte(i):
+            toktype = 6
+
+        tokens.append(text)
+        scores.append(score)
+        toktypes.append(toktype)
+
+    if Path(dir_model + "/added_tokens.json").is_file():
+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+            addtokens_json = json.load(f)
+
+            print("gguf: get added tokens")
+
+            for key in addtokens_json:
+                tokens.append( key.encode("utf-8") )
+                scores.append(-1000.0)
+                toktypes.append(4) # user-defined token type
+
+
+    gguf_writer.add_tokenizer_model("llama")
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_scores(scores)
+    gguf_writer.add_token_types(toktypes)
+
+
+print("gguf: get special token ids")
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # Look for special tokens in tokenizer.json if it exists
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer = json.load(f)
+
+    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]["content"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]["content"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]["content"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]["content"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]["content"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+else:
+    # If no tokenizer.json: Look for special tokens in config.json
+
+    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
+        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
+
+    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
+        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
+
+    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
+        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
+
+    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
+        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
+
+    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
+        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = ("pytorch_model.bin",)
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name.endswith(".rotary_emb.inv_freq"):
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # reverse permute these
+        if name.endswith(".q_proj.weight"):
+            data = reverse_hf_permute(data, head_count)
+        if name.endswith(".k_proj.weight"):
+            data = reverse_hf_permute(data, head_count, head_count_kv)
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -1,13 +0,0 @@
-# Compatibility stub
-
-import argparse
-
-import convert
-
-parser = argparse.ArgumentParser(
-    description="""[DEPRECATED - use `convert.py` instead]
-    Convert a LLaMA model checkpoint to a ggml compatible file""")
-parser.add_argument('dir_model',  help='directory containing the model checkpoint')
-parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
-args = parser.parse_args()
-convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
--- a/convert.py
+++ b/convert.py
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@@ -3,7 +3,7 @@
 ## Verifying that the model is running on the GPU with cuBLAS
 Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
 ```shell
-./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
+./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
 ```

 When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
@@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM)
 CPU: 7 physical cores
 RAM: 32GB

-Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
+Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)

-Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`

 Result:

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,27 +6,6 @@ find_package(Threads REQUIRED)

 # ...

-# common
-
-set(TARGET common)
-
-add_library(${TARGET} OBJECT
-    common.h
-    common.cpp
-    console.h
-    console.cpp
-    grammar-parser.h
-    grammar-parser.cpp
-    )
-
-if (BUILD_SHARED_LIBS)
-    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()
-
-target_include_directories(${TARGET} PUBLIC .)
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE llama)
-
 # examples

 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
@@ -42,8 +21,10 @@ else()
    add_subdirectory(benchmark)
    add_subdirectory(baby-llama)
    add_subdirectory(train-text-from-scratch)
+    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
    add_subdirectory(embd-input)
+    add_subdirectory(llama-bench)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/examples/convert-llama2c-to-ggml/CMakeLists.txt
+++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET convert-llama2c-to-ggml)
+add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -0,0 +1,26 @@
+## Convert llama2.c model to ggml
+
+This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
+
+To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
+
+`$ make -j`
+
+After successful compilation, following usage options are available:
+```
+usage: ./convert-llama2c-to-ggml [options]
+
+options:
+  -h, --help                       show this help message and exit
+  --copy-vocab-from-model FNAME    model path from which to copy vocab (default 'models/ggml-vocab.bin')
+  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model
+  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default ak_llama_model.bin')
+```
+
+An example command is as follows:
+
+`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>`
+
+Now you can use the model with command like:
+
+`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -0,0 +1,827 @@
+#include "ggml.h"
+#include "llama.h"
+
+#include <unordered_map>
+#include <vector>
+#include <cassert>
+#include <climits>
+#include <cstring>
+#include <cstdarg>
+#include <ctime>
+#include <random>
+#include <stdexcept>
+#include <algorithm>
+#include <string>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
+typedef struct {
+    int dim; // transformer dimension
+    int hidden_dim; // for ffn layers
+    int n_layers; // number of layers
+    int n_heads; // number of query heads
+    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
+    int vocab_size; // vocabulary size, usually 256 (byte-level)
+    int seq_len; // max sequence length
+} Config;
+
+typedef struct {
+    // token embedding table
+    float* token_embedding_table;    // (vocab_size, dim)
+    // weights for rmsnorms
+    float* rms_att_weight; // (layer, dim) rmsnorm weights
+    float* rms_ffn_weight; // (layer, dim)
+    // weights for matmuls
+    float* wq; // (layer, dim, dim)
+    float* wk; // (layer, dim, dim)
+    float* wv; // (layer, dim, dim)
+    float* wo; // (layer, dim, dim)
+    // weights for ffn
+    float* w1; // (layer, hidden_dim, dim)
+    float* w2; // (layer, dim, hidden_dim)
+    float* w3; // (layer, hidden_dim, dim)
+    // final rmsnorm
+    float* rms_final_weight; // (dim,)
+    // freq_cis for RoPE relatively positional embeddings
+    // float* freq_cis_real; // (seq_len, dim/2)
+    // float* freq_cis_imag; // (seq_len, dim/2)
+    // (optional) classifier weights for the logits, on the last layer
+    //float* wcls;
+} TransformerWeights;
+
+void malloc_weights(TransformerWeights* w, Config* p) {
+    // we calloc instead of malloc to keep valgrind happy
+    w->token_embedding_table = new float[p->vocab_size * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+
+    w->rms_att_weight = new float[p->n_layers * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+
+    w->rms_ffn_weight = new float[p->n_layers * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+
+    w->wq = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wk = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wv = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wo = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->rms_final_weight = new float[p->dim]();
+    printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+}
+
+int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
+    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
+    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
+    return 0;
+}
+
+void free_weights(TransformerWeights* w) {
+    delete w->token_embedding_table;
+    delete w->rms_att_weight;
+    delete w->rms_ffn_weight;
+    delete w->wq;
+    delete w->wk;
+    delete w->wv;
+    delete w->wo;
+    delete w->w1;
+    delete w->w2;
+    delete w->w3;
+    delete w->rms_final_weight;
+}
+
+void print_sample_weights(TransformerWeights *w){
+    printf("----- Quick print of first of the weight vales of all the variables\n");
+    printf("%f\n", w->token_embedding_table[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    printf("%f\n", w->rms_ffn_weight[0]);
+
+    printf("%f\n", w->wq[0]);
+    printf("%f\n", w->wk[0]);
+    printf("%f\n", w->wv[0]);
+    printf("%f\n", w->wo[0]);
+    printf("%f\n", w->w1[0]);
+    printf("%f\n", w->w2[0]);
+    printf("%f\n", w->w3[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
+
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+    using ttype = llama_token_type;
+
+    struct token_data {
+        token text;
+        float score;
+        ttype type;
+    };
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_data> id_to_token;
+};
+
+struct my_llama_hparams {
+    uint32_t n_vocab = 32000;
+    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_embd  = 4096;
+    uint32_t n_mult  = 4;
+    uint32_t n_head  = 32;
+    uint32_t n_layer = 32;
+    uint32_t n_rot   = 64;
+    bool operator!=(const my_llama_hparams& other) const {
+        return memcmp(this, &other, sizeof(my_llama_hparams));
+    }
+};
+
+struct my_llama_layer {
+    // normalization
+    struct ggml_tensor * attention_norm;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+
+    // ff
+    struct ggml_tensor * w1;
+    struct ggml_tensor * w2;
+    struct ggml_tensor * w3;
+};
+
+struct my_llama_model {
+    struct ggml_context * ctx = NULL;
+
+    my_llama_hparams hparams;
+
+    struct ggml_tensor * tok_embeddings;
+
+    struct ggml_tensor * norm;
+    struct ggml_tensor * output;
+
+    std::vector<my_llama_layer> layers;
+
+    uint32_t train_its = 0;
+    uint32_t train_samples = 0;
+    uint32_t train_tokens = 0;
+};
+
+struct train_params {
+    const char * fn_vocab_model;
+    const char * fn_llama2c_model;
+    const char * fn_llama2c_output_model;
+    const char * fn_train_data;
+    const char * fn_checkpoint_in;
+    const char * fn_checkpoint_out;
+    const char * fn_model_out;
+
+    uint32_t seed;
+
+    int n_ctx;
+    int n_embd;
+    int n_mult;
+    int n_head;
+    int n_layer;
+    int n_rotmax;
+
+    int n_threads;
+    int n_batch;
+    int n_examples;
+    int n_predict;
+
+    int print_info_interval;
+    int print_details_interval;
+
+    bool samples_start_after_nl;
+    bool use_adam;
+    bool use_flash;
+    bool use_scratch;
+
+    // only adam
+    int   warmup;
+    int   cos_decay_steps;
+    float cos_decay_restart;
+    float cos_decay_alpha;
+
+    int   lbfgs_n_iter;
+    int   adam_n_iter;
+    float adam_alpha;
+    float adam_decay;
+
+    int mem_model_gb;
+    int mem_compute_gb;
+    int mem_compute0_gb;
+    int mem_compute1_gb;
+};
+
+uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
+    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
+    return n_ff;
+}
+
+void print_params(struct my_llama_hparams * params) {
+    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
+    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
+    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+}
+
+void init_model(struct my_llama_model * model) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_vocab = hparams.n_vocab;
+
+    const uint32_t n_ff = get_n_ff(&hparams);
+    struct ggml_context * ctx = model->ctx;
+
+    model->train_its = 0;
+    model->train_samples = 0;
+    model->train_tokens = 0;
+
+    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
+
+    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+    printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
+
+    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+
+    // printing the per-layer allocations here so we dont print in the for loop.
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+
+    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
+    ggml_set_name(model->norm,           "norm.weight");
+    ggml_set_name(model->output,         "output.weight");
+
+    model->layers.resize(n_layer);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        std::string layers_i = "layers." + std::to_string(i);
+
+        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+
+        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
+        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+
+        ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
+
+        ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
+        ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
+        ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
+        ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
+
+        ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
+
+        ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
+        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
+        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
+    }
+}
+
+float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+void print_row(struct ggml_tensor * probs, int i) {
+    for (int k = 0; k < probs->ne[0]; ++k) {
+        float p = get_f32_2d(probs, k, i);
+        printf(" %f", p);
+    }
+    printf("\n");
+}
+
+void print_matrix(struct ggml_tensor * probs) {
+    assert(probs->n_dims == 2);
+    for (int i = 0; i < probs->ne[1]; ++i) {
+        for (int k = 0; k < probs->ne[0]; ++k) {
+            float p = get_f32_2d(probs, k, i);
+            printf(" %.2f", p);
+        }
+        printf("\n");
+    }
+}
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+    std::float_t read_f32() {
+        std::float_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    if (tensor == NULL) {
+        file->write_u32(0);
+        file->write_u32(0);
+        file->write_u32(GGML_TYPE_F32);
+        file->seek((0-file->tell()) & 31, SEEK_CUR);
+        return;
+    }
+    const char * name = ggml_get_name(tensor);
+    uint32_t name_len = strlen(name);
+    uint32_t nd = tensor->n_dims;
+    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
+                       (uint32_t)tensor->ne[1],
+                       (uint32_t)tensor->ne[2],
+                       (uint32_t)tensor->ne[3] };
+    file->write_u32(nd);
+    file->write_u32(name_len);
+    file->write_u32(tensor->type);
+    file->write_raw(ne, sizeof(ne[0]) * nd);
+    file->write_raw(name, name_len);
+    file->seek((0-file->tell()) & 31, SEEK_CUR);
+    file->write_raw(tensor->data, ggml_nbytes(tensor));
+}
+
+bool is_ggml_file(const char *filename) {
+    llama_file file(filename, "rb");
+    if (file.size < 4) {
+        return false;
+    }
+    uint32_t magic = file.read_u32();
+    return magic == GGUF_MAGIC;
+}
+
+void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
+    // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
+    if (is_ggml_file(filename)) {
+
+        struct llama_context_params llama_params = llama_context_default_params();
+        llama_params.vocab_only = true;
+
+        struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
+        struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
+
+        const int n_vocab = llama_n_vocab(lctx);
+        vocab->id_to_token.resize(n_vocab);
+        for (int i=0; i<n_vocab; ++i) {
+            vocab->id_to_token[i].text  = llama_token_get_text(lctx, i);
+            vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
+            vocab->id_to_token[i].type  = llama_token_get_type(lctx, i);
+            vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
+        }
+        llama_free(lctx);
+        llama_free_model(lmodel);
+    } else { // assume llama2.c vocabulary
+        printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
+        llama_file file(filename, "rb");
+        const int  n_vocab = config->vocab_size;
+        /* uint32_t max_token_length =  */ file.read_u32(); // unused
+        vocab->id_to_token.resize(n_vocab);
+        for (int i=0; i<n_vocab; ++i) {
+            float_t score = file.read_f32();
+            uint32_t len = file.read_u32();
+            std::string text = file.read_string(len);
+            vocab->id_to_token[i].text = text;
+            vocab->id_to_token[i].score = score;
+            vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
+            vocab->token_to_id.emplace(text, i);
+        }
+    }
+}
+
+void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
+    int ct;
+    switch (gg_weights->n_dims){
+        case 1:
+            ct = 0;
+            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
+                float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
+                *ptr = karpathy_weights[ct];
+                ct++;
+            }
+            break;
+        case 2:
+            ct = 0;
+            for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                    float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
+                    *ptr = karpathy_weights[ct];
+                    ct++;
+                }
+            }
+            break;
+        case 3:
+            ct = 0;
+            for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
+                for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                        float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
+                        *ptr = karpathy_weights[ct];
+                        ct++;
+                    }
+                }
+            }
+            break;
+    }
+}
+
+void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
+    struct llama_file file(filename, "wb");
+    if (file.fp == NULL) {
+        return;
+    }
+
+#pragma message("TODO: implement file saving using gguf")
+    (void) vocab;
+    (void) model;
+    (void) w;
+//    // write_magic
+//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+//    file.write_u32(LLAMA_FILE_VERSION); // version
+//    // write_hparams
+//    file.write_u32(model->hparams.n_vocab);
+//    file.write_u32(model->hparams.n_embd);
+//    file.write_u32(model->hparams.n_mult);
+//    file.write_u32(model->hparams.n_head);
+//    file.write_u32(model->hparams.n_layer);
+//    file.write_u32(model->hparams.n_rot);
+//    file.write_u32(LLAMA_FTYPE_ALL_F32);
+//
+//    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
+//    uint32_t n_vocab = model->hparams.n_vocab;
+//    for (uint32_t i = 0; i < n_vocab; i++) {
+//        const auto & token_data = vocab->id_to_token.at(i);
+//        file.write_u32((uint32_t) token_data.tok.size());
+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
+//        file.write_raw(&token_data.score, sizeof(token_data.score));
+//    }
+//
+//    // stuff AK weights into GG weights one by one.
+//    // w->token_embedding_table -> model->tok_embeddings
+//    // float*                   -> struct ggml_tensor
+//    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+//    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
+//
+//    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
+//    //print_row(model->norm, 0);
+//
+//    // for rms-att-weight
+//    int row_length = model->hparams.n_embd;
+//    const auto & hparams = model->hparams;
+//    //int n_ff = model->hparams.n_embd;
+//    int n_ff = get_n_ff(&hparams);
+//
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
+//        auto & layer = model->layers[i];
+//        // 1d
+//        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+//
+//        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
+//        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+//
+//        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+//        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+//        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
+//    }
+//    // write tensors
+//    write_tensor(&file, model->tok_embeddings);
+//    write_tensor(&file, model->norm);
+//    write_tensor(&file, model->output); // ?
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+//        auto & layer = model->layers[i];
+//
+//        write_tensor(&file, layer.attention_norm);
+//        write_tensor(&file, layer.wq);
+//        write_tensor(&file, layer.wk);
+//        write_tensor(&file, layer.wv);
+//        write_tensor(&file, layer.wo);
+//        write_tensor(&file, layer.ffn_norm);
+//        write_tensor(&file, layer.w1);
+//        write_tensor(&file, layer.w2);
+//        write_tensor(&file, layer.w3);
+//    }
+}
+
+struct train_params get_default_train_params() {
+    struct train_params params;
+    params.fn_vocab_model    = "models/ggml-vocab.bin";
+    params.fn_llama2c_output_model = "ak_llama_model.bin";
+    params.fn_train_data     = "shakespeare.txt";
+    params.fn_checkpoint_in  = "checkpoint.bin";
+    params.fn_checkpoint_out = "checkpoint.bin";
+    params.fn_model_out      = "ggml-checkpoint-f32.bin";
+
+    params.seed       =   -1;
+
+    params.n_ctx      =  128;
+    params.n_embd     =  256;
+    params.n_mult     =  256;
+    params.n_head     =    8;
+    params.n_layer    =   16;
+    params.n_rotmax   =   64;
+
+    params.n_threads  =    6;
+    params.n_batch    =    8;
+    params.n_examples =    8;
+    params.n_predict  = 1024;
+
+    params.print_info_interval    = 1;
+    params.print_details_interval = 2;
+
+    params.samples_start_after_nl = false;
+    params.use_adam               = true;
+    params.use_flash              = true;
+    params.use_scratch            = true;
+
+    // only adam
+    params.warmup            =  100;
+    params.cos_decay_steps   = 1000;
+    params.cos_decay_restart = 1.1f;
+    params.cos_decay_alpha   = 0.0f;
+
+    params.lbfgs_n_iter      = 16;
+    params.adam_n_iter       = 16;
+    params.adam_alpha        = 1e-3f;
+    params.adam_decay        = 1e-3f;
+
+    params.mem_model_gb   = 2;
+    params.mem_compute_gb = 24;
+    params.mem_compute0_gb = 8;
+    params.mem_compute1_gb = 2;
+
+    return params;
+}
+
+void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help                       show this help message and exit\n");
+    fprintf(stderr, "  --copy-vocab-from-model FNAME    llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
+    fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
+    fprintf(stderr, "\n");
+}
+
+bool params_parse(int argc, char ** argv, struct train_params * params) {
+    bool invalid_param = false;
+    bool reqd_param_found = false;
+    std::string arg;
+    struct train_params default_params = get_default_train_params();
+    const std::string arg_prefix = "--";
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "--copy-vocab-from-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_vocab_model = argv[i];
+        } else if (arg == "--llama2c-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            reqd_param_found = true;
+            params->fn_llama2c_model = argv[i];
+        } else if (arg == "--llama2c-output-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_llama2c_output_model = argv[i];
+        } else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, &default_params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            print_usage(argc, argv, &default_params);
+            exit(1);
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    if (!reqd_param_found){
+        fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
+        print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    struct train_params params = get_default_train_params();
+    if (!params_parse(argc, argv, &params)) {
+        return 1;
+    }
+    Config config;
+    TransformerWeights weights;
+    {
+        FILE *file = fopen(params.fn_llama2c_model, "rb");
+        if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
+        // read in the config header
+        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
+        // read in the Transformer weights
+        malloc_weights(&weights, &config);
+        if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
+        fclose(file);
+    }
+
+    struct llama_vocab vocab;
+    load_vocab(params.fn_vocab_model, &config, &vocab);
+
+    struct my_llama_model model;
+    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
+    model.hparams.n_ctx   = params.n_ctx;
+    model.hparams.n_embd  = config.dim; //params.n_embd;
+    model.hparams.n_mult  = 32;//params.n_mult;
+    model.hparams.n_head  = config.n_heads; //params.n_head;
+    model.hparams.n_layer = config.n_layers; //params.n_layer;
+    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+    print_params(&model.hparams);
+    struct ggml_init_params lcparams;
+    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
+    lcparams.mem_buffer = NULL;
+    lcparams.no_alloc   = false;
+
+    model.ctx = ggml_init(lcparams);
+
+    init_model(&model);
+    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
+
+    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
+
+    ggml_free(model.ctx);
+    free_weights(&weights);
+    return 0;
+}
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -167,7 +167,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

        // TODO: Apply penalties
-        // float nl_logit = logits[llama_token_nl()];
+        // float nl_logit = logits[llama_token_nl(ctx)];
        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
        // llama_sample_repetition_penalty(ctx, &candidates_p,
        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@@ -176,7 +176,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
        // last_n_repeat, alpha_frequency, alpha_presence);
        // if (!penalize_nl) {
-        //     logits[llama_token_nl()] = nl_logit;
+        //     logits[llama_token_nl(ctx)] = nl_logit;
        // }

        if (temp <= 0) {
@@ -211,7 +211,7 @@ const char * sampling(struct MyModel * mymodel) {
    llama_context * ctx = mymodel->ctx;
    int id = sampling_id(mymodel);
    static std::string ret;
-    if (id == llama_token_eos()) {
+    if (id == llama_token_eos(ctx)) {
        ret = "</s>";
    } else {
        ret = llama_token_to_str(ctx, id);
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -67,28 +67,35 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
        }
        fprintf(stderr, "\n");
    }

-    if (params.embedding){
-        if (embd_inp.size() > 0) {
-            if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
-                return 1;
-            }
-        }
-
-        const int n_embd = llama_n_embd(ctx);
-        const auto embeddings = llama_get_embeddings(ctx);
-
-        for (int i = 0; i < n_embd; i++) {
-            printf("%f ", embeddings[i]);
-        }
-        printf("\n");
+    if (embd_inp.size() > (size_t)params.n_ctx) {
+        fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
+                __func__, embd_inp.size(), params.n_ctx);
+        return 1;
    }

+    while (!embd_inp.empty()) {
+        int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
+        if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return 1;
+        }
+        n_past += n_tokens;
+        embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
+    }
+
+    const int n_embd = llama_n_embd(ctx);
+    const auto embeddings = llama_get_embeddings(ctx);
+
+    for (int i = 0; i < n_embd; i++) {
+        printf("%f ", embeddings[i]);
+    }
+    printf("\n");
+
    llama_print_timings(ctx);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -0,0 +1,246 @@
+#include "ggml.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cinttypes>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <vector>
+
+#undef MIN
+#undef MAX
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+template<typename T>
+static std::string to_string(const T & val) {
+    std::stringstream ss;
+    ss << val;
+    return ss.str();
+}
+
+bool gguf_ex_write(const std::string & fname) {
+    struct gguf_context * ctx = gguf_init_empty();
+
+    gguf_set_val_u8  (ctx, "some.parameter.uint8",    0x12);
+    gguf_set_val_i8  (ctx, "some.parameter.int8",    -0x13);
+    gguf_set_val_u16 (ctx, "some.parameter.uint16",   0x1234);
+    gguf_set_val_i16 (ctx, "some.parameter.int16",   -0x1235);
+    gguf_set_val_u32 (ctx, "some.parameter.uint32",   0x12345678);
+    gguf_set_val_i32 (ctx, "some.parameter.int32",   -0x12345679);
+    gguf_set_val_f32 (ctx, "some.parameter.float32",  0.123456789f);
+    gguf_set_val_bool(ctx, "some.parameter.bool",     true);
+    gguf_set_val_str (ctx, "some.parameter.string",   "hello world");
+
+    gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16,   std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
+    gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
+    gguf_set_arr_str (ctx, "some.parameter.arr.str",                    std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ 128ull*1024ull*1024ull,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false,
+    };
+
+    struct ggml_context * ctx_data = ggml_init(params);
+
+    const int n_tensors = 10;
+
+    // tensor infos
+    for (int i = 0; i < n_tensors; ++i) {
+        const std::string name = "tensor_" + to_string(i);
+
+        int64_t ne[GGML_MAX_DIMS] = { 1 };
+        int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
+
+        for (int j = 0; j < n_dims; ++j) {
+            ne[j] = rand() % 10 + 1;
+        }
+
+        struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
+        ggml_set_name(cur, name.c_str());
+
+        {
+            float * data = (float *) cur->data;
+            for (int j = 0; j < ggml_nelements(cur); ++j) {
+                data[j] = 100 + i;
+            }
+        }
+
+        gguf_add_tensor(ctx, cur);
+    }
+
+    gguf_write_to_file(ctx, fname.c_str(), false);
+
+    fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
+
+    ggml_free(ctx_data);
+    gguf_free(ctx);
+
+    return true;
+}
+
+// just read tensor info
+bool gguf_ex_read_0(const std::string & fname) {
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ NULL,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+
+    // kv
+    {
+        const int n_kv = gguf_get_n_kv(ctx);
+
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
+
+        for (int i = 0; i < n_kv; ++i) {
+            const char * key = gguf_get_key(ctx, i);
+
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
+        }
+    }
+
+    // find kv string
+    {
+        const char * findkey = "some.parameter.string";
+
+        const int keyidx = gguf_find_key(ctx, findkey);
+        if (keyidx == -1) {
+            fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
+        } else {
+            const char * key_value = gguf_get_val_str(ctx, keyidx);
+            fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
+        }
+    }
+
+    // tensor info
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+        }
+    }
+
+    gguf_free(ctx);
+
+    return true;
+}
+
+// read and create ggml_context containing the tensors and their data
+bool gguf_ex_read_1(const std::string & fname) {
+    struct ggml_context * ctx_data = NULL;
+
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ &ctx_data,
+    };
+
+    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
+
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+
+    // kv
+    {
+        const int n_kv = gguf_get_n_kv(ctx);
+
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
+
+        for (int i = 0; i < n_kv; ++i) {
+            const char * key = gguf_get_key(ctx, i);
+
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
+        }
+    }
+
+    // tensor info
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t offset = gguf_get_tensor_offset(ctx, i);
+
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+        }
+    }
+
+    // data
+    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
+        for (int i = 0; i < n_tensors; ++i) {
+            fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);
+
+            const char * name = gguf_get_tensor_name(ctx, i);
+
+            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
+
+            fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
+
+            // print first 10 elements
+            const float * data = (const float *) cur->data;
+
+            printf("%s data[:10] : ", name);
+            for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
+                printf("%f ", data[j]);
+            }
+            printf("\n\n");
+
+            // check data
+            {
+                const float * data = (const float *) cur->data;
+                for (int j = 0; j < ggml_nelements(cur); ++j) {
+                    if (data[j] != 100 + i) {
+                        fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
+                        return false;
+                    }
+                }
+            }
+        }
+    }
+
+    fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
+
+    ggml_free(ctx_data);
+    gguf_free(ctx);
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    if (argc < 3) {
+        fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
+        return -1;
+    }
+
+    const std::string fname(argv[1]);
+    const std::string mode (argv[2]);
+
+    GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
+
+    if (mode == "w") {
+        GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
+    } else if (mode == "r") {
+        GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
+        GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
+    }
+
+    return 0;
+}
--- a/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
+++ b/examples/gptneox-wip/cmpnct_gpt2bpe.hpp
--- a/examples/gptneox-wip/falcon-main.cpp
+++ b/examples/gptneox-wip/falcon-main.cpp
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
--- a/examples/llama-bench/CMakeLists.txt
+++ b/examples/llama-bench/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET llama-bench)
+add_executable(${TARGET} llama-bench.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -0,0 +1,969 @@
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <chrono>
+#include <cinttypes>
+#include <cstring>
+#include <ctime>
+#include <iterator>
+#include <map>
+#include <numeric>
+#include <regex>
+#include <sstream>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#include "ggml.h"
+#include "llama.h"
+#include "common.h"
+#include "build-info.h"
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+// utils
+static uint64_t get_time_ns() {
+    using clock = std::chrono::high_resolution_clock;
+    return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
+}
+
+template<class T>
+static std::string join(const std::vector<T> & values, const std::string & delim) {
+    std::ostringstream str;
+    for (size_t i = 0; i < values.size(); i++) {
+        str << values[i];
+        if (i < values.size() - 1) {
+            str << delim;
+        }
+    }
+    return str.str();
+}
+
+template<class T>
+static std::vector<T> split(const std::string & str, char delim) {
+    std::vector<T> values;
+    std::istringstream str_stream(str);
+    std::string token;
+    while (std::getline(str_stream, token, delim)) {
+        T value;
+        std::istringstream token_stream(token);
+        token_stream >> value;
+        values.push_back(value);
+    }
+    return values;
+}
+
+template<typename T>
+static T avg(const std::vector<T> & v) {
+    if (v.empty()) {
+        return 0;
+    }
+    T sum = std::accumulate(v.begin(), v.end(), T(0));
+    return sum / (T)v.size();
+}
+
+template<typename T>
+static T stdev(const std::vector<T> & v) {
+    if (v.size() <= 1) {
+        return 0;
+    }
+    T mean = avg(v);
+    T sq_sum = std::inner_product(v.begin(), v.end(), v.begin(), T(0));
+    T stdev = std::sqrt(sq_sum / (T)(v.size() - 1) - mean * mean * (T)v.size() / (T)(v.size() - 1));
+    return stdev;
+}
+
+static bool ggml_cpu_has_metal() {
+#if defined(GGML_USE_METAL)
+    return true;
+#else
+    return false;
+#endif
+}
+
+static std::string get_cpu_info() {
+    std::string id;
+#ifdef __linux__
+    FILE * f = fopen("/proc/cpuinfo", "r");
+    if (f) {
+        char buf[1024];
+        while (fgets(buf, sizeof(buf), f)) {
+            if (strncmp(buf, "model name", 10) == 0) {
+                char * p = strchr(buf, ':');
+                if (p) {
+                    p++;
+                    while (std::isspace(*p)) {
+                        p++;
+                    }
+                    while (std::isspace(p[strlen(p) - 1])) {
+                        p[strlen(p) - 1] = '\0';
+                    }
+                    id = p;
+                    break;
+                }
+            }
+        }
+    }
+#endif
+    // TODO: other platforms
+    return id;
+}
+
+static std::string get_gpu_info() {
+    std::string id;
+#ifdef GGML_USE_CUBLAS
+    int count = ggml_cuda_get_device_count();
+    for (int i = 0; i < count; i++) {
+        char buf[128];
+        ggml_cuda_get_device_description(i, buf, sizeof(buf));
+        id += buf;
+        if (i < count - 1) {
+            id += "/";
+        }
+    }
+#endif
+    // TODO: other backends
+    return id;
+}
+
+// command line params
+enum output_formats {CSV, JSON, MARKDOWN, SQL};
+
+struct cmd_params {
+    std::vector<std::string> model;
+    std::vector<int> n_prompt;
+    std::vector<int> n_gen;
+    std::vector<int> n_batch;
+    std::vector<bool> f32_kv;
+    std::vector<int> n_threads;
+    std::vector<int> n_gpu_layers;
+    std::vector<int> main_gpu;
+    std::vector<bool> mul_mat_q;
+    std::vector<bool> low_vram;
+    std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
+    int reps;
+    bool verbose;
+    output_formats output_format;
+};
+
+static const cmd_params cmd_params_defaults = {
+    /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
+    /* n_prompt      */ {512},
+    /* n_gen         */ {128},
+    /* n_batch       */ {512},
+    /* f32_kv        */ {false},
+    /* n_threads     */ {get_num_physical_cores()},
+    /* n_gpu_layers  */ {99},
+    /* main_gpu      */ {0},
+    /* mul_mat_q     */ {true},
+    /* low_vram      */ {false},
+    /* tensor_split  */ {{}},
+    /* reps          */ 5,
+    /* verbose       */ false,
+    /* output_format */ MARKDOWN
+};
+
+static void print_usage(int /* argc */, char ** argv) {
+    fprintf(stdout, "usage: %s [options]\n", argv[0]);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "options:\n");
+    fprintf(stdout, "  -h, --help\n");
+    fprintf(stdout, "  -m, --model <filename>            (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
+    fprintf(stdout, "  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
+    fprintf(stdout, "  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
+    fprintf(stdout, "  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
+    fprintf(stdout, "  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
+    fprintf(stdout, "  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    fprintf(stdout, "  -ngl N, --n-gpu-layers <n>        (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    fprintf(stdout, "  -mg i, --main-gpu <n>             (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
+    fprintf(stdout, "  -lv, --low-vram <0|1>             (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
+    fprintf(stdout, "  -mmq, --mul-mat-q <0|1>           (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
+    fprintf(stdout, "  -ts, --tensor_split <ts0/ts1/..>               \n");
+    fprintf(stdout, "  -r, --repetitions <n>             (default: %d)\n", cmd_params_defaults.reps);
+    fprintf(stdout, "  -o, --output <csv|json|md|sql>    (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
+    fprintf(stdout, "  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
+    fprintf(stdout, "\n");
+    fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
+
+}
+
+static cmd_params parse_cmd_params(int argc, char ** argv) {
+    cmd_params params;
+    std::string arg;
+    bool invalid_param = false;
+    const std::string arg_prefix = "--";
+    const char split_delim = ',';
+
+    params.verbose = cmd_params_defaults.verbose;
+    params.output_format = cmd_params_defaults.output_format;
+    params.reps = cmd_params_defaults.reps;
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv);
+            exit(0);
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<std::string>(argv[i], split_delim);
+            params.model.insert(params.model.end(), p.begin(), p.end());
+        } else if (arg == "-p" || arg == "--n-prompt") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<int>(argv[i], split_delim);
+            params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
+        } else if (arg == "-n" || arg == "--n-gen") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<int>(argv[i], split_delim);
+            params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
+        } else if (arg == "-b" || arg == "--batch-size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<int>(argv[i], split_delim);
+            params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
+        } else if (arg == "--memory-f32") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<int>(argv[i], split_delim);
+            params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<int>(argv[i], split_delim);
+            params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
+        } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<int>(argv[i], split_delim);
+            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
+        } else if (arg == "-mg" || arg == "--main-gpu") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.main_gpu = split<int>(argv[i], split_delim);
+        } else if (arg == "-lv" || arg == "--low-vram") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<bool>(argv[i], split_delim);
+            params.low_vram.insert(params.low_vram.end(), p.begin(), p.end());
+        } else if (arg == "-mmq" || arg == "--mul-mat-q") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = split<bool>(argv[i], split_delim);
+            params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end());
+        } else if (arg == "-ts" || arg == "--tensor-split") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            for (auto ts : split<std::string>(argv[i], split_delim)) {
+                // split string by ; and /
+                const std::regex regex{R"([;/]+)"};
+                std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
+                std::vector<std::string> split_arg{it, {}};
+                GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+
+                std::array<float, LLAMA_MAX_DEVICES> tensor_split;
+                for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
+                    if (i < split_arg.size()) {
+                        tensor_split[i] = std::stof(split_arg[i]);
+                    } else {
+                        tensor_split[i] = 0.0f;
+                    }
+                }
+                params.tensor_split.push_back(tensor_split);
+            }
+        } else if (arg == "-r" || arg == "--repetitions") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.reps = std::stoi(argv[i]);
+        } else if (arg == "-o" || arg == "--output") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            if (argv[i] == std::string("csv")) {
+                params.output_format = CSV;
+            } else if (argv[i] == std::string("json")) {
+                params.output_format = JSON;
+            } else if (argv[i] == std::string("md")) {
+                params.output_format = MARKDOWN;
+            } else if (argv[i] == std::string("sql")) {
+                params.output_format = SQL;
+            } else {
+                invalid_param = true;
+                break;
+            }
+        } else if (arg == "-v" || arg == "--verbose") {
+            params.verbose = true;
+        } else {
+            invalid_param = true;
+            break;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv);
+        exit(1);
+    }
+
+    // set defaults
+    if (params.model.empty())        { params.model = cmd_params_defaults.model; }
+    if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
+    if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
+    if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
+    if (params.f32_kv.empty())       { params.f32_kv = cmd_params_defaults.f32_kv; }
+    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
+    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
+    if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
+    if (params.low_vram.empty())     { params.low_vram = cmd_params_defaults.low_vram; }
+    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
+    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
+
+    return params;
+}
+
+struct cmd_params_instance {
+    std::string model;
+    int n_prompt;
+    int n_gen;
+    int n_batch;
+    bool f32_kv;
+    int n_threads;
+    int n_gpu_layers;
+    int main_gpu;
+    bool mul_mat_q;
+    bool low_vram;
+    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
+
+    llama_context_params to_llama_params() const {
+        llama_context_params lparams = llama_context_default_params();
+        lparams.n_ctx = n_prompt + n_gen;
+        lparams.n_batch = n_batch;
+        lparams.f16_kv = !f32_kv;
+        lparams.n_gpu_layers = n_gpu_layers;
+        lparams.main_gpu = main_gpu;
+        lparams.mul_mat_q = mul_mat_q;
+        lparams.low_vram = low_vram;
+        lparams.tensor_split = tensor_split.data();
+
+        return lparams;
+    }
+};
+
+static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) {
+    std::vector<cmd_params_instance> instances;
+
+    for (const auto & m : params.model)
+    for (const auto & nb : params.n_batch)
+    for (const auto & fk : params.f32_kv)
+    for (const auto & nl : params.n_gpu_layers)
+    for (const auto & mg : params.main_gpu)
+    for (const auto & mmq : params.mul_mat_q)
+    for (const auto & lv : params.low_vram)
+    for (const auto & ts : params.tensor_split)
+    for (const auto & nt : params.n_threads) {
+        cmd_params_instance instance = {
+            /* .model        = */ m,
+            /* .n_prompt     = */ n_prompt,
+            /* .n_gen        = */ n_gen,
+            /* .n_batch      = */ nb,
+            /* .f32_kv       = */ fk,
+            /* .n_threads    = */ nt,
+            /* .n_gpu_layers = */ nl,
+            /* .main_gpu     = */ mg,
+            /* .mul_mat_q    = */ mmq,
+            /* .low_vram     = */ lv,
+            /* .tensor_split = */ ts,
+        };
+        instances.push_back(instance);
+    }
+    return instances;
+}
+
+static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
+    std::vector<cmd_params_instance> instances;
+
+    for (const auto & n_prompt : params.n_prompt) {
+        if (n_prompt == 0) {
+            continue;
+        }
+        auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt);
+        instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end());
+    }
+
+    for (const auto & n_gen : params.n_gen) {
+        if (n_gen == 0) {
+            continue;
+        }
+        auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
+        instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
+    }
+
+    return instances;
+}
+
+struct test {
+    static const std::string build_commit;
+    static const int build_number;
+    static const bool cuda;
+    static const bool opencl;
+    static const bool metal;
+    static const bool gpu_blas;
+    static const bool blas;
+    static const std::string cpu_info;
+    static const std::string gpu_info;
+    std::string model_filename;
+    std::string model_type;
+    int n_batch;
+    int n_threads;
+    bool f32_kv;
+    int n_gpu_layers;
+    int main_gpu;
+    bool mul_mat_q;
+    bool low_vram;
+    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
+    int n_prompt;
+    int n_gen;
+    std::string test_time;
+    std::vector<uint64_t> samples_ns;
+
+    test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
+        model_filename = inst.model;
+        char buf[128];
+        llama_model_type(lmodel, buf, sizeof(buf));
+        model_type = buf;
+        n_batch = inst.n_batch;
+        n_threads = inst.n_threads;
+        f32_kv = inst.f32_kv;
+        n_gpu_layers = inst.n_gpu_layers;
+        main_gpu = inst.main_gpu;
+        mul_mat_q = inst.mul_mat_q;
+        low_vram = inst.low_vram;
+        tensor_split = inst.tensor_split;
+        n_prompt = inst.n_prompt;
+        n_gen = inst.n_gen;
+        // RFC 3339 date-time format
+        time_t t = time(NULL);
+        std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
+        test_time = buf;
+
+        (void) ctx;
+    }
+
+    uint64_t avg_ns() const {
+        return ::avg(samples_ns);
+    }
+
+    uint64_t stdev_ns() const {
+        return ::stdev(samples_ns);
+    }
+
+    std::vector<double> get_ts() const {
+        int n_tokens = n_prompt + n_gen;
+        std::vector<double> ts;
+        std::transform(samples_ns.begin(), samples_ns.end(), std::back_inserter(ts), [n_tokens](uint64_t t) { return 1e9 * n_tokens / t; });
+        return ts;
+    }
+
+    double avg_ts() const {
+        return ::avg(get_ts());
+    }
+
+    double stdev_ts() const {
+        return ::stdev(get_ts());
+    }
+
+    static std::string get_backend() {
+        if (cuda) {
+            return "CUDA";
+        }
+        if (opencl) {
+            return "OpenCL";
+        }
+        if (metal) {
+            return "Metal";
+        }
+        if (gpu_blas) {
+            return "GPU BLAS";
+        }
+        if (blas) {
+            return "BLAS";
+        }
+        return "CPU";
+    }
+
+    static const std::vector<std::string> & get_fields() {
+        static const std::vector<std::string> fields = {
+            "build_commit", "build_number",
+            "cuda", "opencl", "metal", "gpu_blas", "blas",
+            "cpu_info", "gpu_info",
+            "model_filename", "model_type",
+            "n_batch", "n_threads", "f16_kv",
+            "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
+            "n_prompt", "n_gen", "test_time",
+            "avg_ns", "stddev_ns",
+            "avg_ts", "stddev_ts"
+        };
+        return fields;
+    }
+
+    enum field_type {STRING, BOOL, INT, FLOAT};
+
+    static field_type get_field_type(const std::string & field) {
+        if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
+            field == "n_gpu_layers" || field == "main_gpu" ||
+            field == "n_prompt" || field == "n_gen" ||
+            field == "avg_ns" || field == "stddev_ns") {
+            return INT;
+        }
+        if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
+            field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
+            return BOOL;
+        }
+        if (field == "avg_ts" || field == "stddev_ts") {
+            return FLOAT;
+        }
+        return STRING;
+    }
+
+    std::vector<std::string> get_values() const {
+        std::string tensor_split_str;
+        int max_nonzero = 0;
+        for (int i = 0; i < LLAMA_MAX_DEVICES; i++) {
+            if (tensor_split[i] > 0) {
+                max_nonzero = i;
+            }
+        }
+        for (int i = 0; i <= max_nonzero; i++) {
+            char buf[32];
+            snprintf(buf, sizeof(buf), "%.2f", tensor_split[i]);
+            tensor_split_str += buf;
+            if (i < max_nonzero) {
+                tensor_split_str += "/";
+            }
+        }
+        std::vector<std::string> values = {
+            build_commit, std::to_string(build_number),
+            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
+            cpu_info, gpu_info,
+            model_filename, model_type,
+            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
+            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
+            std::to_string(n_prompt), std::to_string(n_gen), test_time,
+            std::to_string(avg_ns()), std::to_string(stdev_ns()),
+            std::to_string(avg_ts()), std::to_string(stdev_ts())
+        };
+        return values;
+    }
+
+    std::map<std::string, std::string> get_map() const {
+        std::map<std::string, std::string> map;
+        auto fields = get_fields();
+        auto values = get_values();
+        std::transform(fields.begin(), fields.end(), values.begin(),
+                std::inserter(map, map.end()), std::make_pair<const std::string &, const std::string &>);
+        return map;
+    }
+};
+
+const std::string test::build_commit = BUILD_COMMIT;
+const int         test::build_number = BUILD_NUMBER;
+const bool        test::cuda         = !!ggml_cpu_has_cublas();
+const bool        test::opencl       = !!ggml_cpu_has_clblast();
+const bool        test::metal        = !!ggml_cpu_has_metal();
+const bool        test::gpu_blas     = !!ggml_cpu_has_gpublas();
+const bool        test::blas         = !!ggml_cpu_has_blas();
+const std::string test::cpu_info     = get_cpu_info();
+const std::string test::gpu_info     = get_gpu_info();
+
+struct printer {
+    virtual ~printer() {}
+
+    FILE * fout;
+    virtual void print_header(const cmd_params & params) { (void) params; };
+    virtual void print_test(const test & t) = 0;
+    virtual void print_footer() { };
+};
+
+struct csv_printer : public printer {
+    static std::string escape_csv(const std::string & field) {
+        std::string escaped = "\"";
+        for (auto c : field) {
+            if (c == '"') {
+                escaped += "\"";
+            }
+            escaped += c;
+        }
+        escaped += "\"";
+        return escaped;
+    }
+
+    void print_header(const cmd_params & params) override  {
+        std::vector<std::string> fields = test::get_fields();
+        fprintf(fout, "%s\n", join(fields, ",").c_str());
+        (void) params;
+    }
+
+    void print_test(const test & t) override {
+        std::vector<std::string> values = t.get_values();
+        std::transform(values.begin(), values.end(), values.begin(), escape_csv);
+        fprintf(fout, "%s\n", join(values, ",").c_str());
+    }
+};
+
+struct json_printer : public printer {
+    bool first = true;
+
+    static std::string escape_json(const std::string & value) {
+        std::string escaped;
+        for (auto c : value) {
+            if (c == '"') {
+                escaped += "\\\"";
+            } else if (c == '\\') {
+                escaped += "\\\\";
+            } else  if (c <= 0x1f) {
+                char buf[8];
+                snprintf(buf, sizeof(buf), "\\u%04x", c);
+                escaped += buf;
+            } else {
+                escaped += c;
+            }
+        }
+        return escaped;
+    }
+
+    static std::string format_value(const std::string & field, const std::string & value) {
+        switch (test::get_field_type(field)) {
+            case test::STRING:
+                return "\"" + escape_json(value) + "\"";
+            case test::BOOL:
+                return value == "0" ? "false" : "true";
+            default:
+                return value;
+        }
+    }
+
+    void print_header(const cmd_params & params) override {
+        fprintf(fout, "[\n");
+        (void) params;
+    }
+
+    void print_fields(const std::vector<std::string> & fields, const std::vector<std::string> & values) {
+        assert(fields.size() == values.size());
+        for (size_t i = 0; i < fields.size(); i++) {
+            fprintf(fout, "    \"%s\": %s,\n", fields.at(i).c_str(), format_value(fields.at(i), values.at(i)).c_str());
+        }
+    }
+
+    void print_test(const test & t) override {
+        if (first) {
+            first = false;
+        } else {
+            fprintf(fout, ",\n");
+        }
+        fprintf(fout, "  {\n");
+        print_fields(test::get_fields(), t.get_values());
+        fprintf(fout, "    \"samples_ns\": [ %s ],\n", join(t.samples_ns, ", ").c_str());
+        fprintf(fout, "    \"samples_ts\": [ %s ]\n", join(t.get_ts(), ", ").c_str());
+        fprintf(fout, "  }");
+        fflush(fout);
+    }
+
+    void print_footer() override {
+        fprintf(fout, "\n]\n");
+    }
+};
+
+struct markdown_printer : public printer {
+    std::vector<std::string> fields;
+
+    static int get_field_width(const std::string & field) {
+        if (field == "model") {
+            return -30;
+        }
+        if (field == "t/s") {
+            return 15;
+        }
+        int width = std::max((int)field.length(), 10);
+
+        if (test::get_field_type(field) == test::STRING) {
+            return -width;
+        }
+        return width;
+    }
+
+    void print_header(const cmd_params & params) override {
+        // select fields to print
+        fields = { "model", "backend" };
+        bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
+        if (!is_cpu_backend) {
+            fields.push_back("n_gpu_layers");
+        }
+        if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
+            fields.push_back("n_threads");
+        }
+        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
+            fields.push_back("n_batch");
+        }
+        if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
+            fields.push_back("f16_kv");
+        }
+        if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
+            fields.push_back("main_gpu");
+        }
+        if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
+            fields.push_back("mul_mat_q");
+        }
+        if (params.low_vram.size() > 1 || params.low_vram != cmd_params_defaults.low_vram) {
+            fields.push_back("low_vram");
+        }
+        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
+            fields.push_back("tensor_split");
+        }
+        fields.push_back("test");
+        fields.push_back("t/s");
+
+        fprintf(fout, "|");
+        for (const auto & field : fields) {
+            fprintf(fout, " %*s |", get_field_width(field), field.c_str());
+        }
+        fprintf(fout, "\n");
+        fprintf(fout, "|");
+        for (const auto & field : fields) {
+            int width = get_field_width(field);
+            fprintf(fout, " %s%s |", std::string(std::abs(width) - 1, '-').c_str(), width > 0 ? ":" : "-");
+        }
+        fprintf(fout, "\n");
+    }
+
+    void print_test(const test & t) override {
+        std::map<std::string, std::string> vmap = t.get_map();
+
+        fprintf(fout, "|");
+        for (const auto & field : fields) {
+            std::string value;
+            if (field == "model") {
+                value = t.model_type;
+            } else if (field == "backend") {
+                value = test::get_backend();
+            } else if (field == "test") {
+                char buf[128];
+                if (t.n_prompt > 0 && t.n_gen == 0) {
+                    snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
+                } else if (t.n_gen > 0 && t.n_prompt == 0) {
+                    snprintf(buf, sizeof(buf), "tg %d", t.n_gen);
+                } else {
+                    assert(false);
+                    exit(1);
+                }
+                value = buf;
+            } else if (field == "t/s") {
+                char buf[128];
+                snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
+                value = buf;
+            } else if (vmap.find(field) != vmap.end()) {
+                value = vmap.at(field);
+            } else {
+                assert(false);
+                exit(1);
+            }
+
+            int width = get_field_width(field);
+            if (field == "t/s") {
+                // HACK: the utf-8 character is 2 bytes
+                width += 1;
+            }
+            fprintf(fout, " %*s |", width, value.c_str());
+        }
+        fprintf(fout, "\n");
+    }
+
+    void print_footer() override {
+        fprintf(fout, "\nbuild: %s (%d)\n", test::build_commit.c_str(), test::build_number);
+    }
+};
+
+struct sql_printer : public printer {
+    static std::string get_sql_field_type(const std::string & field) {
+        switch (test::get_field_type(field)) {
+            case test::STRING:
+                return "TEXT";
+            case test::BOOL:
+            case test::INT:
+                return "INTEGER";
+            case test::FLOAT:
+                return "REAL";
+            default:
+                assert(false);
+                exit(1);
+        }
+    }
+
+    void print_header(const cmd_params & params) override {
+        std::vector<std::string> fields = test::get_fields();
+        fprintf(fout, "CREATE TABLE IF NOT EXISTS test (\n");
+        for (size_t i = 0; i < fields.size(); i++) {
+            fprintf(fout, "  %s %s%s\n", fields.at(i).c_str(), get_sql_field_type(fields.at(i)).c_str(),  i < fields.size() - 1 ? "," : "");
+        }
+        fprintf(fout, ");\n");
+        fprintf(fout, "\n");
+        (void) params;
+    }
+
+    void print_test(const test & t) override {
+        fprintf(fout, "INSERT INTO test (%s) ", join(test::get_fields(), ", ").c_str());
+        fprintf(fout, "VALUES (");
+        std::vector<std::string> values = t.get_values();
+        for (size_t i = 0; i < values.size(); i++) {
+            fprintf(fout, "'%s'%s", values.at(i).c_str(), i < values.size() - 1 ? ", " : "");
+        }
+        fprintf(fout, ");\n");
+    }
+};
+
+static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
+    std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
+    int n_processed = 0;
+    while (n_processed < n_prompt) {
+        int n_tokens = std::min(n_prompt - n_processed, n_batch);
+        llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads);
+        n_processed += n_tokens;
+    }
+}
+
+static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
+    llama_token token = llama_token_bos(ctx);
+    for (int i = 0; i < n_gen; i++) {
+        llama_eval(ctx, &token, 1, n_past + i, n_threads);
+    }
+}
+
+static void llama_null_log_callback(enum llama_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) text;
+    (void) user_data;
+}
+
+int main(int argc, char ** argv) {
+#if !defined(NDEBUG)
+    fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
+#endif
+
+#if (defined(_MSC_VER) && defined(_DEBUG)) || (!defined(_MSC_VER) && !defined(__OPTIMIZE__))
+    fprintf(stderr, "warning: debug build, performance may be affected\n");
+#endif
+
+#if defined(__SANITIZE_ADDRESS__) || defined(__SANITIZE_THREAD__)
+    fprintf(stderr, "warning: sanitizer enabled, performance may be affected\n");
+#endif
+
+    cmd_params params = parse_cmd_params(argc, argv);
+
+    // initialize llama.cpp
+    if (!params.verbose) {
+        llama_log_set(llama_null_log_callback, NULL);
+    }
+    bool numa = false;
+    llama_backend_init(numa);
+
+    // initialize printer
+    std::unique_ptr<printer> p;
+    switch (params.output_format) {
+        case CSV:
+            p.reset(new csv_printer());
+            break;
+        case JSON:
+            p.reset(new json_printer());
+            break;
+        case MARKDOWN:
+            p.reset(new markdown_printer());
+            break;
+        case SQL:
+            p.reset(new sql_printer());
+            break;
+        default:
+            assert(false);
+            exit(1);
+    }
+    p->fout = stdout;
+    p->print_header(params);
+
+    std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
+
+    for (const auto & inst : params_instances) {
+        // TODO: keep the model between tests when possible
+        llama_context_params lparams = inst.to_llama_params();
+
+        llama_model * lmodel  = llama_load_model_from_file(inst.model.c_str(), lparams);
+        if (lmodel == NULL) {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, inst.model.c_str());
+            return 1;
+        }
+
+        llama_context * ctx = llama_new_context_with_model(lmodel, lparams);
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, inst.model.c_str());
+            llama_free_model(lmodel);
+            return 1;
+        }
+
+        test t(inst, lmodel, ctx);
+
+        // warmup run
+        test_gen(ctx, 1, 0, t.n_threads);
+
+        for (int i = 0; i < params.reps; i++) {
+            uint64_t t_start = get_time_ns();
+            if (t.n_prompt > 0) {
+                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
+            }
+            if (t.n_gen > 0) {
+                test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
+            }
+            uint64_t t_ns = get_time_ns() - t_start;
+            t.samples_ns.push_back(t_ns);
+        }
+
+        p->print_test(t);
+
+        llama_print_timings(ctx);
+
+        llama_free(ctx);
+        llama_free_model(lmodel);
+    }
+
+    p->print_footer();
+
+    llama_backend_free();
+
+    return 0;
+}
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -160,9 +160,13 @@ The following options allow you to control the text generation process and fine-

 ### Number of Tokens to Predict

-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity, -2 = until context filled)

-The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
+The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
+
+A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--n-keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in significant pause in output.
+
+If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.

 It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.

@@ -284,6 +288,10 @@ These options help improve the performance and memory usage of the LLaMA models.

 -   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.

+### Grammars
+
+-   `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
+
 ### Quantization

 For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -36,10 +36,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-#if defined(GGML_USE_KOMPUTE)
-#include "ggml-vulkan.h"
-#endif
-
 static llama_context ** g_ctx;
 static bool is_interacting = false;

@@ -122,10 +118,6 @@ int main(int argc, char ** argv) {
    llama_context * ctx_guidance = NULL;
    g_ctx = &ctx;

-#if defined(GGML_USE_KOMPUTE)
-    ggml_vk_init_device(0, "gpu");
-#endif
-
    // load the model and apply lora adapter, if any
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (params.cfg_scale > 1.f) {
@@ -151,7 +143,7 @@ int main(int argc, char ** argv) {
        {
            fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);

-            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
            llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
        }

@@ -199,10 +191,6 @@ int main(int argc, char ** argv) {

    // tokenize the prompt
    std::vector<llama_token> embd_inp;
-
-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
-
    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
    } else {
@@ -278,15 +266,12 @@ int main(int argc, char ** argv) {
        params.interactive = true;
    }

-    // determine newline token
-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
    if (params.verbose_prompt) {
        fprintf(stderr, "\n");
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
        }

        if (ctx_guidance) {
@@ -294,14 +279,14 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
            }
        }

        if (params.n_keep > 0) {
        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
+                fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
            }
            fprintf(stderr, "'\n");
        }
@@ -319,7 +304,7 @@ int main(int argc, char ** argv) {
        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
        };
-        SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif

        fprintf(stderr, "%s: interactive mode on.\n", __func__);
@@ -360,10 +345,9 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "\n");

        {
-            auto it = params.logit_bias.find(llama_token_eos());
+            auto it = params.logit_bias.find(llama_token_eos(ctx));
            if (it != params.logit_bias.end() && it->second == -INFINITY) {
-                fprintf(stderr,
-                    "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+                fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
            }
        }

@@ -413,7 +397,7 @@ int main(int argc, char ** argv) {

    // do one empty run to warm up the model
    {
-        const std::vector<llama_token> tmp = { llama_token_bos(), };
+        const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
        llama_reset_timings(ctx);
    }
@@ -439,8 +423,12 @@ int main(int argc, char ** argv) {
            // - take the n_keep first tokens from the original prompt (via n_past)
            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
-                const int n_left = n_past - params.n_keep;
+                if (params.n_predict == -2) {
+                    fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
+                    break;
+                }

+                const int n_left = n_past - params.n_keep;
                // always keep the first token - BOS
                n_past = std::max(1, params.n_keep);
                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
@@ -593,7 +581,7 @@ int main(int argc, char ** argv) {
                }

                // Apply penalties
-                float nl_logit = logits[llama_token_nl()];
+                float nl_logit = logits[llama_token_nl(ctx)];
                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
                llama_sample_repetition_penalty(ctx, &candidates_p,
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@@ -602,7 +590,7 @@ int main(int argc, char ** argv) {
                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
                    last_n_repeat, alpha_frequency, alpha_presence);
                if (!penalize_nl) {
-                    logits[llama_token_nl()] = nl_logit;
+                    logits[llama_token_nl(ctx)] = nl_logit;
                }

                if (grammar != NULL) {
@@ -666,7 +654,7 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo) {
            for (auto id : embd) {
-                printf("%s", llama_token_to_str(ctx, id));
+                printf("%s", llama_token_to_str(ctx, id).c_str());
            }
            fflush(stdout);
        }
@@ -708,7 +696,7 @@ int main(int argc, char ** argv) {
            }

            // deal with end of text token in interactive mode
-            if (last_n_tokens.back() == llama_token_eos()) {
+            if (last_n_tokens.back() == llama_token_eos(ctx)) {
                if (params.interactive) {
                    if (params.antiprompt.size() != 0) {
                        // tokenize and inject first reverse prompt
@@ -732,7 +720,7 @@ int main(int argc, char ** argv) {
                }

                if (params.input_prefix_bos) {
-                    embd_inp.push_back(llama_token_bos());
+                    embd_inp.push_back(llama_token_bos(ctx));
                }

                std::string buffer;
@@ -786,8 +774,7 @@ int main(int argc, char ** argv) {
                    if (grammar != NULL) {
                        llama_grammar_free(grammar);

-                        std::vector<const llama_grammar_element *> grammar_rules(
-                            parsed_grammar.c_rules());
+                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
                        grammar = llama_grammar_init(
                            grammar_rules.data(), grammar_rules.size(),
                            parsed_grammar.symbol_ids.at("root"));
@@ -798,7 +785,7 @@ int main(int argc, char ** argv) {
        }

        // end of text token
-        if (!embd.empty() && embd.back() == llama_token_eos() && !(params.instruct || params.interactive)) {
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
            fprintf(stderr, " [end of text]\n");
            break;
        }
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@@ -2,7 +2,7 @@
 //
 // - First, export a LLaMA graph:
 //
-//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export
+//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
 //
 // - Run this tool to evaluate the exported graph:
 //
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -5,6 +5,7 @@
 #include <cmath>
 #include <ctime>
 #include <sstream>
+#include <cstring>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -26,7 +27,121 @@ std::vector<float> softmax(const std::vector<float>& logits) {
    return probs;
 }

+void perplexity_v2(llama_context * ctx, const gpt_params & params) {
+
+    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
+    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+    // Output: `perplexity: 13.5106 [114/114]`
+    // BOS tokens will be added for each chunk before eval
+
+    if (params.ppl_stride <= 0) {
+        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+        return;
+    }
+    auto tokens = ::llama_tokenize(ctx, params.prompt, true);
+
+    const int calc_chunk = params.n_ctx;
+
+    fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
+
+    if (int(tokens.size()) <= calc_chunk) {
+        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
+                tokens.size(), params.n_ctx, params.ppl_stride);
+        return;
+    }
+
+    const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1)  / params.ppl_stride;
+
+    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
+    const int n_vocab = llama_n_vocab(ctx);
+    const int n_batch = params.n_batch;
+
+    int count = 0;
+    double nll = 0.0;
+
+    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
+
+    for (int i = 0; i < n_chunk; ++i) {
+        const int start =     i * params.ppl_stride;
+        const int end   = start + calc_chunk;
+
+        const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
+        //fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
+
+        std::vector<float> logits;
+
+        const auto t_start = std::chrono::high_resolution_clock::now();
+
+        for (int j = 0; j < num_batches; ++j) {
+            const int batch_start = start + j * n_batch;
+            const int batch_size  = std::min(end - batch_start, n_batch);
+
+            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
+            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
+                //fprintf(stderr, "%s : failed to eval\n", __func__);
+                return;
+            }
+
+            // save original token and restore it after eval
+            const auto token_org = tokens[batch_start];
+
+            // add BOS token for the first batch of each chunk
+            if (j == 0) {
+                tokens[batch_start] = llama_token_bos(ctx);
+            }
+
+            const auto batch_logits = llama_get_logits(ctx);
+            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
+
+            if (j == 0) {
+                tokens[batch_start] = token_org;
+            }
+        }
+
+        const auto t_end = std::chrono::high_resolution_clock::now();
+
+        if (i == 0) {
+            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            int total_seconds = (int)(t_total * n_chunk);
+            if (total_seconds >= 60*60) {
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
+                total_seconds = total_seconds % (60*60);
+            }
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
+        }
+
+        //fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
+        for (int j = params.n_ctx - params.ppl_stride - 1; j < params.n_ctx - 1; ++j) {
+
+            // Calculate probability of next token, given the previous ones.
+            const std::vector<float> tok_logits(
+                logits.begin() + (j + 0) * n_vocab,
+                logits.begin() + (j + 1) * n_vocab);
+
+            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
+
+            nll += -std::log(prob);
+            ++count;
+        }
+        // perplexity is e^(average negative log-likelihood)
+        if (params.ppl_output_type == 0) {
+            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+        } else {
+            printf("%8d  %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
+        }
+        fflush(stdout);
+    }
+    printf("\n");
+}
+
 void perplexity(llama_context * ctx, const gpt_params & params) {
+
+    if (params.ppl_stride > 0) {
+        perplexity_v2(ctx, params);
+        return;
+    }
+
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
    // Output: `perplexity: 13.5106 [114/114]`
@@ -63,7 +178,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {

            // add BOS token for the first batch of each chunk
            if (j == 0) {
-                tokens[batch_start] = llama_token_bos();
+                tokens[batch_start] = llama_token_bos(ctx);
            }

            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
@@ -88,7 +203,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            fprintf(stderr, "%d minutes\n", total_seconds / 60);
+            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

        // We get the logits for all the tokens in the context window (params.n_ctx)
@@ -115,12 +230,37 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
            ++count;
        }
        // perplexity is e^(average negative log-likelihood)
-        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+        if (params.ppl_output_type == 0) {
+            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+        } else {
+            printf("%8d  %.4lf\n", i*params.n_ctx, std::exp(nll / count));
+        }
        fflush(stdout);
    }
    printf("\n");
 }

+std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
+        int n_vocab, int n_thread) {
+    std::vector<float> result;
+    result.reserve(tokens.size() * n_vocab);
+    size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
+    for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
+        size_t n_tokens = tokens.size() - i_chunk * n_batch;
+        n_tokens = std::min(n_tokens, size_t(n_batch));
+        if (llama_eval(ctx, tokens.data() + i_chunk * n_batch, n_tokens, n_past, n_thread)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return {};
+        }
+
+        const auto logits = llama_get_logits(ctx);
+        result.insert(result.end(), logits, logits + n_tokens * n_vocab);
+
+        n_past += n_tokens;
+    }
+    return result;
+}
+
 void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    // Calculates hellaswag score (acc_norm) from prompt
    //
@@ -209,50 +349,93 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    double acc = 0.0f;
    const int n_vocab = llama_n_vocab(ctx);

+    std::vector<float> tok_logits(n_vocab);
+
    for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {

        // Tokenize the context to count tokens
        std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos);
        size_t context_size = context_embd.size();

-        for (size_t ending_idx=0;ending_idx<4;ending_idx++) {
+        // Do the 1st ending
+        // In this case we include the context when evaluating
+        auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos);
+        auto query_size = query_embd.size();
+        //printf("First query: %d\n",(int)query_size);
+
+        // Stop if query wont fit the ctx window
+        if (query_size > (size_t)params.n_ctx) {
+            fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
+            return;
+        }
+
+        // Speedup small evaluations by evaluating atleast 32 tokens
+        if (query_size < 32) {
+            query_embd.resize(32);
+        }
+
+        auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab, params.n_threads);
+        if (logits.empty()) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return;
+        }
+
+        std::memcpy(tok_logits.data(), logits.data() + (context_size-1)*n_vocab, n_vocab*sizeof(float));
+        const auto first_probs = softmax(tok_logits);
+
+        hs_data[task_idx].ending_logprob_count[0] = 1;
+        hs_data[task_idx].ending_logprob[0] = std::log(first_probs[query_embd[context_size]]);
+
+        // Calculate the logprobs over the ending
+        for (size_t j = context_size; j < query_size - 1; j++) {
+
+            std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));
+
+            const float prob = softmax(tok_logits)[query_embd[j + 1]];
+
+            hs_data[task_idx].ending_logprob[0] += std::log(prob);
+            hs_data[task_idx].ending_logprob_count[0]++;
+        }
+
+        // Calculate the mean token logprob for acc_norm
+        hs_data[task_idx].ending_logprob[0] /= hs_data[task_idx].ending_logprob_count[0];
+
+        // Do the remaining endings
+        // For these, we use the bare ending with n_past = context_size
+        //
+        for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {

            // Tokenize the query
-            std::vector<int> query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[ending_idx], prepend_bos);
-            size_t query_size = query_embd.size();
+            query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
+            query_size = query_embd.size();

            // Stop if query wont fit the ctx window
-            if (query_size > (size_t)params.n_ctx) {
+            if (context_size + query_size > (size_t)params.n_ctx) {
                fprintf(stderr, "%s : number of tokens in query %zu > n_ctxl\n", __func__, query_size);
                return;
            }

            // Speedup small evaluations by evaluating atleast 32 tokens
-            if (query_size < 32) {
-                query_embd.resize(32);
-            }
+            // No, resizing to 32 is actually slightly slower (at least on CUDA)
+            //if (query_size < 32) {
+            //    query_embd.resize(32);
+            //}

            // Evaluate the query
-            if (llama_eval(ctx, query_embd.data(), query_embd.size(), 0, params.n_threads)) {
+            logits = hellaswag_evaluate_tokens(ctx, query_embd, context_size, params.n_batch, n_vocab, params.n_threads);
+            if (logits.empty()) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
            }

-            const auto query_logits = llama_get_logits(ctx);
-            std::vector<float> logits;
-            logits.insert(logits.end(), query_logits, query_logits + query_size * n_vocab);
-
-            hs_data[task_idx].ending_logprob_count[ending_idx] = 0;
-            hs_data[task_idx].ending_logprob[ending_idx] = 0.0f;
+            hs_data[task_idx].ending_logprob_count[ending_idx] = 1;
+            hs_data[task_idx].ending_logprob[ending_idx] = std::log(first_probs[query_embd[0]]);

            // Calculate the logprobs over the ending
-            for (size_t j = context_size-1; j < query_size - 1; j++) {
-                // Calculate probability of next token, given the previous ones.
-                const std::vector<float> tok_logits(
-                    logits.begin() + (j + 0) * n_vocab,
-                    logits.begin() + (j + 1) * n_vocab);
+            for (size_t j = 0; j < query_size - 1; j++) {
+                std::memcpy(tok_logits.data(), logits.data() + j*n_vocab, n_vocab*sizeof(float));

-                const float prob = softmax(tok_logits)[query_embd[ j + 1]];
+                const float prob = softmax(tok_logits)[query_embd[j + 1]];

                hs_data[task_idx].ending_logprob[ending_idx] += std::log(prob);
                hs_data[task_idx].ending_logprob_count[ending_idx]++;
@@ -267,9 +450,9 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
        }

        // Find the ending with maximum logprob
-        size_t ending_logprob_max_idx = -1;
-        double ending_logprob_max_val = -INFINITY;
-        for (size_t j=0; j < 4; j++) {
+        size_t ending_logprob_max_idx = 0;
+        double ending_logprob_max_val = hs_data[task_idx].ending_logprob[0];
+        for (size_t j = 1; j < 4; j++) {
            if (hs_data[task_idx].ending_logprob[j] > ending_logprob_max_val) {
                ending_logprob_max_idx = j;
                ending_logprob_max_val =  hs_data[task_idx].ending_logprob[j];
@@ -304,6 +487,12 @@ int main(int argc, char ** argv) {
    params.perplexity = true;
    params.n_batch = std::min(params.n_batch, params.n_ctx);

+    if (params.ppl_stride > 0) {
+        fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
+                params.n_ctx, params.n_ctx + params.ppl_stride/2);
+        params.n_ctx += params.ppl_stride/2;
+    }
+
    if (params.n_ctx > 2048) {
        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
                "expect poor results\n", __func__, params.n_ctx);
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -24,7 +24,7 @@
 #endif

 struct quantize_stats_params {
-    std::string model = "models/7B/ggml-model-f16.bin";
+    std::string model = "models/7B/ggml-model-f16.gguf";
    bool verbose = false;
    bool per_layer_stats = false;
    bool print_histogram = false;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -14,25 +14,25 @@ struct quant_option {
 };

 static const std::vector<struct quant_option> QUANT_OPTIONS = {
-    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 3.50G, +0.2499 ppl @ 7B", },
-    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1846 ppl @ 7B", },
-    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.30G, +0.0796 ppl @ 7B", },
-    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0415 ppl @ 7B", },
+    { "Q4_0",   LLAMA_FTYPE_MOSTLY_Q4_0,   " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
+    { "Q4_1",   LLAMA_FTYPE_MOSTLY_Q4_1,   " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
+    { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
+    { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
 #ifdef GGML_USE_K_QUANTS
-    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.67G, +0.8698 ppl @ 7B", },
+    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
-    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
-    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
-    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
+    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
+    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
+    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
    { "Q4_K",   LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
-    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
-    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
+    { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
+    { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
    { "Q5_K",   LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
-    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
-    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
-    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, +0.0044 ppl @ 7B", },
+    { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
+    { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
+    { "Q6_K",   LLAMA_FTYPE_MOSTLY_Q6_K,   " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
 #endif
-    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ 7B", },
+    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
 };
@@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
 }

 // usage:
-//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
 void usage(const char * executable) {
-    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
+    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
    fprintf(stderr, "\nAllowed quantization types:\n");
@@ -118,8 +118,8 @@ int main(int argc, char ** argv) {
        if (pos != std::string::npos) {
            fpath = fname_inp.substr(0, pos + 1);
        }
-        // export as [inp path]/ggml-model-[ftype].bin
-        fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
+        // export as [inp path]/ggml-model-[ftype].gguf
+        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
        arg_idx++;
    }
    else {
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -26,7 +26,6 @@ int main(int argc, char ** argv) {
    auto lparams = llama_context_default_params();

    lparams.n_ctx     = params.n_ctx;
-    lparams.n_gqa     = params.n_gqa;
    lparams.seed      = params.seed;
    lparams.f16_kv    = params.memory_f16;
    lparams.use_mmap  = params.use_mmap;
@@ -45,9 +44,8 @@ int main(int argc, char ** argv) {
        llama_free_model(model);
        return 1;
    }
-    auto tokens = std::vector<llama_token>(params.n_ctx);
-    auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
-
+    auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
+    auto n_prompt_tokens = tokens.size();
    if (n_prompt_tokens < 1) {
        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
        llama_free(ctx);
@@ -92,7 +90,7 @@ int main(int argc, char ** argv) {
        auto next_token_str = llama_token_to_str(ctx, next_token);
        last_n_tokens_data.push_back(next_token);

-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
        if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx);
@@ -152,7 +150,7 @@ int main(int argc, char ** argv) {
        auto next_token_str = llama_token_to_str(ctx2, next_token);
        last_n_tokens_data.push_back(next_token);

-        printf("%s", next_token_str);
+        printf("%s", next_token_str.c_str());
        if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
            llama_free(ctx2);
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -5,7 +5,7 @@ This example demonstrates a simple HTTP API server and a simple web front end to
 Command line options:

 -   `--threads N`, `-t N`: Set the number of threads to use during computation.
-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
 -   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 -   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
@@ -16,6 +16,7 @@ Command line options:
 -   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
 -   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 -   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
+-   `--numa`: Attempt optimizations that help on some NUMA systems.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
 -   `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
@@ -47,15 +48,14 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./server -m models/7B/ggml-model.bin -c 2048
+./server -m models/7B/ggml-model.gguf -c 2048
 ```

 ### Windows:

 ```powershell
-server.exe -m models\7B\ggml-model.bin -c 2048
+server.exe -m models\7B\ggml-model.gguf -c 2048
 ```
-
 The above command will start a server that by default listens on `127.0.0.1:8080`.
 You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.

@@ -126,7 +126,7 @@ node .

    `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.

-    `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
+    `prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.

    `stop`: Specify a JSON array of stopping strings.
    These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).
@@ -151,6 +151,8 @@ node .

    `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1).

+    `grammar`: Set grammar for grammar-based sampling (default: no grammar)
+
    `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed).

    `ignore_eos`: Ignore end of stream token and continue generating (default: false).
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@@ -1,5 +1,34 @@
 import * as readline from 'node:readline'
 import { stdin, stdout } from 'node:process'
+import { readFileSync } from 'node:fs'
+import { SchemaConverter }  from './public/json-schema-to-grammar.mjs'
+
+const args = process.argv.slice(2);
+const grammarJsonSchemaFile = args.find(
+    (_, index) => args[index - 1] === "--grammar-json-schema"
+);
+const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
+
+// Example usage: function,arguments
+const grammarJsonSchemaPropOrder = args.find(
+    (_, index) => args[index - 1] === "--grammar-json-schema-prop-order"
+);
+const propOrder = grammarJsonSchemaPropOrder
+    ? grammarJsonSchemaPropOrder
+          .split(",")
+          .reduce((acc, cur, index) => ({ ...acc, [cur]: index }), {})
+    : {};
+
+let grammar = null
+if (grammarJsonSchemaFile) {
+    const schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
+    const converter = new SchemaConverter(propOrder)
+    converter.visit(schema, '')
+    grammar = converter.formatGrammar()
+}
+if (grammarFile) {
+    grammar = readFileSync(grammarFile, 'utf-8')
+}

 const API_URL = 'http://127.0.0.1:8080'

@@ -48,6 +77,7 @@ async function chat_completion(question) {
            n_keep: n_keep,
            n_predict: 256,
            stop: ["\n### Human:"], // stop completion after generating this
+            grammar,
            stream: true,
        })
    })
--- a/examples/server/deps.sh
+++ b/examples/server/deps.sh
@@ -11,8 +11,10 @@ echo >> $PUBLIC/index.js # add newline

 FILES=$(ls $PUBLIC)

+cd $PUBLIC
 for FILE in $FILES; do
-  func=$(echo $FILE | tr '.' '_')
-  echo "generate $FILE.hpp ($func)"
-  xxd -n $func -i $PUBLIC/$FILE > $DIR/$FILE.hpp
+  echo "generate $FILE.hpp"
+
+  # use simple flag for old version of xxd
+  xxd -i $FILE > $DIR/$FILE.hpp
 done
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/index.js.hpp
+++ b/examples/server/index.js.hpp
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
@@ -0,0 +1,311 @@
+unsigned char json_schema_to_grammar_mjs[] = {
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f,
+  0x52, 0x55, 0x4c, 0x45, 0x20, 0x3d, 0x20, 0x27, 0x22, 0x20, 0x22, 0x3f,
+  0x27, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x52,
+  0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45,
+  0x53, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x62, 0x6f, 0x6f, 0x6c,
+  0x65, 0x61, 0x6e, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x74, 0x72, 0x75, 0x65,
+  0x22, 0x20, 0x7c, 0x20, 0x22, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x22, 0x29,
+  0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x6e,
+  0x75, 0x6d, 0x62, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d, 0x22,
+  0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20, 0x5b,
+  0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a, 0x29,
+  0x29, 0x20, 0x28, 0x22, 0x2e, 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d,
+  0x2b, 0x29, 0x3f, 0x20, 0x28, 0x5b, 0x65, 0x45, 0x5d, 0x20, 0x5b, 0x2d,
+  0x2b, 0x5d, 0x3f, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2b, 0x29, 0x3f,
+  0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x69,
+  0x6e, 0x74, 0x65, 0x67, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d,
+  0x22, 0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20,
+  0x5b, 0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a,
+  0x29, 0x29, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20,
+  0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x60, 0x20, 0x22,
+  0x5c, 0x5c, 0x22, 0x22, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x5b, 0x5e, 0x22, 0x5c, 0x5c, 0x5c, 0x5c, 0x5d, 0x20,
+  0x7c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x22, 0x5c,
+  0x5c, 0x5c, 0x5c, 0x22, 0x20, 0x28, 0x5b, 0x22, 0x5c, 0x5c, 0x5c, 0x5c,
+  0x2f, 0x62, 0x66, 0x6e, 0x72, 0x74, 0x5d, 0x20, 0x7c, 0x20, 0x22, 0x75,
+  0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2a, 0x20,
+  0x22, 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60,
+  0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x20, 0x27, 0x22,
+  0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
+  0x2c, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44, 0x5f, 0x52, 0x55, 0x4c, 0x45,
+  0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f, 0x52, 0x45, 0x20, 0x3d, 0x20,
+  0x2f, 0x5b, 0x5e, 0x5c, 0x64, 0x41, 0x2d, 0x5a, 0x61, 0x2d, 0x7a, 0x2d,
+  0x5d, 0x2b, 0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45,
+  0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, 0x52,
+  0x45, 0x20, 0x3d, 0x20, 0x2f, 0x5b, 0x5c, 0x6e, 0x5c, 0x72, 0x22, 0x5d,
+  0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x47, 0x52,
+  0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41,
+  0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x53, 0x20, 0x3d, 0x20,
+  0x7b, 0x27, 0x5c, 0x72, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x72, 0x27,
+  0x2c, 0x20, 0x27, 0x5c, 0x6e, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x6e,
+  0x27, 0x2c, 0x20, 0x27, 0x22, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x22,
+  0x27, 0x7d, 0x3b, 0x0a, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20,
+  0x63, 0x6c, 0x61, 0x73, 0x73, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
+  0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f,
+  0x72, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x3d,
+  0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x7c,
+  0x7c, 0x20, 0x7b, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x20, 0x3d, 0x20,
+  0x6e, 0x65, 0x77, 0x20, 0x4d, 0x61, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c,
+  0x65, 0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x27, 0x73, 0x70, 0x61, 0x63,
+  0x65, 0x27, 0x2c, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f, 0x52, 0x55,
+  0x4c, 0x45, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
+  0x61, 0x6c, 0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x4a, 0x53,
+  0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79,
+  0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x2e, 0x72, 0x65,
+  0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54,
+  0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f,
+  0x52, 0x45, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x20,
+  0x3d, 0x3e, 0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c,
+  0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50,
+  0x45, 0x53, 0x5b, 0x6d, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x60, 0x22, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x7d,
+  0x22, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f,
+  0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x6e, 0x61, 0x6d, 0x65,
+  0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d,
+  0x65, 0x20, 0x3d, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2e, 0x72, 0x65, 0x70,
+  0x6c, 0x61, 0x63, 0x65, 0x28, 0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44,
+  0x5f, 0x52, 0x55, 0x4c, 0x45, 0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f,
+  0x52, 0x45, 0x2c, 0x20, 0x27, 0x2d, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20,
+  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
+  0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28, 0x65, 0x73,
+  0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x28,
+  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x3d, 0x3d, 0x3d,
+  0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
+  0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28,
+  0x60, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
+  0x7b, 0x69, 0x7d, 0x60, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x60, 0x24, 0x7b,
+  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x69, 0x7d,
+  0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65,
+  0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x6b, 0x65, 0x79, 0x2c, 0x20, 0x72,
+  0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20,
+  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73,
+  0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20,
+  0x3d, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x74, 0x79, 0x70,
+  0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20,
+  0x6e, 0x61, 0x6d, 0x65, 0x20, 0x7c, 0x7c, 0x20, 0x27, 0x72, 0x6f, 0x6f,
+  0x74, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x6f, 0x6e, 0x65, 0x4f,
+  0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e,
+  0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c,
+  0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e,
+  0x6f, 0x6e, 0x65, 0x4f, 0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68,
+  0x65, 0x6d, 0x61, 0x2e, 0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x2e, 0x6d,
+  0x61, 0x70, 0x28, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69,
+  0x73, 0x69, 0x74, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
+  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20,
+  0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x69, 0x7d, 0x60, 0x29, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e,
+  0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74,
+  0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65,
+  0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x72,
+  0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20,
+  0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c,
+  0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
+  0x61, 0x6c, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x65,
+  0x6e, 0x75, 0x6d, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d,
+  0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x65, 0x6e, 0x75, 0x6d,
+  0x2e, 0x6d, 0x61, 0x70, 0x28, 0x76, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69,
+  0x74, 0x65, 0x72, 0x61, 0x6c, 0x28, 0x76, 0x29, 0x29, 0x2e, 0x6a, 0x6f,
+  0x69, 0x6e, 0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c,
+  0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
+  0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x63,
+  0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d,
+  0x20, 0x27, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x27, 0x20, 0x26, 0x26,
+  0x20, 0x27, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73,
+  0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
+  0x54, 0x4f, 0x44, 0x4f, 0x3a, 0x20, 0x60, 0x72, 0x65, 0x71, 0x75, 0x69,
+  0x72, 0x65, 0x64, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64,
+  0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f,
+  0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72,
+  0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
+  0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70,
+  0x72, 0x6f, 0x70, 0x50, 0x61, 0x69, 0x72, 0x73, 0x20, 0x3d, 0x20, 0x4f,
+  0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x65, 0x6e, 0x74, 0x72, 0x69, 0x65,
+  0x73, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x70, 0x72, 0x6f,
+  0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x29, 0x2e, 0x73, 0x6f, 0x72,
+  0x74, 0x28, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
+  0x20, 0x73, 0x6f, 0x72, 0x74, 0x20, 0x62, 0x79, 0x20, 0x70, 0x6f, 0x73,
+  0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f,
+  0x70, 0x5f, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x28, 0x69, 0x66, 0x20,
+  0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x65, 0x64, 0x29, 0x20, 0x74,
+  0x68, 0x65, 0x6e, 0x20, 0x62, 0x79, 0x20, 0x6b, 0x65, 0x79, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x3d, 0x20, 0x74, 0x79,
+  0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64,
+  0x65, 0x72, 0x5b, 0x61, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3d, 0x3d, 0x3d,
+  0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x27, 0x20, 0x3f, 0x20,
+  0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x61, 0x5b,
+  0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49, 0x6e, 0x66, 0x69, 0x6e, 0x69,
+  0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x42,
+  0x20, 0x3d, 0x20, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72,
+  0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d,
+  0x5d, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65,
+  0x72, 0x27, 0x20, 0x3f, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64,
+  0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49,
+  0x6e, 0x66, 0x69, 0x6e, 0x69, 0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x2d, 0x20, 0x6f, 0x72, 0x64,
+  0x65, 0x72, 0x42, 0x20, 0x7c, 0x7c, 0x20, 0x61, 0x5b, 0x30, 0x5d, 0x2e,
+  0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x65, 0x43, 0x6f, 0x6d, 0x70, 0x61, 0x72,
+  0x65, 0x28, 0x62, 0x5b, 0x30, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d,
+  0x20, 0x27, 0x22, 0x7b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x70,
+  0x50, 0x61, 0x69, 0x72, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63,
+  0x68, 0x28, 0x28, 0x5b, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65,
+  0x2c, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
+  0x5d, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x70, 0x72, 0x6f, 0x70, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d,
+  0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69, 0x73,
+  0x69, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
+  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20,
+  0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x4e,
+  0x61, 0x6d, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3e, 0x20,
+  0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x27,
+  0x20, 0x22, 0x2c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20,
+  0x2b, 0x3d, 0x20, 0x60, 0x20, 0x24, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
+  0x61, 0x6c, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65, 0x29,
+  0x7d, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x22, 0x3a, 0x22, 0x20,
+  0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70,
+  0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x60, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20,
+  0x27, 0x20, 0x22, 0x7d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64,
+  0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61,
+  0x6d, 0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66,
+  0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65,
+  0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x61, 0x72, 0x72, 0x61, 0x79, 0x27,
+  0x20, 0x26, 0x26, 0x20, 0x27, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x27, 0x20,
+  0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x4f,
+  0x44, 0x4f, 0x20, 0x60, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, 0x49, 0x74,
+  0x65, 0x6d, 0x73, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64,
+  0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f,
+  0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75,
+  0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x2e, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x2c, 0x20, 0x60, 0x24,
+  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65,
+  0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20, 0x3a, 0x20, 0x22, 0x22, 0x7d,
+  0x69, 0x74, 0x65, 0x6d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65,
+  0x20, 0x3d, 0x20, 0x60, 0x22, 0x5b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63,
+  0x65, 0x20, 0x28, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75, 0x6c,
+  0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x28, 0x22, 0x2c, 0x22, 0x20,
+  0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d,
+  0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x29, 0x2a, 0x29,
+  0x3f, 0x20, 0x22, 0x5d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64,
+  0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d,
+  0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x50, 0x52,
+  0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45,
+  0x53, 0x5b, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65,
+  0x5d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
+  0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x55, 0x6e, 0x72, 0x65, 0x63, 0x6f,
+  0x67, 0x6e, 0x69, 0x7a, 0x65, 0x64, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x3a, 0x20, 0x24, 0x7b, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74,
+  0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x29, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61,
+  0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65,
+  0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20,
+  0x3f, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20, 0x3a, 0x20, 0x73,
+  0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x2c, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x50, 0x52, 0x49, 0x4d, 0x49,
+  0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x53, 0x5b, 0x73,
+  0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x5d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72,
+  0x6d, 0x61, 0x74, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x28, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x67,
+  0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72,
+  0x75, 0x6c, 0x65, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68,
+  0x28, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65,
+  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x2b, 0x3d, 0x20,
+  0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x3a, 0x3a, 0x3d,
+  0x20, 0x24, 0x7b, 0x72, 0x75, 0x6c, 0x65, 0x7d, 0x5c, 0x6e, 0x60, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x72, 0x61, 0x6d,
+  0x6d, 0x61, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a
+};
+unsigned int json_schema_to_grammar_mjs_len = 3695;
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -141,14 +141,15 @@
    } from '/index.js';

    import { llama } from '/completion.js';
+    import { SchemaConverter } from '/json-schema-to-grammar.mjs';

    const session = signal({
-      prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
+      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
      template: "{{prompt}}\n\n{{history}}\n{{char}}:",
      historyTemplate: "{{name}}: {{message}}",
      transcript: [],
      type: "chat",
-      char: "llama",
+      char: "Llama",
      user: "User",
    })

@@ -166,8 +167,139 @@
      mirostat: 0, // 0/1/2
      mirostat_tau: 5, // target entropy
      mirostat_eta: 0.1, // learning rate
+      grammar: '',
    })

+    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
+
+    const local_storage_storageKey = "llamacpp_server_local_storage";
+
+    function local_storage_setDataFromObject(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, JSON.stringify(content));
+    }
+
+    function local_storage_setDataFromRawText(tag, content) {
+      localStorage.setItem(local_storage_storageKey + '/' + tag, content);
+    }
+
+    function local_storage_getDataAsObject(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return JSON.parse(item);
+      }
+    }
+
+    function local_storage_getDataAsRawText(tag) {
+      const item = localStorage.getItem(local_storage_storageKey + '/' + tag);
+      if (!item) {
+        return null;
+      } else {
+        return item;
+      }
+    }
+
+    // create a container for user templates and settings
+
+    const savedUserTemplates = signal({})
+    const selectedUserTemplate = signal({ name: '', template: { session: {}, params: {} } })
+
+    // let's import locally saved templates and settings if there are any
+    // user templates and settings are stored in one object
+    // in form of { "templatename": "templatedata" } and { "settingstemplatename":"settingsdata" }
+
+    console.log('Importing saved templates')
+
+    let importedTemplates = local_storage_getDataAsObject('user_templates')
+
+    if (importedTemplates) {
+      // saved templates were successfuly imported.
+
+      console.log('Processing saved templates and updating default template')
+
+      //console.log(importedTemplates);
+      savedUserTemplates.value = importedTemplates;
+
+      //override default template
+      savedUserTemplates.value.default = { session: session.value, params: params.value }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    } else {
+      // no saved templates detected.
+
+      console.log('Initializing LocalStorage and saving default template')
+
+      savedUserTemplates.value = { "default": { session: session.value, params: params.value } }
+      local_storage_setDataFromObject('user_templates', savedUserTemplates.value)
+    }
+
+    function userTemplateResetToDefault() {
+      console.log('Reseting themplate to default')
+      selectedUserTemplate.value.name = 'default';
+      selectedUserTemplate.value.data = savedUserTemplates.value['default'];
+    }
+
+    function userTemplateApply(t) {
+      session.value = t.data.session;
+      params.value = t.data.params;
+    }
+
+    function userTemplateResetToDefaultAndApply() {
+      userTemplateResetToDefault()
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    function userTemplateLoadAndApplyAutosaved() {
+      // get autosaved last used template
+      let lastUsedTemplate = local_storage_getDataAsObject('user_templates_last')
+
+      if (lastUsedTemplate) {
+
+        console.log('Autosaved template found, restoring')
+
+        selectedUserTemplate.value = lastUsedTemplate
+      }
+      else {
+
+        console.log('No autosaved template found, using default template')
+        // no autosaved last used template was found, so load from default.
+
+        userTemplateResetToDefault()
+      }
+
+      console.log('Applying template')
+      // and update internal data from templates
+
+      userTemplateApply(selectedUserTemplate.value)
+    }
+
+    //console.log(savedUserTemplates.value)
+    //console.log(selectedUserTemplate.value)
+
+    function userTemplateAutosave() {
+      console.log('Template Autosave...')
+      if (selectedUserTemplate.value.name == 'default') {
+        // we don't want to save over default template, so let's create a new one
+        let newTemplateName = 'UserTemplate-' + Date.now().toString()
+        let newTemplate = { 'name': newTemplateName, 'data': { 'session': session.value, 'params': params.value } }
+
+        console.log('Saving as ' + newTemplateName)
+
+        // save in the autosave slot
+        local_storage_setDataFromObject('user_templates_last', newTemplate)
+
+        // and load it back and apply
+        userTemplateLoadAndApplyAutosaved()
+      } else {
+        local_storage_setDataFromObject('user_templates_last', { 'name': selectedUserTemplate.value.name, 'data': { 'session': session.value, 'params': params.value } })
+      }
+    }
+
+    console.log('Checking for autosaved last used template')
+    userTemplateLoadAndApplyAutosaved()
+
+    /* END: Support for storing prompt templates and parameters in browsers LocalStorage */
+
    const llamaStats = signal(null)
    const controller = signal(null)

@@ -304,6 +436,26 @@
      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }

+      const grammarJsonSchemaPropOrder = signal('')
+      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
+      const convertJSONSchemaGrammar = () => {
+        try {
+          const schema = JSON.parse(params.value.grammar)
+          const converter = new SchemaConverter(
+            grammarJsonSchemaPropOrder.value
+              .split(',')
+              .reduce((acc, cur, i) => ({...acc, [cur.trim()]: i}), {})
+          )
+          converter.visit(schema, '')
+          params.value = {
+            ...params.value,
+            grammar: converter.formatGrammar(),
+          }
+        } catch (e) {
+          alert(`Convert failed: ${e.message}`)
+        }
+      }
+
      const FloatField = ({label, max, min, name, step, value}) => {
        return html`
          <div>
@@ -324,8 +476,34 @@
        `
      };

+      const userTemplateReset = (e) => {
+        e.preventDefault();
+        userTemplateResetToDefaultAndApply()
+      }
+
+      const UserTemplateResetButton = () => {
+        if (selectedUserTemplate.value.name == 'default') {
+          return html`
+            <button disabled>Using default template</button>
+          `
+        }
+
+        return html`
+          <button onclick=${userTemplateReset}>Reset all to default</button>
+        `
+      };
+
+      useEffect(() => {
+        // autosave template on every change
+        userTemplateAutosave()
+      }, [session.value, params.value])
+
      return html`
        <form>
+          <fieldset>
+            <${UserTemplateResetButton}/>
+          </fieldset>
+
          <fieldset>
            <div>
              <label for="prompt">Prompt</label>
@@ -355,6 +533,13 @@
              <label for="template">Chat history template</label>
              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
            </div>
+
+            <div>
+              <label for="template">Grammar</label>
+              <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+              <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+              <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
+            </div>
          </fieldset>

          <fieldset class="two">
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -0,0 +1,112 @@
+const SPACE_RULE = '" "?';
+
+const PRIMITIVE_RULES = {
+  boolean: '("true" | "false") space',
+  number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
+  integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
+  string: ` "\\"" (
+        [^"\\\\] |
+        "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\\"" space`,
+  null: '"null" space',
+};
+
+const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
+const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
+const GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'};
+
+export class SchemaConverter {
+  constructor(propOrder) {
+    this._propOrder = propOrder || {};
+    this._rules = new Map();
+    this._rules.set('space', SPACE_RULE);
+  }
+
+  _formatLiteral(literal) {
+    const escaped = JSON.stringify(literal).replace(
+      GRAMMAR_LITERAL_ESCAPE_RE,
+      m => GRAMMAR_LITERAL_ESCAPES[m]
+    );
+    return `"${escaped}"`;
+  }
+
+  _addRule(name, rule) {
+    let escName = name.replace(INVALID_RULE_CHARS_RE, '-');
+    let key = escName;
+
+    if (this._rules.has(escName)) {
+      if (this._rules.get(escName) === rule) {
+        return key;
+      }
+
+      let i = 0;
+      while (this._rules.has(`${escName}${i}`)) {
+        i += 1;
+      }
+      key = `${escName}${i}`;
+    }
+
+    this._rules.set(key, rule);
+    return key;
+  }
+
+  visit(schema, name) {
+    const schemaType = schema.type;
+    const ruleName = name || 'root';
+
+    if (schema.oneOf || schema.anyOf) {
+      const rule = (schema.oneOf || schema.anyOf).map((altSchema, i) =>
+        this.visit(altSchema, `${name}${name ? "-" : ""}${i}`)
+      ).join(' | ');
+
+      return this._addRule(ruleName, rule);
+    } else if ('const' in schema) {
+      return this._addRule(ruleName, this._formatLiteral(schema.const));
+    } else if ('enum' in schema) {
+      const rule = schema.enum.map(v => this._formatLiteral(v)).join(' | ');
+      return this._addRule(ruleName, rule);
+    } else if (schemaType === 'object' && 'properties' in schema) {
+      // TODO: `required` keyword (from python implementation)
+      const propOrder = this._propOrder;
+      const propPairs = Object.entries(schema.properties).sort((a, b) => {
+        // sort by position in prop_order (if specified) then by key
+        const orderA = typeof propOrder[a[0]] === 'number' ? propOrder[a[0]] : Infinity;
+        const orderB = typeof propOrder[b[0]] === 'number' ? propOrder[b[0]] : Infinity;
+        return orderA - orderB || a[0].localeCompare(b[0]);
+      });
+
+      let rule = '"{" space';
+      propPairs.forEach(([propName, propSchema], i) => {
+        const propRuleName = this.visit(propSchema, `${name}${name ? "-" : ""}${propName}`);
+        if (i > 0) {
+          rule += ' "," space';
+        }
+        rule += ` ${this._formatLiteral(propName)} space ":" space ${propRuleName}`;
+      });
+      rule += ' "}" space';
+
+      return this._addRule(ruleName, rule);
+    } else if (schemaType === 'array' && 'items' in schema) {
+      // TODO `prefixItems` keyword (from python implementation)
+      const itemRuleName = this.visit(schema.items, `${name}${name ? "-" : ""}item`);
+      const rule = `"[" space (${itemRuleName} ("," space ${itemRuleName})*)? "]" space`;
+      return this._addRule(ruleName, rule);
+    } else {
+      if (!PRIMITIVE_RULES[schemaType]) {
+        throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
+      }
+      return this._addRule(
+        ruleName === 'root' ? 'root' : schemaType,
+        PRIMITIVE_RULES[schemaType]
+      );
+    }
+  }
+
+  formatGrammar() {
+    let grammar = '';
+    this._rules.forEach((rule, name) => {
+      grammar += `${name} ::= ${rule}\n`;
+    });
+    return grammar;
+  }
+}
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,6 +1,7 @@
 #include "common.h"
 #include "llama.h"
 #include "build-info.h"
+#include "grammar-parser.h"

 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -14,6 +15,7 @@
 #include "index.html.hpp"
 #include "index.js.hpp"
 #include "completion.js.hpp"
+#include "json-schema-to-grammar.mjs.hpp"

 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
@@ -188,6 +190,7 @@ struct llama_server_context
    size_t n_past = 0;
    size_t n_remain = 0;

+    json prompt;
    std::vector<llama_token> embd;
    std::vector<llama_token> last_n_tokens;

@@ -195,6 +198,9 @@ struct llama_server_context
    llama_context *ctx = nullptr;
    gpt_params params;

+    grammar_parser::parse_state parsed_grammar;
+    llama_grammar *grammar = nullptr;
+
    bool truncated = false;
    bool stopped_eos = false;
    bool stopped_word = false;
@@ -226,6 +232,7 @@ struct llama_server_context
    void rewind()
    {
        params.antiprompt.clear();
+        params.grammar.clear();
        num_prompt_tokens = 0;
        num_tokens_predicted = 0;
        generated_text = "";
@@ -237,9 +244,13 @@ struct llama_server_context
        stopped_limit = false;
        stopping_word = "";
        multibyte_pending = 0;
-
        n_remain = 0;
        n_past = 0;
+
+        if (grammar != nullptr) {
+            llama_grammar_free(grammar);
+            grammar = nullptr;
+        }
    }

    bool loadModel(const gpt_params &params_)
@@ -257,10 +268,82 @@ struct llama_server_context
        return true;
    }

+    std::vector<llama_token> tokenize(json json_prompt, bool add_bos)
+    {
+        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
+        // or the first element of the json_prompt array is a string.
+        std::vector<llama_token> prompt_tokens;
+
+        if (json_prompt.is_array())
+        {
+            bool first = true;
+            for (const auto& p : json_prompt)
+            {
+                if (p.is_string())
+                {
+                    auto s = p.template get<std::string>();
+                    std::vector<llama_token> p;
+                    if (first)
+                    {
+                        s.insert(0, 1, ' '); // add a space if it's the first
+                        p = ::llama_tokenize(ctx, s, add_bos);
+                        first = false;
+                    }
+                    else
+                    {
+                        p = ::llama_tokenize(ctx, s, false);
+                    }
+                    prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
+                }
+                else
+                {
+                    if (first)
+                    {
+                        first = false;
+                    }
+                    prompt_tokens.push_back(p.template get<llama_token>());
+                }
+            }
+        }
+        else
+        {
+            auto s = json_prompt.template get<std::string>();
+            s.insert(0, 1, ' '); // always add a first space
+            prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
+        }
+
+        return prompt_tokens;
+    }
+
+    bool loadGrammar()
+    {
+        if (!params.grammar.empty()) {
+            parsed_grammar = grammar_parser::parse(params.grammar.c_str());
+            // will be empty (default) if there are parse errors
+            if (parsed_grammar.rules.empty()) {
+                LOG_ERROR("grammar parse error", {{"grammar", params.grammar}});
+                return false;
+            }
+            grammar_parser::print_grammar(stderr, parsed_grammar);
+
+            {
+                auto it = params.logit_bias.find(llama_token_eos(ctx));
+                if (it != params.logit_bias.end() && it->second == -INFINITY) {
+                    LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
+                }
+            }
+
+            std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+            grammar = llama_grammar_init(
+                grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
+        }
+        return true;
+    }
+
    void loadPrompt()
    {
-        params.prompt.insert(0, 1, ' '); // always add a first space
-        std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
+        auto prompt_tokens = tokenize(prompt, true);  // always add BOS
+
        num_prompt_tokens = prompt_tokens.size();

        if (params.n_keep < 0)
@@ -367,7 +450,7 @@ struct llama_server_context
        if (params.n_predict == 0)
        {
            has_next_token = false;
-            result.tok = llama_token_eos();
+            result.tok = llama_token_eos(ctx);
            return result;
        }

@@ -407,7 +490,7 @@ struct llama_server_context
            llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};

            // Apply penalties
-            float nl_logit = logits[llama_token_nl()];
+            float nl_logit = logits[llama_token_nl(ctx)];
            auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
            llama_sample_repetition_penalty(ctx, &candidates_p,
                                            last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@@ -417,7 +500,11 @@ struct llama_server_context
                                                          last_n_repeat, alpha_frequency, alpha_presence);
            if (!penalize_nl)
            {
-                logits[llama_token_nl()] = nl_logit;
+                logits[llama_token_nl(ctx)] = nl_logit;
+            }
+
+            if (grammar != nullptr) {
+                llama_sample_grammar(ctx, &candidates_p, grammar);
            }

            if (temp <= 0)
@@ -457,10 +544,15 @@ struct llama_server_context
                }
            }

+            if (grammar != nullptr) {
+                llama_grammar_accept_token(ctx, grammar, result.tok);
+            }
+
            for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i)
            {
                result.probs.push_back({candidates_p.data[i].id, candidates_p.data[i].p});
            }
+
            last_n_tokens.erase(last_n_tokens.begin());
            last_n_tokens.push_back(result.tok);
            num_tokens_predicted++;
@@ -471,7 +563,7 @@ struct llama_server_context
        // decrement remaining sampling budget
        --n_remain;

-        if (!embd.empty() && embd.back() == llama_token_eos())
+        if (!embd.empty() && embd.back() == llama_token_eos(ctx))
        {
            // stopping_word = llama_token_to_str(ctx, embd.back());
            has_next_token = false;
@@ -608,8 +700,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    fprintf(stdout, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
-    fprintf(stdout, "  -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@@ -623,17 +713,17 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
    {
        fprintf(stdout, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
+    fprintf(stdout, "  --numa                attempt optimizations that help on some NUMA systems\n");
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
    fprintf(stdout, "  -ngl N, --n-gpu-layers N\n");
    fprintf(stdout, "                        number of layers to store in VRAM\n");
    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
    fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
-    fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
-    fprintf(stdout, "                        Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
-    fprintf(stdout, "                        is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
+    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif
    fprintf(stdout, "  -m FNAME, --model FNAME\n");
    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
@@ -729,23 +819,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            }
            params.n_ctx = std::stoi(argv[i]);
        }
-        else if (arg == "-gqa" || arg == "--gqa")
-        {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_gqa = std::stoi(argv[i]);
-        }
-        else if (arg == "-eps" || arg == "--rms-norm-eps") {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.rms_norm_eps = std::stof(argv[i]);
-        }
        else if (arg == "--rope-freq-base")
        {
            if (++i >= argc)
@@ -841,12 +914,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
 #endif // GGML_USE_CUBLAS
        }
-        else if (arg == "--mul-mat-q" || arg == "-mmq")
+        else if (arg == "--no-mul-mat-q" || arg == "-nommq")
        {
 #ifdef GGML_USE_CUBLAS
-            params.mul_mat_q = true;
+            params.mul_mat_q = false;
 #else
-            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
+            LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
 #endif // GGML_USE_CUBLAS
        }
        else if (arg == "--main-gpu" || arg == "-mg")
@@ -897,6 +970,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
        {
            params.use_mmap = false;
        }
+        else if (arg == "--numa")
+        {
+            params.numa = true;
+        }
        else if (arg == "--embedding")
        {
            params.embedding = true;
@@ -919,7 +996,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,

 static json format_generation_settings(llama_server_context &llama)
 {
-    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos());
+    const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
    const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
                            eos_bias->second < 0.0f && std::isinf(eos_bias->second);

@@ -947,6 +1024,7 @@ static json format_generation_settings(llama_server_context &llama)
        {"stream", llama.stream},
        {"logit_bias", llama.params.logit_bias},
        {"n_probs", llama.params.n_probs},
+        {"grammar", llama.params.grammar},
    };
 }

@@ -964,7 +1042,7 @@ static json format_timings(llama_server_context &llama)
    assert(timings.n_eval == llama.num_tokens_predicted);

    return json{
-        {"prompt_n", timings.n_eval},
+        {"prompt_n", timings.n_p_eval},
        {"prompt_ms", timings.t_p_eval_ms},
        {"prompt_per_token_ms", timings.t_p_eval_ms / timings.n_p_eval},
        {"prompt_per_second", 1e3 / timings.t_p_eval_ms * timings.n_p_eval},
@@ -986,14 +1064,13 @@ static json format_final_response(llama_server_context &llama, const std::string
        {"tokens_predicted", llama.num_tokens_predicted},
        {"tokens_evaluated", llama.num_prompt_tokens},
        {"generation_settings", format_generation_settings(llama)},
-        {"prompt", llama.params.prompt},
+        {"prompt", llama.prompt},
        {"truncated", llama.truncated},
        {"stopped_eos", llama.stopped_eos},
        {"stopped_word", llama.stopped_word},
        {"stopped_limit", llama.stopped_limit},
        {"stopping_word", llama.stopping_word},
        {"tokens_cached", llama.n_past},
-        {"tokens_predicted", llama.num_tokens_predicted},
        {"timings", format_timings(llama)},
    };

@@ -1026,34 +1103,52 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
        {"tokens", tokens}};
 }

+template <typename T>
+static T json_value(const json &body, const std::string &key, const T &default_value)
+{
+    // Fallback null to default value
+    return body.contains(key) && !body.at(key).is_null()
+        ? body.value(key, default_value)
+        : default_value;
+}
+
 static void parse_options_completion(const json &body, llama_server_context &llama)
 {
    gpt_params default_params;

-    llama.stream = body.value("stream", false);
-    llama.params.n_predict = body.value("n_predict", default_params.n_predict);
-    llama.params.top_k = body.value("top_k", default_params.top_k);
-    llama.params.top_p = body.value("top_p", default_params.top_p);
-    llama.params.tfs_z = body.value("tfs_z", default_params.tfs_z);
-    llama.params.typical_p = body.value("typical_p", default_params.typical_p);
-    llama.params.repeat_last_n = body.value("repeat_last_n", default_params.repeat_last_n);
-    llama.params.temp = body.value("temperature", default_params.temp);
-    llama.params.repeat_penalty = body.value("repeat_penalty", default_params.repeat_penalty);
-    llama.params.presence_penalty = body.value("presence_penalty", default_params.presence_penalty);
-    llama.params.frequency_penalty = body.value("frequency_penalty", default_params.frequency_penalty);
-    llama.params.mirostat = body.value("mirostat", default_params.mirostat);
-    llama.params.mirostat_tau = body.value("mirostat_tau", default_params.mirostat_tau);
-    llama.params.mirostat_eta = body.value("mirostat_eta", default_params.mirostat_eta);
-    llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl);
-    llama.params.n_keep = body.value("n_keep", default_params.n_keep);
-    llama.params.seed = body.value("seed", default_params.seed);
-    llama.params.prompt = body.value("prompt", default_params.prompt);
-    llama.params.n_probs = body.value("n_probs", default_params.n_probs);
+    llama.stream = json_value(body, "stream", false);
+    llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
+    llama.params.top_k = json_value(body, "top_k", default_params.top_k);
+    llama.params.top_p = json_value(body, "top_p", default_params.top_p);
+    llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
+    llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
+    llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
+    llama.params.temp = json_value(body, "temperature", default_params.temp);
+    llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
+    llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
+    llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
+    llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
+    llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
+    llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
+    llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
+    llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
+    llama.params.seed = json_value(body, "seed", default_params.seed);
+    llama.params.grammar = json_value(body, "grammar", default_params.grammar);
+    llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
+
+    if (body.count("prompt") != 0)
+    {
+        llama.prompt = body["prompt"];
+    }
+    else
+    {
+        llama.prompt = "";
+    }

    llama.params.logit_bias.clear();
-    if (body.value("ignore_eos", false))
+    if (json_value(body, "ignore_eos", false))
    {
-        llama.params.logit_bias[llama_token_eos()] = -INFINITY;
+        llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
    }

    const auto &logit_bias = body.find("logit_bias");
@@ -1169,6 +1264,12 @@ int main(int argc, char **argv)
        res.set_content(reinterpret_cast<const char*>(&completion_js), completion_js_len, "application/javascript");
        return false; });

+    // this is only called if no index.html is found in the public --path
+    svr.Get("/json-schema-to-grammar.mjs", [](const Request &, Response &res)
+            {
+        res.set_content(reinterpret_cast<const char*>(&json_schema_to_grammar_mjs), json_schema_to_grammar_mjs_len, "application/javascript");
+        return false; });
+
    svr.Post("/completion", [&llama](const Request &req, Response &res)
             {
        auto lock = llama.lock();
@@ -1179,6 +1280,12 @@ int main(int argc, char **argv)

        parse_options_completion(json::parse(req.body), llama);

+        if (!llama.loadGrammar())
+        {
+            res.status = 400;
+            return;
+        }
+
        llama.loadPrompt();
        llama.beginCompletion();

@@ -1294,8 +1401,11 @@ int main(int argc, char **argv)
        auto lock = llama.lock();

        const json body = json::parse(req.body);
-        const std::string content = body.value("content", "");
-        const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
+        std::vector<llama_token> tokens;
+        if (body.count("content") != 0)
+        {
+            tokens = llama.tokenize(body["content"], false);
+        }
        const json data = format_tokenizer_response(tokens);
        return res.set_content(data.dump(), "application/json"); });

@@ -1307,7 +1417,14 @@ int main(int argc, char **argv)

        llama.rewind();
        llama_reset_timings(llama.ctx);
-        llama.params.prompt = body.value("content", "");
+        if (body.count("content") != 0)
+        {
+            llama.prompt = body["content"];
+        }
+        else
+        {
+            llama.prompt = "";
+        }
        llama.params.n_predict = 0;
        llama.loadPrompt();
        llama.beginCompletion();
@@ -1334,8 +1451,12 @@ int main(int argc, char **argv)

    svr.set_error_handler([](const Request &, Response &res)
                          {
-        res.set_content("File Not Found", "text/plain");
-        res.status = 404; });
+        if (res.status == 400) {
+            res.set_content("Invalid request", "text/plain");
+        } else if (res.status != 500) {
+            res.set_content("File Not Found", "text/plain");
+            res.status = 404;
+        } });

    // set timeouts and change hostname and port
    svr.set_read_timeout(sparams.read_timeout);
@@ -1363,6 +1484,9 @@ int main(int argc, char **argv)
        return 1;
    }

+    if (llama.grammar != nullptr) {
+        llama_grammar_free(llama.grammar);
+    }
    llama_backend_free();

    return 0;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -2,180 +2,129 @@
 #define _GNU_SOURCE
 #endif

-#include "common.h"
-#include "llama.h"
 #include "build-info.h"

-#include <cassert>
-#include <cinttypes>
+#include "common.h"
+#include "llama.h"
+
 #include <cmath>
 #include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
 #include <string>
 #include <vector>

-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
-#include <windows.h>
-#include <signal.h>
-#endif
-
-
-
-int main(int argc, char ** argv)
-{
+int main(int argc, char ** argv) {
    gpt_params params;

-    //---------------------------------
-    // Print help :
-    //---------------------------------
-
-    if ( argc == 1 || argv[1][0] == '-' )
-    {
-        printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
+    if (argc == 1 || argv[1][0] == '-') {
+        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
        return 1 ;
    }

-    //---------------------------------
-    // Load parameters :
-    //---------------------------------
-
-    if ( argc >= 2 )
-    {
+    if (argc >= 2) {
        params.model = argv[1];
    }

-    if ( argc >= 3 )
-    {
+    if (argc >= 3) {
        params.prompt = argv[2];
    }

-    if ( params.prompt.empty() )
-    {
+    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }

-    //---------------------------------
-    // Init LLM :
-    //---------------------------------
+    // init LLM

    llama_backend_init(params.numa);

-    llama_model * model;
-    llama_context * ctx;
+    llama_context_params ctx_params = llama_context_default_params();

-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);

-    if ( model == NULL )
-    {
-        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
        return 1;
    }

-    //---------------------------------
-    // Tokenize the prompt :
-    //---------------------------------
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+
+    // tokenize the prompt

    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize( ctx , params.prompt , true );
+    tokens_list = ::llama_tokenize(ctx, params.prompt, true);

-    const int max_context_size     = llama_n_ctx( ctx );
-    const int max_tokens_list_size = max_context_size - 4 ;
+    const int max_context_size     = llama_n_ctx(ctx);
+    const int max_tokens_list_size = max_context_size - 4;

-    if ( (int)tokens_list.size() > max_tokens_list_size )
-    {
-        fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
-             __func__ , (int)tokens_list.size() , max_tokens_list_size );
+    if ((int) tokens_list.size() > max_tokens_list_size) {
+        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
        return 1;
    }

-    fprintf( stderr, "\n\n" );
+    fprintf(stderr, "\n\n");

-    // Print the tokens from the prompt :
-
-    for( auto id : tokens_list )
-    {
-        printf( "%s" , llama_token_to_str( ctx , id ) );
+    for (auto id : tokens_list) {
+        fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
    }

-    fflush(stdout);
+    fflush(stderr);

-
-    //---------------------------------
-    // Main prediction loop :
-    //---------------------------------
+    // main loop

    // The LLM keeps a contextual cache memory of previous token evaluation.
    // Usually, once this cache is full, it is required to recompute a compressed context based on previous
    // tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
    // example, we will just stop the loop once this cache is full or once an end of stream is detected.

-    while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
-    {
-        //---------------------------------
-        // Evaluate the tokens :
-        //---------------------------------
+    const int n_gen = std::min(32, max_context_size);

-        if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
-        {
-            fprintf( stderr,  "%s : failed to eval\n" , __func__ );
+    while (llama_get_kv_cache_token_count(ctx) < n_gen) {
+        // evaluate the transformer
+
+        if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
            return 1;
        }

        tokens_list.clear();

-        //---------------------------------
-        // Select the best prediction :
-        //---------------------------------
+        // sample the next token

        llama_token new_token_id = 0;

-        auto logits  = llama_get_logits( ctx );
-        auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
+        auto logits  = llama_get_logits(ctx);
+        auto n_vocab = llama_n_vocab(ctx);

        std::vector<llama_token_data> candidates;
-        candidates.reserve( n_vocab );
+        candidates.reserve(n_vocab);

-        for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
-        {
-            candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
        }

        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

-        // Select it using the "Greedy sampling" method :
-        new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
-
+        new_token_id = llama_sample_token_greedy(ctx , &candidates_p);

        // is it an end of stream ?
-        if ( new_token_id == llama_token_eos() )
-        {
+        if (new_token_id == llama_token_eos(ctx)) {
            fprintf(stderr, " [end of text]\n");
            break;
        }

-        // Print the new token :
-        printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
-        fflush( stdout );
+        // print the new token :
+        printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
+        fflush(stdout);

-        // Push this new token for next evaluation :
-        tokens_list.push_back( new_token_id );
+        // push this new token for next evaluation
+        tokens_list.push_back(new_token_id);
+    }

-    } // wend of main loop
-
-    llama_free( ctx );
-    llama_free_model( model );
+    llama_free(ctx);
+    llama_free_model(model);

    llama_backend_free();

+    fprintf(stderr, "\n\n");
+
    return 0;
 }
-
-// EOF
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "common.h"
 #include "llama.h"
 #include <unordered_map>
 #include <vector>
@@ -16,7 +17,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+static const float rms_norm_eps = 1e-5f;

 struct random_normal_distribution {
    std::mt19937 gen;
@@ -169,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
 struct llama_vocab {
    using id    = int32_t;
    using token = std::string;
+    using ttype = llama_token_type;

-    struct token_score {
-        token tok;
+    struct token_data {
+        token text;
        float score;
+        ttype type;
    };

    std::unordered_map<token, id> token_to_id;
-    std::vector<token_score> id_to_token;
+    std::vector<token_data> id_to_token;
 };

 struct my_llama_hparams {
@@ -1865,10 +1868,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
        t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1));                                            assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
        t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd));                 assert_shape_2d(t11->grad, N*n_batch, n_embd);
        t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3));                                            assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
-        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
+        t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx, 10000.0f, 1.0f, 0.0f, false));        assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
        t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch));                                  assert_shape_2d(t08->grad, n_embd, N*n_batch);
        t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3));                                            assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
-        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx));                     assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
+        t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx, 10000.0f, 1.0f, 0.0f, false));        assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
        t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch));                                  assert_shape_2d(t05->grad, n_embd, N*n_batch);
        t04->grad = expand(gb, ggml_add_inplace(ctx0,
                        ggml_add_inplace(ctx0,
@@ -1961,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) {


 void print_token(struct llama_context * ctx, llama_token token) {
-    printf("%s", llama_token_to_str(ctx, token));
+    printf("%s", llama_token_to_str(ctx, token).c_str());
 }

 void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@@ -1995,7 +1998,7 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
    }
 }

-void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
    int n_tokens = tokens_input->ne[0];
    int n_vocab  = target_logits->ne[0];

@@ -2004,7 +2007,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons

    ggml_set_f32(target_logits, -1.0f/n_vocab);
    ggml_set_f32(target_probs, 0.0f);
-    ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
+    ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx));
    for (int i=1; i<n_tokens+1; ++i) {
        int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
        set_f32_2d(target_logits, token, i-1, +1.0f);
@@ -2015,7 +2018,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
    }
 }

-void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
+void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
    GGML_ASSERT(tokens_input->n_dims  == 2);
    GGML_ASSERT(target_logits->n_dims == 3);
    GGML_ASSERT(target_probs->n_dims  == 3);
@@ -2035,7 +2038,7 @@ void get_example_targets_batch(struct llama_context * /*lctx*/, const int * trai
        size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
        GGML_ASSERT(sample+n_tokens-1 < n_train_data);

-        set_i32_2d(tokens_input, 0, k, llama_token_bos());
+        set_i32_2d(tokens_input, 0, k, llama_token_bos(lctx));
        for (int i=1; i<n_tokens+1; ++i) {
            int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
            // print_token(lctx, token);
@@ -2188,11 +2191,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
    f.read_raw(buf.data(), f.size);
    buf[f.size] = '\0';

-    out.resize(buf.size());
-
-    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
-    if (n_tokens >= 0) {
-        out.resize(n_tokens);
+    int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
+    if (n_tokens < 0) {
+        out.resize(-n_tokens);
+        llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
    }

    bool verify = false;
@@ -2200,17 +2202,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
        const char * in  = buf.data();
        const char * end = buf.data() + buf.size();
        for (int i = 0; i < (int) out.size(); ++i) {
-            const char * s = llama_token_to_str(lctx, out[i]);
-            int len = strlen(s);
+            std::string s = llama_token_to_str(lctx, out[i]);
+            int len = s.length();
            if (in >= end) {
                printf("%s: unexpected end of original text.\n", __func__);
                break;
            }
-            const bool matches = (strncmp(in, s, len) == 0);
+            const bool matches = (strncmp(in, s.c_str(), len) == 0);
            if (matches) {
                in += len;
            } else {
-                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
+                printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
            }
        }
    }
@@ -2294,7 +2296,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
    const auto params = sampler->params;

    // Apply penalties
-    const float nl_logit = logits[llama_token_nl()];
+    const float nl_logit = logits[llama_token_nl(ctx)];

    const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);

@@ -2313,7 +2315,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
        params.alpha_presence);

    if (!params.penalize_nl) {
-        logits[llama_token_nl()] = nl_logit;
+        logits[llama_token_nl(ctx)] = nl_logit;
    }

    llama_token token = 0;
@@ -2612,42 +2614,45 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
        return;
    }

-    // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC);   // magic
-    file.write_u32(LLAMA_FILE_VERSION); // version
-    // write_hparams
-    file.write_u32(model->hparams.n_vocab);
-    file.write_u32(model->hparams.n_embd);
-    file.write_u32(model->hparams.n_mult);
-    file.write_u32(model->hparams.n_head);
-    file.write_u32(model->hparams.n_layer);
-    file.write_u32(model->hparams.n_rot);
-    file.write_u32(LLAMA_FTYPE_ALL_F32);
-    // write_vocab
-    uint32_t n_vocab = model->hparams.n_vocab;
-    for (uint32_t i = 0; i < n_vocab; i++) {
-        const auto & token_score = vocab->id_to_token.at(i);
-        file.write_u32((uint32_t) token_score.tok.size());
-        file.write_raw(token_score.tok.data(), token_score.tok.size());
-        file.write_raw(&token_score.score, sizeof(token_score.score));
-    }
-    // write tensors
-    write_tensor(&file, model->tok_embeddings);
-    write_tensor(&file, model->norm);
-    write_tensor(&file, model->output);
-    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
-        auto & layer = model->layers[i];
-
-        write_tensor(&file, layer.attention_norm);
-        write_tensor(&file, layer.wq);
-        write_tensor(&file, layer.wk);
-        write_tensor(&file, layer.wv);
-        write_tensor(&file, layer.wo);
-        write_tensor(&file, layer.ffn_norm);
-        write_tensor(&file, layer.w1);
-        write_tensor(&file, layer.w2);
-        write_tensor(&file, layer.w3);
-    }
+#pragma message("TODO: implement file saving using gguf")
+    (void) vocab;
+    (void) model;
+//    // write_magic
+//    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+//    file.write_u32(LLAMA_FILE_VERSION); // version
+//    // write_hparams
+//    file.write_u32(model->hparams.n_vocab);
+//    file.write_u32(model->hparams.n_embd);
+//    file.write_u32(model->hparams.n_mult);
+//    file.write_u32(model->hparams.n_head);
+//    file.write_u32(model->hparams.n_layer);
+//    file.write_u32(model->hparams.n_rot);
+//    file.write_u32(LLAMA_FTYPE_ALL_F32);
+//    // write_vocab
+//    uint32_t n_vocab = model->hparams.n_vocab;
+//    for (uint32_t i = 0; i < n_vocab; i++) {
+//        const auto & token_data = vocab->id_to_token.at(i);
+//        file.write_u32((uint32_t) token_data.tok.size());
+//        file.write_raw(token_data.tok.data(), token_data.tok.size());
+//        file.write_raw(&token_data.score, sizeof(token_data.score));
+//    }
+//    // write tensors
+//    write_tensor(&file, model->tok_embeddings);
+//    write_tensor(&file, model->norm);
+//    write_tensor(&file, model->output);
+//    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+//        auto & layer = model->layers[i];
+//
+//        write_tensor(&file, layer.attention_norm);
+//        write_tensor(&file, layer.wq);
+//        write_tensor(&file, layer.wk);
+//        write_tensor(&file, layer.wv);
+//        write_tensor(&file, layer.wo);
+//        write_tensor(&file, layer.ffn_norm);
+//        write_tensor(&file, layer.w1);
+//        write_tensor(&file, layer.w2);
+//        write_tensor(&file, layer.w3);
+//    }
 }

 float cosine_decay(const int decay_steps, const float alpha, int step) {
@@ -3052,20 +3057,13 @@ int main(int argc, char ** argv) {

    struct llama_vocab vocab;
    {
-        std::vector<const char *> strings;
-        std::vector<float> scores;
-        int n_vocab = llama_n_vocab(lctx);
-        strings.resize(n_vocab, NULL);
-        scores.resize(n_vocab, 0);
-        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
-        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        const int n_vocab = llama_n_vocab(lctx);
        vocab.id_to_token.resize(n_vocab);
        for (int i=0; i<n_vocab; ++i) {
-            std::string tok   = std::string(strings[i]);
-            float       score = scores[i];
-            vocab.id_to_token[i].tok   = tok;
-            vocab.id_to_token[i].score = score;
-            vocab.token_to_id.emplace(tok, i);
+            vocab.id_to_token[i].text  = llama_token_get_text(lctx, i);
+            vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
+            vocab.id_to_token[i].type  = llama_token_get_type(lctx, i);
+            vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
        }
    }

@@ -3178,7 +3176,7 @@ int main(int argc, char ** argv) {
    std::vector<int> train_samples;
    train_samples.push_back(0);
    for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
-        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) {
+        if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) {
            train_samples.push_back(i);
        }
    }
@@ -3338,7 +3336,7 @@ int main(int argc, char ** argv) {
        struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);
        struct ggml_tensor * target_probs  = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab,  n_tokens);

-        get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
+        get_example_targets(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
        for (int i=sample_ctx; i<n_tokens; ++i) {
            ggml_set_i32_1d(tokens_input, i, n_vocab/2);
        }
--- a/flake.nix
+++ b/flake.nix
@@ -14,8 +14,6 @@
            with pkgs.darwin.apple_sdk_11_0.frameworks; [
              Accelerate
              MetalKit
-              MetalPerformanceShaders
-              MetalPerformanceShadersGraph
            ]
          else if isAarch32 && isDarwin then
            with pkgs.darwin.apple_sdk.frameworks; [
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -67,6 +67,8 @@ struct ggml_allocr {
    struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
    size_t max_size;
    bool measure;
+    int parse_seq[GGML_MAX_NODES];
+    bool has_parse_seq;

 #ifdef GGML_ALLOCATOR_DEBUG
    struct ggml_tensor * allocated_tensors[1024];
@@ -74,7 +76,7 @@ struct ggml_allocr {
 };

 #ifdef GGML_ALLOCATOR_DEBUG
-static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
+static void add_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    for (int i = 0; i < 1024; i++) {
        if (alloc->allocated_tensors[i] == NULL) {
            alloc->allocated_tensors[i] = tensor;
@@ -83,7 +85,7 @@ static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tens
    }
    GGML_ASSERT(!"out of allocated_tensors");
 }
-static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
+static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    for (int i = 0; i < 1024; i++) {
        if (alloc->allocated_tensors[i] == tensor ||
            (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
@@ -111,10 +113,10 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)

    size_t max_avail = 0;

-    // find the best fitting free block
+    // find the best fitting free block besides the last block
    int best_fit_block = -1;
    size_t best_fit_size = SIZE_MAX;
-    for (int i = 0; i < alloc->n_free_blocks; i++) {
+    for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
        struct free_block * block = &alloc->free_blocks[i];
        max_avail = MAX(max_avail, block->size);
        if (block->size >= size && block->size <= best_fit_size) {
@@ -126,10 +128,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
    AT_PRINTF("block %d\n", best_fit_block);

    if (best_fit_block == -1) {
-        fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
-                __func__, size, max_avail);
-        GGML_ASSERT(!"not enough space in the buffer");
+        // the last block is our last resort
+        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
+        if (block->size >= size) {
+            best_fit_block = alloc->n_free_blocks - 1;
+            max_avail = MAX(max_avail, block->size);
+        } else {
+            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
+                    __func__, size, max_avail);
+            GGML_ASSERT(!"not enough space in the buffer");
        return;
+        }
    }
    struct free_block * block = &alloc->free_blocks[best_fit_block];
    void * addr = block->addr;
@@ -229,6 +238,17 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
    alloc->n_free_blocks++;
 }

+void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
+    int pos = 0;
+    for (int i = 0; i < n; i++) {
+        if (list[i] != -1) {
+            alloc->parse_seq[pos] = list[i];
+            pos++;
+        }
+    }
+    alloc->has_parse_seq = true;
+}
+
 void ggml_allocr_reset(struct ggml_allocr * alloc) {
    alloc->n_free_blocks = 1;
    size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
@@ -248,6 +268,8 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
        /*.hash_table    = */ {{0}},
        /*.max_size      = */ 0,
        /*.measure       = */ false,
+        /*.parse_seq     = */ {0},
+        /*.has_parse_seq = */ false,
 #ifdef GGML_ALLOCATOR_DEBUG
        /*.allocated_tensors = */ = {0},
 #endif
@@ -275,6 +297,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
        /*.hash_table    = */ {{0}},
        /*.max_size      = */ 0,
        /*.measure       = */ true,
+        /*.parse_seq     = */ {0},
+        /*.has_parse_seq = */ false,
 #ifdef GGML_ALLOCATOR_DEBUG
        /*.allocated_tensors = */ = {0},
 #endif
@@ -394,6 +418,14 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                    if (parent == NULL) {
                        break;
                    }
+
+                    // if the node's data is external, then we cannot re-use it
+                    if ((char *) parent->data < (char *) alloc->data ||
+                        (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
+                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
+                        continue;
+                    }
+
                    struct hash_node * p_hn = hash_get(ht, parent);
                    if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
                        if (ggml_is_view(parent)) {
@@ -465,7 +497,13 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
                allocate_node(alloc, input);
            }
        }
-        for (int i = 0; i < gf->n_nodes; i++) {
+        for (int ind = 0; ind < gf->n_nodes; ind++) {
+            int i;
+            if (alloc->has_parse_seq) {
+                i = alloc->parse_seq[ind];
+            } else {
+                i = ind;
+            }
            struct ggml_tensor * node = gf->nodes[i];

            // allocate parents (leafs)
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@@ -10,6 +10,10 @@ extern "C" {
 GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
 GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);

+// tell the allocator to parse nodes following the order described in the list
+// you should call this if your graph are optimized to execute out-of-order
+GGML_API void   ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
+
 GGML_API void   ggml_allocr_free(struct ggml_allocr * alloc);
 GGML_API bool   ggml_allocr_is_measure(struct ggml_allocr * alloc);
 GGML_API void   ggml_allocr_reset(struct ggml_allocr * alloc);
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -8,29 +8,30 @@ extern "C" {

 #define GGML_CUDA_MAX_DEVICES       16

-void   ggml_init_cublas(void);
-void   ggml_cuda_set_tensor_split(const float * tensor_split);
+GGML_API void   ggml_init_cublas(void);
+GGML_API void * ggml_cuda_host_malloc(size_t size);
+GGML_API void   ggml_cuda_host_free(void * ptr);

-void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);
+GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);

-// TODO: export these with GGML_API
-void * ggml_cuda_host_malloc(size_t size);
-void   ggml_cuda_host_free(void * ptr);
+GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);

-void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);

-void   ggml_cuda_free_data(struct ggml_tensor * tensor);
-void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
-void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
-void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
-void   ggml_cuda_set_main_device(int main_device);
-void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
-void   ggml_cuda_set_scratch_size(size_t scratch_size);
-void   ggml_cuda_free_scratch(void);
-bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_set_main_device(int main_device);
+GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
+GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);
+GGML_API void   ggml_cuda_free_scratch(void);
+GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+
+GGML_API int    ggml_cuda_get_device_count(void);
+GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);

 #ifdef  __cplusplus
 }
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -38,6 +38,9 @@ struct ggml_metal_context;
 struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);

+void * ggml_metal_host_malloc(size_t n);
+void   ggml_metal_host_free  (void * data);
+
 // set the number of command buffers to use
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);

@@ -63,10 +66,13 @@ void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *

 // try to find operations that can be run concurrently in the graph
 // you should run it again if the topology of your graph changes
-void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);

-// if the graph has been optimized for concurrently dispatch
-bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+
+// output the concur_list for ggml_alloc
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);

 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -5,7 +5,6 @@
 #import <Foundation/Foundation.h>

 #import <Metal/Metal.h>
-#import <MetalPerformanceShaders/MetalPerformanceShaders.h>

 #undef MIN
 #undef MAX
@@ -79,6 +78,14 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q2_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q3_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
+    GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
    GGML_METAL_DECL_KERNEL(rope);
    GGML_METAL_DECL_KERNEL(alibi_f32);
    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
@@ -110,13 +117,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    ctx->n_buffers = 0;
    ctx->concur_list_len = 0;

-    // determine if we can use MPS
-    if (MPSSupportsMTLDevice(ctx->device)) {
-        fprintf(stderr, "%s: using MPS\n", __func__);
-    } else {
-        fprintf(stderr, "%s: not using MPS\n", __func__);
-        GGML_ASSERT(false && "MPS not supported");
-    }

 #if 0
    // compile from source string and show compile log
@@ -126,7 +126,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
        if (error) {
            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-            exit(1);
+            return NULL;
        }
    }
 #else
@@ -144,7 +144,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
        if (error) {
            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-            exit(1);
+            return NULL;
        }

 #ifdef GGML_QKK_64
@@ -156,17 +156,22 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #endif
        if (error) {
            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
-            exit(1);
+            return NULL;
        }
    }
 #endif

    // load kernels
    {
+        NSError * error = nil;
 #define GGML_METAL_ADD_KERNEL(name) \
        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
-        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \
-        fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name);
+        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
+        fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); \
+        if (error) { \
+            fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+            return NULL; \
+        }

        GGML_METAL_ADD_KERNEL(add);
        GGML_METAL_ADD_KERNEL(add_row);
@@ -196,6 +201,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
+        GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
        GGML_METAL_ADD_KERNEL(rope);
        GGML_METAL_ADD_KERNEL(alibi_f32);
        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
@@ -224,15 +237,31 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    free(ctx);
 }

+void * ggml_metal_host_malloc(size_t n) {
+    void * data = NULL;
+    const int result = posix_memalign((void **) &data, getpagesize(), n);
+    if (result != 0) {
+        fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
+        return NULL;
+    }
+
+    return data;
+}
+
+void ggml_metal_host_free(void * data) {
+    free(data);
+}
+
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
    ctx->n_cb = n_cb;
 }

-bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
-    if (ctx->concur_list_len) {
-        return true;
-    }
-    return false;
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
+    return ctx->concur_list_len;
+}
+
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
+    return ctx->concur_list;
 }

 // finds the Metal buffer that contains the tensor data on the GPU device
@@ -375,7 +404,7 @@ void ggml_metal_get_tensor(

 void ggml_metal_graph_find_concurrency(
        struct ggml_metal_context * ctx,
-        struct ggml_cgraph * gf) {
+        struct ggml_cgraph * gf, bool check_mem) {
    int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
    int nodes_unused[GGML_MAX_CONCUR];

@@ -422,7 +451,7 @@ void ggml_metal_graph_find_concurrency(
                        }
                    }
                }
-                if (exe_flag) {
+                if (exe_flag && check_mem) {
                    // check if nodes[i]'s data will be overwritten by a node before nodes[i].
                    // if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
                    int64_t data_start = (int64_t) gf->nodes[i]->data;
@@ -506,7 +535,7 @@ void ggml_metal_graph_compute(

            id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];

-            id<MTLComputeCommandEncoder> encoder = nil;
+            id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];

            const int node_start =                                  (cb_idx + 0) * n_nodes_per_cb;
            const int node_end   = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb;
@@ -515,10 +544,6 @@ void ggml_metal_graph_compute(
                const int i = has_concur ? ctx->concur_list[ind] : ind;

                if (i == -1) {
-                    if (encoder == nil) {
-                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                        continue;
-                    }
                    [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
                    continue;
                }
@@ -592,10 +617,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ADD:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
                                [encoder setComputePipelineState:ctx->pipeline_add_row];
@@ -613,10 +634,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_MUL:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
                                [encoder setComputePipelineState:ctx->pipeline_mul_row];
@@ -634,10 +651,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_SCALE:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            const float scale = *(const float *) src1->data;

                            [encoder setComputePipelineState:ctx->pipeline_scale];
@@ -653,10 +666,6 @@ void ggml_metal_graph_compute(
                        switch (ggml_get_unary_op(gf->nodes[i])) {
                            case GGML_UNARY_OP_SILU:
                                {
-                                    if (encoder == nil) {
-                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                                    }
-
                                    [encoder setComputePipelineState:ctx->pipeline_silu];
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -667,10 +676,6 @@ void ggml_metal_graph_compute(
                                } break;
                            case GGML_UNARY_OP_RELU:
                                {
-                                    if (encoder == nil) {
-                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                                    }
-
                                    [encoder setComputePipelineState:ctx->pipeline_relu];
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -681,10 +686,6 @@ void ggml_metal_graph_compute(
                                } break;
                            case GGML_UNARY_OP_GELU:
                                {
-                                    if (encoder == nil) {
-                                        encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                                    }
-
                                    [encoder setComputePipelineState:ctx->pipeline_gelu];
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -701,10 +702,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_SOFT_MAX:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            const int nth = 32;

                            [encoder setComputePipelineState:ctx->pipeline_soft_max];
@@ -719,10 +716,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_DIAG_MASK_INF:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            const int n_past = ((int32_t *)(dst->op_params))[0];

                            [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
@@ -740,53 +733,43 @@ void ggml_metal_graph_compute(

                            GGML_ASSERT(ne00 == ne10);
                            // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
+                            uint gqa = ne12/ne02;
                            GGML_ASSERT(ne03 == ne13);

+                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
                            if (ggml_is_contiguous(src0) &&
                                ggml_is_contiguous(src1) &&
-                                (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
-
-                                if (encoder != nil) {
-                                    [encoder endEncoding];
-                                    encoder = nil;
+                                src1t == GGML_TYPE_F32 &&
+                                [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                                ne00%32 == 0 &&
+                                ne11 > 1) {
+                                    switch (src0->type) {
+                                        case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
+                                        case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
+                                        case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
+                                        case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q2_K_f32]; break;
+                                        case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q3_K_f32]; break;
+                                        case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
+                                        case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
+                                        case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
+                                        default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
+                                    }
+                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                                    [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                                    [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                                    [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
+                                    [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
+                                    [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
+                                    [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
+                                    [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
+                                    [encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
+                                    [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+                                    [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                                }
-
-                                MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
-                                MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
-
-                                // for F32 x F32 we use MPS
-                                MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
-                                    matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt];
-
-                                MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
-                                    matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt];
-
-                                MPSMatrixDescriptor * desc  = [MPSMatrixDescriptor
-                                    matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32];
-
-                                MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
-                                    initWithDevice:ctx->device transposeLeft:false transposeRight:true
-                                        resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
-
-                                // we need to do ne12 multiplications
-                                // TODO: is there a way to do this in parallel - currently very slow ..
-                                // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
-                                for (int64_t i02 = 0; i02 < ne12; ++i02) {
-                                    size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
-                                    size_t offs_src1_cur = offs_src1 + i02*nb12;
-                                    size_t offs_dst_cur  = offs_dst  + i02*nb2;
-
-                                    MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
-                                    MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
-                                    MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst_cur  descriptor:desc ];
-
-                                    [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
-                                }
-                            } else {
-                                if (encoder == nil) {
-                                    encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                                }
-
+                            else {
                                int nth0 = 32;
                                int nth1 = 1;

@@ -885,23 +868,24 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
+                                [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];

                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
                                    src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q3_K) {
 #ifdef GGML_QKK_64
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #else
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
 #endif
                                }
                                else if (src0t == GGML_TYPE_Q5_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q6_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
                                    [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -910,10 +894,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_GET_ROWS:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            switch (src0->type) {
                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
                                case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
@@ -939,10 +919,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_RMS_NORM:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            float eps;
                            memcpy(&eps, dst->op_params, sizeof(float));

@@ -962,10 +938,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_NORM:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            const float eps = 1e-5f;

                            const int nth = 256;
@@ -984,10 +956,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ALIBI:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            GGML_ASSERT((src0t == GGML_TYPE_F32));

                            const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past);
@@ -1027,10 +995,6 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ROPE:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            const int n_past = ((int32_t *) dst->op_params)[0];
                            const int n_dims = ((int32_t *) dst->op_params)[1];
                            const int mode   = ((int32_t *) dst->op_params)[2];
@@ -1071,10 +1035,6 @@ void ggml_metal_graph_compute(
                    case GGML_OP_CPY:
                    case GGML_OP_CONT:
                        {
-                            if (encoder == nil) {
-                                encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
-                            }
-
                            const int nth = 32;

                            switch (src0t) {
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@@ -1,64 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <vector>
-#include <string>
-
-struct ggml_kompute_context;
-
-namespace vk {
-    class DeviceMemory;
-    class Buffer;
-};
-
-struct ggml_vk_memory {
-    void *data = nullptr;
-    size_t size = 0;
-    vk::DeviceMemory *primaryMemory = nullptr;
-    vk::Buffer *primaryBuffer = nullptr;
-    vk::DeviceMemory *stagingMemory = nullptr;
-    vk::Buffer *stagingBuffer = nullptr;
-};
-
-struct ggml_vk_device {
-    int index = 0;
-    int type = 0;           // same as VkPhysicalDeviceType
-    size_t heapSize = 0;
-    std::string name;
-    std::string vendor;
-};
-
-std::vector<ggml_vk_device> ggml_vk_available_devices(size_t memoryRequired);
-bool ggml_vk_init_device(size_t memoryRequired, const std::string &device);
-bool ggml_vk_init_device(const ggml_vk_device &device);
-bool ggml_vk_init_device(int device);
-bool ggml_vk_free_device();
-bool ggml_vk_has_vulkan();
-bool ggml_vk_has_device();
-bool ggml_vk_using_vulkan();
-ggml_vk_device ggml_vk_current_device();
-struct ggml_kompute_context * ggml_vk_init(void);
-bool ggml_vk_has_h2d_all(struct ggml_kompute_context * ctx);
-void ggml_vk_free(struct ggml_kompute_context * ctx);
-size_t ggml_vk_aligned_offset(size_t offset);
-ggml_vk_memory ggml_vk_allocate(size_t size);
-void ggml_vk_free_memory(ggml_vk_memory &memory);
-
-void ggml_vk_add_buffer(
-    struct ggml_kompute_context * ctx,
-    const char * name,
-    const ggml_vk_memory &memory);
-
-void ggml_vk_h2d_all(struct ggml_kompute_context * ctx);
-void ggml_vk_d2h_all(struct ggml_kompute_context * ctx);
-void ggml_vk_h2d_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
-void ggml_vk_d2h_tensor(struct ggml_kompute_context * ctx, struct ggml_tensor * t);
-void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf);
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -207,7 +207,7 @@
 #define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_SRC           6
-#define GGML_MAX_NAME          48
+#define GGML_MAX_NAME          64
 #define GGML_MAX_OP_PARAMS     32
 #define GGML_DEFAULT_N_THREADS 4

@@ -215,6 +215,11 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1

+#define GGUF_MAGIC   0x46554747 // "GGUF"
+#define GGUF_VERSION 1
+
+#define GGUF_DEFAULT_ALIGNMENT 32
+
 #define GGML_UNUSED(x) (void)(x)

 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
@@ -255,8 +260,9 @@
 extern "C" {
 #endif

-#ifdef __ARM_NEON
-    // we use the built-in 16-bit float type
+#if defined(__ARM_NEON) && defined(__CUDACC__)
+    typedef half ggml_fp16_t;
+#elif defined(__ARM_NEON)
    typedef __fp16 ggml_fp16_t;
 #else
    typedef uint16_t ggml_fp16_t;
@@ -340,10 +346,12 @@ extern "C" {
        GGML_OP_ARGMAX,
        GGML_OP_REPEAT,
        GGML_OP_REPEAT_BACK,
+        GGML_OP_CONCAT,
        GGML_OP_SILU_BACK,
        GGML_OP_NORM, // normalize
        GGML_OP_RMS_NORM,
        GGML_OP_RMS_NORM_BACK,
+        GGML_OP_GROUP_NORM,

        GGML_OP_MUL_MAT,
        GGML_OP_OUT_PROD,
@@ -369,14 +377,19 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_1D,
        GGML_OP_CONV_2D,
+        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,

+        GGML_OP_UPSCALE, // nearest interpolate
+
        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
        GGML_OP_FLASH_ATTN_BACK,
        GGML_OP_WIN_PART,
        GGML_OP_WIN_UNPART,
+        GGML_OP_GET_REL_POS,
+        GGML_OP_ADD_REL_POS,

        GGML_OP_UNARY,

@@ -562,6 +575,7 @@ extern "C" {
    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);

    GGML_API int     ggml_blck_size (enum ggml_type type);
@@ -799,6 +813,13 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // concat a and b on dim 2
+    // used in stable-diffusion
+    GGML_API struct ggml_tensor * ggml_concat(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    GGML_API struct ggml_tensor * ggml_abs(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
@@ -907,6 +928,19 @@ extern "C" {
            struct ggml_tensor  * a,
            float                 eps);

+    // group normalize along ne0*ne1*n_groups
+    // used in stable-diffusion
+    // TODO: eps is hardcoded to 1e-6 for now
+    GGML_API struct ggml_tensor * ggml_group_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_groups);
+
+    GGML_API struct ggml_tensor * ggml_group_norm_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_groups);
+
    // a - x
    // b - dy
    // TODO: update with configurable eps
@@ -1207,6 +1241,15 @@ extern "C" {
            float                 freq_base,
            float                 freq_scale);

+    // xPos RoPE, in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            float                 base,
+            bool                  down);
+
    // rotary position embedding backward, i.e compute dx from dy
    // a - dy
    GGML_API struct ggml_tensor * ggml_rope_back(
@@ -1215,7 +1258,11 @@ extern "C" {
            int                   n_past,
            int                   n_dims,
            int                   mode,
-            int                   n_ctx);
+            int                   n_ctx,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 xpos_base,
+            bool                  xpos_down);

    // alibi position embedding
    // in-place, returns view(a)
@@ -1242,6 +1289,15 @@ extern "C" {
            int                   p0,  // padding
            int                   d0); // dilation

+    // conv_1d with padding = half
+    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
+    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s,
+            int                   d);
+
    GGML_API struct ggml_tensor * ggml_conv_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1253,14 +1309,38 @@ extern "C" {
            int                   d0,
            int                   d1);

-    // conv_1d with padding = half
-    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
-    GGML_API struct ggml_tensor * ggml_conv_1d_ph(
+
+    // kernel size is a->ne[0] x a->ne[1]
+    // stride is equal to kernel size
+    // padding is zero
+    // example:
+    // a:     16   16    3  768
+    // b:   1024 1024    3    1
+    // res:   64   64  768    1
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // kernel size is a->ne[0] x a->ne[1]
+    // stride is 1
+    // padding is half
+    // example:
+    // a:      3    3    256  256
+    // b:     64   64    256    1
+    // res:   64   64    256    1
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            int                   s,
-            int                   d);
+            int                   stride);

    enum ggml_op_pool {
        GGML_OP_POOL_MAX,
@@ -1287,6 +1367,13 @@ extern "C" {
            int                   p0,
            int                   p1);

+    // nearest interpolate
+    // used in stable-diffusion
+    GGML_API struct ggml_tensor * ggml_upscale(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   scale_factor);
+
    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@@ -1340,6 +1427,27 @@ extern "C" {
        struct ggml_tensor  * a,
        enum ggml_unary_op op);

+    // used in sam
+    GGML_API struct ggml_tensor * ggml_get_rel_pos(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   qh,
+            int                   kh);
+
+    // used in sam
+
+    GGML_API struct ggml_tensor * ggml_add_rel_pos(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * pw,
+            struct ggml_tensor  * ph);
+
+    GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * pw,
+            struct ggml_tensor  * ph);
+
    // custom operators

    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@@ -1703,6 +1811,118 @@ extern "C" {

    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);

+    //
+    // gguf
+    //
+
+    enum gguf_type {
+        GGUF_TYPE_UINT8   = 0,
+        GGUF_TYPE_INT8    = 1,
+        GGUF_TYPE_UINT16  = 2,
+        GGUF_TYPE_INT16   = 3,
+        GGUF_TYPE_UINT32  = 4,
+        GGUF_TYPE_INT32   = 5,
+        GGUF_TYPE_FLOAT32 = 6,
+        GGUF_TYPE_BOOL    = 7,
+        GGUF_TYPE_STRING  = 8,
+        GGUF_TYPE_ARRAY   = 9,
+        GGUF_TYPE_COUNT,       // marks the end of the enum
+    };
+
+    struct gguf_context;
+
+    struct gguf_init_params {
+        bool no_alloc;
+
+        // if not NULL, create a ggml_context and allocate the tensor data in it
+        struct ggml_context ** ctx;
+    };
+
+    GGML_API struct gguf_context * gguf_init_empty(void);
+    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
+    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
+
+    GGML_API void gguf_free(struct gguf_context * ctx);
+
+    GGML_API const char * gguf_type_name(enum gguf_type type);
+
+    GGML_API int    gguf_get_version    (struct gguf_context * ctx);
+    GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
+    GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
+    GGML_API void * gguf_get_data       (struct gguf_context * ctx);
+
+    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);
+    GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);
+    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
+
+    GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
+    GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
+
+    // results are undefined if the wrong type is used for the key
+    GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
+    GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);
+    GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);
+    GGML_API int16_t      gguf_get_val_i16 (struct gguf_context * ctx, int i);
+    GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);
+    GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);
+    GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);
+    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
+    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);
+    GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
+
+    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
+    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);
+    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
+    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);
+
+    // overrides existing values or adds a new one
+    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
+    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
+    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
+    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
+    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
+    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
+    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
+    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
+    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
+    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
+    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
+
+    // set or add KV pairs from another context
+    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
+
+    // manage tensor info
+    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
+    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
+
+    // writing gguf files can be done in 2 ways:
+    //
+    // - write the entire gguf_context to a binary file in a single pass:
+    //
+    //   gguf_write_to_file(ctx, fname);
+    //
+    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
+    //
+    //   FILE * f = fopen(fname, "wb");
+    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
+    //   fwrite(f, ...);
+    //   void * data = gguf_meta_get_meta_data(ctx);
+    //   fseek(f, 0, SEEK_SET);
+    //   fwrite(f, data, gguf_get_meta_size(ctx));
+    //   free(data);
+    //   fclose(f);
+    //
+
+    // write the entire context to a binary file
+    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
+
+    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
+    GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
+    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);
+
    //
    // system info
    //
@@ -1740,6 +1960,10 @@ extern "C" {
    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);

    typedef struct {
+        const char      * type_name;
+        int               blck_size;
+        size_t            type_size;
+        bool              is_quantized;
        ggml_to_float_t   to_float;
        ggml_from_float_t from_float;
        ggml_from_float_t from_float_reference;
@@ -1747,7 +1971,7 @@ extern "C" {
        enum ggml_type    vec_dot_type;
    } ggml_type_traits_t;

-    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
+    ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

 #ifdef  __cplusplus
 }
--- a/gguf.py
+++ b/gguf.py
@@ -0,0 +1,722 @@
+import shutil
+import sys
+import struct
+import tempfile
+import numpy as np
+
+from enum import IntEnum, auto
+from typing import Any, IO, List, Optional
+
+#
+# constants
+#
+
+GGUF_MAGIC             = 0x46554747
+GGUF_VERSION           = 1
+GGUF_DEFAULT_ALIGNMENT = 32
+
+# general
+KEY_GENERAL_ARCHITECTURE         = "general.architecture"
+KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
+KEY_GENERAL_ALIGNMENT            = "general.alignment"
+KEY_GENERAL_NAME                 = "general.name"
+KEY_GENERAL_AUTHOR               = "general.author"
+KEY_GENERAL_URL                  = "general.url"
+KEY_GENERAL_DESCRIPTION          = "general.description"
+KEY_GENERAL_LICENSE              = "general.license"
+KEY_GENERAL_SOURCE_URL           = "general.source.url"
+KEY_GENERAL_SOURCE_HF_REPO       = "general.source.hugginface.repository"
+KEY_GENERAL_FILE_TYPE            = "general.file_type"
+
+# LLM
+KEY_LLM_CONTEXT_LENGTH        = "{arch}.context_length"
+KEY_LLM_EMBEDDING_LENGTH      = "{arch}.embedding_length"
+KEY_LLM_BLOCK_COUNT           = "{arch}.block_count"
+KEY_LLM_FEED_FORWARD_LENGTH   = "{arch}.feed_forward_length"
+KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
+KEY_LLM_TENSOR_DATA_LAYOUT    = "{arch}.tensor_data_layout"
+
+# attention
+KEY_ATTENTION_HEAD_COUNT        = "{arch}.attention.head_count"
+KEY_ATTENTION_HEAD_COUNT_KV     = "{arch}.attention.head_count_kv"
+KEY_ATTENTION_MAX_ALIBI_BIAS    = "{arch}.attention.max_alibi_bias"
+KEY_ATTENTION_CLAMP_KQV         = "{arch}.attention.clamp_kqv"
+KEY_ATTENTION_LAYERNORM_EPS     = "{arch}.attention.layer_norm_epsilon"
+KEY_ATTENTION_LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
+
+# RoPE
+KEY_ROPE_DIMENSION_COUNT = "{arch}.rope.dimension_count"
+KEY_ROPE_SCALE_LINEAR    = "{arch}.rope.scale_linear"
+
+# tokenization
+KEY_TOKENIZER_MODEL      = "tokenizer.ggml.model"
+KEY_TOKENIZER_LIST       = "tokenizer.ggml.tokens"
+KEY_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type"
+KEY_TOKENIZER_SCORES     = "tokenizer.ggml.scores"
+KEY_TOKENIZER_MERGES     = "tokenizer.ggml.merges"
+KEY_TOKENIZER_BOS_ID     = "tokenizer.ggml.bos_token_id"
+KEY_TOKENIZER_EOS_ID     = "tokenizer.ggml.eos_token_id"
+KEY_TOKENIZER_UNK_ID     = "tokenizer.ggml.unknown_token_id"
+KEY_TOKENIZER_SEP_ID     = "tokenizer.ggml.seperator_token_id"
+KEY_TOKENIZER_PAD_ID     = "tokenizer.ggml.padding_token_id"
+KEY_TOKENIZER_HF_JSON    = "tokenizer.huggingface.json"
+KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"
+
+
+#
+# recommended mapping of model tensor names for storage in gguf
+#
+
+
+class MODEL_ARCH(IntEnum):
+    LLAMA   = auto()
+    FALCON  = auto()
+    GPT2    = auto()
+    GPTJ    = auto()
+    GPTNEOX = auto()
+    MPT     = auto()
+
+
+class MODEL_TENSOR(IntEnum):
+    TOKEN_EMBD    = auto()
+    POS_EMBD      = auto()
+    OUTPUT        = auto()
+    OUTPUT_NORM   = auto()
+    ROPE_FREQS    = auto()
+    ATTN_Q        = auto()
+    ATTN_K        = auto()
+    ATTN_V        = auto()
+    ATTN_QKV      = auto()
+    ATTN_OUT      = auto()
+    ATTN_NORM     = auto()
+    ATTN_NORM_2   = auto()
+    ATTN_ROT_EMBD = auto()
+    FFN_GATE      = auto()
+    FFN_DOWN      = auto()
+    FFN_UP        = auto()
+    FFN_NORM      = auto()
+
+
+MODEL_ARCH_NAMES = {
+    MODEL_ARCH.LLAMA:   "llama",
+    MODEL_ARCH.FALCON:  "falcon",
+    MODEL_ARCH.GPT2:    "gpt2",
+    MODEL_ARCH.GPTJ:    "gptj",
+    MODEL_ARCH.GPTNEOX: "gptneox",
+    MODEL_ARCH.MPT:     "mpt",
+}
+
+MODEL_TENSOR_NAMES = {
+    MODEL_ARCH.LLAMA: {
+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
+        MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
+        MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.GPTNEOX: {
+        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
+        MODEL_TENSOR.OUTPUT:        "output",
+        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
+        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
+        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
+        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.FALCON: {
+        MODEL_TENSOR.TOKEN_EMBD:  "token_embd",
+        MODEL_TENSOR.OUTPUT_NORM: "output_norm",
+        MODEL_TENSOR.OUTPUT:      "output",
+        MODEL_TENSOR.ATTN_NORM:   "blk.{bid}.attn_norm",
+        MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
+        MODEL_TENSOR.ATTN_QKV:    "blk.{bid}.attn_qkv",
+        MODEL_TENSOR.ATTN_OUT:    "blk.{bid}.attn_output",
+        MODEL_TENSOR.FFN_DOWN:    "blk.{bid}.ffn_down",
+        MODEL_TENSOR.FFN_UP:      "blk.{bid}.ffn_up",
+    },
+    MODEL_ARCH.GPT2: {
+        # TODO
+    },
+    # TODO
+}
+
+# tensors that will not be serialized
+MODEL_TENSOR_SKIP = {
+    MODEL_ARCH.LLAMA: [
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+    ],
+}
+
+
+# TODO: the following helper functions should be removed
+#       instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
+#       however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
+# REMOVE
+def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
+    for skip in MODEL_TENSOR_SKIP.get(arch, []):
+        for i in range(n_blocks):
+            if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
+                return True
+
+    return False
+
+
+def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
+    tensor_map = {}
+
+    # Token embeddings
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
+
+    tensor_map["gpt_neox.embed_in"]           = mapped_to  # gptneox
+    tensor_map["transformer.wte"]             = mapped_to  # gpt2 mpt
+    tensor_map["transformer.word_embeddings"] = mapped_to  # falcon
+    tensor_map["model.embed_tokens"]          = mapped_to  # llama-hf
+    tensor_map["tok_embeddings"]              = mapped_to  # llama-pth
+
+    # Position embeddings
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
+
+    tensor_map["transformer.wpe"] = mapped_to  # gpt2
+
+    # Output
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
+
+    tensor_map["embed_out"] = mapped_to  # gptneox
+    tensor_map["lm_head"]   = mapped_to  # gpt2 mpt falcon llama-hf
+    tensor_map["output"]    = mapped_to  # llama-pth
+
+    # Output norm
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
+
+    tensor_map["gpt_neox.final_layer_norm"] = mapped_to  # gptneox
+    tensor_map["transformer.ln_f"]          = mapped_to  # gpt2 falcon
+    tensor_map["transformer.norm_f"]        = mapped_to  # mpt
+    tensor_map["model.norm"]                = mapped_to  # llama-hf
+    tensor_map["norm"]                      = mapped_to  # llama-pth
+
+    # Rope frequencies
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
+
+    tensor_map["rope.freqs"] = mapped_to  # llama-pth
+
+    # Attention and feed-forward blocks
+    for i in range(0, n_blocks):
+        # Attention norm
+        # TODO: is there are simpler way to write these 2 lines in Python?
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".ln_1"]              = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".norm_1"]       = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".input_layernorm"]   = mapped_to  # falcon7b
+        tensor_map["transformer.h."+str(i)+".ln_mlp"]            = mapped_to  # falcon40b
+        tensor_map["model.layers."+str(i)+".input_layernorm"]    = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention_norm"]           = mapped_to  # llama-pth
+
+        # Attention norm 2
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to  # falcon40b
+
+        # Attention query-key-value
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"]    = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".attn.c_attn"]                    = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"]                 = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to  # falcon
+
+        # Attention query
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wq"]           = mapped_to  # llama-pth
+
+        # Attention key
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wk"]           = mapped_to  # llama-pth
+
+        # Attention value
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wv"]           = mapped_to  # llama-pth
+
+        # Attention output
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".attention.dense"]    = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".attn.c_proj"]          = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".attn.out_proj"]   = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".self_attn.o_proj"]      = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wo"]                = mapped_to  # llama-pth
+
+        # Rotary embeddings
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"]  = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to  # llama-pth
+
+        # Feed-forward norm
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".ln_2"]                       = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".norm_2"]                = mapped_to  # mpt
+        tensor_map["model.layers."+str(i)+".post_attention_layernorm"]    = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".ffn_norm"]                          = mapped_to  # llama-pth
+
+        # Feed-forward up
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".mlp.c_fc"]            = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"]    = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"]   = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".mlp.up_proj"]          = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w3"]            = mapped_to  # llama-pth
+
+        # Feed-forward gate
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w1"]     = mapped_to  # llama-pth
+
+        # Feed-forward down
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".mlp.c_proj"]          = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"]  = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"]   = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".mlp.down_proj"]        = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w2"]            = mapped_to  # llama-pth
+
+    return tensor_map
+
+
+class TokenType(IntEnum):
+    NORMAL       = 1
+    UNKNOWN      = 2
+    CONTROL      = 3
+    USER_DEFINED = 4
+    UNUSED       = 5
+    BYTE         = 6
+
+#
+# implementation
+#
+
+
+class GGMLQuantizationType(IntEnum):
+    F32  = 0
+    F16  = 1
+    Q4_0 = 2
+    Q4_1 = 3
+    Q5_0 = 6
+    Q5_1 = 7
+    Q8_0 = 8
+    Q8_1 = 9
+    Q2_K = 10
+    Q3_K = 11
+    Q4_K = 12
+    Q5_K = 13
+    Q6_K = 14
+    Q8_K = 15
+
+
+class GGUFValueType(IntEnum):
+    UINT8   = 0
+    INT8    = 1
+    UINT16  = 2
+    INT16   = 3
+    UINT32  = 4
+    INT32   = 5
+    FLOAT32 = 6
+    BOOL    = 7
+    STRING  = 8
+    ARRAY   = 9
+
+    @staticmethod
+    def get_type(val):
+        if isinstance(val, str) or isinstance(val, bytes) or isinstance(val, bytearray):
+            return GGUFValueType.STRING
+        elif isinstance(val, list):
+            return GGUFValueType.ARRAY
+        elif isinstance(val, float):
+            return GGUFValueType.FLOAT32
+        elif isinstance(val, bool):
+            return GGUFValueType.BOOL
+        elif isinstance(val, int):
+            return GGUFValueType.INT32
+        else:
+            print("Unknown type: "+str(type(val)))
+            sys.exit()
+
+
+class GGUFWriter:
+    def __init__(self, path: str, arch: str, use_temp_file = True):
+        self.fout = open(path, "wb")
+        self.arch = arch
+        self.offset_tensor = 0
+        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
+        self.kv_data = b""
+        self.kv_data_count = 0
+        self.ti_data = b""
+        self.ti_data_count = 0
+        self.add_architecture()
+        self.use_temp_file = use_temp_file
+        self.tensors = []
+
+    def write_header_to_file(self):
+        self.fout.write(struct.pack("<I", GGUF_MAGIC))
+        self.fout.write(struct.pack("<I", GGUF_VERSION))
+        self.fout.write(struct.pack("<I", self.ti_data_count))
+        self.fout.write(struct.pack("<I", self.kv_data_count))
+        self.flush()
+#        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
+
+    def write_kv_data_to_file(self):
+        self.fout.write(self.kv_data)
+        self.flush()
+
+    def write_ti_data_to_file(self):
+        self.fout.write(self.ti_data)
+        self.flush()
+
+    def add_key(self, key: str):
+        self.add_val(key, GGUFValueType.STRING, add_vtype=False)
+
+    def add_uint8(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT8)
+
+    def add_int8(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT8)
+
+    def add_uint16(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT16)
+
+    def add_int16(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT16)
+
+    def add_uint32(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.UINT32)
+
+    def add_int32(self, key: str, val: int):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.INT32)
+
+    def add_float32(self, key: str, val: float):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.FLOAT32)
+
+    def add_bool(self, key: str, val: bool):
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.BOOL)
+
+    def add_string(self, key: str, val: str):
+        if len(val) == 0:
+            return
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.STRING)
+
+    def add_array(self, key: str, val: list):
+        if not isinstance(val, list):
+            raise ValueError("Value must be a list for array type")
+
+        self.add_key(key)
+        self.add_val(val, GGUFValueType.ARRAY)
+
+    def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
+        if vtype is None:
+            vtype = GGUFValueType.get_type(val)
+
+        if add_vtype:
+            self.kv_data += struct.pack("<I", vtype)
+            self.kv_data_count += 1
+
+        if vtype == GGUFValueType.UINT8:
+            self.kv_data += struct.pack("<B", val)
+        elif vtype == GGUFValueType.INT8:
+            self.kv_data += struct.pack("<b", val)
+        elif vtype == GGUFValueType.UINT16:
+            self.kv_data += struct.pack("<H", val)
+        elif vtype == GGUFValueType.INT16:
+            self.kv_data += struct.pack("<h", val)
+        elif vtype == GGUFValueType.UINT32:
+            self.kv_data += struct.pack("<I", val)
+        elif vtype == GGUFValueType.INT32:
+            self.kv_data += struct.pack("<i", val)
+        elif vtype == GGUFValueType.FLOAT32:
+            self.kv_data += struct.pack("<f", val)
+        elif vtype == GGUFValueType.BOOL:
+            self.kv_data += struct.pack("?", val)
+        elif vtype == GGUFValueType.STRING:
+            encoded_val = val.encode("utf8") if isinstance(val, str) else val
+            self.kv_data += struct.pack("<I", len(encoded_val))
+            self.kv_data += encoded_val
+        elif vtype == GGUFValueType.ARRAY:
+            ltype = set([GGUFValueType.get_type(item) for item in val])
+            assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
+            self.kv_data += struct.pack("<I", list(ltype)[0])
+            self.kv_data += struct.pack("<I", len(val))
+            for item in val:
+                self.add_val(item, add_vtype=False)
+        else:
+            raise ValueError("Invalid GGUF metadata value type")
+
+    @staticmethod
+    def ggml_pad(x: int, n: int) -> int:
+        return ((x + n - 1) // n) * n
+
+    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
+        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
+
+        encoded_name = name.encode("utf8")
+        self.ti_data += struct.pack("<I", len(encoded_name))
+        self.ti_data += encoded_name
+        n_dims = len(tensor_shape)
+        self.ti_data += struct.pack("<I", n_dims)
+        for i in range(n_dims):
+            self.ti_data += struct.pack("<I", tensor_shape[n_dims - 1 - i])
+        if raw_dtype is None:
+            dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
+        else:
+            dtype = raw_dtype
+        self.ti_data += struct.pack("<I", dtype)
+        self.ti_data += struct.pack("<Q", self.offset_tensor)
+        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
+        self.ti_data_count += 1
+
+    def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
+        if self.use_temp_file and not hasattr(self, "temp_file"):
+            self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
+            self.temp_file.seek(0)
+
+        self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
+
+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
+
+        if not self.use_temp_file:
+            self.tensors.append((tensor, pad))
+            return
+
+        tensor.tofile(self.temp_file)
+
+        if pad != 0:
+            self.temp_file.write(bytes([0] * pad))
+
+    def write_tensor_data(self, tensor: np.ndarray):
+        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))
+
+        tensor.tofile(self.fout)
+
+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))
+
+    def write_tensors_to_file(self):
+        self.write_ti_data_to_file()
+
+        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))
+
+        if not self.use_temp_file:
+            for (currtensor, currpad) in self.tensors:
+                currtensor.tofile(self.fout)
+                if currpad != 0:
+                    self.fout.write(bytes([0] * currpad))
+            return
+
+        self.temp_file.seek(0)
+
+        shutil.copyfileobj(self.temp_file, self.fout)
+        self.flush()
+        self.temp_file.close()
+
+    def flush(self):
+        self.fout.flush()
+
+    def close(self):
+        self.fout.close()
+
+    def add_architecture(self):
+        self.add_string(KEY_GENERAL_ARCHITECTURE, self.arch)
+
+    def add_author(self, author: str):
+        self.add_string(KEY_GENERAL_AUTHOR, author)
+
+    def add_tensor_data_layout(self, layout: str):
+        self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
+
+    def add_url(self, url: str):
+        self.add_string(KEY_GENERAL_URL, url)
+
+    def add_description(self, description: str):
+        self.add_string(KEY_GENERAL_DESCRIPTION, description)
+
+    def add_source_url(self, url: str):
+        self.add_string(KEY_GENERAL_SOURCE_URL, url)
+
+    def add_source_hf_repo(self, repo: str):
+        self.add_string(KEY_GENERAL_SOURCE_HF_REPO, repo)
+
+    def add_file_type(self, ftype: int):
+        self.add_uint32(KEY_GENERAL_FILE_TYPE, ftype)
+
+    def add_name(self, name: str):
+        self.add_string(KEY_GENERAL_NAME, name)
+
+    def add_quantization_version(self, quantization_version: GGMLQuantizationType):
+        self.add_uint32(
+            KEY_GENERAL_QUANTIZATION_VERSION, quantization_version)
+
+    def add_custom_alignment(self, alignment: int):
+        self.data_alignment = alignment
+        self.add_uint32(KEY_GENERAL_ALIGNMENT, alignment)
+
+    def add_context_length(self, length: int):
+        self.add_uint32(
+            KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length)
+
+    def add_embedding_length(self, length: int):
+        self.add_uint32(
+            KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length)
+
+    def add_block_count(self, length: int):
+        self.add_uint32(
+            KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length)
+
+    def add_feed_forward_length(self, length: int):
+        self.add_uint32(
+            KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
+
+    def add_parallel_residual(self, use: bool):
+        self.add_bool(
+            KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
+
+    def add_tensor_data_layout(self, layout: str):
+        self.add_string(
+            KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
+
+    def add_head_count(self, count: int):
+        self.add_uint32(
+            KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
+
+    def add_head_count_kv(self, count: int):
+        self.add_uint32(
+            KEY_ATTENTION_HEAD_COUNT_KV.format(arch=self.arch), count)
+
+    def add_max_alibi_bias(self, bias: float):
+        self.add_float32(
+            KEY_ATTENTION_MAX_ALIBI_BIAS.format(arch=self.arch), bias)
+
+    def add_clamp_kqv(self, value: float):
+        self.add_float32(
+            KEY_ATTENTION_CLAMP_KQV.format(arch=self.arch), value)
+
+    def add_layer_norm_eps(self, value: float):
+        self.add_float32(
+            KEY_ATTENTION_LAYERNORM_EPS.format(arch=self.arch), value)
+
+    def add_layer_norm_rms_eps(self, value: float):
+        self.add_float32(
+            KEY_ATTENTION_LAYERNORM_RMS_EPS.format(arch=self.arch), value)
+
+    def add_rope_dimension_count(self, count: int):
+        self.add_uint32(
+            KEY_ROPE_DIMENSION_COUNT.format(arch=self.arch), count)
+
+    def add_rope_scale_linear(self, value:  float):
+        self.add_float32(KEY_ROPE_SCALE_LINEAR.format(arch=self.arch), value)
+
+    def add_tokenizer_model(self, model: str):
+        self.add_string(KEY_TOKENIZER_MODEL, model)
+
+    def add_token_list(self, tokens: List):
+        self.add_array(KEY_TOKENIZER_LIST, tokens)
+
+    def add_token_merges(self, merges: List):
+        self.add_array(KEY_TOKENIZER_MERGES, merges)
+
+    def add_token_types(self, types: List[int]):
+        self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)
+
+    def add_token_scores(self, scores: List[float]):
+        self.add_array(KEY_TOKENIZER_SCORES, scores)
+
+    def add_bos_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_BOS_ID, id)
+
+    def add_eos_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_EOS_ID, id)
+
+    def add_unk_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_UNK_ID, id)
+
+    def add_sep_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_SEP_ID, id)
+
+    def add_pad_token_id(self, id: int):
+        self.add_uint32(KEY_TOKENIZER_PAD_ID, id)
+
+
+# Example usage:
+if __name__ == "__main__":
+    # Example usage with a file
+    gguf_writer = GGUFWriter("example.gguf", "llama")
+
+    gguf_writer.add_architecture()
+    gguf_writer.add_block_count(12)
+    gguf_writer.add_uint32("answer", 42)  # Write a 32-bit integer
+    gguf_writer.add_float32("answer_in_float", 42.0)  # Write a 32-bit float
+    gguf_writer.add_custom_alignment(64)
+
+    tensor1 = np.ones((32,), dtype=np.float32) * 100.0
+    tensor2 = np.ones((64,), dtype=np.float32) * 101.0
+    tensor3 = np.ones((96,), dtype=np.float32) * 102.0
+
+    gguf_writer.add_tensor("tensor1", tensor1)
+    gguf_writer.add_tensor("tensor2", tensor2)
+    gguf_writer.add_tensor("tensor3", tensor3)
+
+    gguf_writer.write_header_to_file()
+    gguf_writer.write_kv_data_to_file()
+    gguf_writer.write_tensors_to_file()
+
+    gguf_writer.close()
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -0,0 +1,91 @@
+# GBNF Guide
+
+GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`.
+
+## Background
+
+[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
+
+## Basics
+
+In GBNF, we define *production rules* that specify how a *non-terminal* (rule name) can be replaced with sequences of *terminals* (characters, specifically Unicode [code points](https://en.wikipedia.org/wiki/Code_point)) and other non-terminals. The basic format of a production rule is `nonterminal ::= sequence...`.
+
+## Example
+
+Before going deeper, let's look at some of the features demonstrated in `grammars/chess.gbnf`, a small chess notation grammar:
+```
+# `root` specifies the pattern for the overall output
+root ::= (
+    # it must start with the characters "1. " followed by a sequence
+    # of characters that match the `move` rule, followed by a space, followed
+    # by another move, and then a newline
+    "1. " move " " move "\n"
+
+    # it's followed by one or more subsequent moves, numbered with one or two digits
+    ([1-9] [0-9]? ". " move " " move "\n")+
+)
+
+# `move` is an abstract representation, which can be a pawn, nonpawn, or castle.
+# The `[+#]?` denotes the possibility of checking or mate signs after moves
+move ::= (pawn | nonpawn | castle) [+#]?
+
+pawn ::= ...
+nonpawn ::= ...
+castle ::= ...
+```
+
+## Non-Terminals and Terminals
+
+Non-terminal symbols (rule names) stand for a pattern of terminals and other non-terminals. They are required to be a dashed lowercase word, like `move`, `castle`, or `check-mate`.
+
+Terminals are actual characters ([code points](https://en.wikipedia.org/wiki/Code_point)). They can be specified as a sequence like `"1"` or `"O-O"` or as ranges like `[1-9]` or `[NBKQR]`.
+
+## Characters and character ranges
+
+Terminals support the full range of Unicode. Unicode characters can be specified directly in the grammar, for example `hiragana ::= [ぁ-ゟ]`, or with escapes: 8-bit (`\xXX`), 16-bit (`\uXXXX`) or 32-bit (`\UXXXXXXXX`).
+
+Character ranges can be negated with `^`:
+```
+single-line ::= [^\n]+ "\n"`
+```
+
+## Sequences and Alternatives
+
+The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
+
+Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`.
+
+Parentheses `()` can be used to group sequences, which allows for embedding alternatives in a larger rule or applying repetition and optptional symbols (below) to a sequence.
+
+## Repetition and Optional Symbols
+
+- `*` after a symbol or sequence means that it can be repeated zero or more times.
+- `+` denotes that the symbol or sequence should appear one or more times.
+- `?` makes the preceding symbol or sequence optional.
+
+## Comments and newlines
+
+Comments can be specified with `#`:
+```
+# defines optional whitspace
+ws ::= [ \t\n]+
+```
+
+Newlines are allowed between rules and between symbols or sequences nested inside parentheses. Additionally, a newline after an alternate marker `|` will continue the current rule, even outside of parentheses.
+
+## The root rule
+
+In a full grammar, the `root` rule always defines the starting point of the grammar. In other words, it specifies what the entire output must match.
+
+```
+# a grammar for lists
+root ::= ("- " item)+
+item ::= [^\n]+ "\n"
+```
+
+## Next steps
+
+This guide provides a brief overview. Check out the GBNF files in this directory (`grammars/`) for examples of full grammars. You can try them out with:
+```
+./main -m <model> --grammar-file grammars/some-grammar.gbnf -p 'Some prompt'
+```
--- a/k_quants.c
+++ b/k_quants.c
@@ -77,6 +77,11 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
        }
        return 1/iscale;
    }
+    bool return_early = false;
+    if (rmse_type < 0) {
+        rmse_type = -rmse_type;
+        return_early = true;
+    }
    int weight_type = rmse_type%2;
    float sumlx = 0;
    float suml2 = 0;
@@ -89,56 +94,9 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
        suml2 += w*l*l;
    }
    float scale = sumlx/suml2;
+    if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
    float best = scale * sumlx;
-    for (int itry = 0; itry < 3; ++itry) {
-        iscale = 1/scale;
-        float slx = 0;
-        float sl2 = 0;
-        bool changed = false;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            l = MAX(-nmax, MIN(nmax-1, l));
-            if (l + nmax != L[i]) { changed = true; }
-            float w = weight_type == 1 ? x[i] * x[i] : 1.f;
-            slx += w*x[i]*l;
-            sl2 += w*l*l;
-        }
-        if (!changed || sl2 == 0 || slx*slx <= best*sl2) { break; }
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
-        }
-        sumlx = slx; suml2 = sl2;
-        scale = sumlx/suml2;
-        best = scale * sumlx;
-    }
-    for (int itry = 0; itry < 5; ++itry) {
-        int n_changed = 0;
-        for (int i = 0; i < n; ++i) {
-            float w = weight_type == 1 ? x[i]*x[i] : 1;
-            int l = L[i] - nmax;
-            float slx = sumlx - w*x[i]*l;
-            if (slx > 0) {
-                float sl2 = suml2 - w*l*l;
-                int new_l = nearest_int(x[i] * sl2 / slx);
-                new_l = MAX(-nmax, MIN(nmax-1, new_l));
-                if (new_l != l) {
-                    slx += w*x[i]*new_l;
-                    sl2 += w*new_l*new_l;
-                    if (sl2 > 0 && slx*slx*suml2 > sumlx*sumlx*sl2) {
-                        L[i] = nmax + new_l; sumlx = slx; suml2 = sl2;
-                        scale = sumlx / suml2; best = scale * sumlx;
-                        ++n_changed;
-                    }
-                }
-            }
-        }
-        if (!n_changed) { break; }
-    }
-    if (rmse_type < 3) {
-        return scale;
-    }
-    for (int is = -4; is <= 4; ++is) {
+    for (int is = -9; is <= 9; ++is) {
        if (is == 0) {
            continue;
        }
@@ -221,12 +179,17 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t *
    return 1/iscale;
 }

-static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, int ntry) {
+static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min,
+        int ntry, float alpha) {
    float min = x[0];
    float max = x[0];
+    float sum_x = 0;
+    float sum_x2 = 0;
    for (int i = 1; i < n; ++i) {
        if (x[i] < min) min = x[i];
        if (x[i] > max) max = x[i];
+        sum_x += x[i];
+        sum_x2 += x[i]*x[i];
    }
    if (max == min) {
        for (int i = 0; i < n; ++i) L[i] = 0;
@@ -254,7 +217,7 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
        for (int i = 0; i < n; ++i) {
            sum += x[i] - scale*L[i];
        }
-        min = sum/n;
+        min = alpha*min + (1 - alpha)*sum/n;
        if (min > 0) min = 0;
        iscale = 1/scale;
        if (!did_change) break;
@@ -263,6 +226,82 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
    return scale;
 }

+static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
+        uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
+        float rmin, float rdelta, int nstep, bool use_mad) {
+    float min = x[0];
+    float max = x[0];
+    float sum_w = weights[0];
+    float sum_x = sum_w * x[0];
+    for (int i = 1; i < n; ++i) {
+        if (x[i] < min) min = x[i];
+        if (x[i] > max) max = x[i];
+        float w = weights[i];
+        sum_w += w;
+        sum_x += w * x[i];
+    }
+    if (min > 0) min = 0;
+    if (max == min) {
+        for (int i = 0; i < n; ++i) L[i] = 0;
+        *the_min = -min;
+        return 0.f;
+    }
+    float iscale = nmax/(max - min);
+    float scale = 1/iscale;
+    float best_mad = 0;
+    for (int i = 0; i < n; ++i) {
+        int l = nearest_int(iscale*(x[i] - min));
+        L[i] = MAX(0, MIN(nmax, l));
+        float diff = scale * L[i] + min - x[i];
+        diff = use_mad ? fabsf(diff) : diff * diff;
+        float w = weights[i];
+        best_mad += w * diff;
+    }
+    if (nstep < 1) {
+        *the_min = -min;
+        return scale;
+    }
+    for (int is = 0; is <= nstep; ++is) {
+        iscale = (rmin + rdelta*is + nmax)/(max - min);
+        float sum_l = 0, sum_l2 = 0, sum_xl = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale*(x[i] - min));
+            l = MAX(0, MIN(nmax, l));
+            Laux[i] = l;
+            float w = weights[i];
+            sum_l += w*l;
+            sum_l2 += w*l*l;
+            sum_xl += w*l*x[i];
+        }
+        float D = sum_w * sum_l2 - sum_l * sum_l;
+        if (D > 0) {
+            float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
+            float this_min   = (sum_l2 * sum_x - sum_l * sum_xl)/D;
+            if (this_min > 0) {
+                this_min = 0;
+                this_scale = sum_xl / sum_l2;
+            }
+            float mad = 0;
+            for (int i = 0; i < n; ++i) {
+                float diff = this_scale * Laux[i] + this_min - x[i];
+                diff = use_mad ? fabsf(diff) : diff * diff;
+                float w = weights[i];
+                mad += w * diff;
+            }
+            if (mad < best_mad) {
+                for (int i = 0; i < n; ++i) {
+                    L[i] = Laux[i];
+                }
+                best_mad = mad;
+                scale = this_scale;
+                min = this_min;
+            }
+        }
+    }
+    *the_min = -min;
+    return scale;
+}
+
 #if QK_K == 256
 static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) {
    if (j < 4) {
@@ -281,6 +320,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
    const int nb = k / QK_K;

    uint8_t L[QK_K];
+    uint8_t Laux[16];
+    float   weights[16];
    float mins[QK_K/16];
    float scales[QK_K/16];

@@ -291,7 +332,8 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict
        float max_scale = 0; // as we are deducting the min, scales are always positive
        float max_min = 0;
        for (int j = 0; j < QK_K/16; ++j) {
-            scales[j] = make_qkx1_quants(16, 3, x + 16*j, L + 16*j, &mins[j], 5);
+            for (int l = 0; l < 16; ++l) weights[l] = fabsf(x[16*j + l]);
+            scales[j] = make_qkx2_quants(16, 3, x + 16*j, weights, L + 16*j, &mins[j], Laux, -0.5f, 0.1f, 15, true);
            float scale = scales[j];
            if (scale > max_scale) {
                max_scale = scale;
@@ -637,6 +679,8 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
    const int nb = k / QK_K;

    uint8_t L[QK_K];
+    uint8_t Laux[32];
+    float   weights[32];
    float mins[QK_K/32];
    float scales[QK_K/32];

@@ -645,7 +689,12 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
        float max_scale = 0; // as we are deducting the min, scales are always positive
        float max_min = 0;
        for (int j = 0; j < QK_K/32; ++j) {
-            scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 5);
+            //scales[j] = make_qkx1_quants(32, 15, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
+            float sum_x2 = 0;
+            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
+            float av_x = sqrtf(sum_x2/32);
+            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
+            scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
            float scale = scales[j];
            if (scale > max_scale) {
                max_scale = scale;
@@ -798,6 +847,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
    uint8_t L[QK_K];
    float mins[QK_K/32];
    float scales[QK_K/32];
+    float weights[32];
+    uint8_t Laux[32];
 #else
    int8_t L[QK_K];
    float scales[QK_K/16];
@@ -810,7 +861,12 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
        float max_scale = 0; // as we are deducting the min, scales are always positive
        float max_min = 0;
        for (int j = 0; j < QK_K/32; ++j) {
-            scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 5);
+            //scales[j] = make_qkx1_quants(32, 31, x + 32*j, L + 32*j, &mins[j], 9, 0.5f);
+            float sum_x2 = 0;
+            for (int l = 0; l < 32; ++l) sum_x2 += x[32*j + l] * x[32*j + l];
+            float av_x = sqrtf(sum_x2/32);
+            for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
+            scales[j] = make_qkx2_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.5f, 0.1f, 15, false);
            float scale = scales[j];
            if (scale > max_scale) {
                max_scale = scale;
--- a/kompute/.ccls
+++ b/kompute/.ccls
@@ -1,27 +0,0 @@
-
-%clang
-
-fdeclspec
-fms-extensions
-Wall
-Wextra
-std=c++17
-
-%h -x
-%h c++-header
-
-DDEBUG=1
-DKOMPUTE_INCLUDE_FOR_SYNTAX
-
-I/usr/include/python3.6/
-I./python/pybind11/include/
-
-I./build/_deps/vulkan_header-src/include/
-I./build/_deps/spdlog-src/include/
-I./build/_deps/googletest-src/googletest/include/
-I./build/_deps/fmt-src/include/
-
-I./src/include/
-I./build/src/shaders/glsl/
-I./build/test/shaders/glsl/
-I./test/utils/
--- a/kompute/.clang-format
+++ b/kompute/.clang-format
@@ -1,5 +0,0 @@
---
-BasedOnStyle: Mozilla
-IndentWidth: 4
-
-...
--- a/kompute/.dockerignore
+++ b/kompute/.dockerignore
@@ -1,4 +0,0 @@
-build/*
-examples/*
-docker-builders/
-swiftshader/
--- a/kompute/.github/workflows/cpp_examples.yml
+++ b/kompute/.github/workflows/cpp_examples.yml
@@ -1,58 +0,0 @@
-name: C++ Tests
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  array-multiplication-example:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/examples/array_multiplication/build
-        source-dir: ${{github.workspace}}/examples/array_multiplication
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
-        build-options: --parallel # Given we don't build too many resources we can leverage parallel
-    - name: Run tests
-      run: ./examples/array_multiplication/build/src/kompute_array_mult
-
-  logistc-regression-example:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/examples/logistic_regression/build
-        source-dir: ${{github.workspace}}/examples/logistic_regression
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON KOMPUTE_OPT_FROM_SOURCE=ON
-        build-options: --parallel # Given we don't build too many resources we can leverage parallel
-    - name: Run tests
-      run: ./examples/logistic_regression/build/src/kompute_logistic_regression
--- a/kompute/.github/workflows/cpp_tests.yml
+++ b/kompute/.github/workflows/cpp_tests.yml
@@ -1,104 +0,0 @@
-name: C++ Tests
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  cpp-tests-debug-with-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
-
-  cpp-tests-release-with-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Release
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=OFF -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
-
-  cpp-tests-debug-without-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Debug
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
-  
-  cpp-tests-release-without-debug-layers:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    env:
-      VK_ICD_FILENAMES: "/swiftshader/vk_swiftshader_icd.json"
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: "[Release g++] Build & Test"
-      uses: KomputeProject/action-cmake-build@master
-      with:
-        build-dir: ${{github.workspace}}/build
-        source-dir: ${{github.workspace}}
-        cc: gcc
-        cxx: g++
-        build-type: Release
-        run-test: false
-        ctest-options: -V
-        configure-options: -DKOMPUTE_OPT_BUILD_TESTS=ON -DKOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=ON
-    - name: Run tests
-      run: make mk_run_tests
--- a/kompute/.github/workflows/python_tests.yml
+++ b/kompute/.github/workflows/python_tests.yml
@@ -1,28 +0,0 @@
-name: Python Tests
-
-on:
-  push:
-    branches: [ master ]
-  pull_request:
-    branches: [ master ]
-
-jobs:
-  python-tests:
-    runs-on: ubuntu-latest
-    container: axsauze/kompute-builder:0.4
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v3
-      with:
-        submodules: false
-    - name: Install Python Requirements
-      run: pip3 install --user -r python/test/requirements-dev.txt
-    - name: Python Build
-      env:
-        KOMPUTE_PYTHON_NUM_PARALLEL_THREADS: 2
-        KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER: ON
-      run: pip3 install --user . -v
-    - name: Python run Tests
-      run: |
-        export VK_ICD_FILENAMES=/swiftshader/vk_swiftshader_icd.json
-        make test_python
--- a/kompute/CMakeLists.txt
+++ b/kompute/CMakeLists.txt
@@ -1,189 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-cmake_minimum_required(VERSION 3.20)
-project(kompute VERSION 0.8.1 LANGUAGES CXX)
-
-set(CMAKE_CXX_STANDARD 14)
-
-# Only change the folder behavior if kompute is not a subproject
-if(${CMAKE_PROJECT_NAME} STREQUAL ${PROJECT_NAME})
-    set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-    set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMake")
-    set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}/bin)
-    set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}/lib)
-endif()
-
-# Avoid the dll boilerplate code for windows
-set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake;${CMAKE_MODULE_PATH}")
-
-set(KOMPUTE_LIBRARIES kompute CACHE INTERNAL "")
-
-# ####################################################
-# Options
-# ####################################################
-macro(kompute_option OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
-    option(${OPTION_NAME} ${OPTION_TEXT} ${OPTION_DEFAULT})
-
-    if(DEFINED ENV{${OPTION_NAME}})
-        # Allow overriding the option through an environment variable
-        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
-    endif()
-
-    if(${OPTION_NAME})
-        add_definitions(-D${OPTION_NAME})
-    endif()
-
-    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
-endmacro()
-
-macro(kompute_log_level OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
-    set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
-    set_property(CACHE ${OPTION_NAME} PROPERTY STRINGS "Trace" "Debug" "Info" "Warn" "Error" "Critical" "Default" "Off")
-
-    if(DEFINED ENV{${OPTION_NAME}})
-        # Allow setting the option through an environment variable
-        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
-    endif()
-
-    if(${OPTION_NAME})
-        add_definitions(-D${OPTION_NAME})
-    endif()
-
-    # Allow disabling logging completely and prevent linking against it:
-    if(${KOMPUTE_OPT_LOG_LEVEL} STREQUAL "Off")
-        set(${OPTION_NAME}_DISABLED ON)
-        add_compile_definitions(${OPTION_NAME}_DISABLED=1)
-    endif()
-
-    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
-endmacro()
-
-macro(kompute_option_string OPTION_NAME OPTION_TEXT OPTION_DEFAULT)
-    set(${OPTION_NAME} ${OPTION_DEFAULT} CACHE STRING ${OPTION_TEXT})
-
-    if(DEFINED ENV{${OPTION_NAME}})
-        # Allow setting the option through an environment variable
-        set(${OPTION_NAME} $ENV{${OPTION_NAME}})
-    endif()
-
-    if(${OPTION_NAME})
-        add_definitions(-D${OPTION_NAME})
-    endif()
-
-    message(STATUS "  ${OPTION_NAME}: ${${OPTION_NAME}}")
-endmacro()
-
-message(STATUS "General purpose GPU compute framework built on Vulkan")
-message(STATUS "=======================================================")
-
-# Build options
-kompute_log_level(KOMPUTE_OPT_LOG_LEVEL "Internally we use Spdlog or fmt for logging, depending on the value of 'KOMPUTE_OPT_USE_SPDLOG'. The log level used can be changed here. Possible values: 'Trace', 'Debug', 'Info', 'Warn', 'Error', 'Critical', 'Off', 'Default'. If set to 'Off' logging will be deactivated completely. If set to 'Default', the log level will be set to 'Info' for release builds and 'Debug' else." "Off")
-kompute_option(KOMPUTE_OPT_USE_SPDLOG "If enabled, logging via KP_LOG_<DEBUG, INFO, etc...> will happen through Spdlog instead of plan fmt." OFF)
-kompute_option(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS "Explicitly disable debug layers even on debug." ON)
-kompute_option(KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK "Whether to check if your driver supports the Vulkan Header version you are linking against. This might be useful in case you build shared on a different system than you run later." OFF)
-kompute_option(KOMPUTE_OPT_BUILD_SHADERS "Rebuilds all compute shaders during compilation and does not use the already precompiled versions. Requires glslangValidator to be installed on your system." OFF)
-
-# External components
-kompute_option(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG "Use the built-in version of Spdlog. Requires 'KOMPUTE_OPT_USE_SPDLOG' to be set to ON in order to have any effect." ON)
-kompute_option(KOMPUTE_OPT_SPDLOG_ASYNC_MODE "If spdlog is enabled this allows for selecting whether the default logger setup creates sync or async logger" OFF)
-kompute_option(KOMPUTE_OPT_USE_BUILT_IN_FMT "Use the built-in version of fmt." ON)
-kompute_option(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER "Use the built-in version of Vulkan Headers. This could be helpful in case your system Vulkan Headers are too new for your driver. If you set this to OFF, please make sure your system Vulkan Headers are supported by your driver." ON)
-kompute_option_string(KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG "The git tag used for the built-in Vulkan Headers when 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER' is enabled. A list of tags can be found here: https://github.com/KhronosGroup/Vulkan-Headers/tags" "v1.3.231")
-message(STATUS "=======================================================")
-
-# ####################################################
-# Deprecated Options
-# ####################################################
-include(cmake/deprecation_warnings.cmake)
-
-# ####################################################
-# Dependencies
-# ####################################################
-include(cmake/vulkan_shader_compiler.cmake)
-include(cmake/check_vulkan_version.cmake)
-include(FetchContent)
-
-# Vulkan Header
-if(KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER)
-    FetchContent_Declare(vulkan_header GIT_REPOSITORY https://github.com/KhronosGroup/Vulkan-Headers.git
-        GIT_TAG ${KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG}) # Source: https://github.com/KhronosGroup/Vulkan-Headers/tags
-    FetchContent_MakeAvailable(vulkan_header)
-
-    if(NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
-        # Ensure the driver supports this Vulkan version
-        check_vulkan_version(INCLUDE_DIR "${vulkan_header_SOURCE_DIR}/include")
-    endif()
-endif()
-
-find_package(Vulkan REQUIRED)
-
-if(Vulkan_FOUND AND NOT TARGET Vulkan::Headers)
-    add_library(Vulkan::Headers INTERFACE IMPORTED)
-    set_target_properties(Vulkan::Headers PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES "${Vulkan_INCLUDE_DIRS}")
-endif()
-
-if(NOT KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER AND NOT KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK)
-    # Ensure the driver supports this Vulkan version
-    check_vulkan_version(INCLUDE_DIR ${Vulkan_INCLUDE_DIR})
-endif()
-
-# Spdlog
-if(KOMPUTE_OPT_USE_SPDLOG)
-    add_compile_definitions(KOMPUTE_OPT_USE_SPDLOG=1)
-
-    if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
-        if(KOMPUTE_OPT_USE_BUILT_IN_SPDLOG)
-            set(SPDLOG_BUILD_SHARED ${BUILD_SHARED_LIBS})
-
-            FetchContent_Declare(spdlog GIT_REPOSITORY https://github.com/gabime/spdlog.git
-                GIT_TAG v1.10.0) # Source: https://github.com/gabime/spdlog/releases
-            FetchContent_MakeAvailable(spdlog)
-        else()
-            find_package(spdlog REQUIRED)
-        endif()
-    endif()
-endif()
-
-# fmt
-if(KOMPUTE_OPT_USE_BUILT_IN_FMT)
-    FetchContent_Declare(fmt GIT_REPOSITORY https://github.com/fmtlib/fmt.git
-        GIT_TAG 10.0.0) # Source: https://github.com/fmtlib/fmt/releases
-    FetchContent_MakeAvailable(fmt)
-else()
-    find_package(fmt REQUIRED)
-endif()
-
-add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
-
-# ####################################################
-# Preprocessor Macros
-# ####################################################
-if(KOMPUTE_OPT_DISABLE_VK_DEBUG_LAYERS)
-    add_compile_definitions(KOMPUTE_DISABLE_VK_DEBUG_LAYERS=1)
-endif()
-
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic -Werror")
-endif()
-
-# If glslang is cloned, then SPIRV/GlslangToSpv.h will be used instead of glslang/SPIRV/GlslangToSpv.h
-# As after installation, SPIRV/ header files will be found in glslang/SPIRV/ , more info in #193
-if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
-    add_definitions(-DUSE_EXTERNAL_GLSLANG)
-endif()
-
-# Allow scripts to call main kompute Makefile
-function(kompute_make KOMPUTE_MAKE_TARGET)
-    add_custom_target(${KOMPUTE_MAKE_TARGET}
-        COMMAND make -C ${PROJECT_SOURCE_DIR} ${KOMPUTE_MAKE_TARGET})
-endfunction()
-
-add_executable(xxd external/bin/xxd.c)
-
-add_subdirectory(src)
--- a/kompute/LICENSE
+++ b/kompute/LICENSE
@@ -1,203 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright 2021 The Institute for Ethical AI & Machine Learning
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
--- a/kompute/Makefile
+++ b/kompute/Makefile
@@ -1,210 +0,0 @@
-# This makefile is optimized to be run from WSL and to interact with the 
-# Windows host as there are limitations when building GPU programs. This
-# makefile contains the commands for interacting with the visual studio
-# build via command line for faster iterations, as the intention is to 
-# support other editors (optimised for vim). There are also commands that
-# support the builds for linux-native compilations and these are the commands
-# starting with mk_.
-
-VERSION := $(shell cat ./VERSION)
-
-VCPKG_WIN_PATH ?= "C:\\Users\\axsau\\Programming\\lib\\vcpkg\\scripts\\buildsystems\\vcpkg.cmake"
-VCPKG_UNIX_PATH ?= "/c/Users/axsau/Programming/lib/vcpkg/scripts/buildsystems/vcpkg.cmake"
-
-# These are the tests that don't work with swiftshader but can be run directly with vulkan
-FILTER_TESTS ?= "-TestAsyncOperations.TestManagerParallelExecution:TestSequence.SequenceTimestamps:TestPushConstants.TestConstantsDouble"
-
-ifeq ($(OS),Windows_NT)     # is Windows_NT on XP, 2000, 7, Vista, 10...
-	CMAKE_BIN ?= "C:\Program Files\CMake\bin\cmake.exe"
-	SCMP_BIN="C:\\VulkanSDK\\1.2.141.2\\Bin32\\glslangValidator.exe"
-	MSBUILD_BIN ?= "C:\\Program Files (x86)\\Microsoft Visual Studio\\2019\\Community\\MSBuild\\Current\\Bin\\MSBuild.exe"
-else
-	CLANG_FORMAT_BIN ?= "/home/alejandro/Programming/lib/clang+llvm-10.0.0-x86_64-linux-gnu-ubuntu-18.04/bin/clang-format"
-	CMAKE_BIN ?= "/c/Program Files/CMake/bin/cmake.exe"
-	MSBUILD_BIN ?= "/c/Program Files (x86)/Microsoft Visual Studio/2019/Community/MSBuild/Current/Bin/MSBuild.exe"
-	# Choosing the binary based on whether it's on WSL or linux-native
-	KERNEL := $(shell uname -r)
-	IS_WSL := $(shell (if [[ "$(KERNEL)" =~ Microsoft$  ]]; then echo '0'; fi))
-	ifeq ($(IS_WSL),0)
-		SCMP_BIN ?= "/c/VulkanSDK/1.2.141.2/Bin32/glslangValidator.exe"
-	else
-		SCMP_BIN ?= "/usr/bin/glslangValidator"
-	endif
-endif
-
-
-####### Main Target Rules #######
-
-push_docs_to_ghpages:
-	GIT_DEPLOY_DIR="build/docs/sphinx/" \
-		GIT_DEPLOY_BRANCH="gh-pages" \
-		GIT_DEPLOY_REPO="origin" \
-			./scripts/push_folder_to_branch.sh
-
-####### CMAKE quickstart commands #######
-
-clean_cmake:
-	rm -rf build/
-
-####### Visual studio build shortcut commands #######
-
-MK_BUILD_TYPE ?= "Release"
-MK_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
-MK_CMAKE_EXTRA_FLAGS ?= ""
-MK_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
-
-mk_cmake:
-	cmake \
-		-Bbuild \
-		-DCMAKE_CXX_FLAGS=$(MK_KOMPUTE_EXTRA_CXX_FLAGS) \
-		-DCMAKE_BUILD_TYPE=$(MK_BUILD_TYPE) \
-		-DCMAKE_INSTALL_PREFIX=$(MK_INSTALL_PATH) \
-		-DKOMPUTE_OPT_INSTALL=ON \
-		-DKOMPUTE_OPT_BUILD_TESTS=ON \
-		-DKOMPUTE_OPT_BUILD_DOCS=ON \
-		-DKOMPUTE_OPT_BUILD_SHADERS=ON \
-		-DKOMPUTE_OPT_CODE_COVERAGE=ON \
-		-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-		-DKOMPUTE_OPT_LOG_LEVEL=Debug \
-		$(MK_CMAKE_EXTRA_FLAGS) \
-		-G "Unix Makefiles"
-
-mk_build_all:
-	cmake --build build/. --parallel
-
-mk_build_docs:
-	cmake --build build/. --target gendocsall --parallel
-
-mk_build_kompute:
-	cmake --build build/. --target kompute --parallel
-
-mk_build_tests:
-	cmake --build build/. --target kompute_tests --parallel
-
-mk_run_docs: mk_build_docs
-	(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
-
-# An alternative would be: ctest -vv --test-dir build/.
-# But this is not possible since we need to filter specific tests, not complete executables, which is not possible with ctest.
-# https://gitlab.kitware.com/cmake/cmake/-/issues/13168 
-mk_run_tests: mk_build_tests
-	./build/bin/kompute_tests --gtest_filter=$(FILTER_TESTS)
-
-mk_build_swiftshader_library:
-	git clone https://github.com/google/swiftshader || echo "Assuming already cloned"
-	# GCC 8 or above is required otherwise error on "filesystem" lib will appear
-	CC="/usr/bin/gcc-8" CXX="/usr/bin/g++-8" cmake swiftshader/. -Bswiftshader/build/
-	cmake --build swiftshader/build/. --parallel
-
-mk_run_tests_cpu: export VK_ICD_FILENAMES=$(PWD)/swiftshader/build/vk_swiftshader_icd.json
-mk_run_tests_cpu: mk_build_swiftshader_library mk_build_tests mk_run_tests_cpu_only
-
-
-####### Visual studio build shortcut commands #######
-
-VS_BUILD_TYPE ?= "Debug"
-# Run with multiprocessin / parallel build by default
-VS_CMAKE_EXTRA_FLAGS ?= ""
-VS_KOMPUTE_EXTRA_CXX_FLAGS ?= ""
-VS_INSTALL_PATH ?= "build/src/CMakeFiles/Export/" # Set to "" if prefer default
-
-vs_cmake:
-	$(CMAKE_BIN) \
-		-Bbuild \
-		$(VS_CMAKE_EXTRA_FLAGS) \
-		-DCMAKE_TOOLCHAIN_FILE=$(VCPKG_WIN_PATH) \
-		-DCMAKE_CXX_FLAGS=$(VS_KOMPUTE_EXTRA_CXX_FLAGS) \
-		-DCMAKE_INSTALL_PREFIX=$(VS_INSTALL_PATH) \
-		-DKOMPUTE_OPT_INSTALL=ON \
-		-DKOMPUTE_OPT_BUILD_TESTS=ON \
-		-DKOMPUTE_OPT_BUILD_SHADERS=ON \
-		-DKOMPUTE_OPT_CODE_COVERAGE=OFF \
-		-DKOMPUTE_OPT_BUILD_DOCS=OFF \
-		-G "Visual Studio 16 2019" \
-		-DCMAKE_BUILD_TYPE=$(VS_BUILD_TYPE)
-
-vs_build_all:
-	cmake --build build/. --parallel
-
-vs_build_docs:
-	cmake --build build/. --target gendocsall --parallel
-
-vs_install_kompute:
-	cmake --build build/. --target install --parallel
-
-vs_build_kompute:
-	cmake --build build/. --target kompute --parallel
-
-vs_build_tests:
-	cmake --build build/. --target kompute_tests --parallel
-
-vs_run_docs: vs_build_docs
-	(cd build/docs/sphinx && python2.7 -m SimpleHTTPServer)
-
-vs_run_tests: vs_build_tests
-	./build/test/$(VS_BUILD_TYPE)/bin/kompute_tests.exe --gtest_filter=$(FILTER_TESTS)
-
-
-#### PYTHONG ####
-
-test_python:
-	python3 -m pytest -s --log-cli-level=DEBUG -v python/test/
-
-####### Run CI Commands #######
-
-# This command uses act to replicate github action
-# https://github.com/nektos/act
-run_ci:
-	act
-
-####### General project commands #######
-
-generate_python_docstrings:
-	python -m pybind11_mkdoc \
-		-o python/src/docstrings.hpp \
-		kompute/Kompute.hpp \
-		-Iexternal/fmt/include/ \
-		-Iexternal/spdlog/include/ \
-		-Iexternal/glslang/ \
-		-I/usr/include/c++/7.5.0/
-
-install_python_reqs:
-	python3 -m pip install -r scripts/requirements.txt
-
-install_lcov:
-	sudo apt install lcov -y
-
-build_shaders:
-	python3 scripts/convert_shaders.py \
-		--shader-path shaders/glsl \
-		--shader-binary $(SCMP_BIN) \
-		--header-path src/include/kompute/shaders/ \
-		-v
-	python3 scripts/convert_shaders.py \
-		--shader-path test/shaders/glsl \
-		--shader-binary $(SCMP_BIN) \
-		--header-path test/compiled_shaders_include/kompute_test/shaders/ \
-		-v
-
-build_single_header:
-	quom \
-		--include_directory \
-		"src/include/" \
-		"single_include/AggregateHeaders.cpp" \
-		"single_include/kompute/Kompute.hpp"
-
-win_build_xxd:
-	cd external/bin/ && gcc.exe -o xxd.exe xxd.c -DCYGWIN
-
-format:
-	for val in "examples single_include src test" ; do \
-    	find $$val -depth -iname *.h -or -iname *.c -or -iname *.hpp -or -iname *.cpp | grep -v "shaders" | xargs $(CLANG_FORMAT_BIN) -style=file -i; \
-	done
-
-static_scan:
-	cppcheck --project=build/compile_commands.json -iexternal/
-
-build_changelog:
-	docker run --rm -it -v "$(PWD)":/usr/local/src/your-app -e CHANGELOG_GITHUB_TOKEN=${CHANGELOG_GITHUB_TOKEN} ferrarimarco/github-changelog-generator:1.15.2 -u KomputeProject -p kompute
-	chmod 664 CHANGELOG.md # (Read+Write, Read+Write, Read)
-	sed -i -e 's/\(HEAD\|Unreleased\)/v${VERSION}/g' CHANGELOG.md # Replacing unreleased version with latest tag
--- a/kompute/README.md
+++ b/kompute/README.md
@@ -1,513 +0,0 @@
-
-![GitHub](https://img.shields.io/badge/Version-0.7.0-green.svg)
-![GitHub](https://img.shields.io/badge/C++-14—20-purple.svg)
-![GitHub](https://img.shields.io/badge/Build-cmake-red.svg)
-![GitHub](https://img.shields.io/badge/Python-3.7—3.9-blue.svg)
-![GitHub](https://img.shields.io/badge/License-Apache-black.svg)
-[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/4834/badge)](https://bestpractices.coreinfrastructure.org/projects/4834)
-
-<table>
-<tr>
-
-<td width="20%">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute.jpg">
-</td>
-
-<td>
-
-<h1>Kompute</h1>
-<h3>The general purpose GPU compute framework for cross vendor graphics cards (AMD, Qualcomm, NVIDIA & friends)</h3>
-
-</td>
-
-</tr>
-</table>
-
-<h4>Blazing fast, mobile-enabled, asynchronous, and optimized for advanced GPU acceleration usecases.</h4>
-
-💬 [Join the Discord & Community Calls](https://kompute.cc/overview/community.html) 🔋 [Documentation](https://kompute.cc) 💻 [Blog Post](https://medium.com/@AxSaucedo/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) ⌨ [Examples](#more-examples) 💾
-
-<hr>
-
-##### Kompute is backed by the Linux Foundation as a <a href="https://lfaidata.foundation/blog/2021/08/26/kompute-joins-lf-ai-data-as-new-sandbox-project/">hosted project</a> by the LF AI & Data Foundation.
-
-<table>
-<tr>
-<td>
-<a href="https://www.linuxfoundation.org/projects/">
-<img src="https://upload.wikimedia.org/wikipedia/commons/b/b5/Linux_Foundation_logo.png">
-</a>
-</td>
-<td>
-<a href="https://lfaidata.foundation/projects/">
-<img src="https://raw.githubusercontent.com/lfai/artwork/main/lfaidata-assets/lfaidata/horizontal/color/lfaidata-horizontal-color.png">
-</a>
-</td>
-</tr>
-</table>
-
-
-## Principles & Features
-
-* [Flexible Python module](#your-first-kompute-python) with [C++ SDK](#your-first-kompute-c) for optimizations
-* [Asynchronous & parallel processing](#asynchronous-and-parallel-operations) support through GPU family queues
-* [Mobile enabled](#mobile-enabled) with examples via Android NDK across several architectures
-* BYOV: [Bring-your-own-Vulkan design](#motivations) to play nice with existing Vulkan applications
-* Explicit relationships for GPU and host [memory ownership and memory management](https://kompute.cc/overview/memory-management.html)
-* Robust codebase with [90% unit test code coverage](https://kompute.cc/codecov/)
-* Advanced use-cases on [machine learning 🤖](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a), [mobile development 📱](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617) and [game development 🎮](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0).
-* Active community with [monthly calls, discord chat and more](https://kompute.cc/overview/community.html)
-
-![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/komputer-logos.gif)
-
-## Getting Started
-
-Below you can find a GPU multiplication example using the C++ and Python Kompute interfaces.
-
-You can [join the Discord](https://discord.gg/MaH5Jv5zwv) for questions / discussion, open a [github issue](https://github.com/KomputeProject/kompute/issues/new), or read [the documentation](https://kompute.cc/).
-
-### Your First Kompute (C++)
-
-The C++ interface provides low level access to the native components of Kompute, enabling for [advanced optimizations](https://kompute.cc/overview/async-parallel.html) as well as [extension of components](https://kompute.cc/overview/reference.html).
-
-```c++
-
-void kompute(const std::string& shader) {
-
-    // 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
-    kp::Manager mgr; 
-
-    // 2. Create and initialise Kompute Tensors through manager
-
-    // Default tensor constructor simplifies creation of float values
-    auto tensorInA = mgr.tensor({ 2., 2., 2. });
-    auto tensorInB = mgr.tensor({ 1., 2., 3. });
-    // Explicit type constructor supports uint32, int32, double, float and bool
-    auto tensorOutA = mgr.tensorT<uint32_t>({ 0, 0, 0 });
-    auto tensorOutB = mgr.tensorT<uint32_t>({ 0, 0, 0 });
-
-    std::vector<std::shared_ptr<kp::Tensor>> params = {tensorInA, tensorInB, tensorOutA, tensorOutB};
-
-    // 3. Create algorithm based on shader (supports buffers & push/spec constants)
-    kp::Workgroup workgroup({3, 1, 1});
-    std::vector<float> specConsts({ 2 });
-    std::vector<float> pushConstsA({ 2.0 });
-    std::vector<float> pushConstsB({ 3.0 });
-
-    auto algorithm = mgr.algorithm(params,
-                                   // See documentation shader section for compileSource
-                                   compileSource(shader),
-                                   workgroup,
-                                   specConsts,
-                                   pushConstsA);
-
-    // 4. Run operation synchronously using sequence
-    mgr.sequence()
-        ->record<kp::OpTensorSyncDevice>(params)
-        ->record<kp::OpAlgoDispatch>(algorithm) // Binds default push consts
-        ->eval() // Evaluates the two recorded operations
-        ->record<kp::OpAlgoDispatch>(algorithm, pushConstsB) // Overrides push consts
-        ->eval(); // Evaluates only last recorded operation
-
-    // 5. Sync results from the GPU asynchronously
-    auto sq = mgr.sequence();
-    sq->evalAsync<kp::OpTensorSyncLocal>(params);
-
-    // ... Do other work asynchronously whilst GPU finishes
-
-    sq->evalAwait();
-
-    // Prints the first output which is: { 4, 8, 12 }
-    for (const float& elem : tensorOutA->vector()) std::cout << elem << "  ";
-    // Prints the second output which is: { 10, 10, 10 }
-    for (const float& elem : tensorOutB->vector()) std::cout << elem << "  ";
-
-} // Manages / releases all CPU and GPU memory resources
-
-int main() {
-
-    // Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
-    // files). This shader shows some of the main components including constants, buffers, etc
-    std::string shader = (R"(
-        #version 450
-
-        layout (local_size_x = 1) in;
-
-        // The input tensors bind index is relative to index in parameter passed
-        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
-        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
-        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
-        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
-
-        // Kompute supports push constants updated on dispatch
-        layout(push_constant) uniform PushConstants {
-            float val;
-        } push_const;
-
-        // Kompute also supports spec constants on initalization
-        layout(constant_id = 0) const float const_one = 0;
-
-        void main() {
-            uint index = gl_GlobalInvocationID.x;
-            out_a[index] += uint( in_a[index] * in_b[index] );
-            out_b[index] += uint( const_one * push_const.val );
-        }
-    )");
-
-    // Run the function declared above with our raw string shader
-    kompute(shader);
-}
-
-```
-
-### Your First Kompute (Python)
-
-The [Python package](https://kompute.cc/overview/python-package.html) provides a [high level interactive interface](https://kompute.cc/overview/python-reference.html) that enables for experimentation whilst ensuring high performance and fast development workflows.
-
-```python
-
-from .utils import compile_source # using util function from python/test/utils
-
-def kompute(shader):
-    # 1. Create Kompute Manager with default settings (device 0, first queue and no extensions)
-    mgr = kp.Manager()
-
-    # 2. Create and initialise Kompute Tensors through manager
-
-    # Default tensor constructor simplifies creation of float values
-    tensor_in_a = mgr.tensor([2, 2, 2])
-    tensor_in_b = mgr.tensor([1, 2, 3])
-    # Explicit type constructor supports uint32, int32, double, float and bool
-    tensor_out_a = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
-    tensor_out_b = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
-
-    params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]
-
-    # 3. Create algorithm based on shader (supports buffers & push/spec constants)
-    workgroup = (3, 1, 1)
-    spec_consts = [2]
-    push_consts_a = [2]
-    push_consts_b = [3]
-
-    # See documentation shader section for compile_source
-    spirv = compile_source(shader)
-
-    algo = mgr.algorithm(params, spirv, workgroup, spec_consts, push_consts_a)
-
-    # 4. Run operation synchronously using sequence
-    (mgr.sequence()
-        .record(kp.OpTensorSyncDevice(params))
-        .record(kp.OpAlgoDispatch(algo)) # Binds default push consts provided
-        .eval() # evaluates the two recorded ops
-        .record(kp.OpAlgoDispatch(algo, push_consts_b)) # Overrides push consts
-        .eval()) # evaluates only the last recorded op
-
-    # 5. Sync results from the GPU asynchronously
-    sq = mgr.sequence()
-    sq.eval_async(kp.OpTensorSyncLocal(params))
-
-    # ... Do other work asynchronously whilst GPU finishes
-
-    sq.eval_await()
-
-    # Prints the first output which is: { 4, 8, 12 }
-    print(tensor_out_a)
-    # Prints the first output which is: { 10, 10, 10 }
-    print(tensor_out_b)
-
-if __name__ == "__main__":
-
-    # Define a raw string shader (or use the Kompute tools to compile to SPIRV / C++ header
-    # files). This shader shows some of the main components including constants, buffers, etc
-    shader = """
-        #version 450
-
-        layout (local_size_x = 1) in;
-
-        // The input tensors bind index is relative to index in parameter passed
-        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
-        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
-        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
-        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };
-
-        // Kompute supports push constants updated on dispatch
-        layout(push_constant) uniform PushConstants {
-            float val;
-        } push_const;
-
-        // Kompute also supports spec constants on initalization
-        layout(constant_id = 0) const float const_one = 0;
-
-        void main() {
-            uint index = gl_GlobalInvocationID.x;
-            out_a[index] += uint( in_a[index] * in_b[index] );
-            out_b[index] += uint( const_one * push_const.val );
-        }
-    """
-
-    kompute(shader)
-
-```
-
-### Interactive Notebooks & Hands on Videos
-
-You are able to try out the interactive Colab Notebooks which allow you to use a free GPU. The available examples are the Python and C++ examples below:
-
-<table>
-<tr>
-
-<td width="50%">
-<h5>Try the interactive <a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?usp=sharing">C++ Colab</a> from <a href="https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a">Blog Post</a></h5>
-</td>
-
-<td>
-<h5>Try the interactive <a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">Python Colab</a> from <a href="https://towardsdatascience.com/beyond-cuda-gpu-accelerated-python-for-machine-learning-in-cross-vendor-graphics-cards-made-simple-6cc828a45cc3">Blog Post</a></h5>
-</td>
-
-</tr>
-<tr>
-
-<td width="50%">
-<a href="https://colab.research.google.com/drive/1l3hNSq2AcJ5j2E3YIw__jKy5n6M615GP?authuser=1#scrollTo=1BipBsO-fQRD">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-cpp.jpg">
-</a>
-</td>
-
-<td>
-<a href="https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/binder-python.jpg">
-</a>
-</td>
-
-</tr>
-</table>
-
-
-You can also check out the two following talks presented at the FOSDEM 2021 conference. 
-
-Both videos have timestamps which will allow you to skip to the most relevant section for you - the intro & motivations for both is almost the same so you can skip to the more specific content.
-
-<table>
-<tr>
-
-<td width="50%">
-<h5>Watch the video for <a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">C++ Enthusiasts</a> </h5>
-</td>
-
-<td>
-<h5>Watch the video for <a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">Python & Machine Learning</a> Enthusiasts</h5>
-</td>
-
-</tr>
-<tr>
-
-<td width="50%">
-<a href="https://www.youtube.com/watch?v=Xz4fiQNmGSA">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-cpp-video.png">
-</a>
-</td>
-
-<td>
-<a href="https://www.youtube.com/watch?v=AJRyZ09IUdg">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-python-video.png">
-</a>
-</td>
-
-</tr>
-</table>
-
-
-## Architectural Overview
-
-The core architecture of Kompute includes the following:
-* [Kompute Manager](https://kompute.cc/overview/reference.html#manager) - Base orchestrator which creates and manages device and child components
-* [Kompute Sequence](https://kompute.cc/overview/reference.html#sequence) - Container of operations that can be sent to GPU as batch
-* [Kompute Operation (Base)](https://kompute.cc/overview/reference.html#algorithm) - Base class from which all operations inherit
-* [Kompute Tensor](https://kompute.cc/overview/reference.html#tensor) - Tensor structured data used in GPU operations
-* [Kompute Algorithm](https://kompute.cc/overview/reference.html#algorithm) - Abstraction for (shader) logic executed in the GPU
-
-To see a full breakdown you can read further in the [C++ Class Reference](https://kompute.cc/overview/reference.html).
-
-<table>
-<th>
-Full Architecture
-</th>
-<th>
-Simplified Kompute Components
-</th>
-<tr>
-<td width=30%>
-
-
-<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-vulkan-architecture.jpg">
-
-<br>
-<br>
-(very tiny, check the <a href="https://ethicalml.github.io/vulkan-kompute/overview/reference.html">full reference diagram in docs for details</a>)
-<br>
-<br>
-
-<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/suspicious.jfif">
-
-</td>
-<td>
-<img width="100%" src="https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/kompute-architecture.jpg">
-</td>
-</tr>
-</table>
-
-
-## Asynchronous and Parallel Operations
-
-Kompute provides flexibility to run operations in an asynrchonous way through vk::Fences. Furthermore, Kompute enables for explicit allocation of queues, which allow for parallel execution of operations across queue families.
-
-The image below provides an intuition on how Kompute Sequences can be allocated to different queues to enable parallel execution based on hardware. You can see the [hands on example](https://kompute.cc/overview/advanced-examples.html#parallel-operations), as well as the [detailed documentation page](https://kompute.cc/overview/async-parallel.html) describing how it would work using an NVIDIA 1650 as an example. 
-
-![](https://raw.githubusercontent.com/KomputeProject/kompute/master/docs/images/queue-allocation.jpg)
-
-## Mobile Enabled
-
-Kompute has been optimized to work in mobile environments. The [build system](#build-overview) enables for dynamic loading of the Vulkan shared library for Android environments, together with a working [Android NDK wrapper](https://github.com/KomputeProject/kompute/tree/master/vk_ndk_wrapper_include) for the CPP headers.
-
-<table>
-<tr>
-
-<td width="70%">
-<p>
-For a full deep dive you can read the blog post "<a href="https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617">Supercharging your Mobile Apps with On-Device GPU Accelerated Machine Learning</a>". 
-
-You can also access the <a href="https://github.com/KomputeProject/kompute/tree/v0.4.0/examples/android/android-simple">end-to-end example code</a> in the repository, which can be run using android studio.
-
-</p>
-
-
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-editor.jpg">
-
-</td>
-
-
-<td width="30%">
-<img src="https://raw.githubusercontent.com/KomputeProject/kompute/android-example/docs/images/android-kompute.jpg">
-</td>
-
-</tr>
-</table>
-
-## More examples
-
-### Simple examples
-
-* [Simple multiplication example](https://kompute.cc/overview/advanced-examples.html#simple-shader-example)
-* [Record batch commands with a Kompute Sequence](https://kompute.cc/overview/advanced-examples.html#record-batch-commands)
-* [Run Asynchronous Operations](https://kompute.cc/overview/advanced-examples.html#asynchronous-operations)
-* [Run Parallel Operations Across Multiple GPU Queues](https://kompute.cc/overview/advanced-examples.html#parallel-operations)
-* [Create your custom Kompute Operations](https://kompute.cc/overview/advanced-examples.html#your-custom-kompute-operation)
-* [Implementing logistic regression from scratch](https://kompute.cc/overview/advanced-examples.html#logistic-regression-example)
-
-### End-to-end examples
-
-* [Machine Learning Logistic Regression Implementation](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a)
-* [Parallelizing GPU-intensive Workloads via Multi-Queue Operations](https://towardsdatascience.com/parallelizing-heavy-gpu-workloads-via-multi-queue-operations-50a38b15a1dc)
-* [Android NDK Mobile Kompute ML Application](https://towardsdatascience.com/gpu-accelerated-machine-learning-in-your-mobile-applications-using-the-android-ndk-vulkan-kompute-1e9da37b7617)
-* [Game Development Kompute ML in Godot Engine](https://towardsdatascience.com/supercharging-game-development-with-gpu-accelerated-ml-using-vulkan-kompute-the-godot-game-engine-4e75a84ea9f0)
-
-## Python Package
-
-Besides the C++ core SDK you can also use the Python package of Kompute, which exposes the same core functionality, and supports interoperability with Python objects like Lists, Numpy Arrays, etc.
-
-The only dependencies are Python 3.5+ and Cmake 3.4.1+. You can install Kompute from the [Python pypi package](https://pypi.org/project/kp/) using the following command.
-
-```
-pip install kp
-```
-
-You can also install from master branch using:
-
-```
-pip install git+git://github.com/KomputeProject/kompute.git@master
-```
-
-For further details you can read the [Python Package documentation](https://kompute.cc/overview/python-package.html) or the [Python Class Reference documentation](https://kompute.cc/overview/python-reference.html).
-
-## C++ Build Overview
-
-The build system provided uses `cmake`, which allows for cross platform builds.
-
-The top level `Makefile` provides a set of optimized configurations for development as well as the docker image build, but you can start a build with the following command:
-
-```
-   cmake -Bbuild
-```
-
-You also are able to add Kompute in your repo with `add_subdirectory` - the [Android example CMakeLists.txt file](https://github.com/KomputeProject/kompute/blob/7c8c0eeba2cdc098349fcd999102bb2cca1bf711/examples/android/android-simple/app/src/main/cpp/CMakeLists.txt#L3) shows how this would be done.
-
-For a more advanced overview of the build configuration check out the [Build System Deep Dive](https://kompute.cc/overview/build-system.html) documentation.
-
-## Kompute Development
-
-We appreciate PRs and Issues. If you want to contribute try checking the "Good first issue" tag, but even using Kompute and reporting issues is a great contribution!
-
-### Contributing
-
-#### Dev Dependencies
-
-* Testing
-    + GTest
-* Documentation
-    + Doxygen (with Dot)
-    + Sphynx
-
-#### Development
-
-* Follows Mozilla C++ Style Guide https://www-archive.mozilla.org/hacking/mozilla-style-guide.html
-    + Uses post-commit hook to run the linter, you can set it up so it runs the linter before commit
-    + All dependencies are defined in vcpkg.json 
-* Uses cmake as build system, and provides a top level makefile with recommended command
-* Uses xxd (or xxd.exe windows 64bit port) to convert shader spirv to header files
-* Uses doxygen and sphinx for documentation and autodocs
-* Uses vcpkg for finding the dependencies, it's the recommended set up to retrieve the libraries
-
-If you want to run with debug layers you can add them with the `KOMPUTE_ENV_DEBUG_LAYERS` parameter as:
-
-```
-export KOMPUTE_ENV_DEBUG_LAYERS="VK_LAYER_LUNARG_api_dump"
-```
-
-##### Updating documentation
-
-To update the documentation you will need to:
-* Run the gendoxygen target in the build system
-* Run the gensphynx target in the build-system 
-* Push to github pages with `make push_docs_to_ghpages`
-
-##### Running tests
-
-Running the unit tests has been significantly simplified for contributors.
-
-The tests run on CPU, and can be triggered using the ACT command line interface (https://github.com/nektos/act) - once you install the command line (And start the Docker daemon) you just have to type:
-
-```
-$ act
-
-[Python Tests/python-tests] 🚀  Start image=axsauze/kompute-builder:0.2
-[C++ Tests/cpp-tests      ] 🚀  Start image=axsauze/kompute-builder:0.2
-[C++ Tests/cpp-tests      ]   🐳  docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
-[Python Tests/python-tests]   🐳  docker run image=axsauze/kompute-builder:0.2 entrypoint=["/usr/bin/tail" "-f" "/dev/null"] cmd=[]
-...
-```
-
-The repository contains unit tests for the C++ and Python code, and can be found under the `test/` and `python/test` folder.
-
-The tests are currently run through the CI using Github Actions. It uses the images found in `docker-builders/`.
-
-In order to minimise hardware requirements the tests can run without a GPU, directly in the CPU using [Swiftshader](https://github.com/google/swiftshader).
-
-For more information on how the CI and tests are setup, you can go to the [CI, Docker and Tests Section](https://kompute.cc/overview/ci-tests.html) in the documentation.
-
-## Motivations
-
-This project started after seeing that a lot of new and renowned ML & DL projects like Pytorch, Tensorflow, Alibaba DNN, Tencent NCNN - among others - have either integrated or are looking to integrate the Vulkan SDK to add mobile (and cross-vendor) GPU support.
-
-The Vulkan SDK offers a great low level interface that enables for highly specialized optimizations - however it comes at a cost of highly verbose code which requires 500-2000 lines of code to even begin writing application code. This has resulted in each of these projects having to implement the same baseline to abstract the non-compute related features of the Vulkan SDK. This large amount of non-standardised boiler-plate can result in limited knowledge transfer, higher chance of unique framework implementation bugs being introduced, etc.
-
-We are currently developing Kompute not to hide the Vulkan SDK interface (as it's incredibly well designed) but to augment it with a direct focus on the Vulkan SDK's GPU computing capabilities. [This article](https://towardsdatascience.com/machine-learning-and-data-processing-in-the-gpu-with-vulkan-kompute-c9350e5e5d3a) provides a high level overview of the motivations of Kompute, together with a set of hands on examples that introduce both GPU computing as well as the core Kompute architecture.
--- a/kompute/cmake/bin2h.cmake
+++ b/kompute/cmake/bin2h.cmake
@@ -1,106 +0,0 @@
-##################################################################################
-# Based on: https://github.com/sivachandran/cmake-bin2h
-#
-# Copyright 2020 Sivachandran Paramasivam
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-##################################################################################
-
-include(CMakeParseArguments)
-
-# Function to wrap a given string into multiple lines at the given column position.
-# Parameters:
-#   VARIABLE    - The name of the CMake variable holding the string.
-#   AT_COLUMN   - The column position at which string will be wrapped.
-function(WRAP_STRING)
-    set(oneValueArgs VARIABLE AT_COLUMN)
-    cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})
-
-    string(LENGTH ${${WRAP_STRING_VARIABLE}} stringLength)
-    math(EXPR offset "0")
-
-    while(stringLength GREATER 0)
-
-        if(stringLength GREATER ${WRAP_STRING_AT_COLUMN})
-            math(EXPR length "${WRAP_STRING_AT_COLUMN}")
-        else()
-            math(EXPR length "${stringLength}")
-        endif()
-
-        string(SUBSTRING ${${WRAP_STRING_VARIABLE}} ${offset} ${length} line)
-        set(lines "${lines}\n${line}")
-
-        math(EXPR stringLength "${stringLength} - ${length}")
-        math(EXPR offset "${offset} + ${length}")
-    endwhile()
-
-    set(${WRAP_STRING_VARIABLE} "${lines}" PARENT_SCOPE)
-endfunction()
-
-# Function to embed contents of a file as byte array in C/C++ header file(.h). The header file
-# will contain a byte array and integer variable holding the size of the array.
-# Parameters
-#   SOURCE_FILE      - The path of source file whose contents will be embedded in the header file.
-#   VARIABLE_NAME    - The name of the variable for the byte array. The string "_SIZE" will be append
-#                      to this name and will be used a variable name for size variable.
-#   HEADER_FILE      - The path of header file.
-#   APPEND           - If specified appends to the header file instead of overwriting it
-#   NULL_TERMINATE   - If specified a null byte(zero) will be append to the byte array. This will be
-#                      useful if the source file is a text file and we want to use the file contents
-#                      as string. But the size variable holds size of the byte array without this
-#                      null byte.
-#   HEADER_NAMESPACE - The namespace, where the array should be located in.
-#   IS_BIG_ENDIAN    - If set to true, will not revers the byte order for the uint32_t to match the
-#                      big endian system architecture
-# Usage:
-#   bin2h(SOURCE_FILE "Logo.png" HEADER_FILE "Logo.h" VARIABLE_NAME "LOGO_PNG")
-function(BIN2H)
-    set(options APPEND NULL_TERMINATE)
-    set(oneValueArgs SOURCE_FILE VARIABLE_NAME HEADER_FILE)
-    cmake_parse_arguments(BIN2H "${options}" "${oneValueArgs}" "" ${ARGN})
-
-    # reads source file contents as hex string
-    file(READ ${BIN2H_SOURCE_FILE} hexString HEX)
-    string(LENGTH ${hexString} hexStringLength)
-
-    # appends null byte if asked
-    if(BIN2H_NULL_TERMINATE)
-        set(hexString "${hexString}00")
-    endif()
-
-    # wraps the hex string into multiple lines at column 32(i.e. 16 bytes per line)
-    wrap_string(VARIABLE hexString AT_COLUMN 32)
-    math(EXPR arraySize "${hexStringLength} / 8")
-
-    # adds '0x' prefix and comma suffix before and after every byte respectively
-    if(IS_BIG_ENDIAN)
-        message(STATUS "Interpreting shader in big endian...")
-        string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\1\\2\\3\\4, " arrayValues ${hexString})
-    else()
-        message(STATUS "Interpreting shader in little endian...")
-        string(REGEX REPLACE "([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])" "0x\\4\\3\\2\\1, " arrayValues ${hexString})
-    endif()
-    # removes trailing comma
-    string(REGEX REPLACE ", $" "" arrayValues ${arrayValues})
-
-    # converts the variable name into proper C identifier
-    string(MAKE_C_IDENTIFIER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
-    string(TOUPPER "${BIN2H_VARIABLE_NAME}" BIN2H_VARIABLE_NAME)
-
-    # declares byte array and the length variables
-    set(namespaceStart "namespace ${HEADER_NAMESPACE} {")
-    set(namespaceEnd "} // namespace ${HEADER_NAMESPACE}")
-    set(arrayIncludes "#pragma once\n#include <array>\n#include <cstdint>")
-    set(arrayDefinition "const std::array<uint32_t, ${arraySize}> ${BIN2H_VARIABLE_NAME} = { ${arrayValues} };")
-
-    set(declarations "${arrayIncludes}\n\n${namespaceStart}\n${arrayDefinition}\n${namespaceEnd}\n\n")
-    if(BIN2H_APPEND)
-        file(APPEND ${BIN2H_HEADER_FILE} "${declarations}")
-    else()
-        file(WRITE ${BIN2H_HEADER_FILE} "${declarations}")
-    endif()
-endfunction()
--- a/kompute/cmake/bin_file_to_header.cmake
+++ b/kompute/cmake/bin_file_to_header.cmake
@@ -1,19 +0,0 @@
-cmake_minimum_required(VERSION 3.20)
-
-if(${INPUT_SHADER_FILE} STREQUAL "")
-    message(FATAL_ERROR "No input file path provided via 'INPUT_SHADER_FILE'.")
-endif()
-
-if(${OUTPUT_HEADER_FILE} STREQUAL "")
-    message(FATAL_ERROR "No output file path provided via 'OUTPUT_HEADER_FILE'.")
-endif()
-
-if(${HEADER_NAMESPACE} STREQUAL "")
-    message(FATAL_ERROR "No header namespace provided via 'HEADER_NAMESPACE'.")
-endif()
-
-include(bin2h.cmake)
-
-get_filename_component(BINARY_FILE_CONTENT ${INPUT_SHADER_FILE} NAME)
-bin2h(SOURCE_FILE ${INPUT_SHADER_FILE} HEADER_FILE ${OUTPUT_HEADER_FILE} VARIABLE_NAME ${BINARY_FILE_CONTENT} HEADER_NAMESPACE ${HEADER_NAMESPACE})
-file(APPEND ${OUTPUT_HEADER_FILE} "\n")
--- a/kompute/cmake/check_vulkan_version.cmake
+++ b/kompute/cmake/check_vulkan_version.cmake
@@ -1,139 +0,0 @@
-# Current issue: Only checks the result of GPU0
-function(check_vulkan_version)
-    cmake_parse_arguments(VULKAN_CHECK_VERSION "" "INCLUDE_DIR" "" ${ARGN})
-    message(STATUS "Ensuring the currently installed driver supports the Vulkan version requested by the Vulkan Header.")
-
-    # Get the current Vulkan Header version (e.g. 1.2.189).
-    # This snippet is based on: https://gitlab.kitware.com/cmake/cmake/-/blob/v3.23.1/Modules/FindVulkan.cmake#L140-156
-    if(VULKAN_CHECK_VERSION_INCLUDE_DIR)
-        set(VULKAN_CORE_H ${VULKAN_CHECK_VERSION_INCLUDE_DIR}/vulkan/vulkan_core.h)
-        if(EXISTS ${VULKAN_CORE_H})
-            file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE REGEX "^#define VK_HEADER_VERSION ")
-            string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION "${VULKAN_HEADER_VERSION_LINE}")
-            file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_VERSION_LINE2 REGEX "^#define VK_HEADER_VERSION_COMPLETE ")
-            if(NOT ${VULKAN_HEADER_VERSION_LINE2} STREQUAL "")
-                string(REGEX MATCHALL "[0-9]+" VULKAN_HEADER_VERSION2 "${VULKAN_HEADER_VERSION_LINE2}")
-                list(LENGTH VULKAN_HEADER_VERSION2 _len)
-                # Versions >= 1.2.175 have an additional numbers in front of e.g. '0, 1, 2' instead of '1, 2'
-                if(_len EQUAL 3)
-                    list(REMOVE_AT VULKAN_HEADER_VERSION2 0)
-                endif()
-                list(APPEND VULKAN_HEADER_VERSION2 ${VULKAN_HEADER_VERSION})
-                list(JOIN VULKAN_HEADER_VERSION2 "." VULKAN_HEADER_VERSION)
-            else()
-                file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_2 REGEX "^#define VK_API_VERSION_1_2.*")
-                if(NOT ${VULKAN_HEADER_API_VERSION_1_2} STREQUAL "")
-                    set(VULKAN_HEADER_VERSION "1.2.${VULKAN_HEADER_VERSION}")
-                else()
-                    file(STRINGS ${VULKAN_CORE_H} VULKAN_HEADER_API_VERSION_1_1 REGEX "^#define VK_API_VERSION_1_1.*")
-                    if(NOT ${VULKAN_HEADER_API_VERSION_1_1} STREQUAL "")
-                        set(VULKAN_HEADER_VERSION "1.1.${VULKAN_HEADER_VERSION}")
-                    else()
-                        message(FATAL_ERROR "'${VULKAN_CORE_H}' does not contain a supported Vulkan version. Probably because its < 1.2.0.")
-                    endif()
-                endif()
-            endif()
-        else()
-            message(FATAL_ERROR "'${VULKAN_CORE_H}' does not exist. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
-            return()
-        endif()
-    else()
-        message(FATAL_ERROR "Invalid Vulkan include directory given. Try calling 'find_package(Vulkan REQUIRED)' before you call this function or set 'Vulkan_INCLUDE_DIR' manually!")
-        return()
-    endif()
-    message(STATUS "Found Vulkan Header version: ${VULKAN_HEADER_VERSION}")
-
-    # Get Vulkan version supported by driver
-    find_program(VULKAN_INFO_PATH NAMES vulkaninfo)
-    if(VULKAN_INFO_PATH STREQUAL "VULKAN_INFO_PATH-NOTFOUND")
-        message(FATAL_ERROR "vulkaninfo not found. The Vulkan SDK might not be installed properly. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
-        return()
-    endif()
-
-    execute_process(COMMAND "vulkaninfo"
-                    OUTPUT_VARIABLE VULKAN_INFO_OUTPUT
-                    RESULT_VARIABLE VULKAN_INFO_RETURN)
-    if(NOT ${VULKAN_INFO_RETURN} EQUAL 0)
-        message(FATAL_ERROR "Running vulkaninfo failed with return code ${VULKAN_INFO_RETURN}. Make sure you have 'vulkan-tools' installed. Result:\n${VULKAN_INFO_OUTPUT}?")
-        return()
-    else()
-        message(STATUS "Running vulkaninfo was successful. Parsing the output...")
-    endif()
-
-    # Check if running vulkaninfo was successfully
-    string(FIND "${VULKAN_INFO_OUTPUT}" "Vulkan Instance Version" VULKAN_INFO_SUCCESSFUL)
-    if(VULKAN_INFO_SUCCESSFUL LESS 0)
-        message(FATAL_ERROR "Running vulkaninfo failed. Make sure you have 'vulkan-tools' installed and DISPLAY is configured. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON). Result:\n${VULKAN_INFO_OUTPUT}?")
-    endif()
-
-    string(REGEX MATCHALL "(GPU[0-9]+)" GPU_IDS "${VULKAN_INFO_OUTPUT}")
-    if(NOT GPU_IDS)
-        message(FATAL_ERROR "No GPU supporting Vulkan found in vulkaninfo. Does your GPU (driver) support Vulkan?")
-    endif()
-
-    string(REGEX MATCHALL "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*" GPU_API_VERSIONS ${VULKAN_INFO_OUTPUT})
-    if(NOT GPU_API_VERSIONS)
-        message(FATAL_ERROR "No valid Vulkan API version found in vulkaninfo. Does your GPU (driver) support Vulkan?")
-    endif()
-
-    # Check length
-    # message(FATAL_ERROR "GPUS: ${GPU_IDS}")
-    list(LENGTH GPU_IDS GPU_IDS_LENGTH)
-    list(LENGTH GPU_API_VERSIONS GPU_API_VERSIONS_LENGTH)
-    if(NOT ${GPU_IDS_LENGTH} EQUAL ${GPU_API_VERSIONS_LENGTH})
-        message(FATAL_ERROR "Found ${GPU_IDS_LENGTH} GPUs, but ${GPU_API_VERSIONS_LENGTH} API versions in vulkaninfo. We expected to find an equal amount of them.")
-    endif()
-
-    # Compare versions
-    set(VALID_GPU "")
-    set(VALID_VULKAN_VERSION "")
-    math(EXPR ITER_LEN "${GPU_IDS_LENGTH} - 1")
-    foreach(INDEX RANGE ${ITER_LEN})
-        list(GET GPU_IDS ${INDEX} GPU)
-        list(GET GPU_API_VERSIONS ${INDEX} API_VERSION)
-
-        # Extract API version
-        if(${API_VERSION} MATCHES "apiVersion[ ]*=[ ]*[0-9a-fA-F]*[ ]*[(]*([0-9]+[.][0-9]+[.][0-9]+)[)]*")
-            set(VULKAN_DRIVER_VERSION ${CMAKE_MATCH_1})
-        else()
-            message(FATAL_ERROR "API version match failed. This should not have happened...")
-        endif()
-
-        message(STATUS "${GPU} supports Vulkan API version '${VULKAN_DRIVER_VERSION}'.")
-
-        # Compare driver and header version
-        if(${VULKAN_DRIVER_VERSION} VERSION_LESS ${VULKAN_HEADER_VERSION})
-        # Version missmatch. Let us check if the minor version is the same.
-            if(${VULKAN_DRIVER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
-                set(VULKAN_DRIVER_MINOR_VERSION ${CMAKE_MATCH_1})
-            else()
-                message(FATAL_ERROR "Invalid Vulkan driver version '${VULKAN_DRIVER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
-            endif()
-            if(${VULKAN_HEADER_VERSION} MATCHES "[0-9]+[.]([0-9]+)[.][0-9]+")
-                set(VULKAN_HEADER_MINOR_VERSION ${CMAKE_MATCH_1})
-            else()
-                message(FATAL_ERROR "Invalid Vulkan Header version '${VULKAN_HEADER_VERSION}' found. Expected version in the following format: '[0-9]+.[0-9]+.[0-9]+'")
-            endif()
-
-            if(${VULKAN_DRIVER_MINOR_VERSION} EQUAL ${VULKAN_HEADER_MINOR_VERSION})
-                message(WARNING "Your GPU driver does not support Vulkan > ${VULKAN_DRIVER_VERSION}, but you try to use Vulkan Header ${VULKAN_HEADER_VERSION}. At least your driver supports the same minor version (${VULKAN_DRIVER_MINOR_VERSION}), so this should be fine but keep it in mind in case you encounter any strange behavior.")
-                set(VALID_GPU ${GPU})
-                set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
-                break()
-            else()
-                message(STATUS "${GPU} does not support Vulkan > ${VULKAN_DRIVER_VERSION}.")
-            endif()
-        else()
-            set(VALID_GPU ${GPU})
-            set(VALID_VULKAN_VERSION ${VULKAN_DRIVER_VERSION})
-            break()
-        endif()
-    endforeach()
-
-    if("${VALID_GPU}" STREQUAL "")
-        message(FATAL_ERROR "None of your GPUs supports Vulkan Header ${VULKAN_HEADER_VERSION}. Please try updating your driver, or downgrade your Vulkan headers. If you know what you are doing, you can disable the Vulkan version check by setting 'KOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK' to 'ON' (-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON).")
-    else()
-        message("Valid GPU (${VALID_GPU}) for Vulkan header version ${VULKAN_HEADER_VERSION} found. ${VALID_GPU} supports up to Vulkan ${VALID_VULKAN_VERSION}.")
-    endif()
-
-endfunction()
--- a/kompute/cmake/code_coverage.cmake
+++ b/kompute/cmake/code_coverage.cmake
@@ -1,35 +0,0 @@
-# Code coverage
-set(CMAKE_BUILD_TYPE COVERAGE CACHE INTERNAL "Coverage build enabled")
-message(STATUS "Enabling gcov support")
-
-if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    set(COVERAGE_FLAG "--coverage")
-endif()
-
-set(CMAKE_CXX_FLAGS_COVERAGE
-    "-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
-    CACHE STRING "Flags used by the C++ compiler during coverage builds."
-    FORCE)
-set(CMAKE_C_FLAGS_COVERAGE
-    "-g -O0 ${COVERAGE_FLAG} -fprofile-arcs -ftest-coverage"
-    CACHE STRING "Flags used by the C compiler during coverage builds."
-    FORCE)
-set(CMAKE_EXE_LINKER_FLAGS_COVERAGE
-    ""
-    CACHE STRING "Flags used for linking binaries during coverage builds."
-    FORCE)
-set(CMAKE_SHARED_LINKER_FLAGS_COVERAGE
-    ""
-    CACHE STRING "Flags used by the shared libraries linker during coverage builds."
-    FORCE)
-
-set(CODECOV_DIR ${CMAKE_CURRENT_BINARY_DIR}/codecov/)
-set(CODECOV_DIR_LCOV ${CODECOV_DIR}lcov/)
-set(CODECOV_FILENAME_LCOV_INFO lcov.info)
-set(CODECOV_FILENAME_LCOV_INFO_FULL lcov_full.info)
-set(CODECOV_DIR_HTML ${CODECOV_DIR}html/)
-
-mark_as_advanced(CMAKE_CXX_FLAGS_COVERAGE
-    CMAKE_C_FLAGS_COVERAGE
-    CMAKE_EXE_LINKER_FLAGS_COVERAGE
-    CMAKE_SHARED_LINKER_FLAGS_COVERAGE)
--- a/kompute/cmake/deprecation_warnings.cmake
+++ b/kompute/cmake/deprecation_warnings.cmake
@@ -1,15 +0,0 @@
-if(KOMPUTE_OPT_REPO_SUBMODULE_BUILD)
-    message(FATAL_ERROR "'KOMPUTE_OPT_REPO_SUBMODULE_BUILD' got replaced by 'KOMPUTE_OPT_USE_BUILT_IN_SPDLOG', 'KOMPUTE_OPT_USE_BUILT_IN_FMT', 'KOMPUTE_OPT_USE_BUILT_IN_GOOGLE_TEST', 'KOMPUTE_OPT_USE_BUILT_IN_PYBIND11' and 'KOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER'. Please use them instead.")
-endif()
-
-if(KOMPUTE_OPT_BUILD_AS_SHARED_LIB)
-    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_AS_SHARED_LIB' is deprecated and should not be used. Instead use the default 'BUILD_SHARED_LIBS' CMake switch.")
-endif()
-
-if(KOMPUTE_OPT_BUILD_SINGLE_HEADER)
-    message(FATAL_ERROR "'KOMPUTE_OPT_BUILD_SINGLE_HEADER' is deprecated and should not be used. The single header will now always be build and can be included via '#include<kompute/kompute.h>'.")
-endif()
-
-if(KOMPUTE_OPT_ENABLE_SPDLOG)
-    message(FATAL_ERROR "'KOMPUTE_OPT_ENABLE_SPDLOG' is deprecated and should not be used. It got replaced by 'KOMPUTE_OPT_LOG_LEVEL'. This option can be set to a variety of log levels (e.g. 'Off', 'Trace', 'Debug', 'Default', ...).")
-endif()
--- a/kompute/cmake/komputeConfig.cmake.in
+++ b/kompute/cmake/komputeConfig.cmake.in
@@ -1,8 +0,0 @@
-include(CMakeFindDependencyMacro)
-@PACKAGE_INIT@
-
-find_dependency(VULKAN REQUIRED)
-
-include(${CMAKE_CURRENT_LIST_DIR}/komputeTargets.cmake)
-
-check_required_components(kompute)
--- a/kompute/cmake/vulkan_shader_compiler.cmake
+++ b/kompute/cmake/vulkan_shader_compiler.cmake
@@ -1,43 +0,0 @@
-function(vulkan_compile_shader)
-     find_program(GLS_LANG_VALIDATOR_PATH NAMES glslangValidator)
-     if(GLS_LANG_VALIDATOR_PATH STREQUAL "GLS_LANG_VALIDATOR_PATH-NOTFOUND")
-          message(FATAL_ERROR "glslangValidator not found.")
-          return()
-     endif()
-
-     cmake_parse_arguments(SHADER_COMPILE "" "INFILE;OUTFILE;NAMESPACE;RELATIVE_PATH" "" ${ARGN})
-     set(SHADER_COMPILE_INFILE_FULL "${CMAKE_CURRENT_SOURCE_DIR}/${SHADER_COMPILE_INFILE}")
-     set(SHADER_COMPILE_SPV_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_INFILE}.spv")
-     set(SHADER_COMPILE_HEADER_FILE_FULL "${CMAKE_CURRENT_BINARY_DIR}/${SHADER_COMPILE_OUTFILE}")
-
-     if(NOT SHADER_COMPILE_RELATIVE_PATH)
-          set(SHADER_COMPILE_RELATIVE_PATH "${PROJECT_SOURCE_DIR}/cmake")
-     endif()
-    
-     # .comp -> .spv
-     add_custom_command(OUTPUT "${SHADER_COMPILE_SPV_FILE_FULL}"
-                        COMMAND "${GLS_LANG_VALIDATOR_PATH}"
-                        ARGS "-V"
-                             "${SHADER_COMPILE_INFILE_FULL}"
-                             "-o"
-                             "${SHADER_COMPILE_SPV_FILE_FULL}"
-                        COMMENT "Compile vulkan compute shader from file '${SHADER_COMPILE_INFILE_FULL}' to '${SHADER_COMPILE_SPV_FILE_FULL}'."
-                        MAIN_DEPENDENCY "${SHADER_COMPILE_INFILE_FULL}")
-
-     # Check if big or little endian
-     include (TestBigEndian)
-     TEST_BIG_ENDIAN(IS_BIG_ENDIAN)
-
-     # .spv -> .hpp
-     add_custom_command(OUTPUT "${SHADER_COMPILE_HEADER_FILE_FULL}"
-                        COMMAND ${CMAKE_COMMAND}
-                        ARGS "-DINPUT_SHADER_FILE=${SHADER_COMPILE_SPV_FILE_FULL}"
-                             "-DOUTPUT_HEADER_FILE=${SHADER_COMPILE_HEADER_FILE_FULL}"
-                             "-DHEADER_NAMESPACE=${SHADER_COMPILE_NAMESPACE}"
-                             "-DIS_BIG_ENDIAN=${IS_BIG_ENDIAN}"
-                             "-P"
-                             "${SHADER_COMPILE_RELATIVE_PATH}/bin_file_to_header.cmake"
-                        WORKING_DIRECTORY "${SHADER_COMPILE_RELATIVE_PATH}"
-                        COMMENT "Converting compiled shader '${SHADER_COMPILE_SPV_FILE_FULL}' to header file '${SHADER_COMPILE_HEADER_FILE_FULL}'."
-                        MAIN_DEPENDENCY "${SHADER_COMPILE_SPV_FILE_FULL}")
-endfunction()
--- a/kompute/config/FindSphinx.cmake
+++ b/kompute/config/FindSphinx.cmake
@@ -1,16 +0,0 @@
-# Look for an executable called sphinx-build
-find_program(SPHINX_EXECUTABLE
-    NAMES sphinx-build
-    DOC "Path to sphinx-build executable")
-
-if(SPHINX_EXECUTABLE STREQUAL "SPHINX_EXECUTABLE-NOTFOUND")
-    message(FATAL_ERROR "sphinx-build not found.")
-endif()
-
-include(FindPackageHandleStandardArgs)
-
-# Handle standard arguments to find_package like REQUIRED and QUIET
-find_package_handle_standard_args(
-    Sphinx
-    "Failed to find sphinx-build executable"
-    SPHINX_EXECUTABLE)
--- a/kompute/external/bin/xxd.c
+++ b/kompute/external/bin/xxd.c
@@ -1,819 +0,0 @@
-/*
-As indicated at https://lists.debian.org/debian-legal/2015/01/msg00037.html,
-the author has permitted redistribution of xxd under the MIT license, as follows:
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice shall be included
-in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * xxd: my hexdump facility. jw
- *
- *  2.10.90 changed to word output
- *  3.03.93 new indent style, dumb bug inserted and fixed.
- *	    -c option, mls
- * 26.04.94 better option parser, -ps, -l, -s added.
- *  1.07.94 -r badly needs - as input file.  Per default autoskip over
- *	       consecutive lines of zeroes, as unix od does.
- *	    -a shows them too.
- *	    -i dump as c-style #include "file.h"
- *  1.11.95 if "xxd -i" knows the filename, an 'unsigned char filename_bits[]'
- *	    array is written in correct c-syntax.
- *	    -s improved, now defaults to absolute seek, relative requires a '+'.
- *	    -r improved, now -r -s -0x... is supported.
- *	       change/suppress leading '\0' bytes.
- *	    -l n improved: stops exactly after n bytes.
- *	    -r improved, better handling of partial lines with trailing garbage.
- *	    -r improved, now -r -p works again!
- *	    -r improved, less flushing, much faster now! (that was silly)
- *  3.04.96 Per repeated request of a single person: autoskip defaults to off.
- * 15.05.96 -v added. They want to know the version.
- *	    -a fixed, to show last line inf file ends in all zeros.
- *	    -u added: Print upper case hex-letters, as preferred by unix bc.
- *	    -h added to usage message. Usage message extended.
- *	    Now using outfile if specified even in normal mode, aehem.
- *	    No longer mixing of ints and longs. May help doze people.
- *	    Added binify ioctl for same reason. (Enough Doze stress for 1996!)
- * 16.05.96 -p improved, removed occasional superfluous linefeed.
- * 20.05.96 -l 0 fixed. tried to read anyway.
- * 21.05.96 -i fixed. now honours -u, and prepends __ to numeric filenames.
- *	    compile -DWIN32 for NT or W95. George V. Reilly, * -v improved :-)
- *	    support --gnuish-longhorn-options
- * 25.05.96 MAC support added: CodeWarrior already uses ``outline'' in Types.h
- *	    which is included by MacHeaders (Axel Kielhorn). Renamed to
- *	    xxdline().
- *  7.06.96 -i printed 'int' instead of 'char'. *blush*
- *	    added Bram's OS2 ifdefs...
- * 18.07.96 gcc -Wall @ SunOS4 is now slient.
- *	    Added osver for MSDOS/DJGPP/WIN32.
- * 29.08.96 Added size_t to strncmp() for Amiga.
- * 24.03.97 Windows NT support (Phil Hanna). Clean exit for Amiga WB (Bram)
- * 02.04.97 Added -E option, to have EBCDIC translation instead of ASCII
- *	    (azc10@yahoo.com)
- * 22.05.97 added -g (group octets) option (jcook@namerica.kla.com).
- * 23.09.98 nasty -p -r misfeature fixed: slightly wrong output, when -c was
- *	    missing or wrong.
- * 26.09.98 Fixed: 'xxd -i infile outfile' did not truncate outfile.
- * 27.10.98 Fixed: -g option parser required blank.
- *	    option -b added: 01000101 binary output in normal format.
- * 16.05.00 Added VAXC changes by Stephen P. Wall
- * 16.05.00 Improved MMS file and merge for VMS by Zoltan Arpadffy
- *
- * (c) 1990-1998 by Juergen Weigert (jnweiger@informatik.uni-erlangen.de)
- *
- * Small changes made afterwards by Bram Moolenaar et al.
- *
- * Distribute freely and credit me,
- * make money and share with me,
- * lose money and don't ask me.
- *
- *
- */
-
-/* Visual Studio 2005 has 'deprecated' many of the standard CRT functions */
-#if _MSC_VER >= 1400
-# define _CRT_SECURE_NO_DEPRECATE
-# define _CRT_NONSTDC_NO_DEPRECATE
-#endif
-
-#include <stdio.h>
-#ifdef VAXC
-# include <file.h>
-#else
-# include <fcntl.h>
-#endif
-#ifdef __TSC__
-# define MSDOS
-#endif
-#if !defined(OS2) && defined(__EMX__)
-# define OS2
-#endif
-#if defined(MSDOS) || defined(WIN32) || defined(OS2) || defined(__BORLANDC__) || defined(CYGWIN)
-# include <io.h>	/* for setmode() */
-#else
-# ifdef UNIX
-#  include <unistd.h>
-# endif
-#endif
-#include <stdlib.h>
-#include <string.h>	/* for strncmp() */
-#include <ctype.h>	/* for isalnum() */
-#if __MWERKS__ && !defined(BEBOX)
-# include <unix.h>	/* for fdopen() on MAC */
-#endif
-
-#if defined(__BORLANDC__) && __BORLANDC__ <= 0x0410 && !defined(fileno)
-/* Missing define and prototype grabbed from the BC 4.0 <stdio.h> */
-# define fileno(f)       ((f)->fd)
-FILE   _FAR *_Cdecl _FARFUNC fdopen(int __handle, char _FAR *__type);
-#endif
-
-
-/*  This corrects the problem of missing prototypes for certain functions
- *  in some GNU installations (e.g. SunOS 4.1.x).
- *  Darren Hiebert <darren@hmi.com> (sparc-sun-sunos4.1.3_U1/2.7.2.2)
- */
-#if defined(__GNUC__) && defined(__STDC__)
-# ifndef __USE_FIXED_PROTOTYPES__
-#  define __USE_FIXED_PROTOTYPES__
-# endif
-#endif
-
-#ifndef __USE_FIXED_PROTOTYPES__
-/*
- * This is historic and works only if the compiler really has no prototypes:
- *
- * Include prototypes for Sun OS 4.x, when using an ANSI compiler.
- * FILE is defined on OS 4.x, not on 5.x (Solaris).
- * if __SVR4 is defined (some Solaris versions), don't include this.
- */
-#if defined(sun) && defined(FILE) && !defined(__SVR4) && defined(__STDC__)
-#  define __P(a) a
-/* excerpt from my sun_stdlib.h */
-extern int fprintf __P((FILE *, char *, ...));
-extern int fputs   __P((char *, FILE *));
-extern int _flsbuf __P((unsigned char, FILE *));
-extern int _filbuf __P((FILE *));
-extern int fflush  __P((FILE *));
-extern int fclose  __P((FILE *));
-extern int fseek   __P((FILE *, long, int));
-extern int rewind  __P((FILE *));
-
-extern void perror __P((char *));
-# endif
-#endif
-
-extern long int strtol();
-extern long int ftell();
-
-char version[] = "xxd V1.10 27oct98 by Juergen Weigert";
-#ifdef WIN32
-char osver[] = " (Win32)";
-#else
-# ifdef DJGPP
-char osver[] = " (dos 32 bit)";
-# else
-#  ifdef MSDOS
-char osver[] = " (dos 16 bit)";
-#  else
-char osver[] = "";
-#  endif
-# endif
-#endif
-
-#if !defined(CYGWIN) && (defined(CYGWIN32) || defined(__CYGWIN__) || defined(__CYGWIN32__))
-# define CYGWIN
-#endif
-#if defined(MSDOS) || defined(WIN32) || defined(OS2)
-# define BIN_READ(yes)  ((yes) ? "rb" : "rt")
-# define BIN_WRITE(yes) ((yes) ? "wb" : "wt")
-# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
-# define BIN_ASSIGN(fp, yes) setmode(fileno(fp), (yes) ? O_BINARY : O_TEXT)
-# define PATH_SEP '\\'
-#elif defined(CYGWIN)
-# define BIN_READ(yes)  ((yes) ? "rb" : "rt")
-# define BIN_WRITE(yes) ((yes) ? "wb" : "w")
-# define BIN_CREAT(yes) ((yes) ? (O_CREAT|O_BINARY) : O_CREAT)
-# define BIN_ASSIGN(fp, yes) ((yes) ? (void) setmode(fileno(fp), O_BINARY) : (void) (fp))
-# define PATH_SEP '/'
-#else
-# ifdef VMS
-#  define BIN_READ(dummy)  "r"
-#  define BIN_WRITE(dummy) "w"
-#  define BIN_CREAT(dummy) O_CREAT
-#  define BIN_ASSIGN(fp, dummy) fp
-#  define PATH_SEP ']'
-#  define FILE_SEP '.'
-# else
-#  define BIN_READ(dummy)  "r"
-#  define BIN_WRITE(dummy) "w"
-#  define BIN_CREAT(dummy) O_CREAT
-#  define BIN_ASSIGN(fp, dummy) fp
-#  define PATH_SEP '/'
-# endif
-#endif
-
-/* open has only to arguments on the Mac */
-#if __MWERKS__
-# define OPEN(name, mode, umask) open(name, mode)
-#else
-# define OPEN(name, mode, umask) open(name, mode, umask)
-#endif
-
-#ifdef AMIGA
-# define STRNCMP(s1, s2, l) strncmp(s1, s2, (size_t)l)
-#else
-# define STRNCMP(s1, s2, l) strncmp(s1, s2, l)
-#endif
-
-#ifndef __P
-# if defined(__STDC__) || defined(MSDOS) || defined(WIN32) || defined(OS2) \
-        || defined(__BORLANDC__)
-#  define __P(a) a
-# else
-#  define __P(a) ()
-# endif
-#endif
-
-/* Let's collect some prototypes */
-/* CodeWarrior is really picky about missing prototypes */
-static void exit_with_usage __P((char *));
-static int huntype __P((FILE *, FILE *, FILE *, char *, int, int, long));
-static void xxdline __P((FILE *, char *, int));
-
-#define TRY_SEEK	/* attempt to use lseek, or skip forward by reading */
-#define COLS 256	/* change here, if you ever need more columns */
-#define LLEN (11 + (9*COLS-1)/1 + COLS + 2)
-
-char hexxa[] = "0123456789abcdef0123456789ABCDEF", *hexx = hexxa;
-
-/* the different hextypes known by this program: */
-#define HEX_NORMAL 0
-#define HEX_POSTSCRIPT 1
-#define HEX_CINCLUDE 2
-#define HEX_BITS 3		/* not hex a dump, but bits: 01111001 */
-
-static void
-exit_with_usage(pname)
-char *pname;
-{
-  fprintf(stderr, "Usage:\n       %s [options] [infile [outfile]]\n", pname);
-  fprintf(stderr, "    or\n       %s -r [-s [-]offset] [-c cols] [-ps] [infile [outfile]]\n", pname);
-  fprintf(stderr, "Options:\n");
-  fprintf(stderr, "    -a          toggle autoskip: A single '*' replaces nul-lines. Default off.\n");
-  fprintf(stderr, "    -b          binary digit dump (incompatible with -p,-i,-r). Default hex.\n");
-  fprintf(stderr, "    -c cols     format <cols> octets per line. Default 16 (-i: 12, -ps: 30).\n");
-  fprintf(stderr, "    -E          show characters in EBCDIC. Default ASCII.\n");
-  fprintf(stderr, "    -g          number of octets per group in normal output. Default 2.\n");
-  fprintf(stderr, "    -h          print this summary.\n");
-  fprintf(stderr, "    -i          output in C include file style.\n");
-  fprintf(stderr, "    -l len      stop after <len> octets.\n");
-  fprintf(stderr, "    -ps         output in postscript plain hexdump style.\n");
-  fprintf(stderr, "    -r          reverse operation: convert (or patch) hexdump into binary.\n");
-  fprintf(stderr, "    -r -s off   revert with <off> added to file positions found in hexdump.\n");
-  fprintf(stderr, "    -s %sseek  start at <seek> bytes abs. %sinfile offset.\n",
-#ifdef TRY_SEEK
-      "[+][-]", "(or +: rel.) ");
-#else
-      "", "");
-#endif
-  fprintf(stderr, "    -u          use upper case hex letters.\n");
-  fprintf(stderr, "    -v          show version: \"%s%s\".\n", version, osver);
-  exit(1);
-}
-
-/*
- * Max. cols binary characters are decoded from the input stream per line.
- * Two adjacent garbage characters after evaluated data delimit valid data.
- * Everything up to the next newline is discarded.
- *
- * The name is historic and came from 'undo type opt h'.
- */
-static int
-huntype(fpi, fpo, fperr, pname, cols, hextype, base_off)
-FILE *fpi, *fpo, *fperr;
-char *pname;
-int cols, hextype;
-long base_off;
-{
-  int c, ign_garb = 1, n1 = -1, n2 = 0, n3, p = cols;
-  long have_off = 0, want_off = 0;
-
-  rewind(fpi);
-
-  while ((c = getc(fpi)) != EOF)
-    {
-      if (c == '\r')	/* Doze style input file? */
-    continue;
-
-#if 0	/* this doesn't work when there is normal text after the hex codes in
-       the last line that looks like hex */
-      if (c == ' ' || c == '\n' || c == '\t')  /* allow multiple spaces */
-    continue;
-#endif
-
-      n3 = n2;
-      n2 = n1;
-
-      if (c >= '0' && c <= '9')
-    n1 = c - '0';
-      else if (c >= 'a' && c <= 'f')
-    n1 = c - 'a' + 10;
-      else if (c >= 'A' && c <= 'F')
-    n1 = c - 'A' + 10;
-      else
-    {
-      n1 = -1;
-      if (ign_garb)
-        continue;
-    }
-
-      ign_garb = 0;
-
-      if (p >= cols)
-    {
-      if (!hextype)
-        {
-          if (n1 < 0)
-        {
-          p = 0;
-          continue;
-        }
-          want_off = (want_off << 4) | n1;
-          continue;
-        }
-      else
-        p = 0;
-    }
-
-      if (base_off + want_off != have_off)
-    {
-      fflush(fpo);
-#ifdef TRY_SEEK
-      c = fseek(fpo, base_off + want_off - have_off, 1);
-      if (c >= 0)
-        have_off = base_off + want_off;
-#endif
-      if (base_off + want_off < have_off)
-        {
-          fprintf(fperr, "%s: sorry, cannot seek backwards.\n", pname);
-          return 5;
-        }
-      for (; have_off < base_off + want_off; have_off++)
-        putc(0, fpo);
-    }
-
-      if (n2 >= 0 && n1 >= 0)
-    {
-      putc((n2 << 4) | n1, fpo);
-      have_off++;
-      want_off++;
-      n1 = -1;
-      if ((++p >= cols) && !hextype)
-        {
-          /* skip rest of line as garbage */
-          want_off = 0;
-          while ((c = getc(fpi)) != '\n' && c != EOF)
-        ;
-          ign_garb = 1;
-        }
-    }
-      else if (n1 < 0 && n2 < 0 && n3 < 0)
-    {
-      /* already stumbled into garbage, skip line, wait and see */
-      if (!hextype)
-        want_off = 0;
-      while ((c = getc(fpi)) != '\n' && c != EOF)
-        ;
-      ign_garb = 1;
-    }
-    }
-  fflush(fpo);
-#ifdef TRY_SEEK
-  fseek(fpo, 0L, 2);
-#endif
-  fclose(fpo);
-  fclose(fpi);
-  return 0;
-}
-
-/*
- * Print line l. If nz is false, xxdline regards the line a line of
- * zeroes. If there are three or more consecutive lines of zeroes,
- * they are replaced by a single '*' character.
- *
- * If the output ends with more than two lines of zeroes, you
- * should call xxdline again with l being the last line and nz
- * negative. This ensures that the last line is shown even when
- * it is all zeroes.
- *
- * If nz is always positive, lines are never suppressed.
- */
-static void
-xxdline(fp, l, nz)
-FILE *fp;
-char *l;
-int nz;
-{
-  static char z[LLEN+1];
-  static int zero_seen = 0;
-
-  if (!nz && zero_seen == 1)
-    strcpy(z, l);
-
-  if (nz || !zero_seen++)
-    {
-      if (nz)
-    {
-      if (nz < 0)
-        zero_seen--;
-      if (zero_seen == 2)
-        fputs(z, fp);
-      if (zero_seen > 2)
-        fputs("*\n", fp);
-    }
-      if (nz >= 0 || zero_seen > 0)
-    fputs(l, fp);
-      if (nz)
-    zero_seen = 0;
-    }
-}
-
-/* This is an EBCDIC to ASCII conversion table */
-/* from a proposed BTL standard April 16, 1979 */
-static unsigned char etoa64[] =
-{
-    0040,0240,0241,0242,0243,0244,0245,0246,
-    0247,0250,0325,0056,0074,0050,0053,0174,
-    0046,0251,0252,0253,0254,0255,0256,0257,
-    0260,0261,0041,0044,0052,0051,0073,0176,
-    0055,0057,0262,0263,0264,0265,0266,0267,
-    0270,0271,0313,0054,0045,0137,0076,0077,
-    0272,0273,0274,0275,0276,0277,0300,0301,
-    0302,0140,0072,0043,0100,0047,0075,0042,
-    0303,0141,0142,0143,0144,0145,0146,0147,
-    0150,0151,0304,0305,0306,0307,0310,0311,
-    0312,0152,0153,0154,0155,0156,0157,0160,
-    0161,0162,0136,0314,0315,0316,0317,0320,
-    0321,0345,0163,0164,0165,0166,0167,0170,
-    0171,0172,0322,0323,0324,0133,0326,0327,
-    0330,0331,0332,0333,0334,0335,0336,0337,
-    0340,0341,0342,0343,0344,0135,0346,0347,
-    0173,0101,0102,0103,0104,0105,0106,0107,
-    0110,0111,0350,0351,0352,0353,0354,0355,
-    0175,0112,0113,0114,0115,0116,0117,0120,
-    0121,0122,0356,0357,0360,0361,0362,0363,
-    0134,0237,0123,0124,0125,0126,0127,0130,
-    0131,0132,0364,0365,0366,0367,0370,0371,
-    0060,0061,0062,0063,0064,0065,0066,0067,
-    0070,0071,0372,0373,0374,0375,0376,0377
-};
-
-const char* extract_filename(const char* path) {
-    const char* filename = strrchr(path, '/');
-    if (filename) {
-        return filename + 1;
-    }
-    return path;
-}
-
-int
-main(argc, argv)
-int argc;
-char *argv[];
-{
-  FILE *fp, *fpo;
-  int c, e, p = 0, relseek = 1, negseek = 0, revert = 0;
-  int cols = 0, nonzero = 0, autoskip = 0, hextype = HEX_NORMAL;
-  int ebcdic = 0;
-  int octspergrp = -1;	/* number of octets grouped in output */
-  int grplen;		/* total chars per octet group */
-  long length = -1, n = 0, seekoff = 0;
-  char l[LLEN+1];
-  char *pname, *pp;
-
-#ifdef AMIGA
-  /* This program doesn't work when started from the Workbench */
-  if (argc == 0)
-    exit(1);
-#endif
-
-  pname = argv[0];
-  for (pp = pname; *pp; )
-    if (*pp++ == PATH_SEP)
-      pname = pp;
-#ifdef FILE_SEP
-  for (pp = pname; *pp; pp++)
-    if (*pp == FILE_SEP)
-      {
-    *pp = '\0';
-    break;
-      }
-#endif
-
-  while (argc >= 2)
-    {
-      pp = argv[1] + (!STRNCMP(argv[1], "--", 2) && argv[1][2]);
-       if (!STRNCMP(pp, "-a", 2)) autoskip = 1 - autoskip;
-      else if (!STRNCMP(pp, "-b", 2)) hextype = HEX_BITS;
-      else if (!STRNCMP(pp, "-u", 2)) hexx = hexxa + 16;
-      else if (!STRNCMP(pp, "-p", 2)) hextype = HEX_POSTSCRIPT;
-      else if (!STRNCMP(pp, "-i", 2)) hextype = HEX_CINCLUDE;
-      else if (!STRNCMP(pp, "-r", 2)) revert++;
-      else if (!STRNCMP(pp, "-E", 2)) ebcdic++;
-      else if (!STRNCMP(pp, "-v", 2))
-    {
-      fprintf(stderr, "%s%s\n", version, osver);
-      exit(0);
-    }
-      else if (!STRNCMP(pp, "-c", 2))
-    {
-      if (pp[2] && STRNCMP("ols", pp + 2, 3))
-        cols = (int)strtol(pp + 2, NULL, 0);
-      else
-        {
-          if (!argv[2])
-        exit_with_usage(pname);
-          cols = (int)strtol(argv[2], NULL, 0);
-          argv++;
-          argc--;
-        }
-    }
-      else if (!STRNCMP(pp, "-g", 2))
-    {
-      if (pp[2] && STRNCMP("roupsize", pp + 2, 8))
-        octspergrp = (int)strtol(pp + 2, NULL, 0);
-      else
-        {
-          if (!argv[2])
-        exit_with_usage(pname);
-          octspergrp = (int)strtol(argv[2], NULL, 0);
-          argv++;
-          argc--;
-        }
-    }
-      else if (!STRNCMP(pp, "-s", 2))
-    {
-      relseek = 0;
-      negseek = 0;
-      if (pp[2] && STRNCMP("kip", pp+2, 3) && STRNCMP("eek", pp+2, 3))
-        {
-#ifdef TRY_SEEK
-          if (pp[2] == '+')
-        relseek++;
-          if (pp[2+relseek] == '-')
-        negseek++;
-#endif
-          seekoff = strtol(pp + 2+relseek+negseek, (char **)NULL, 0);
-        }
-      else
-        {
-          if (!argv[2])
-        exit_with_usage(pname);
-#ifdef TRY_SEEK
-          if (argv[2][0] == '+')
-        relseek++;
-          if (argv[2][relseek] == '-')
-        negseek++;
-#endif
-          seekoff = strtol(argv[2] + relseek+negseek, (char **)NULL, 0);
-          argv++;
-          argc--;
-        }
-    }
-      else if (!STRNCMP(pp, "-l", 2))
-    {
-      if (pp[2] && STRNCMP("en", pp + 2, 2))
-        length = strtol(pp + 2, (char **)NULL, 0);
-      else
-        {
-          if (!argv[2])
-        exit_with_usage(pname);
-          length = strtol(argv[2], (char **)NULL, 0);
-          argv++;
-          argc--;
-        }
-    }
-      else if (!strcmp(pp, "--"))	/* end of options */
-    {
-      argv++;
-      argc--;
-      break;
-    }
-      else if (pp[0] == '-' && pp[1])	/* unknown option */
-    exit_with_usage(pname);
-      else
-    break;				/* not an option */
-
-      argv++;				/* advance to next argument */
-      argc--;
-    }
-
-  if (!cols)
-    switch (hextype)
-      {
-      case HEX_POSTSCRIPT:	cols = 30; break;
-      case HEX_CINCLUDE:	cols = 12; break;
-      case HEX_BITS:		cols = 6; break;
-      case HEX_NORMAL:
-      default:			cols = 16; break;
-      }
-
-  if (octspergrp < 0)
-    switch (hextype)
-      {
-      case HEX_BITS:		octspergrp = 1; break;
-      case HEX_NORMAL:		octspergrp = 2; break;
-      case HEX_POSTSCRIPT:
-      case HEX_CINCLUDE:
-      default:			octspergrp = 0; break;
-      }
-
-  if (cols < 1 || ((hextype == HEX_NORMAL || hextype == HEX_BITS)
-                                && (cols > COLS)))
-    {
-      fprintf(stderr, "%s: invalid number of columns (max. %d).\n", pname, COLS);
-      exit(1);
-    }
-
-  if (octspergrp < 1)
-    octspergrp = cols;
-
-  if (argc > 3)
-    exit_with_usage(pname);
-
-  if (argc == 1 || (argv[1][0] == '-' && !argv[1][1]))
-    BIN_ASSIGN(fp = stdin, !revert);
-  else
-    {
-      if ((fp = fopen(argv[1], BIN_READ(!revert))) == NULL)
-    {
-      fprintf(stderr,"%s: ", pname);
-      perror(argv[1]);
-      return 2;
-    }
-    }
-
-  if (argc < 3 || (argv[2][0] == '-' && !argv[2][1]))
-    BIN_ASSIGN(fpo = stdout, revert);
-  else
-    {
-      int fd;
-      int mode = revert ? O_WRONLY : (O_TRUNC|O_WRONLY);
-
-      if (((fd = OPEN(argv[2], mode | BIN_CREAT(revert), 0666)) < 0) ||
-      (fpo = fdopen(fd, BIN_WRITE(revert))) == NULL)
-    {
-      fprintf(stderr, "%s: ", pname);
-      perror(argv[2]);
-      return 3;
-    }
-      rewind(fpo);
-    }
-
-  if (revert)
-    {
-      if (hextype && (hextype != HEX_POSTSCRIPT))
-    {
-      fprintf(stderr, "%s: sorry, cannot revert this type of hexdump\n", pname);
-      return -1;
-    }
-      return huntype(fp, fpo, stderr, pname, cols, hextype,
-        negseek ? -seekoff : seekoff);
-    }
-
-  if (seekoff || negseek || !relseek)
-    {
-#ifdef TRY_SEEK
-      if (relseek)
-    e = fseek(fp, negseek ? -seekoff : seekoff, 1);
-      else
-    e = fseek(fp, negseek ? -seekoff : seekoff, negseek ? 2 : 0);
-      if (e < 0 && negseek)
-    {
-      fprintf(stderr, "%s: sorry cannot seek.\n", pname);
-      return 4;
-    }
-      if (e >= 0)
-    seekoff = ftell(fp);
-      else
-#endif
-    {
-      long s = seekoff;
-
-      while (s--)
-        (void)getc(fp);
-    }
-    }
-
-  if (hextype == HEX_CINCLUDE)
-    {
-      const char* filename = extract_filename(argv[1]);
-
-      if (fp != stdin)
-    {
-      fprintf(fpo, "unsigned char %s", isdigit((int)filename[0]) ? "__" : "");
-      for (e = 0; (c = filename[e]) != 0; e++)
-        putc(isalnum(c) ? c : '_', fpo);
-      fputs("[] = {\n", fpo);
-    }
-
-      p = 0;
-      while ((length < 0 || p < length) && (c = getc(fp)) != EOF)
-    {
-      fprintf(fpo, (hexx == hexxa) ? "%s0x%02x" : "%s0X%02X",
-        (p % cols) ? ", " : ",\n  "+2*!p,  c);
-      p++;
-    }
-
-      if (p)
-    fputs("\n};\n"+3*(fp == stdin), fpo);
-
-      if (fp != stdin)
-    {
-      fprintf(fpo, "unsigned int %s", isdigit((int)filename[0]) ? "__" : "");
-      for (e = 0; (c = filename[e]) != 0; e++)
-        putc(isalnum(c) ? c : '_', fpo);
-      fprintf(fpo, "_len = %d;\n", p);
-    }
-
-      fclose(fp);
-      fclose(fpo);
-      return 0;
-    }
-
-  if (hextype == HEX_POSTSCRIPT)
-    {
-      p = cols;
-      while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
-    {
-      putchar(hexx[(e >> 4) & 0xf]);
-      putchar(hexx[(e     ) & 0xf]);
-      n++;
-      if (!--p)
-        {
-          putchar('\n');
-          p = cols;
-        }
-    }
-      if (p < cols)
-    putchar('\n');
-      fclose(fp);
-      fclose(fpo);
-      return 0;
-    }
-
-  /* hextype: HEX_NORMAL or HEX_BITS */
-
-  if (hextype == HEX_NORMAL)
-    grplen = octspergrp + octspergrp + 1;	/* chars per octet group */
-  else	/* hextype == HEX_BITS */
-    grplen = 8 * octspergrp + 1;
-
-  while ((length < 0 || n < length) && (e = getc(fp)) != EOF)
-    {
-      if (p == 0)
-    {
-      sprintf(l, "%07lx: ", n + seekoff);
-      for (c = 9; c < LLEN; l[c++] = ' ');
-    }
-      if (hextype == HEX_NORMAL)
-    {
-      l[c = (9 + (grplen * p) / octspergrp)] = hexx[(e >> 4) & 0xf];
-      l[++c]			       = hexx[ e       & 0xf];
-    }
-      else /* hextype == HEX_BITS */
-    {
-      int i;
-
-      c = (9 + (grplen * p) / octspergrp) - 1;
-      for (i = 7; i >= 0; i--)
-        l[++c] = (e & (1 << i)) ? '1' : '0';
-    }
-      if (ebcdic)
-    e = (e < 64) ? '.' : etoa64[e-64];
-      /* When changing this update definition of LLEN above. */
-      l[11 + (grplen * cols - 1)/octspergrp + p] =
-#ifdef __MVS__
-      (e >= 64)
-#else
-      (e > 31 && e < 127)
-#endif
-      ? e : '.';
-      if (e)
-    nonzero++;
-      n++;
-      if (++p == cols)
-    {
-      l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
-      xxdline(fpo, l, autoskip ? nonzero : 1);
-      nonzero = 0;
-      p = 0;
-    }
-    }
-  if (p)
-    {
-      l[c = (11 + (grplen * cols - 1)/octspergrp + p)] = '\n'; l[++c] = '\0';
-      xxdline(fpo, l, 1);
-    }
-  else if (autoskip)
-    xxdline(fpo, l, -1);	/* last chance to flush out suppressed lines */
-
-  fclose(fp);
-  fclose(fpo);
-  return 0;
-}
--- a/kompute/kompute-config.cmake
+++ b/kompute/kompute-config.cmake
@@ -1,28 +0,0 @@
-# General purpose GPU compute framework built on Vulkan to
-# support 1000s of cross vendor graphics cards
-# (AMD, Qualcomm, NVIDIA & friends). Blazing fast, mobile-enabled,
-# asynchronous and optimized for advanced GPU data processing use cases.
-# Backed by the Linux Foundation. 
-#
-# Finding this module will define the following variables:
-#  KOMPUTE_FOUND - True if the core library has been found
-#  KOMPUTE_LIBRARIES - Path to the core library archive
-#  KOMPUTE_INCLUDE_DIRS - Path to the include directories. Gives access
-#                     to kompute.h, as a single include which must be included in every
-#                     file that uses this interface. Else it also points to the
-#                     directory for individual includes.
-
-find_path(KOMPUTE_INCLUDE_DIR
-          NAMES kompute.h)
-
-find_library(KOMPUTE_LIBRARY
-             NAMES kompute
-             HINTS ${KOMPUTE_LIBRARY_ROOT})
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(KOMPUTE REQUIRED_VARS KOMPUTE_LIBRARY KOMPUTE_INCLUDE_DIR)
-
-if(KOMPUTE_FOUND)
-    set(KOMPUTE_LIBRARIES ${KOMPUTE_LIBRARY})
-    set(KOMPUTE_INCLUDE_DIRS ${KOMPUTE_INCLUDE_DIR})
-endif()
--- a/kompute/op_add.comp
+++ b/kompute/op_add.comp
@@ -1,145 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    uint row;
-} pcs;
-
-void main() {
-    const uint i = gl_WorkGroupID.x;
-
-    out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i) + pcs.inBOff];
-}
--- a/kompute/op_addrow.comp
+++ b/kompute/op_addrow.comp
@@ -1,145 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
-layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; };
-layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    uint row;
-} pcs;
-
-void main() {
-    const uint i = gl_WorkGroupID.x;
-
-    out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff];
-}
--- a/kompute/op_cpy_f16_f16.comp
+++ b/kompute/op_cpy_f16_f16.comp
@@ -1,176 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
-
-#define nth 32
-#define IN_TYPE float16_t
-#define IN_TYPE_SIZE 2
-#define OUT_TYPE float16_t
-#define OUT_TYPE_SIZE 2
-
-layout(local_size_x = nth) in;
-
-layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
-
-layout (push_constant) uniform parameter {
-    uint inOff;
-    uint outOff;
-    int ne00;
-    int ne01;
-    int ne02;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    int ne1;
-    int ne2;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-
-void main() {
-    const uint i03 = gl_WorkGroupID.z;
-    const uint i02 = gl_WorkGroupID.y;
-    const uint i01 = gl_WorkGroupID.x;
-
-    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
-
-    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
-    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
-    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
-    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
-
-    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
-
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
-        out_[dst_data+i00] = OUT_TYPE(in_[src]);
-    }
-}
--- a/kompute/op_cpy_f16_f32.comp
+++ b/kompute/op_cpy_f16_f32.comp
@@ -1,176 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
-
-#define nth 32
-#define IN_TYPE float16_t
-#define IN_TYPE_SIZE 2
-#define OUT_TYPE float
-#define OUT_TYPE_SIZE 4
-
-layout(local_size_x = nth) in;
-
-layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
-
-layout (push_constant) uniform parameter {
-    uint inOff;
-    uint outOff;
-    int ne00;
-    int ne01;
-    int ne02;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    int ne1;
-    int ne2;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-
-void main() {
-    const uint i03 = gl_WorkGroupID.z;
-    const uint i02 = gl_WorkGroupID.y;
-    const uint i01 = gl_WorkGroupID.x;
-
-    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
-
-    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
-    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
-    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
-    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
-
-    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
-
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
-        out_[dst_data+i00] = OUT_TYPE(in_[src]);
-    }
-}
--- a/kompute/op_cpy_f32_f16.comp
+++ b/kompute/op_cpy_f32_f16.comp
@@ -1,176 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
-
-#define nth 32
-#define IN_TYPE float
-#define IN_TYPE_SIZE 4
-#define OUT_TYPE float16_t
-#define OUT_TYPE_SIZE 2
-
-layout(local_size_x = nth) in;
-
-layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
-
-layout (push_constant) uniform parameter {
-    uint inOff;
-    uint outOff;
-    int ne00;
-    int ne01;
-    int ne02;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    int ne1;
-    int ne2;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-
-void main() {
-    const uint i03 = gl_WorkGroupID.z;
-    const uint i02 = gl_WorkGroupID.y;
-    const uint i01 = gl_WorkGroupID.x;
-
-    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
-
-    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
-    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
-    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
-    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
-
-    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
-
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
-        out_[dst_data+i00] = OUT_TYPE(in_[src]);
-    }
-}
--- a/kompute/op_cpy_f32_f32.comp
+++ b/kompute/op_cpy_f32_f32.comp
@@ -1,168 +0,0 @@
-#version 450
-
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
-
-#define nth 32
-#define IN_TYPE float
-#define IN_TYPE_SIZE 4
-#define OUT_TYPE float
-#define OUT_TYPE_SIZE 4
-
-layout(local_size_x = nth) in;
-
-layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; };
-layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; };
-
-layout (push_constant) uniform parameter {
-    uint inOff;
-    uint outOff;
-    int ne00;
-    int ne01;
-    int ne02;
-    uint nb00;
-    uint nb01;
-    uint nb02;
-    uint nb03;
-    int ne0;
-    int ne1;
-    int ne2;
-    uint nb0;
-    uint nb1;
-    uint nb2;
-    uint nb3;
-} pcs;
-
-void main() {
-    const uint i03 = gl_WorkGroupID.z;
-    const uint i02 = gl_WorkGroupID.y;
-    const uint i01 = gl_WorkGroupID.x;
-
-    const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00;
-
-    const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0);
-    const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0);
-    const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0;
-    const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0);
-
-    const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_
-
-    for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += nth) {
-        const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_
-        out_[dst_data+i00] = OUT_TYPE(in_[src]);
-    }
-}
--- a/kompute/op_diagmask.comp
+++ b/kompute/op_diagmask.comp
@@ -1,153 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-
-layout(push_constant) uniform PushConstants {
-    uint inOff;
-    uint outOff;
-    uint n_past;
-    int ne00;
-    int ne01;
-} pcs;
-
-void main() {
-    const uint i02 = gl_WorkGroupID.z;
-    const uint i01 = gl_WorkGroupID.y;
-    const uint i00 = gl_WorkGroupID.x;
-
-    const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00;
-
-    if (i00 > pcs.n_past + i01) {
-        out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000);
-    } else {
-        out_[index + pcs.outOff] = in_[index + pcs.inOff];
-    }
-}
--- a/kompute/op_gelu.comp
+++ b/kompute/op_gelu.comp
@@ -1,142 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
-
-layout(local_size_x = 1) in;
-
-layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; };
-layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; };
-layout(push_constant) uniform PushConstants {
-    uint inOff;
-    uint outOff;
-} pcs;
-
-void main() {
-    const uint i = gl_WorkGroupID.x;
-    const float x = in_[i + pcs.inOff];
-
-    out_[i + pcs.outOff] = 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));
-}
--- a/kompute/op_getrows_f16.comp
+++ b/kompute/op_getrows_f16.comp
@@ -1,150 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
-
-layout(local_size_x = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { int inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int nb01;
-    int nb1;
-} pcs;
-
-void main() {
-    const uint i = gl_WorkGroupID.x;
-    const int r = inB[i + pcs.inBOff];
-
-    for (int j = 0; j < pcs.ne00; j++) {
-        out_[i*pcs.nb1 + j + pcs.outOff] = inA[r*pcs.nb01/2+j + pcs.inAOff];
-    }
-}
--- a/kompute/op_getrows_q4_0.comp
+++ b/kompute/op_getrows_q4_0.comp
@@ -1,179 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
-
-layout(local_size_x = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { int inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int nb01;
-    int nb1;
-} pcs;
-
-#define UNALIGNED_INPUT inA
-
-block_q4_0 get_unaligned_block_q4_0(uint index) {
-    block_q4_0 fres;
-    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
-    [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) {
-        fres.qs[it] = UNALIGNED_INPUT[index+2+it];
-    }
-    return fres;
-}
-
-void dequantize_row_q4_0(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
-    const uint qk = QK4_0;
-
-    const uint nb = k / qk;
-
-    for (uint i = 0; i < nb; i++) {
-        const block_q4_0 block = get_unaligned_block_q4_0(x + i*sizeof_block_q4_0);
-
-        const float16_t d = block.d;
-
-        for (uint j = 0; j < qk/2; ++j) {
-            const int x0 = (block.qs[j] & 0x0F) - 8;
-            const int x1 = (block.qs[j] >>   4) - 8;
-
-            out_[y+i*qk + j + 0   ] = float(x0)*d;
-            out_[y+i*qk + j + qk/2] = float(x1)*d;
-        }
-    }
-}
-
-void main() {
-    const uint i = gl_WorkGroupID.x;
-    const int r = inB[i + pcs.inBOff];
-
-    dequantize_row_q4_0(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
-}
--- a/kompute/op_getrows_q4_1.comp
+++ b/kompute/op_getrows_q4_1.comp
@@ -1,181 +0,0 @@
-/**
- * Copyright (c) 2023 Nomic, Inc. All rights reserved.
- *
- * This software is licensed under the terms of the Software for Open Models License (SOM),
- * version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
- * this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
- */
-
-#version 450
-
-#extension GL_EXT_shader_16bit_storage: require
-#extension GL_EXT_shader_8bit_storage: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
-#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
-#extension GL_EXT_control_flow_attributes: enable
-
-#define QK4_0 32
-#define QR4_0 2
-#define QK4_1 32
-
-#define GELU_COEF_A 0.044715
-#define SQRT_2_OVER_PI 0.79788456080286535587989211986876
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-#define BM 128
-#define BN 128
-#define BK 8
-#define TM 8
-#define TN 8
-
-#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
-#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
-#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx])
-#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx)
-
-#define sizeof_block_q4_0 0x12
-#define sizeof_block_q4_1 0x14
-struct block_q4_0 {
-    float16_t d;
-    uint8_t qs[QK4_0 / 2];
-};
-struct block_q4_1 {
-    float16_t d;
-    float16_t m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-#ifndef QK_K
-#define QK_K 256
-#endif
-
-#if QK_K == 256
-#define K_SCALE_SIZE 12
-#else
-#define K_SCALE_SIZE 4
-#endif
-
-struct block_q2_K {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    float16_t d;           // super-block scale for quantized scales
-    float16_t dmin;        // super-block scale for quantized mins
-};
-// 84 bytes / block
-
-struct block_q3_K {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#if QK_K == 64
-    uint8_t scales[2];
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    float16_t d;             // super-block scale
-};
-
-#if QK_K == 64
-typedef struct {
-    float16_t    d[2];          // super-block scales/mins
-    uint8_t scales[2];
-    uint8_t qs[QK_K/2];    // 4-bit quants
-} block_q4_K;
-#else
-struct block_q4_K {
-    float16_t d;             // super-block scale for quantized scales
-    float16_t dmin;          // super-block scale for quantized mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-};
-#endif
-
-#if QK_K == 64
-struct block_q5_K {
-    float16_t  d;                     // super-block scales/mins
-    int8_t  scales[QK_K/16];     // 8-bit block scales
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-#else
-struct block_q5_K {
-    float16_t d;                      // super-block scale for quantized scales
-    float16_t dmin;                   // super-block scale for quantized mins
-    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];          // quants, high bit
-    uint8_t qs[QK_K/2];          // quants, low 4 bits
-};
-// 176 bytes / block
-#endif
-
-struct block_q6_K {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    float16_t d;                  // super-block scale
-};
-// 210 bytes / block
-
-layout(local_size_x = 1) in;
-
-layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
-layout (binding = 1) readonly buffer tensorInB { int inB[]; };
-layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
-
-layout (push_constant) uniform parameter {
-    uint inAOff;
-    uint inBOff;
-    uint outOff;
-    int ne00;
-    int nb01;
-    int nb1;
-} pcs;
-
-#define UNALIGNED_INPUT inA
-
-block_q4_1 get_unaligned_block_q4_1(uint index) {
-    block_q4_1 fres;
-    fres.d = u8BufToFloat16(UNALIGNED_INPUT, index);
-    fres.m = u8BufToFloat16(UNALIGNED_INPUT, index+2);
-    [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) {
-        fres.qs[it] = UNALIGNED_INPUT[index+4+it];
-    }
-    return fres;
-}
-
-void dequantize_row_q4_1(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) {
-    const uint qk = QK4_1;
-
-    const uint nb = k / qk;
-
-    for (uint i = 0; i < nb; i++) {
-        const block_q4_1 block = get_unaligned_block_q4_1(x + i*sizeof_block_q4_0);
-
-        const float16_t d = block.d;
-        const float16_t m = block.m;
-
-        for (uint j = 0; j < qk/2; ++j) {
-            const int x0 = (block.qs[j] & 0x0F);
-            const int x1 = (block.qs[j] >>   4);
-
-            out_[y+i*qk + j + 0   ] = float(x0)*d + m;
-            out_[y+i*qk + j + qk/2] = float(x1)*d + m;
-        }
-    }
-}
-
-void main() {
-    const uint i = gl_WorkGroupID.x;
-    const int r = inB[i + pcs.inBOff];
-
-    dequantize_row_q4_1(uint(r*pcs.nb01) + pcs.inAOff, uint(i*pcs.nb1/4) + pcs.outOff, pcs.ne00);
-}
--- a/Show More
+++ b/Show More