ggml : use full range for Q4_0 and Q4_2 quantization (#729 )

* Use full range for q4_0 quantization By keeping the sign of the highest magnitude, we can make sure the highest value maps to -8, which is currently unused. This is a bit of a freebie since it is fully backwards compatible with the current format. * Update quantize_row_q4_0 for AVX/AVX2 * Update quantize_row_q4_0 for WASM Untested * Update quantize_row_q4_0 for Arm NEON * Update quantize_row_q4_0 for PowerPC Untested * Use full range for q4_2 quantization
ggml : fix bug in ggml_compute_forward_sum_f32 (#1162 )
2026-02-26 14:23:22 +02:00 · 2023-04-25 20:20:46 +03:00 · 2023-04-24 23:02:02 +02:00 · 2023-04-24 22:18:25 +03:00 · 2023-04-24 19:23:31 +03:00 · 2023-04-24 18:47:30 +03:00
62 changed files with 9277 additions and 3968 deletions
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -5,9 +5,10 @@ FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip

+COPY requirements.txt requirements.txt
+
 RUN pip install --upgrade pip setuptools wheel \
-    && pip install numpy requests sentencepiece tqdm \
-    && pip install torch --index-url https://download.pytorch.org/whl/cpu
+    && pip install -r requirements.txt

 WORKDIR /app

--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -15,4 +15,4 @@ FROM ubuntu:$UBUNTU_VERSION as runtime

 COPY --from=build /app/main /main

-ENTRYPOINT [ "/main" ]
+ENTRYPOINT [ "/main" ]
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,4 +21,4 @@ models/*

 arm_neon.h
 compile_commands.json
-Dockerfile
+Dockerfile
--- a/.ecrc
+++ b/.ecrc
@@ -0,0 +1,5 @@
+{
+  "Disable": {
+    "IndentSize": true
+  }
+}
--- a/.editorconfig
+++ b/.editorconfig
@@ -0,0 +1,19 @@
+# https://EditorConfig.org
+
+# Top-most EditorConfig file
+root = true
+
+# Unix-style newlines with a newline ending every file, utf-8 charset
+[*]
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+charset = utf-8
+indent_style = space
+indent_size = 4
+
+[Makefile]
+indent_style = tab
+
+[prompts/*.txt]
+insert_final_newline = unset
--- a/.github/ISSUE_TEMPLATE/custom.md
+++ b/.github/ISSUE_TEMPLATE/custom.md
@@ -22,9 +22,9 @@ Please provide a detailed written description of what you were trying to do, and

 # Current Behavior

-Please provide a detailed written description of what `llama.cpp` did, instead. 
+Please provide a detailed written description of what `llama.cpp` did, instead.

-# Environment and Context 
+# Environment and Context

 Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.

@@ -133,7 +133,7 @@ llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.
 llama_model_load: .......................................................................................... done
 llama_model_load: model size =  4869.09 MB / num tensors = 723

-system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 
+system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |

 main: prompt: 'Please close your issue when it has been answered.'
 main: number of tokens in prompt = 11
@@ -166,14 +166,14 @@ main:    total time = 246406.42 ms

 Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':

-        3636882.89 msec task-clock                #   14.677 CPUs utilized          
-             13509      context-switches          #    3.714 /sec                   
-              2436      cpu-migrations            #    0.670 /sec                   
-          10476679      page-faults               #    2.881 K/sec                  
+        3636882.89 msec task-clock                #   14.677 CPUs utilized
+             13509      context-switches          #    3.714 /sec
+              2436      cpu-migrations            #    0.670 /sec
+          10476679      page-faults               #    2.881 K/sec
    13133115082869      cycles                    #    3.611 GHz                      (16.77%)
       29314462753      stalled-cycles-frontend   #    0.22% frontend cycles idle     (16.76%)
    10294402631459      stalled-cycles-backend    #   78.39% backend cycles idle      (16.74%)
-    23479217109614      instructions              #    1.79  insn per cycle         
+    23479217109614      instructions              #    1.79  insn per cycle
                                                  #    0.44  stalled cycles per insn  (16.76%)
     2353072268027      branches                  #  647.002 M/sec                    (16.77%)
        1998682780      branch-misses             #    0.08% of all branches          (16.76%)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -8,17 +8,19 @@ on:
        required: true
        type: boolean
  push:
+    branches:
+      - master
    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
  pull_request:
-    types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
+    types: [opened, synchronize, reopened]
    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']

 env:
 BRANCH_NAME: ${{ github.head_ref || github.ref_name }}

 jobs:
-  ubuntu-latest-make:
-    runs-on: ubuntu-latest
+  ubuntu-focal-make:
+    runs-on: ubuntu-20.04

    steps:
      - name: Clone
@@ -29,12 +31,12 @@ jobs:
        id: depends
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential
+          sudo apt-get install build-essential gcc-8

      - name: Build
        id: make_build
        run: |
-          make
+          CC=gcc-8 make

  ubuntu-latest-cmake:
    runs-on: ubuntu-latest
@@ -73,7 +75,6 @@ jobs:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
        build_type: [Debug, Release]
-        accelerate: [ON, OFF]

    steps:
      - name: Clone
@@ -91,7 +92,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_ACCELERATE=${{ matrix.accelerate }}
+          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
          cmake --build . --config ${{ matrix.build_type }}

      - name: Test
@@ -156,7 +157,7 @@ jobs:
         - build: 'avx'
           defines: '-DLLAMA_AVX2=OFF'
         - build: 'avx512'
-           defines: '-DLLAMA_AVX512=ON'
+           defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'

    steps:
      - name: Clone
@@ -215,7 +216,7 @@ jobs:
    runs-on: ubuntu-latest

    needs:
-      - ubuntu-latest-make
+      - ubuntu-focal-make
      - ubuntu-latest-cmake
      - macOS-latest-make
      - macOS-latest-cmake
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -18,6 +18,8 @@ on:
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
+    if: github.event.pull_request.draft == false
+
    runs-on: ubuntu-latest
    env:
      COMMIT_SHA: ${{ github.sha }}
@@ -60,4 +62,4 @@ jobs:
          push: ${{ github.event_name == 'push' }}
          platforms: linux/amd64,linux/arm64
          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
-          file: ${{ matrix.config.dockerfile }}
+          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -0,0 +1,17 @@
+name: EditorConfig Checker
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  editorconfig:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: editorconfig-checker/action-editorconfig-checker@main
+      - run: editorconfig-checker
--- a/.gitignore
+++ b/.gitignore
@@ -1,11 +1,15 @@
 *.o
 *.a
+.DS_Store
+.build/
 .cache/
+.direnv/
+.envrc
+.swiftpm
+.venv
 .vs/
 .vscode/
-.DS_Store

-.build/
 build/
 build-em/
 build-debug/
@@ -19,20 +23,20 @@ models/*

 /main
 /quantize
+/quantize-stats
 /result
 /perplexity
 /embedding
+/benchmark-q4_0-matmult
+/vdot
 /Pipfile

 arm_neon.h
 compile_commands.json

-.envrc
-.direnv/
-
-.venv
 __pycache__
-.swiftpm

 zig-out/
 zig-cache/
+
+ppl-*.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,11 +55,18 @@ option(LLAMA_SANITIZE_UNDEFINED     "llama: enable undefined sanitizer"
 option(LLAMA_AVX                    "llama: enable AVX"                                     ON)
 option(LLAMA_AVX2                   "llama: enable AVX2"                                    ON)
 option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
+option(LLAMA_AVX512_VBMI            "llama: enable AVX512-VBMI"                             OFF)
+option(LLAMA_AVX512_VNNI            "llama: enable AVX512-VNNI"                             OFF)
 option(LLAMA_FMA                    "llama: enable FMA"                                     ON)
+# in MSVC F16C is implied with AVX2/AVX512
+if (NOT MSVC)
+    option(LLAMA_F16C               "llama: enable F16C"                                    ON)
+endif()

 # 3rd party libs
 option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
 option(LLAMA_OPENBLAS               "llama: use OpenBLAS"                                   OFF)
+option(LLAMA_CUBLAS                 "llama: use cuBLAS"                                     OFF)

 option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
@@ -103,6 +110,7 @@ if (APPLE AND LLAMA_ACCELERATE)
        message(WARNING "Accelerate framework not found")
    endif()
 endif()
+
 if (LLAMA_OPENBLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
@@ -115,11 +123,51 @@ if (LLAMA_OPENBLAS)

        add_compile_definitions(GGML_USE_OPENBLAS)
        add_link_options(${BLAS_LIBRARIES})
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
+
+        # find header file
+        set(OPENBLAS_INCLUDE_SEARCH_PATHS
+            /usr/include
+            /usr/include/openblas
+            /usr/include/openblas-base
+            /usr/local/include
+            /usr/local/include/openblas
+            /usr/local/include/openblas-base
+            /opt/OpenBLAS/include
+            $ENV{OpenBLAS_HOME}
+            $ENV{OpenBLAS_HOME}/include
+            )
+        find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+        add_compile_options(-I${OPENBLAS_INC})
    else()
        message(WARNING "OpenBLAS not found")
    endif()
 endif()

+if (LLAMA_CUBLAS)
+    cmake_minimum_required(VERSION 3.17)
+
+    find_package(CUDAToolkit)
+    if (CUDAToolkit_FOUND)
+        message(STATUS "cuBLAS found")
+
+        enable_language(CUDA)
+
+        set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
+
+        add_compile_definitions(GGML_USE_CUBLAS)
+
+        if (LLAMA_STATIC)
+            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+        else()
+            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
+        endif()
+
+    else()
+        message(WARNING "cuBLAS not found")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(c_flags
@@ -131,7 +179,6 @@ if (LLAMA_ALL_WARNINGS)
            -Wshadow
            -Wstrict-prototypes
            -Wpointer-arith
-            -Wno-unused-function
        )
        set(cxx_flags
            -Wall
@@ -139,6 +186,7 @@ if (LLAMA_ALL_WARNINGS)
            -Wpedantic
            -Wcast-qual
            -Wno-unused-function
+            -Wno-multichar
        )
    else()
        # todo : msvc
@@ -151,6 +199,14 @@ if (LLAMA_ALL_WARNINGS)

 endif()

+if (MSVC)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+
+    if (BUILD_SHARED_LIBS)
+        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+    endif()
+endif()
+
 if (LLAMA_LTO)
    include(CheckIPOSupported)
    check_ipo_supported(RESULT result OUTPUT output)
@@ -194,14 +250,31 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
    message(STATUS "x86 detected")
    if (MSVC)
        if (LLAMA_AVX512)
-            add_compile_options(/arch:AVX512)
+            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
+            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
+            # MSVC has no compile-time flags enabling specific
+            # AVX512 extensions, neither it defines the
+            # macros corresponding to the extensions.
+            # Do it manually.
+            if (LLAMA_AVX512_VBMI)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
+            endif()
+            if (LLAMA_AVX512_VNNI)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
+                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
+            endif()
        elseif (LLAMA_AVX2)
-            add_compile_options(/arch:AVX2)
+            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
+            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
        elseif (LLAMA_AVX)
-            add_compile_options(/arch:AVX)
+            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
+            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
        endif()
    else()
-        add_compile_options(-mf16c)
+        if (LLAMA_F16C)
+            add_compile_options(-mf16c)
+        endif()
        if (LLAMA_FMA)
            add_compile_options(-mfma)
        endif()
@@ -213,9 +286,13 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
        endif()
        if (LLAMA_AVX512)
            add_compile_options(-mavx512f)
-            # add_compile_options(-mavx512cd)
-            # add_compile_options(-mavx512dq)
-            # add_compile_options(-mavx512bw)
+            add_compile_options(-mavx512bw)
+        endif()
+        if (LLAMA_AVX512_VBMI)
+            add_compile_options(-mavx512vbmi)
+        endif()
+        if (LLAMA_AVX512_VNNI)
+            add_compile_options(-mavx512vnni)
        endif()
    endif()
 else()
@@ -229,27 +306,39 @@ endif()

 add_library(ggml OBJECT
            ggml.c
-            ggml.h)
+            ggml.h
+            ${GGML_CUDA_SOURCES})

 target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
-target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
+target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
+
 if (BUILD_SHARED_LIBS)
    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()

 add_library(llama
            llama.cpp
-            llama.h)
+            llama.h
+            llama_util.h)

 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
 target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
+
 if (BUILD_SHARED_LIBS)
    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
 endif()

+if (GGML_CUDA_SOURCES)
+    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
+    set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
+    set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
+    set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
+endif()
+
+
 #
 # programs, examples and tests
 #
@@ -261,4 +350,5 @@ endif ()

 if (LLAMA_BUILD_EXAMPLES)
    add_subdirectory(examples)
+    add_subdirectory(pocs)
 endif()
--- a/71
+++ b/71
@@ -1,3 +1,6 @@
+# Define the default target now so that it is always the first target
+default: main quantize quantize-stats perplexity embedding vdot
+
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
@@ -36,8 +39,8 @@ CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =

 # warnings
-CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
-CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar

 # OS specific
 # TODO: support Windows
@@ -71,13 +74,17 @@ endif
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 	# Use all CPU extensions that are available:
-	CFLAGS += -march=native -mtune=native
+	CFLAGS   += -march=native -mtune=native
 	CXXFLAGS += -march=native -mtune=native
+
+	# Usage AVX-only
+	#CFLAGS   += -mfma -mf16c -mavx
+	#CXXFLAGS += -mfma -mf16c -mavx
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
-		CFLAGS += -mcpu=power9
+		CFLAGS   += -mcpu=power9
 		CXXFLAGS += -mcpu=power9
 	endif
 	# Require c++23's std::byteswap for big-endian support.
@@ -97,12 +104,25 @@ ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
 endif
+ifdef LLAMA_CUBLAS
+	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
+	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
+	OBJS      += ggml-cuda.o
+	NVCC      = nvcc
+	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
+endif
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
 endif
+ifdef LLAMA_PERF
+	CFLAGS   += -DGGML_PERF
+	CXXFLAGS += -DGGML_PERF
+endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
-	CFLAGS += -mcpu=native
+	CFLAGS   += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
@@ -133,43 +153,54 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )

-default: main quantize perplexity embedding
-
 #
 # Build library
 #

 ggml.o: ggml.c ggml.h
-	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
+	$(CC)  $(CFLAGS)   -c $< -o $@

-llama.o: llama.cpp llama.h
-	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
+llama.o: llama.cpp ggml.h llama.h llama_util.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@

 common.o: examples/common.cpp examples/common.h
-	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
+	$(CXX) $(CXXFLAGS) -c $< -o $@

 clean:
-	rm -vf *.o main quantize perplexity embedding
+	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult

-main: examples/main/main.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
+main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo

-quantize: examples/quantize/quantize.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
+quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
+perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+
+embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+
+vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+
+libllama.so: llama.o ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 #
 # Tests
 #

+benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
+	./benchmark-q4_0-matmult
+
 .PHONY: tests
 tests:
 	bash ./tests/run-tests.sh
--- a/README.md
+++ b/README.md
@@ -7,13 +7,19 @@

 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

+**Warnings**
+
+- `Q4_2` and `Q4_3` are still in development. Do not expect any kind of backward compatibility until they are finalized
+
 **Hot topics:**

+- [Added LoRA support](https://github.com/ggerganov/llama.cpp/pull/820)
+- [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
 - [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)

 ## Description

-The main goal is to run the model using 4-bit quantization on a MacBook
+The main goal of llama.cpp is to run the llama model using 4-bit quantization on a MacBook.

 - Plain C/C++ implementation without dependencies
 - Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
@@ -42,11 +48,14 @@ New features will probably be added mostly through community contributions.
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
 - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
+- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)

 **Bindings:**

 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
+- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
+- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)

 **UI:**

@@ -147,30 +156,52 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8

 ## Usage

-Here are the step for the LLaMA-7B model:
+Here are the steps for the LLaMA-7B model.
+
+### Get the Code

 ```bash
-# build this repo
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
-make
+```

-#For Windows and CMake, use the following command instead:
-cd <path_to_llama_folder>
-mkdir build
-cd build
-cmake ..
-cmake --build . --config Release
+### Build

+Note: For Windows, CMake or Zig can be used.
+
+1. Use `make`
+
+    ```bash
+    make
+    ```
+
+1. Use CMake
+
+    ```bash
+    mkdir build
+    cd build
+    cmake ..
+    cmake --build . --config Release
+    ```
+
+1. Use Zig
+
+    ```bash
+    zig build -Drelease-fast
+    ```
+
+### Prepare Data & Run
+
+```bash
 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model

 # install Python dependencies
-python3 -m pip install torch numpy sentencepiece
+python3 -m pip install -r requirements.txt

 # convert the 7B model to ggml FP16 format
-python3 convert-pth-to-ggml.py models/7B/ 1
+python3 convert.py models/7B/

 # quantize the model to 4-bits (using method 2 = q4_0)
 ./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
@@ -179,14 +210,11 @@ python3 convert-pth-to-ggml.py models/7B/ 1
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
 ```

-Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11.
-
 When running the larger models, make sure you have enough disk space to store all the intermediate files.

 ### Memory/Disk Requirements

-As the models are currently fully loaded into memory, you will need adequate disk space to save them
-and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
+As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.

 | model | original size | quantized size (4-bit) |
 |-------|---------------|------------------------|
@@ -198,22 +226,22 @@ and sufficient RAM to load them. At the moment, memory and disk requirements are
 ### Interactive mode

 If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
-In this mode, you can always interrupt generation by pressing Ctrl+C and enter one or more lines of text which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt which makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
+In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.

-Here is an example few-shot interaction, invoked with the command
+Here is an example of a few-shot interaction, invoked with the command

 ```bash
-# default arguments using 7B model
+# default arguments using a 7B model
 ./examples/chat.sh

-# advanced chat with 13B model
+# advanced chat with a 13B model
 ./examples/chat-13B.sh

-# custom arguments using 13B model
+# custom arguments using a 13B model
 ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
 ```

-Note the use of `--color` to distinguish between user input and generated text.
+Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.

 ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)

@@ -242,30 +270,31 @@ There 26 letters in the English Alphabet
 The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
 > List 5 words that start with "ca".
 cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
-> 
+>
 ```

 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)

- Obtain the `gpt4all-lora-quantized.bin` model
+- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
+- Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
+- Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
 - It is distributed in the old `ggml` format which is now obsoleted
- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
-convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
+- You have to convert it to the new format using `convert.py`:

-  ```bash
-  python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model 
-  python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
-  ```
-  
- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
- The original model is saved in the same folder with a suffix `.orig`
+```bash
+python3 convert.py models/gpt4all-7B/gpt4all-lora-quantized.bin
+```
+
+- You can now use the newly generated `models/gpt4all-7B/ggml-model-q4_0.bin` model in exactly the same way as all other models
+
+- The newer GPT4All-J model is not yet supported!

 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data

- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. 
+- **Under no circumstances should IPFS, magnet links, or any other links to model downloads be shared anywhere in this repository, including in issues, discussions, or pull requests. They will be immediately deleted.**
+- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
 - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
- Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
+- Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
 - The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:

  `sha256sum --ignore-missing -c SHA256SUMS` on Linux
@@ -274,29 +303,27 @@ convert the model from the old format to the new format with [./migrate-ggml-202

  `shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS

- If your issue is with model generation quality then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
-  - LLaMA:
-    - [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
-    - [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
-  - GPT-3
-    - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
-  - GPT-3.5 / InstructGPT / ChatGPT:
-    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
-    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
-    
-### Perplexity (Measuring model quality)
+- If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
+- LLaMA:
+- [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
+- [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
+- GPT-3
+- [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
+- GPT-3.5 / InstructGPT / ChatGPT:
+- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
+- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)

-You can use the `perplexity` example to measure perplexity over the given prompt.  For more background,
-see https://huggingface.co/docs/transformers/perplexity.  However, in general, lower perplexity is better for LLMs.
+### Perplexity (measuring model quality)
+
+You can use the `perplexity` example to measure perplexity over the given prompt. For more background, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity). However, in general, lower perplexity is better for LLMs.

 #### Latest measurements

-The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406).  `llama.cpp` is measuring very well
-compared to the baseline implementations.  Quantization has a small negative impact to quality, but, as you can see, running
+The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406). `llama.cpp` is measuring very well compared to the baseline implementations. Quantization has a small negative impact on quality, but, as you can see, running
 13B at q4_0 beats the 7B f16 model by a significant amount.

-All measurements are done against wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context).
-Note that the changing the context length will have a significant impact on perplexity (longer context = better perplexity).
+All measurements are done against the wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context).
+Note that changing the context length will have a significant impact on perplexity (longer context = better perplexity).
 ```
 Perplexity - model options
 5.5985 - 13B, q4_0
@@ -338,7 +365,7 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0

 #### Prerequisites
 * Docker must be installed and running on your system.
-* Create a folder to store big models & intermediate files (in ex. im using /llama/models)
+* Create a folder to store big models & intermediate files (ex. /llama/models)

 #### Images
 We have two Docker images available for this project:
@@ -352,17 +379,17 @@ The easiest way to download the models, convert them to ggml and optimize them i

 Replace `/path/to/models` below with the actual path where you downloaded the models.

- ```bash
+```bash
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
 ```

-On complete, you are ready to play!
+On completion, you are ready to play!

 ```bash
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```

-or with light image:
+or with a light image:

 ```bash
 docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
@@ -383,7 +410,7 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /mode
 - Always consider cross-compatibility with other operating systems and architectures
 - Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
 - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
+- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions

 ### Docs
--- a/20
+++ b/20
@@ -1,12 +1,27 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
+666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
+fcb7664c2e69776920b526362a243e912f73c36b1ec892eb354bab940f5edb5a  models/7B/ggml-model-q4_0.bin
+cc061458339a3eb8bcecbf0a825e9924fb7d1a8150f63cd5d091caa99215aafe  models/7B/ggml-model-q4_1.bin
+1bc7484c24a87612726d756f1761890e7acf5f412e23378577ce50fbe789b5b8  models/7B/ggml-model-q4_2.bin
+3429bf198ec771886cf81a574df45245f3ebf04f0ce0956b73ef5d0ab01ff48b  models/7B/ggml-model-q4_3.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
+2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
+4b69e4d6b6e3275230955997b90407fceca7e5ab3daf2e63a2c9e7270a8e1e3e  models/13B/ggml-model-q4_0.bin
+d9581b5b88e5622532fe897c9f9b0e67a317d22dd27a6f90fa4ab8c6d23ccdbb  models/13B/ggml-model-q4_1.bin
+8d55a2077317ec9a928c7851d6a43e08e51f7e9e08360f2a7a7e1deefea3134f  models/13B/ggml-model-q4_2.bin
+4208cdec9788ffa48dc1a17af2c36a0299f5bf3eb0e2b87889dda7fad591fca3  models/13B/ggml-model-q4_3.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
+7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
+7a679908ce31c9d6ae2e38d6059bcd4d0ad3a870cd58cc1c8f7b36f2b2f51c73  models/30B/ggml-model-q4_0.bin
+7b75ac615fa369ee593493a7e6ef87542bf0350255db928b22c5a24f6d598bcd  models/30B/ggml-model-q4_1.bin
+2c82b4954a94a6a284f452f6011c1e4f0d20362c194a0b1eb5737f5fd8a20fb3  models/30B/ggml-model-q4_2.bin
+a6188660199dbcb8d5658abe7d89169869e50423494385830d9e6b330ea7fc33  models/30B/ggml-model-q4_3.bin
 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
@@ -16,5 +31,10 @@ e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770  models/65B/con
 a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/consolidated.05.pth
 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
+60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
+c671fe1bce71499ac732ec999770ebe53ac486623a7891e42c9dfdb6962d2c64  models/65B/ggml-model-q4_0.bin
+4743a28aac3e5f32a6e838a815f51d3779de44fbbe251d745251e66c23c5950f  models/65B/ggml-model-q4_1.bin
+4a145a210c56982389b1ed34387e0590c3e0d7325fa9be4f2284fe4d244a3633  models/65B/ggml-model-q4_2.bin
+305e91a4608b4f627b9b8ad5b4af75187d2684254bfd76dcb9db571618ef293c  models/65B/ggml-model-q4_3.bin
 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model
--- a/build.zig
+++ b/build.zig
@@ -1,14 +1,14 @@
 const std = @import("std");

-pub fn build(b: *std.Build) void {
+pub fn build(b: *std.build.Builder) void {
    const target = b.standardTargetOptions(.{});
-    const optimize = b.standardOptimizeOption(.{});
+    const optimize = b.standardReleaseOptions();
+    const want_lto = b.option(bool, "lto", "Want -fLTO");

-    const lib = b.addStaticLibrary(.{
-        .name = "llama",
-        .target = target,
-        .optimize = optimize,
-    });
+    const lib = b.addStaticLibrary("llama", null);
+    lib.want_lto = want_lto;
+    lib.setTarget(target);
+    lib.setBuildMode(optimize);
    lib.linkLibCpp();
    lib.addIncludePath(".");
    lib.addIncludePath("examples");
@@ -17,11 +17,11 @@ pub fn build(b: *std.Build) void {
    }, &.{"-std=c11"});
    lib.addCSourceFiles(&.{
        "llama.cpp",
-        "examples/common.cpp",
    }, &.{"-std=c++11"});
    lib.install();

-    const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize };
+    const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
+
    const exe = build_example("main", build_args);
    _ = build_example("quantize", build_args);
    _ = build_example("perplexity", build_args);
@@ -42,18 +42,17 @@ pub fn build(b: *std.Build) void {
 fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
    const b = args.b;
    const lib = args.lib;
-    const target = args.target;
-    const optimize = args.optimize;
+    const want_lto = args.want_lto;

-    const exe = b.addExecutable(.{
-        .name = name,
-        .target = target,
-        .optimize = optimize,
-    });
+    const exe = b.addExecutable(name, null);
+    exe.want_lto = want_lto;
+    lib.setTarget(args.target);
+    lib.setBuildMode(args.optimize);
    exe.addIncludePath(".");
    exe.addIncludePath("examples");
    exe.addCSourceFiles(&.{
        std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
+        "examples/common.cpp",
    }, &.{"-std=c++11"});
    exe.linkLibrary(lib);
    exe.install();
--- a/convert-ggml-to-pth.py
+++ b/convert-ggml-to-pth.py
@@ -1,299 +0,0 @@
-# Author: github.com/ductai199x
-import argparse
-import os
-import struct
-
-import numpy as np
-import torch
-from numba import njit
-from tqdm.auto import tqdm
-
-
-def read_header(fin):
-    values = struct.unpack("i" * 9, fin.read(4 * 9))
-    _, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
-    return {
-        "vocab_size": vocab_size,
-        "dim": dim,
-        "multiple_of": multiple_of,
-        "n_heads": n_heads,
-        "n_layers": n_layers,
-    }, ftype
-
-
-def read_tokens(fin, vocab_size):
-    tokens = []
-    for _ in range(vocab_size):
-        text_len = struct.unpack("i", fin.read(4))[0]
-        text_bytes = fin.read(text_len)
-        try:
-            text = text_bytes.decode()
-        except UnicodeDecodeError:
-            text = text_bytes.decode(errors="replace")
-        score = struct.unpack("f", fin.read(4))[0]
-        tokens.append((text, score))
-    return tokens
-
-
-@njit
-def dequantize_weights_numba(fin_data, n_rows, n_cols):
-    qk = 32
-    nb = n_cols // qk
-    bs = 4 + (qk // 2)
-
-    weights = np.zeros((n_rows, n_cols), dtype=np.float32)
-    data_pos = 0
-
-    for row in range(n_rows):
-        for block in range(nb):
-            d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
-            data_pos += 4
-            packed_values = fin_data[data_pos : data_pos + (qk // 2)]
-            data_pos += qk // 2
-
-            for i in range(qk // 2):
-                packed_value = packed_values[i]
-                v0 = np.float32((packed_value & 0b00001111) - 8) * d
-                v1 = np.float32((packed_value >> 4) - 8) * d
-
-                weights[row, block * qk + 2 * i] = v0
-                weights[row, block * qk + 2 * i + 1] = v1
-
-    return weights
-
-
-def dequantize_weights(fin, n_rows, n_cols):
-    qk = 32
-    nb = n_cols // qk
-    data_size = n_rows * n_cols // 2 + n_rows * nb * 4
-    fin_data = fin.read(data_size)
-    return dequantize_weights_numba(fin_data, n_rows, n_cols)
-
-
-def read_variables(fin):
-    model = {}
-    pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
-    while True:
-        start_pos = fin.tell()
-        try:
-            n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
-        except struct.error:
-            break
-
-        shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
-        shape = shape[::-1]
-        name = fin.read(name_length).decode()
-
-        # ensure tensor data is aligned
-        tensor_data_offset = fin.tell()
-        tensor_data_offset = (tensor_data_offset + 31) & -32
-        fin.seek(tensor_data_offset)
-
-        if ftype_cur == 2:
-            # 4-bit quantized weights
-            dtype = np.uint8
-            data = dequantize_weights(fin, shape[0], shape[1])
-            data = data.reshape(shape)
-        elif ftype_cur == 0:
-            dtype = np.float32
-            data_size = np.prod(shape)
-            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
-        elif ftype_cur == 1:
-            dtype = np.float16
-            data_size = np.prod(shape)
-            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
-
-        model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
-
-        pbar.update(fin.tell() - start_pos)
-
-    return model
-
-
-def convert_to_hf_format(model, hparams):
-    # This works for llama 7B, need to test with other models
-    n_layers = hparams["n_layers"]
-    n_heads = hparams["n_heads"]
-    dim = hparams["dim"]
-    dims_per_head = dim // n_heads
-    base = 10000.0
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-
-    # permute for sliced rotary
-    def permute(w):
-        return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
-
-    state_dict = {}
-    for layer_i in range(n_layers):
-        state_dict.update(
-            {
-                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
-                    model[f"layers.{layer_i}.attention.wq.weight"]
-                ),
-                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
-                    model[f"layers.{layer_i}.attention.wk.weight"]
-                ),
-                f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
-                    f"layers.{layer_i}.attention.wv.weight"
-                ],
-                f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
-                    f"layers.{layer_i}.attention.wo.weight"
-                ],
-                f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
-                    f"layers.{layer_i}.feed_forward.w1.weight"
-                ],
-                f"model.layers.{layer_i}.mlp.down_proj.weight": model[
-                    f"layers.{layer_i}.feed_forward.w2.weight"
-                ],
-                f"model.layers.{layer_i}.mlp.up_proj.weight": model[
-                    f"layers.{layer_i}.feed_forward.w3.weight"
-                ],
-                f"model.layers.{layer_i}.input_layernorm.weight": model[
-                    f"layers.{layer_i}.attention_norm.weight"
-                ],
-                f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
-                    f"layers.{layer_i}.ffn_norm.weight"
-                ],
-            }
-        )
-        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
-    state_dict.update(
-        {
-            "model.embed_tokens.weight": model["tok_embeddings.weight"],
-            "model.norm.weight": model["norm.weight"],
-            "lm_head.weight": model["output.weight"],
-        }
-    )
-
-    return state_dict
-
-
-def chat(model, hparams, llama_dir):
-    from transformers import (GenerationConfig, LlamaForCausalLM,
-                              LlamaTokenizer, StoppingCriteria,
-                              StoppingCriteriaList)
-    from transformers.models.llama.configuration_llama import LlamaConfig
-
-    class StoppingCriteriaSub(StoppingCriteria):
-        def __init__(self):
-            super().__init__()
-
-        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
-            print(tokenizer.decode(input_ids[0]), end="", flush=True)
-            if input_ids[0][-1] == 13:
-                return True
-
-            return False
-
-    config = LlamaConfig(
-        vocab_size=hparams["vocab_size"],
-        dim=hparams["dim"],
-        num_hidden_layers=hparams["n_layers"],
-        num_attention_heads=hparams["n_heads"],
-    )
-
-    llama = LlamaForCausalLM(config=config)
-    llama.load_state_dict(state_dict=model, strict=True)
-    tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
-
-    device = torch.device("cpu")
-    llama = llama.to(device)
-
-    ctx = """You are AI.
-This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
-User: Hello, AI.
-AI: Hello! How can I assist you today?
-"""
-    print(ctx.rstrip("\n"))
-    while True:
-        print("-" * 60)
-        prompt = input("User: ")
-        if ctx != "":
-            ctx = f"{ctx}User: {prompt}\n"
-        else:
-            ctx = f"{prompt}\nAI:"
-
-        ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
-
-        print("-" * 60)
-        if len(ctx.strip()) > 0:
-            input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
-            generation_config = GenerationConfig(
-                temperature=0.8,
-                top_p=0.95,
-                top_k=50,
-                repetition_penalty=1.1764,
-            )
-            with torch.no_grad():
-                generation_output = llama.generate(
-                    input_ids=input_ids,
-                    generation_config=generation_config,
-                    return_dict_in_generate=True,
-                    output_scores=True,
-                    max_length=2048,
-                    do_sample=True,
-                    stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
-                )
-            s = generation_output.sequences[0]
-            decoded = tokenizer.decode(s)
-            ctx = f"{decoded}\n"
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
-    )
-    parser.add_argument(
-        "--prefix",
-        "-p",
-        type=str,
-        required=True,
-        help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
-    )
-    parser.add_argument(
-        "--hf",
-        action="store_true",
-        help="Whether to save the model in the Hugging Face format. (default: False)",
-    )
-    parser.add_argument(
-        "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
-    )
-    args = parser.parse_args()
-
-    llama_dir = os.path.abspath(f"{args.input_dir}/../")
-
-    ggml_files = sorted(
-        [f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
-    )
-
-    fin = open(ggml_files[0], "rb")
-    hparams, ftype = read_header(fin)
-    tokens = read_tokens(fin, hparams["vocab_size"])
-    model = read_variables(fin)
-
-    for f in tqdm(ggml_files[1:]):
-        fin = open(f, "rb")
-        read_header(fin)
-        read_tokens(fin, hparams["vocab_size"])
-        model.update(read_variables(fin))
-
-    if args.hf:
-        model = convert_to_hf_format(model, hparams)
-
-    pth_ckpt = {
-        "state_dict": model,
-        "hparams": hparams,
-        "tokens": tokens,
-    }
-
-    torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
-
-    if args.chat:
-        if not args.hf:
-            model = convert_to_hf_format(model, hparams)
-        chat(model, hparams, llama_dir)
-
-
-if __name__ == "__main__":
-    main()
--- a/convert-gpt4all-to-ggml.py
+++ b/convert-gpt4all-to-ggml.py
@@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-
-#
-# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
-#
-
-# Original by https://github.com/eiz
-# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
-import argparse
-import glob
-import os
-import struct
-import sys
-from sentencepiece import SentencePieceProcessor
-
-HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
-    parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
-    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
-    return parser.parse_args()
-
-def read_header(f_in):
-    struct_fmt = "i" * (3 + len(HPARAMS))
-    struct_size = struct.calcsize(struct_fmt)
-    buf = f_in.read(struct_size)
-    return struct.unpack(struct_fmt, buf)
-
-def write_header(f_out, header):
-    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
-
-    if magic != 0x67676d6c:
-        raise Exception('Invalid file magic. Must be an old style ggml file.')
-
-    values = [
-        0x67676d66, # magic: ggml in hex
-        1,          # file version
-        vocab_size,
-        dim,
-        multiple_of,
-        n_heads,
-        n_layers,
-        rot,
-        ftype
-    ]
-    f_out.write(struct.pack("i" * len(values), *values))
-
-def write_tokens(fout, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode()
-        elif tokenizer.is_control(i):
-            text = b""
-        elif tokenizer.is_byte(i):
-            piece = tokenizer.id_to_piece(i)
-            if len(piece) != 6:
-                print(f"Invalid token: {piece}")
-                sys.exit(1)
-            byte_value = int(piece[3:-1], 16)
-            text = struct.pack("B", byte_value)
-        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
-        fout.write(struct.pack("f", tokenizer.get_score(i)))
-
-    # TODO: GPT4All - add extra <pad> token
-    text = "<pad>".encode()
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    fout.write(struct.pack("f", 0.0))
-
-def read_tokens(f_in, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        len_b = f_in.read(4)
-        (length,) = struct.unpack("i", len_b)
-        f_in.read(length)
-
-def copy_all_data(f_out, f_in):
-    while True:
-        buf = f_in.read(1024 * 1024)
-        if not buf:
-            break
-        f_out.write(buf)
-
-def convert_one_file(path_in, tokenizer):
-    path_tmp = f"{path_in}.tmp"
-    path_orig= f"{path_in}.orig"
-    print(f"converting {path_in}")
-    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
-        write_header(f_out, read_header(f_in))
-        read_tokens(f_in, tokenizer)
-        write_tokens(f_out, tokenizer)
-        copy_all_data(f_out, f_in)
-    os.rename(path_in, path_orig)
-    os.rename(path_tmp, path_in)
-
-def main():
-    args = parse_args()
-
-    tokenizer = SentencePieceProcessor(args.tokenizer_model)
-
-    convert_one_file(args.gpt4all_model, tokenizer)
-
-if __name__ == "__main__":
-    main()
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@@ -1,172 +0,0 @@
-# Convert a GPTQ quantized LLaMA model to a ggml compatible file
-# Based on: https://github.com/qwopqwop200/GPTQ-for-LLaMa
-#
-import os
-import re
-import sys
-import json
-import struct
-import numpy as np
-import torch
-from sentencepiece import SentencePieceProcessor
-
-if len(sys.argv) != 4:
-    print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
-    sys.exit(1)
-
-fname_model = sys.argv[1]
-fname_tokenizer = sys.argv[2]
-dir_out = sys.argv[3]
-
-model = torch.load(fname_model, map_location="cpu")
-
-n_vocab, n_embd = model['model.embed_tokens.weight'].shape
-n_layer = 1 + max(int(m.group(1)) for name in model
-                  if (m := re.match(r'model\.layers\.([0-9]+)', name)))
-
-# hardcoded:
-n_mult = 256
-n_head = {32: 32, 40: 40, 60: 52, 80: 64}[n_layer]
-
-tokenizer = SentencePieceProcessor(fname_tokenizer)
-
-assert tokenizer.vocab_size() == n_vocab
-
-fname_out = sys.argv[3]
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
-fout.write(struct.pack("i", 1)) # file version
-fout.write(struct.pack("i", n_vocab))
-fout.write(struct.pack("i", n_embd))
-fout.write(struct.pack("i", n_mult))
-fout.write(struct.pack("i", n_head))
-fout.write(struct.pack("i", n_layer))
-fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
-fout.write(struct.pack("i", 4))
-
-
-# This loop unchanged from convert-pth-to-ggml.py:
-for i in range(tokenizer.vocab_size()):
-    if tokenizer.is_unknown(i):
-        text = " \u2047 ".encode()
-    elif tokenizer.is_control(i):
-        text = b""
-    elif tokenizer.is_byte(i):
-        piece = tokenizer.id_to_piece(i)
-        if len(piece) != 6:
-            print(f"Invalid token: {piece}")
-            sys.exit(1)
-        byte_value = int(piece[3:-1], 16)
-        text = struct.pack("B", byte_value)
-    else:
-        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    fout.write(struct.pack("f", tokenizer.get_score(i)))
-
-def write_header(shape, dst_name, ftype_cur):
-    sname = dst_name.encode()
-    fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
-    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
-    fout.write(sname)
-
-    # ensure tensor data is aligned
-    tensor_data_offset = fout.tell()
-    tensor_data_offset = (tensor_data_offset + 31) & -32
-    fout.seek(tensor_data_offset)
-
-def convert_non_q4(src_name, dst_name):
-    v = model[src_name]
-    shape = v.shape
-    print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
-    if len(shape) == 1:
-        print("  Converting to float32")
-        v = v.to(torch.float32)
-
-    ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
-
-    # header
-    write_header(shape, dst_name, ftype_cur)
-
-    # data
-    v.numpy().tofile(fout)
-
-def convert_q4(src_name, dst_name, permute=False):
-    zeros = model[f"{src_name}.zeros"].numpy()
-    scales = model[f"{src_name}.scales"].numpy()
-    bias = model[f"{src_name}.bias"].numpy()
-    qweight = model[f"{src_name}.qweight"].numpy().T # transpose
-
-    # Q4_1 does not support bias; good thing the bias is always all zeros.
-    assert not np.any(bias)
-
-    # Each int32 item is actually 8 int4 items packed together, and it's transposed.
-    shape = (qweight.shape[0], qweight.shape[1] * 8)
-
-    print(f"Processing Q4 variable: {src_name} with shape: {shape}")
-
-    # The output format has the int4 weights in groups of 32 rather than 8.
-    # It looks like this:
-    # For each row:
-    #   For each group of 32 columns:
-    #     - addend (float32, 4 bytes)
-    #     - scale (float32, 4 bytes)
-    #     - weights (int4 * 32, 16 bytes)
-    # Note that in the input, the scales and addends are shared between all
-    # the columns in a row, so we end up wasting quite a bit of memory with
-    # repeated scales and addends.
-
-    addends = -zeros # flip sign
-
-    # Since the output format is mixed between integers and floats, we have
-    # to hackily view the floats as int32s just so numpy will let us
-    # concatenate them.
-    addends_view = addends.view(dtype=np.int32)
-    scales_view = scales.view(dtype=np.int32)
-
-    # Split into groups of 4 columns (i.e. 32 columns of quantized data):
-    grouped = qweight.reshape([qweight.shape[0], qweight.shape[1] // 4, 4])
-
-    # Repeat addends and scales:
-    addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
-    scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)
-
-    blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
-
-    if permute:
-        # Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
-        # This can be done after the above conversion because it doesn't affect column order/layout.
-        blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
-                    .swapaxes(1, 2)
-                    .reshape(blob.shape))
-
-    # header
-    write_header(shape, dst_name, 3) # ftype = Q4_1
-
-    # data
-    blob.tofile(fout)
-
-convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
-convert_non_q4("model.norm.weight", "norm.weight")
-convert_non_q4("lm_head.weight", "output.weight")
-
-for i in range(n_layer):
-    convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
-    convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
-    convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
-    convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")
-
-    convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
-    convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
-    convert_q4(f"model.layers.{i}.mlp.up_proj",   f"layers.{i}.feed_forward.w3.weight")
-
-    convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
-    convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
-
-
-fout.close()
-
-print(f"Done. Output file: {fname_out}")
-print()
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -0,0 +1,124 @@
+import json
+import os
+import re
+import struct
+import sys
+from typing import Any, Dict, Sequence, TextIO
+
+import torch
+
+from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
+
+HF_SUBLAYER_TO_GGML = {
+    "self_attn.q_proj": "attention.wq",
+    "self_attn.k_proj": "attention.wk",
+    "self_attn.v_proj": "attention.wv",
+    "self_attn.o_proj": "attention.wo",
+    "mlp.gate_proj": "feed_forward.w1",
+    "mlp.down_proj": "feed_forward.w2",
+    "mlp.up_proj": "feed_forward.w3",
+    "input_layernorm": "attention_norm",
+    "post_attention_layernorm": "ffn_norm",
+    # "norm": "norm",
+    # "embed_tokens": "tok_embeddings",
+    # "lm_head": "output",
+}
+
+
+def translate_tensor_name(t: str) -> str:
+    match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
+    if match:
+        nn = match.group(1)
+        sub_layer = match.group(2)
+        lora_type = match.group(3)
+
+        sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
+        if sub_layer_renamed is None:
+            print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
+            sys.exit(1)
+
+        output_string = (
+            f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
+        )
+        return output_string
+    else:
+        print(f"Error: unrecognized tensor {t}")
+        sys.exit(1)
+
+
+def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
+    fout.write(b"ggla"[::-1])  # magic (ggml lora)
+    fout.write(struct.pack("i", 1))  # file version
+    fout.write(struct.pack("ii", params["r"], params["lora_alpha"]))
+
+
+def write_tensor_header(
+    self, name: str, shape: Sequence[int], data_type: DataType
+) -> None:
+    sname = name.encode("utf-8")
+    fout.write(
+        struct.pack(
+            "iii",
+            len(shape),
+            len(sname),
+            DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
+        )
+    )
+    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+    fout.write(sname)
+    fout.seek((fout.tell() + 31) & -32)
+
+
+if len(sys.argv) != 2:
+    print(f"Usage: python {sys.argv[0]} <path>")
+    print(
+        "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
+    )
+    sys.exit(1)
+
+input_json = os.path.join(sys.argv[1], "adapter_config.json")
+input_model = os.path.join(sys.argv[1], "adapter_model.bin")
+output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
+
+model = torch.load(input_model, map_location="cpu")
+
+with open(input_json, "r") as f:
+    params = json.load(f)
+
+if params["peft_type"] != "LORA":
+    print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
+    sys.exit(1)
+
+if params["fan_in_fan_out"] == True:
+    print("Error: param fan_in_fan_out is not supported")
+    sys.exit(1)
+
+if params["bias"] is not None and params["bias"] != "none":
+    print("Error: param bias is not supported")
+    sys.exit(1)
+
+# TODO: these seem to be layers that have been trained but without lora.
+# doesn't seem widely used but eventually should be supported
+if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
+    print("Error: param modules_to_save is not supported")
+    sys.exit(1)
+
+with open(output_path, "wb") as fout:
+    fout.truncate()
+
+    write_file_header(fout, params)
+    for k, v in model.items():
+        if k.endswith("lora_A.weight"):
+            if v.dtype != torch.float16 and v.dtype != torch.float32:
+                v = v.float()
+            v = v.T
+        else:
+            v = v.float()
+
+        t = v.numpy()
+        tname = translate_tensor_name(k)
+        print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
+        write_tensor_header(fout, tname, t.shape, t.dtype)
+        t.tofile(fout)
+
+print(f"Converted {input_json} and {input_model} to {output_path}")
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -1,274 +1,11 @@
-# Convert a LLaMA model checkpoint to a ggjt compatible file
-#
-# Load the model using Torch
-# Iterate over all variables and write them to a binary file.
-#
-# For each variable, write the following:
-#   - Number of dimensions (int)
-#   - Name length (int)
-#   - Dimensions (int[n_dims])
-#   - Name (char[name_length])
-#   - Data (float[n_dims])
-#
-# At the start of the ggml file we write the model parameters
-# and vocabulary.
-#
+# Compatibility stub

 import argparse
-import os
-import sys
-import json
-import struct
-import numpy as np
-import torch

-from sentencepiece import SentencePieceProcessor
+import convert

-QK = 32
-
-GGML_TYPE_Q4_0  = 0
-GGML_TYPE_Q4_1  = 1
-GGML_TYPE_I8    = 2
-GGML_TYPE_I16   = 3
-GGML_TYPE_I32   = 4
-GGML_TYPE_F16   = 5
-GGML_TYPE_F32   = 6
-
-WTYPES = {
-    0: GGML_TYPE_F32,
-    1: GGML_TYPE_F16,
-    2: GGML_TYPE_Q4_0,
-    3: GGML_TYPE_Q4_1,
-}
-
-GGML_BLCK_SIZE = {
-    GGML_TYPE_Q4_0:  QK,
-    GGML_TYPE_Q4_1:  QK,
-    GGML_TYPE_I8:    1,
-    GGML_TYPE_I16:   1,
-    GGML_TYPE_I32:   1,
-    GGML_TYPE_F16:   1,
-    GGML_TYPE_F32:   1,
-}
-
-GGML_TYPE_SIZE = {
-    GGML_TYPE_Q4_0: 4   + QK//2,
-    GGML_TYPE_Q4_1: 4*2 + QK//2,
-    GGML_TYPE_I8:   1,
-    GGML_TYPE_I16:  2,
-    GGML_TYPE_I32:  4,
-    GGML_TYPE_F16:  2,
-    GGML_TYPE_F32:  4,
-}
-
-def ggml_nelements(shape):
-    r = 1
-    for i in shape:
-        r *= i
-    return r
-
-def ggml_nbytes(shape, ftype):
-    x = ggml_nelements(shape)
-    t = WTYPES[ftype]
-    x *= GGML_TYPE_SIZE[t]
-    x //= GGML_BLCK_SIZE[t]
-    return x
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
-    parser.add_argument('dir_model',  help='directory containing the model checkpoint')
-    parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
-    parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
-    return parser.parse_args()
-
-def get_n_parts(dim):
-    mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
-    n_parts = mappings.get(dim)
-    if n_parts is None:
-        print(f"Invalid dim: {dim}")
-        sys.exit(1)
-
-    print(f"n_parts = {n_parts}\n")
-    return n_parts
-
-def load_hparams_and_tokenizer(dir_model):
-    # `dir_model` is something like `models/7B` or `models/7B/`.
-    # "tokenizer.model" is expected under model's parent dir.
-    # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
-    # Let's use the model's parent dir directly.
-    model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
-    fname_hparams = f"{dir_model}/params.json"
-    fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
-    with open(fname_hparams, "r") as f:
-        hparams = json.load(f)
-        print(hparams)
-    tokenizer = SentencePieceProcessor(fname_tokenizer)
-    hparams.update({"vocab_size": tokenizer.vocab_size()})
-    return hparams, tokenizer
-
-def write_header(fout, hparams, ftype):
-    keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
-    values = [
-        0x67676a74,  # magic: ggjt in hex
-        1, # file version
-        *[hparams[key] for key in keys],
-        hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
-        ftype
-    ]
-    fout.write(struct.pack("i" * len(values), *values))
-
-def write_tokens(fout, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode()
-        elif tokenizer.is_control(i):
-            text = b""
-        elif tokenizer.is_byte(i):
-            piece = tokenizer.id_to_piece(i)
-            if len(piece) != 6:
-                print(f"Invalid token: {piece}")
-                sys.exit(1)
-            byte_value = int(piece[3:-1], 16)
-            text = struct.pack("B", byte_value)
-        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
-        fout.write(struct.pack("f", tokenizer.get_score(i)))
-
-def process_and_write_variables(fout, model, ftype, part_id, n_parts):
-    for name, datao in model.items():
-        if name.endswith("freqs"):
-            continue
-
-        # remove dimensions with a single element
-        data = datao.numpy().squeeze()
-        partshape = data.shape
-        n_dims = len(data.shape)
-        assert n_dims in (1, 2)
-
-        print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
-
-        # coerce single-dimensional tensors from float16 to float32
-        ftype_cur = 1
-        if ftype == 0 or n_dims == 1:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-        blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
-        type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
-
-        # determine dimension along which multipart tensor is sharded
-        #
-        # split_dim 0 regex:
-        #   - output.*
-        #   - layers.*.attention.wq.weight
-        #   - layers.*.attention.wk.weight
-        #   - layers.*.attention.wv.weight
-        #   - layers.*.feed_forward.w1.weight
-        #   - layers.*.feed_forward.w3.weight
-        #
-        # split_dim 1 regex:
-        #   - tok_embeddings.*
-        #   - layers.*.attention.wo.weight
-        #   - layers.*.feed_forward.w2.weight
-        #
-        if n_dims > 1:
-            split_dim = 1
-            if "tok_embeddings" in name:
-                split_dim = 1
-            elif "layers" in name:
-                if "attention.wo.weight" in name:
-                    split_dim = 1
-                elif "feed_forward.w2.weight" in name:
-                    split_dim = 1
-                else:
-                    split_dim = 0
-            elif "output" in name:
-                split_dim = 0
-
-        # output tensor header
-        fullshape = list(partshape)
-        if n_dims > 1:
-            fullshape[split_dim] *= n_parts
-        sname = name.encode()
-        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
-        for dim in reversed(fullshape):
-            fout.write(struct.pack("i", dim))
-        fout.write(sname)
-
-        # ensure tensor data is aligned
-        tensor_data_offset = fout.tell()
-        while tensor_data_offset % QK != 0:
-            fout.write(struct.pack("B", 0))
-            tensor_data_offset += 1
-
-        # output unified mappable tensor data
-        if n_dims == 1 or n_parts == 1:
-            # copy tensor which we thankfully received in one piece
-            if part_id == 0:
-                data.tofile(fout)
-        elif split_dim == 0:
-            # reassemble multifile tensor containing some of the rows
-            rows_per_chunk = partshape[0]
-            current_row = part_id * rows_per_chunk
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset = current_row * bytes_per_row
-            fout.seek(tensor_data_offset + offset)
-            data.tofile(fout)
-        elif split_dim == 1:
-            # reassemble multifile tensor containing some of the cols
-            cols_per_chunk = partshape[1]
-            current_col = part_id * cols_per_chunk
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset_current_col = current_col // blck_size * type_size
-            for row in range(partshape[0]):
-                offset_row = row * bytes_per_row
-                offset = offset_row + offset_current_col
-                fout.seek(tensor_data_offset + offset)
-                data[row].tofile(fout)
-
-        # advance file position to next tensor
-        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
-
-def main():
-    args = parse_args()
-    dir_model = args.dir_model
-    ftype = args.ftype
-    ftype_str = ["f32", "f16"]
-    hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
-
-    print(args)
-
-    # if only writing vocab to file
-    if args.vocab_only:
-        fname_model = f"{dir_model}/consolidated.00.pth"
-        fname_out = f"{dir_model}/ggml-vocab.bin"
-        print(f"Extracting only the vocab from '{fname_model}'\n")
-        with open(fname_out, "wb") as fout:
-            write_header(fout, hparams, ftype)
-            write_tokens(fout, tokenizer)
-        print(f"Done. Output file: {fname_out}\n")
-        return
-
-    n_parts = get_n_parts(hparams["dim"])
-    fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
-
-    # we output a single file for ggml
-    with open(fname_out, "wb") as fout:
-        write_header(fout, hparams, ftype)
-        write_tokens(fout, tokenizer)
-        offset_of_tensors = fout.tell()
-        # the tensors we load could be split across multiple files
-        for part_id in range(n_parts):
-            fout.seek(offset_of_tensors)
-            print(f"Processing part {part_id+1} of {n_parts}\n")
-            fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
-            model = torch.load(fname_model, map_location="cpu")
-            process_and_write_variables(fout, model, ftype, part_id, n_parts)
-            del model
-
-    print(f"Done. Output file: {fname_out}\n")
-
-if __name__ == "__main__":
-    main()
+parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
+parser.add_argument('dir_model',  help='directory containing the model checkpoint')
+parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
+args = parser.parse_args()
+convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
--- a/convert-unversioned-ggml-to-ggml.py
+++ b/convert-unversioned-ggml-to-ggml.py
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-# Original by https://github.com/eiz
-# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
-import argparse
-import glob
-import os
-import struct
-import sys
-from sentencepiece import SentencePieceProcessor
-
-HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
-    parser.add_argument('dir_model', help='directory containing ggml .bin files')
-    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
-    return parser.parse_args()
-
-def read_header(f_in):
-    struct_fmt = "i" * (3 + len(HPARAMS))
-    struct_size = struct.calcsize(struct_fmt)
-    buf = f_in.read(struct_size)
-    return struct.unpack(struct_fmt, buf)
-
-def write_header(f_out, header):
-    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
-
-    if magic != 0x67676d6c:
-        raise Exception('Invalid file magic. Must be an old style ggml file.')
-
-    values = [
-        0x67676d66,  # magic: ggml in hex
-        1, # file version
-        vocab_size,
-        dim,
-        multiple_of,
-        n_heads,
-        n_layers,
-        rot,
-        ftype
-    ]
-    f_out.write(struct.pack("i" * len(values), *values))
-
-def write_tokens(fout, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode()
-        elif tokenizer.is_control(i):
-            text = b""
-        elif tokenizer.is_byte(i):
-            piece = tokenizer.id_to_piece(i)
-            if len(piece) != 6:
-                print(f"Invalid token: {piece}")
-                sys.exit(1)
-            byte_value = int(piece[3:-1], 16)
-            text = struct.pack("B", byte_value)
-        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
-        fout.write(struct.pack("f", tokenizer.get_score(i)))
-
-def read_tokens(f_in, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        len_b = f_in.read(4)
-        (length,) = struct.unpack("i", len_b)
-        f_in.read(length)
-
-def copy_all_data(f_out, f_in):
-    while True:
-        buf = f_in.read(1024 * 1024)
-        if not buf:
-            break
-        f_out.write(buf)
-
-def convert_one_file(path_in, tokenizer):
-    path_tmp = f"{path_in}.tmp"
-    path_orig= f"{path_in}.orig"
-    print(f"converting {path_in}")
-    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
-        write_header(f_out, read_header(f_in))
-        read_tokens(f_in, tokenizer)
-        write_tokens(f_out, tokenizer)
-        copy_all_data(f_out, f_in)
-    os.rename(path_in, path_orig)
-    os.rename(path_tmp, path_in)
-
-def main():
-    args = parse_args()
-    files = []
-    files.extend(glob.glob(f"{args.dir_model}/*.bin"))
-    files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
-
-    tokenizer = SentencePieceProcessor(args.tokenizer_model)
-
-    for file in files:
-        convert_one_file(file, tokenizer)
-
-if __name__ == "__main__":
-    main()
--- a/convert.py
+++ b/convert.py
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -31,6 +31,8 @@ if (EMSCRIPTEN)
 else()
    add_subdirectory(main)
    add_subdirectory(quantize)
+    add_subdirectory(quantize-stats)
    add_subdirectory(perplexity)
    add_subdirectory(embedding)
+    add_subdirectory(save-load-state)
 endif()
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -19,15 +19,15 @@ GEN_OPTIONS=(--batch_size 1024
 --top_p 0.5)

 if [ -n "$N_THREAD" ]; then
-	GEN_OPTIONS+=(--threads "$N_THREAD")
+    GEN_OPTIONS+=(--threads "$N_THREAD")
 fi

 ./main "${GEN_OPTIONS[@]}" \
-	--model "$MODEL" \
-	--n_predict "$N_PREDICTS" \
-	--color --interactive \
-	--reverse-prompt "${USER_NAME}:" \
-	--prompt "
+    --model "$MODEL" \
+    --n_predict "$N_PREDICTS" \
+    --color --interactive \
+    --reverse-prompt "${USER_NAME}:" \
+    --prompt "
 This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
 ${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
 ${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@@ -7,4 +7,13 @@
 cd `dirname $0`
 cd ..

-./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
+./main -m ./models/ggml-alpaca-7b-q4.bin \
+       --color \
+       -f ./prompts/alpaca.txt \
+       --ctx_size 2048 \
+       -n -1 \
+       -ins -b 256 \
+       --top_k 10000 \
+       --temp 0.2 \
+       --repeat_penalty 1.1 \
+       -t 7
--- a/examples/benchmark/benchmark-q4_0-matmult.c
+++ b/examples/benchmark/benchmark-q4_0-matmult.c
@@ -0,0 +1,270 @@
+/*
+    License: MIT License
+
+    Changelog:
+    - 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
+
+*/
+
+#include <locale.h>
+#include "ggml.h"
+#include <assert.h>
+#include <math.h>
+#include <cstring>
+#include <cstdio>
+#include <cinttypes>
+#include <unordered_map>
+#include <queue>
+#include <string.h>
+#include <cassert>
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+
+float tensor_sum_elements(struct ggml_tensor * tensor) {
+    float sum = 0;
+    if (tensor->type==GGML_TYPE_F32) {
+        for (int j = 0; j < tensor->ne[1]; j++) {
+            for (int k = 0; k < tensor->ne[0]; k++) {
+                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k];
+            }
+        }
+    }
+    return sum;
+}
+
+
+/*
+    These are mapping to unknown
+    GGML_TYPE_I8,
+    GGML_TYPE_I16,
+    GGML_TYPE_I32,
+    GGML_TYPE_COUNT,
+*/
+
+#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
+
+#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
+        TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
+        TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
+    { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
+
+struct benchmark_params_struct {
+    int32_t n_threads     = 1;
+    int32_t n_iterations  = 10;
+};
+
+void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -i N, --iter N     number of iterations to use during computation (default: %d)\n", params.n_iterations);
+    fprintf(stderr, "\n");
+}
+
+int main(int argc, char ** argv)  {
+
+
+    struct benchmark_params_struct benchmark_params;
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            benchmark_params.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-i" || arg == "--iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            benchmark_params.n_iterations = std::stoi(argv[i]);
+        }  else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, benchmark_params);
+            exit(0);
+        }
+        if (invalid_param) {
+            fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+            print_usage(argc, argv, benchmark_params);
+            exit(1);
+        }
+    }
+
+
+    // create the ggml context
+    printf("Starting Test\n");
+
+
+
+    struct ggml_context * ctx;
+    //const int sizex = 4096;
+    //const int sizey = 11008;
+
+#undef VERBOSE_DEBUGGING
+#ifndef VERBOSE_DEBUGGING
+    const int sizey = 4096;
+    const int sizex = 11008;
+    const int sizez = 128;
+#else
+    /* Working - let's increase size */
+    const int sizey = 1;
+    const int sizex = (8*32);
+    const int sizez = 1;
+
+    /*const int sizey = 1;
+    const int sizex = 3*(8*32);
+    const int sizez = 1;*/
+#endif
+
+    //printf("Memsize required = %i\n", sizex*sizex);
+    ggml_type wtype = GGML_TYPE_F32;
+
+    size_t ctx_size = 0;
+    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizeof(float);
+    ctx_size += 1024*1024*100;
+
+    printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /* no_alloc   =*/ 0
+    };
+
+    ctx = ggml_init(params);
+    if (!ctx) {
+        fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+        return false;
+    }
+
+
+    printf("Creating new tensors\n");
+    // printf("Creating new tensor m1\n");
+    struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
+    ggml_set_f32(m11, 1.0f);
+
+    // printf("Creating new tensor m1\n");
+    struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
+    ggml_set_f32(m12, 1.5f);
+
+    // printf("Creating new tensor m2\n");
+    struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
+    ggml_set_f32(m2, 2.0f);
+
+    printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
+    // printf("Creating new tensor m11xm2\n");
+    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
+
+    // printf("Creating compute graph\n");
+    struct ggml_cgraph gf = ggml_build_forward(m11xm2);
+
+    gf.n_threads=benchmark_params.n_threads;
+    printf("cgraph->n_threads=%i\n",gf.n_threads);
+
+    TENSOR_DUMP(m11);
+    TENSOR_DUMP(m2);
+
+    ggml_graph_compute(ctx, &gf);
+
+    TENSOR_DUMP(gf.nodes[0]);
+
+    printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
+
+    int32_t nelements = sizex*sizey;
+    int32_t ne[2] = { sizex, sizey };
+
+    std::vector<int64_t> hist_cur(1 << 4, 0);
+
+    // Set up a the benchmark matrices
+    // printf("Creating new tensor q11 & Running quantize\n");
+    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
+    ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
+
+    // Set up a the compute graph
+    // printf("Creating new tensor q31\n");
+    struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
+
+    // printf("Creating compute graph\n");
+    struct ggml_cgraph gf31 = ggml_build_forward(q31);
+    gf31.n_threads=benchmark_params.n_threads;
+
+    // Set up a second graph computation to make sure we override the CPU cache lines
+    // printf("Creating new tensor q12 & Running quantize\n");
+    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
+    ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
+
+    // printf("Creating new tensor q32\n");
+    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
+
+    //printf("Creating compute graph\n");
+    struct ggml_cgraph gf32 = ggml_build_forward(q32);
+    gf32.n_threads=benchmark_params.n_threads;
+    printf("cgraph->n_threads=%i\n",gf31.n_threads);
+
+    const int dimx = sizex;
+    const int dimy = sizey;
+    const int dimz = sizez;
+    long long int flops_per_dot_product = dimy + dimy;
+    long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
+    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
+
+
+    // Let's use the F32 result from above as a reference for the q4_0 multiplication
+    float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
+
+
+    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
+    printf("==============================================================================================\n");
+
+    for (int i=0;i<benchmark_params.n_iterations ;i++) {
+
+        long long int start = ggml_time_us();
+        //printf("Running ggml_graph_compute\n");
+        ggml_graph_compute(ctx, &gf31);
+        long long int stop = ggml_time_us();
+        long long int usec = stop-start;
+        float sec = usec/1000000;
+        float flops_per_usec = (1.0f*flops_per_matrix)/usec;
+        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
+            i,
+            gf31.n_threads,
+            sizex, sizey, sizez, flops_per_matrix,
+            usec,flops_per_usec);
+
+#ifdef VERBOSE_DEBUGGING
+        TENSOR_DUMP("res",gf31.nodes[0])
+#endif
+
+        // Check that the matrix multiplication result is in the right ballpark
+        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
+        float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
+        float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
+        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
+
+        if (delta > allowed_delta)  {
+            printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
+                sum_of_F32_reference,
+                sum_of_Q4_result,
+                delta,
+                allowed_delta
+            );
+            exit(0);
+        }
+
+        // Running a different graph computation to make sure we override the CPU cache lines
+        ggml_graph_compute(ctx, &gf32);
+
+    }
+
+}
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -1,7 +1,5 @@
 #include "common.h"

-#include "ggml.h"
-
 #include <cassert>
 #include <cstring>
 #include <fstream>
@@ -9,19 +7,20 @@
 #include <iterator>
 #include <algorithm>

-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <malloc.h> // using malloc.h with MSC/MINGW
-#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-#include <alloca.h>
-#endif
-
 #if defined (_WIN32)
+#include <fcntl.h>
+#include <io.h>
 #pragma comment(lib,"kernel32.lib")
 extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
 extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
 extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
 extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
 extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
+extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
+                                                                   const wchar_t * lpWideCharStr, int cchWideChar,
+                                                                   char * lpMultiByteStr, int cbMultiByte,
+                                                                   const char * lpDefaultChar, bool * lpUsedDefaultChar);
+#define CP_UTF8 65001
 #endif

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
@@ -140,20 +139,33 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.model = argv[i];
+        } else if (arg == "--lora") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_adapter = argv[i];
+            params.use_mmap = false;
+        } else if (arg == "--lora-base") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.lora_base = argv[i];
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
        } else if (arg == "--embedding") {
            params.embedding = true;
-        } else if (arg == "--interactive-start") {
-            params.interactive = true;
        } else if (arg == "--interactive-first") {
-            params.interactive_start = true;
+            params.interactive_first = true;
        } else if (arg == "-ins" || arg == "--instruct") {
            params.instruct = true;
        } else if (arg == "--color") {
            params.use_color = true;
        } else if (arg == "--mlock") {
            params.use_mlock = true;
+        } else if (arg == "--no-mmap") {
+            params.use_mmap = false;
        } else if (arg == "--mtest") {
            params.mem_test = true;
        } else if (arg == "--verbose-prompt") {
@@ -233,11 +245,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
-    if (ggml_mlock_supported()) {
+    if (llama_mlock_supported()) {
        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
+    if (llama_mmap_supported()) {
+        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+    }
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
+    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
    fprintf(stderr, "  -m FNAME, --model FNAME\n");
    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "\n");
@@ -307,12 +324,20 @@ void win32_console_init(bool enable_color) {
            SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
        }
        // Set console output codepage to UTF8
-        SetConsoleOutputCP(65001); // CP_UTF8
+        SetConsoleOutputCP(CP_UTF8);
    }
    void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
    if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
-        // Set console input codepage to UTF8
-        SetConsoleCP(65001); // CP_UTF8
+        // Set console input codepage to UTF16
+        _setmode(_fileno(stdin), _O_WTEXT);
    }
 }
+
+// Convert a wide Unicode string to an UTF8 string
+void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
+    int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
+    std::string strTo(size_needed, 0);
+    WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
+    str = strTo;
+}
 #endif
--- a/examples/common.h
+++ b/examples/common.h
@@ -20,7 +20,7 @@ struct gpt_params {
    int32_t repeat_last_n = 64;   // last n tokens to penalize
    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
    int32_t n_ctx         = 512;  // context size
-    int32_t n_batch       = 8;    // batch size for prompt processing
+    int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt

    // sampling parameters
@@ -31,22 +31,24 @@ struct gpt_params {

    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
    std::string prompt = "";
-    std::string input_prefix = ""; // string to prefix user inputs with
-
-
+    std::string input_prefix = "";       // string to prefix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

+    std::string lora_adapter = "";  // lora adapter path
+    std::string lora_base = "";     // base model path for the lora adapter
+
    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode

    bool embedding         = false; // get only sentence embedding
-    bool interactive_start = false; // wait for user input immediately
+    bool interactive_first = false; // wait for user input immediately

    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool ignore_eos        = false; // do not stop generating after eos
    bool perplexity        = false; // compute perplexity over the prompt
+    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool mem_test          = false; // compute maximum memory usage
    bool verbose_prompt    = false; // print prompt tokens before generation
@@ -92,4 +94,5 @@ void set_console_color(console_state & con_st, console_color_t color);

 #if defined (_WIN32)
 void win32_console_init(bool enable_color);
+void win32_utf8_encode(const std::wstring & wstr, std::string & str);
 #endif
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -1,3 +1,3 @@
-# embedding
-
-TODO
+# embedding
+
+TODO
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,6 +1,8 @@
 #include "common.h"
 #include "llama.h"

+#include <ctime>
+
 int main(int argc, char ** argv) {
    gpt_params params;
    params.model = "models/llama-7B/ggml-model.bin";
@@ -38,6 +40,7 @@ int main(int argc, char ** argv) {
        lparams.seed       = params.seed;
        lparams.f16_kv     = params.memory_f16;
        lparams.logits_all = params.perplexity;
+        lparams.use_mmap   = params.use_mmap;
        lparams.use_mlock  = params.use_mlock;
        lparams.embedding  = params.embedding;

--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@@ -10,6 +10,6 @@ cd ..
 ./main --color --instruct --threads 4 \
       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
       --file ./prompts/alpaca.txt \
-       --batch_size 8 --ctx_size 2048 \
+       --batch_size 8 --ctx_size 2048 -n -1 \
       --repeat_last_n 64 --repeat_penalty 1.3 \
       --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -1,3 +1,191 @@
-# main
-
-TODO
+# llama.cpp/example/main
+
+This example program allows you to use various LLaMA language models in an easy and efficient way. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+
+## Table of Contents
+
+1. [Quick Start](#quick-start)
+2. [Common Options](#common-options)
+3. [Input Prompts](#input-prompts)
+4. [Interaction](#interaction)
+5. [Context Management](#context-management)
+6. [Generation Flags](#generation-flags)
+7. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
+8. [Additional Options](#additional-options)
+
+## Quick Start
+
+To get started right away, run the following command, making sure to use the correct path for the model you have:
+
+```bash
+./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
+```
+
+The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
+
+```bash
+./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time"
+```
+
+For an interactive experience, try this command:
+
+```bash
+./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
+```
+
+Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead.
+
+## Common Options
+
+In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
+
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
+-   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
+-   `-t N, --threads N`: Set the number of threads to use during computation. It is recommended to set this to the number of physical cores your CPU has.
+-   `-n N, --n_predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
+-   `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+
+## Input Prompts
+
+The `main` program provides several ways to interact with the LLaMA models using input prompts:
+
+-   `--prompt PROMPT`: Provide a prompt directly as a command-line option.
+-   `--file FNAME`: Provide a file containing a prompt or multiple prompts.
+-   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
+-   `--random-prompt`: Start with a randomized prompt.
+
+## Interaction
+
+The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`.
+
+In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
+
+### Interaction Options
+
+-   `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
+-   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
+-   `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions.
+-   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
+
+By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
+
+### Reverse Prompts
+
+Reverse prompts are a powerful way to create a chat-like experience with a LLaMA model by pausing the text generation when specific text strings are encountered:
+
+-   `-r PROMPT, --reverse-prompt PROMPT`: Specify one or multiple reverse prompts to pause text generation and switch to interactive mode. For example, `-r "User:"` can be used to jump back into the conversation whenever it's the user's turn to speak. This helps create a more interactive and conversational experience. However, the reverse prompt doesn't work when it ends with a space.
+
+To overcome this limitation, you can use the `--in-prefix` flag to add a space or any other characters after the reverse prompt.
+
+### In-Prefix
+
+The `--in-prefix` flag is used to add a prefix to your input, primarily, this is used to insert a space after the reverse prompt. Here's an example of how to use the `--in-prefix` flag in conjunction with the `--reverse-prompt` flag:
+
+```sh
+./main -r "User:" --in-prefix " "
+```
+
+### Instruction Mode
+
+Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
+
+-   `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
+
+Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response).
+
+By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
+
+## Context Management
+
+During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.
+
+### Context Size
+
+The `--ctx_size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
+
+-   `-c N, --ctx_size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
+
+### Keep Prompt
+
+The `--keep` option allows users to retain the original prompt when the model runs out of context, ensuring a connection to the initial instruction or conversation topic is maintained.
+
+-   `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
+
+By utilizing context management options like `--ctx_size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
+
+## Generation Flags
+
+The following options are related to controlling the text generation process, influencing the diversity, creativity, and quality of the generated text. Understanding these options will help you fine-tune the output according to your needs:
+
+### Number of Tokens to Predict
+
+-   `-n N, --n_predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+
+The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
+
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter.
+
+### RNG Seed
+
+-   `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1).
+
+The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than or equal to 0, a random seed will be used, which will result in different outputs on each run.
+
+### Temperature
+
+-   `--temp N`: Adjust the randomness of the generated text (default: 0.8).
+
+Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
+
+Example usage: `--temp 0.8`
+
+### Repeat Penalty
+
+-   `--repeat_penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
+
+Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
+
+Example usage: `--repeat_penalty 1.1`
+
+### Top-K Sampling
+
+-   `--top_k N`: Limit the next token selection to the K most probable tokens (default: 40).
+
+Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
+
+Example usage: `--top_k 40`
+
+### Top-P Sampling
+
+-   `--top_p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
+
+Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
+
+Example usage: `--top_p 0.9`
+
+By adjusting these options, you can control the diversity, quality, and creativity of the generated text to better suit your needs. You can experiment with different combinations of values to find the best settings for your specific use case.
+
+## Performance Tuning and Memory Options
+
+These options help improve the performance and memory usage of the LLaMA models:
+
+-   `-t N, --threads N`: Set the number of threads to use during computation. Using the correct number of threads can greatly improve performance. It is recommended to set this value to the number of CPU cores.
+-   `--mlock`: Lock the model in memory, preventing it from being swapped out when mmaped. This can improve performance.
+-   `--no-mmap`: Do not memory-map the model. This results in a slower load time but may reduce pageouts if you're not using `mlock`.
+-   `--memory_f32`: Use 32 bit floats instead of 16 bit floats for memory key+value, allowing higher quality inference at the cost of memory.
+-   `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
+
+For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
+
+By understanding and using these performance tuning settings, you can optimize the LLaMA model's behavior to achieve the best performance for your specific needs.
+
+## Additional Options
+
+These options provide extra functionality and customization when running the LLaMA models:
+
+-   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
+-   `--verbose-prompt`: Print the prompt before generating text.
+-   `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
+-   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+-   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,3 +1,8 @@
+// Defines sigaction on msys:
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
 #include "common.h"
 #include "llama.h"

@@ -6,6 +11,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
@@ -19,6 +25,7 @@
 #endif

 static console_state con_st;
+static llama_context ** g_ctx;

 static bool is_interacting = false;

@@ -30,6 +37,7 @@ void sigint_handler(int signo) {
        if (!is_interacting) {
            is_interacting=true;
        } else {
+            llama_print_timings(*g_ctx);
            _exit(130);
        }
    }
@@ -88,6 +96,7 @@ int main(int argc, char ** argv) {
 //bool is_prime(int n) {)";

    llama_context * ctx;
+    g_ctx = &ctx;

    // load the model
    {
@@ -97,6 +106,7 @@ int main(int argc, char ** argv) {
        lparams.n_parts    = params.n_parts;
        lparams.seed       = params.seed;
        lparams.f16_kv     = params.memory_f16;
+        lparams.use_mmap   = params.use_mmap;
        lparams.use_mlock  = params.use_mlock;

        ctx = llama_init_from_file(params.model.c_str(), lparams);
@@ -107,6 +117,17 @@ int main(int argc, char ** argv) {
        }
    }

+    if (!params.lora_adapter.empty()) {
+        int err = llama_apply_lora_from_file(ctx,
+                                             params.lora_adapter.c_str(),
+                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
+                                             params.n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+            return 1;
+        }
+    }
+
    // print system information
    {
        fprintf(stderr, "\n");
@@ -157,12 +178,12 @@ int main(int argc, char ** argv) {

    // in instruct mode, we inject a prefix and a suffix to each input by the user
    if (params.instruct) {
-        params.interactive_start = true;
+        params.interactive_first = true;
        params.antiprompt.push_back("### Instruction:\n\n");
    }

    // enable interactive mode if reverse prompt or interactive start is specified
-    if (params.antiprompt.size() != 0 || params.interactive_start) { 
+    if (params.antiprompt.size() != 0 || params.interactive_first) {
        params.interactive = true;
    }

@@ -225,7 +246,7 @@ int main(int argc, char ** argv) {
 #endif
               " - Press Return to return control to LLaMa.\n"
               " - If you want to submit another line, end your input in '\\'.\n\n");
-        is_interacting = params.interactive_start;
+        is_interacting = params.interactive_first;
    }

    bool is_antiprompt = false;
@@ -246,7 +267,7 @@ int main(int argc, char ** argv) {
            // infinite text generation via context swapping
            // if we run out of context:
            // - take the n_keep first tokens from the original prompt (via n_past)
-            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
+            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
            if (n_past + (int) embd.size() > n_ctx) {
                const int n_left = n_past - params.n_keep;

@@ -264,13 +285,21 @@ int main(int argc, char ** argv) {
                //printf("\n---\n");
            }

-            if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
-                return 1;
+            // evaluate tokens in batches
+            // embd is typically prepared beforehand to fit within a batch, but not always
+            for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
+                int n_eval = (int) embd.size() - i;
+                if (n_eval > params.n_batch) {
+                    n_eval = params.n_batch;
+                }
+                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
+                    fprintf(stderr, "%s : failed to eval\n", __func__);
+                    return 1;
+                }
+                n_past += n_eval;
            }
        }

-        n_past += embd.size();
        embd.clear();

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
@@ -386,10 +415,19 @@ int main(int argc, char ** argv) {
                std::string line;
                bool another_line = true;
                do {
+#if defined(_WIN32)
+                    std::wstring wline;
+                    if (!std::getline(std::wcin, wline)) {
+                        // input stream is bad or EOF received
+                        return 0;
+                    }
+                    win32_utf8_encode(wline, line);
+#else
                    if (!std::getline(std::cin, line)) {
                        // input stream is bad or EOF received
                        return 0;
                    }
+#endif
                    if (line.empty() || line.back() != '\\') {
                        another_line = false;
                    } else {
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@@ -1,3 +1,3 @@
-# perplexity
-
-TODO
+# perplexity
+
+TODO
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -2,6 +2,7 @@
 #include "llama.h"

 #include <cmath>
+#include <ctime>

 std::vector<float> softmax(const std::vector<float>& logits) {
    std::vector<float> probs(logits.size());
@@ -27,25 +28,38 @@ void perplexity(llama_context * ctx, const gpt_params & params) {

    int count = 0;
    int seq_count = tokens.size() / params.n_ctx;
+    int n_vocab = llama_n_vocab(ctx);

    double nll = 0.0;
-
-    fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
+    fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch);

    for (int i = 0; i < seq_count; ++i) {
        int start = i * params.n_ctx;
-        int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512
-                                            //       it is better to always be power of 2 for better performance
-        std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
+        int end = start + params.n_ctx;
+
+        std::vector<float> logits;
+        int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch;
        auto start_t = std::chrono::high_resolution_clock::now();
-        if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return;
+        for (int j = 0; j < num_batches; ++j) {
+            int batch_start = start + j * params.n_batch;
+            int batch_size = std::min(end - batch_start, params.n_batch);
+            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return;
+            }
+            auto batch_logits = llama_get_logits(ctx);
+            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
        }
        auto end_t = std::chrono::high_resolution_clock::now();
        if (i == 0) {
            const float seconds = std::chrono::duration<float>(end_t - start_t).count();
-            printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
+            printf("%.2f seconds per pass - ETA ", seconds);
+            int total_seconds = (int)(seconds * seq_count);
+            if (total_seconds >= 60*60) {
+                printf("%d hours ", total_seconds / (60*60));
+                total_seconds = total_seconds % (60*60);
+            }
+            printf("%d minutes\n", total_seconds / 60);
        }
        // We get the logits for all the tokens in the context window (params.n_ctx)
        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
@@ -59,15 +73,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-
-        auto logits = llama_get_logits(ctx);
-        for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
+        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
            // Calculate probability of next token, given the previous ones.
-            int n_vocab = llama_n_vocab(ctx);
            std::vector<float> tok_logits(
-                logits + j * n_vocab,
-                logits + (j + 1) * n_vocab);
-            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
+                logits.begin() + j * n_vocab,
+                logits.begin() + (j + 1) * n_vocab);
+            float prob = softmax(tok_logits)[tokens[start + j + 1]];
            nll += -std::log(prob);
            ++count;
        }
@@ -82,11 +93,13 @@ int main(int argc, char ** argv) {
    gpt_params params;
    params.model = "models/llama-7B/ggml-model.bin";

+    params.n_batch = 512;
    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }

    params.perplexity = true;
+    params.n_batch = std::min(params.n_batch, params.n_ctx);

    if (params.n_ctx > 2048) {
        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
@@ -115,6 +128,7 @@ int main(int argc, char ** argv) {
        lparams.seed       = params.seed;
        lparams.f16_kv     = params.memory_f16;
        lparams.logits_all = params.perplexity;
+        lparams.use_mmap   = params.use_mmap;
        lparams.use_mlock  = params.use_mlock;
        lparams.embedding  = params.embedding;

@@ -126,6 +140,17 @@ int main(int argc, char ** argv) {
        }
    }

+    if (!params.lora_adapter.empty()) {
+        int err = llama_apply_lora_from_file(ctx,
+                                             params.lora_adapter.c_str(),
+                                             params.lora_base.empty() ? NULL : params.lora_base.c_str(),
+                                             params.n_threads);
+        if (err != 0) {
+            fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+            return 1;
+        }
+    }
+
    // print system information
    {
        fprintf(stderr, "\n");
--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET quantize-stats)
+add_executable(${TARGET} quantize-stats.cpp)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -0,0 +1,420 @@
+#include "ggml.h"
+
+#define LLAMA_API_INTERNAL
+#include "llama.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <numeric>
+#include <regex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include <thread>
+#include <mutex>
+
+struct quantize_stats_params {
+    std::string model = "models/7B/ggml-model-f16.bin";
+    bool verbose = false;
+    bool per_layer_stats = false;
+    bool print_histogram = false;
+    bool reference = false;
+    std::vector<std::string> include_layers;
+    std::vector<std::string> exclude_layers;
+    std::vector<enum ggml_type> include_types;
+};
+
+const size_t HISTOGRAM_BUCKETS = 150;
+const double HISTOGRAM_RANGE = 0.03;
+
+struct error_stats {
+    size_t num_samples;
+    double total_error;
+    double max_error;
+    uint64_t error_histogram[HISTOGRAM_BUCKETS];
+};
+
+
+void quantize_stats_print_usage(int /*argc*/, char ** argv) {
+    quantize_stats_params params;
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -r, --reference\n");
+    fprintf(stderr, "                        use reference implementation (default: false)\n");
+    fprintf(stderr, "  -v, --verbose\n");
+    fprintf(stderr, "                        verbose output (default: false)\n");
+    fprintf(stderr, "  -p, --per-layer-stats\n");
+    fprintf(stderr, "                        print stats per layer (default: false)\n");
+    fprintf(stderr, "  --histogram\n");
+    fprintf(stderr, "                        print error histogram (default: false)\n");
+    fprintf(stderr, "  -l LAYER, --include-layer LAYER\n");
+    fprintf(stderr, "                        only test layers matching pattern\n");
+    fprintf(stderr, "  -L LAYER, --exclude-layer LAYER\n");
+    fprintf(stderr, "                        exclude layers matching pattern\n");
+    fprintf(stderr, "  -t TYPE, --type TYPE\n");
+    fprintf(stderr, "                        only test given type (q4_0, q4_1)\n");
+    fprintf(stderr, "\n");
+}
+
+// Check if a layer is included/excluded by command line
+bool layer_included(const quantize_stats_params params, const std::string & layer) {
+    for (const auto& excluded : params.exclude_layers) {
+        if (std::regex_search(layer, std::regex(excluded))) {
+            return false;
+        }
+    }
+    for (const auto& included : params.include_layers) {
+        if (std::regex_search(layer, std::regex(included))) {
+            return true;
+        }
+    }
+    return params.include_layers.empty();
+}
+
+// Update error statistics given vectors with the before/after result of quantization
+void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
+    for (int64_t i = 0; i < nelements; i++) {
+        double diff = input[i] - output[i];
+        stats.total_error += diff * diff;
+        stats.max_error = fmax(fabs(diff), stats.max_error);
+        stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
+    }
+    stats.num_samples += nelements;
+}
+
+void combine_error_stats(error_stats & into, const error_stats & from) {
+    into.num_samples += from.num_samples;
+    into.total_error += from.total_error;
+    if (from.max_error > into.max_error) into.max_error = from.max_error;
+    for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
+}
+
+double find_quantile(const error_stats & stats, double quantile) {
+    double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
+
+    double accum = 0;
+    for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+        accum += stats.error_histogram[i];
+        if (accum >= sum*quantile) {
+            return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+        }
+    }
+    return INFINITY;
+}
+
+void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
+    double rmse = sqrt(stats.total_error / (double) stats.num_samples);
+    double median = find_quantile(stats, .5);
+    double pct95 = find_quantile(stats, .95);
+    printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
+    if (print_histogram) {
+        printf("Error distribution:\n");
+        for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+            double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
+            printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
+        }
+    }
+}
+
+// copied from ggml.h - verify that we can access this as a flat array
+static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+void test_roundtrip_on_chunk(
+        const ggml_tensor * layer,
+        int64_t offset,
+        int64_t chunk_size,
+        const quantize_fns_t & qfns,
+        bool use_reference,
+        float * input_scratch,
+        char * quantized_scratch,
+        float * output_scratch,
+        error_stats & stats) {
+
+    if (layer->type == GGML_TYPE_F16) {
+        for (int i = 0; i < chunk_size; i++) {
+            input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
+        }
+    } else {
+        input_scratch = ggml_get_data_f32(layer) + offset;
+    }
+
+    if (use_reference) {
+        qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
+    } else {
+        qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
+    }
+    qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
+
+    update_error_stats(chunk_size, input_scratch, output_scratch, stats);
+}
+
+
+// Run quantization function for a single layer and update error stats
+void test_roundtrip_on_layer(
+        std::string & name,
+        bool print_layer_stats,
+        const quantize_fns_t & qfns,
+        bool use_reference,
+        const ggml_tensor * layer,
+        std::vector<float> & input_scratch,
+        std::vector<char> & quantized_scratch,
+        std::vector<float> & output_scratch,
+        error_stats & total_error,
+        int max_thread = 0) {
+
+    assert(tensor_is_contiguous(layer));
+    error_stats layer_error {};
+    uint64_t nelements = ggml_nelements(layer);
+
+    float* input_scratch_ptr = nullptr;
+    if (layer->type == GGML_TYPE_F16) {
+        if (input_scratch.size() < nelements) input_scratch.resize(nelements);
+        input_scratch_ptr = input_scratch.data();
+    }
+    if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
+    if (output_scratch.size() < nelements) output_scratch.resize(nelements);
+
+    if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
+    int chunk_size = 32*512;
+    int num_chunks = (nelements + chunk_size - 1)/chunk_size;
+
+    if (num_chunks < 2 || max_thread < 2) {
+        test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
+                output_scratch.data(), print_layer_stats ? layer_error : total_error);
+    } else {
+        auto & stats = print_layer_stats ? layer_error : total_error;
+        std::mutex mutex;
+        uint64_t counter = 0;
+        auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
+             &quantized_scratch, &output_scratch, chunk_size] () {
+            error_stats local_stats {};
+            while (true) {
+                std::unique_lock<std::mutex> lock(mutex);
+                uint64_t offset = counter; counter += chunk_size;
+                if (offset >= nelements) {
+                    combine_error_stats(stats, local_stats);
+                    break;
+                }
+                lock.unlock();
+                uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
+                test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
+                        quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
+            }
+        };
+        int nthread = std::min(num_chunks, max_thread);
+        std::vector<std::thread> workers(nthread-1);
+        for (auto& w : workers) w = std::thread(compute);
+        compute();
+        for (auto& w : workers) w.join();
+    }
+
+    if (print_layer_stats) {
+        print_error_stats(name, layer_error, false);
+        combine_error_stats(total_error, layer_error);
+    }
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    quantize_stats_params params;
+
+    // read command line
+
+    int max_thread = 0;
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            quantize_stats_print_usage(argc, argv);
+            exit(0);
+        } else if (arg == "-r" || arg == "--reference") {
+            params.reference = true;
+        } else if (arg == "-v") {
+            params.verbose = true;
+        } else if (arg == "-p" || arg == "--per-layer-stats") {
+            params.per_layer_stats = true;
+        } else if (arg == "--histogram") {
+            params.print_histogram = true;
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model = argv[i];
+        } else if (arg == "-l" || arg == "--include-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.include_layers.push_back(argv[i]);
+        } else if (arg == "-L" || arg == "--exclude-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.exclude_layers.push_back(argv[i]);
+        } else if (arg == "-t" || arg == "--type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int j;
+            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) j)) != 0; j++) {
+                // find match
+            }
+            if (j < GGML_TYPE_COUNT) {
+                params.include_types.push_back((ggml_type) j);
+            } else {
+                fprintf(stderr, "error: %s not in list of types\n", argv[i]);
+                invalid_param = true;
+            }
+        } else if (arg == "-n" || arg == "--num-threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            max_thread = atoi(argv[i]);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            quantize_stats_print_usage(argc, argv);
+            return 1;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        quantize_stats_print_usage(argc, argv);
+        return 1;
+    }
+
+    // load the model
+    fprintf(stderr, "Loading model\n");
+
+    const int64_t t_main_start_us = ggml_time_us();
+    llama_context * ctx;
+
+    {
+        auto lparams = llama_context_default_params();
+
+        lparams.n_ctx      = 256;
+        lparams.n_parts    = 1;
+        lparams.seed       = 1;
+        lparams.f16_kv     = false;
+        lparams.use_mlock  = false;
+
+        ctx = llama_init_from_file(params.model.c_str(), lparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+    }
+
+    const auto &tensors = llama_internal_get_tensor_map(ctx);
+
+    // check layer tensors
+    int included_layers = 0;
+    int64_t max_nelements = 0;
+    bool is_f16 = false;
+    for (const auto& kv_tensor : tensors) {
+        if (!layer_included(params, kv_tensor.first)) {
+            continue;
+        }
+        if (params.verbose) {
+            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
+        }
+        if (kv_tensor.second->type == GGML_TYPE_F16) {
+            is_f16 = true;
+        } else if (kv_tensor.second->type != GGML_TYPE_F32) {
+            fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
+                "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
+            llama_free(ctx);
+            return 1;
+        }
+        included_layers++;
+        max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
+    }
+
+    if (is_f16) {
+        printf("note: source model is f16\n");
+    }
+    printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
+    // allocate scratch space
+    std::vector<float> input_scratch;
+    std::vector<char> quantized_scratch;
+    std::vector<float> output_scratch;
+
+    // loop throught quantization types
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        const ggml_type type = (ggml_type) i;
+        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
+            continue;
+        }
+        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+            if (params.verbose) {
+                printf("testing %s ...\n",  ggml_type_name(type));
+            }
+
+            error_stats global_stats {};
+
+            for (const auto& kv_tensor : tensors) {
+                if (!layer_included(params, kv_tensor.first)) {
+                    continue;
+                }
+                if (params.verbose) {
+                    printf("  %s ...\n",  kv_tensor.first.c_str());
+                }
+                std::string layer_name { ggml_type_name(type) };
+                layer_name += "::" + kv_tensor.first;
+                test_roundtrip_on_layer(
+                        layer_name,
+                        params.per_layer_stats,
+                        qfns,
+                        params.reference,
+                        kv_tensor.second,
+                        input_scratch,
+                        quantized_scratch,
+                        output_scratch,
+                        global_stats,
+                        max_thread
+                );
+            }
+
+            print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
+        }
+    }
+
+
+    llama_free(ctx);
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
+    }
+
+    return 0;
+}
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -5,15 +5,17 @@
 #include <string>

 // usage:
-//  ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
+//  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
 //
 int main(int argc, char ** argv) {
    ggml_time_init();

-    if (argc != 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        fprintf(stderr, "  type = 2 - q4_0\n");
-        fprintf(stderr, "  type = 3 - q4_1\n");
+    if (argc < 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
+        fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
+        fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
+        fprintf(stderr, "  type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
+        fprintf(stderr, "  type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
        return 1;
    }

@@ -27,7 +29,8 @@ int main(int argc, char ** argv) {
    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];

-    const int itype = atoi(argv[3]);
+    const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
+    int nthread = argc > 4 ? atoi(argv[4]) : 0;

    const int64_t t_main_start_us = ggml_time_us();

@@ -37,7 +40,7 @@ int main(int argc, char ** argv) {
    {
        const int64_t t_start_us = ggml_time_us();

-        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
+        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }
--- a/examples/save-load-state/CMakeLists.txt
+++ b/examples/save-load-state/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET save-load-state)
+add_executable(${TARGET} save-load-state.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -0,0 +1,128 @@
+#include <vector>
+#include <cstdio>
+#include <chrono>
+
+#include "common.h"
+#include "llama.h"
+#include "llama.cpp"
+
+using namespace std;
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+    params.model = "models/llama-7B/ggml-model.bin";
+    params.seed = 42;
+    params.n_threads = 4;
+    params.repeat_last_n = 64;
+    params.prompt = "The quick brown fox";
+
+    if (gpt_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    auto lparams = llama_context_default_params();
+
+    lparams.n_ctx      = params.n_ctx;
+    lparams.n_parts    = params.n_parts;
+    lparams.seed       = params.seed;
+    lparams.f16_kv     = params.memory_f16;
+    lparams.use_mmap   = params.use_mmap;
+    lparams.use_mlock  = params.use_mlock;
+
+    auto n_past = 0;
+    auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
+
+    // init
+    auto ctx = llama_init_from_file(params.model.c_str(), lparams);
+    auto tokens = vector<llama_token>(params.n_ctx);
+    auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
+
+    if (n_prompt_tokens < 1) {
+        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
+        return 1;
+    }
+
+    // evaluate prompt
+
+    llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
+
+    last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
+    n_past += n_prompt_tokens;
+
+    // Save state (rng, logits, embedding and kv_cache) to file
+    FILE *fp_write = fopen("dump_state.bin", "wb");
+    auto state_size = llama_get_state_size(ctx);
+    auto state_mem = new uint8_t[state_size];
+    llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
+    fwrite(state_mem, 1, state_size, fp_write);
+    fclose(fp_write);
+
+    // save state (last tokens)
+    auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
+    auto n_past_saved = n_past;
+
+    // first run
+    printf("\n%s", params.prompt.c_str());
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto next_token = llama_sample_top_p_top_k(
+            ctx,
+            &last_n_tokens_data.back() - params.repeat_last_n,
+            params.repeat_last_n,
+            40,
+            1.0,
+            1.0,
+            1.1);
+        auto next_token_str = llama_token_to_str(ctx, next_token);
+        last_n_tokens_data.push_back(next_token);
+        printf("%s", next_token_str);
+        if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            return 1;
+        }
+        n_past += 1;
+    }
+    printf("\n\n");
+
+    // free old model
+    llama_free(ctx);
+
+    // load new model
+
+    auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
+
+    // Load state (rng, logits, embedding and kv_cache) from file
+    FILE *fp_read = fopen("dump_state.bin", "rb");
+    auto state_size2 = llama_get_state_size(ctx2);
+    if (state_size != state_size2) {
+        fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
+    }
+    fread(state_mem, 1, state_size, fp_read);
+    llama_set_state_data(ctx2, state_mem);  // could also read directly from memory mapped file
+    fclose(fp_read);
+
+    // restore state (last tokens)
+    last_n_tokens_data = last_n_tokens_data_saved;
+    n_past = n_past_saved;
+
+    // second run
+    for (auto i = 0; i < params.n_predict; i++) {
+        auto next_token = llama_sample_top_p_top_k(
+            ctx2,
+            &last_n_tokens_data.back() - params.repeat_last_n,
+            params.repeat_last_n,
+            40,
+            1.0,
+            1.0,
+            1.1);
+        auto next_token_str = llama_token_to_str(ctx2, next_token);
+        last_n_tokens_data.push_back(next_token);
+        printf("%s", next_token_str);
+        if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
+            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
+            return 1;
+        }
+        n_past += 1;
+    }
+    printf("\n\n");
+    return 0;
+}
--- a/flake.nix
+++ b/flake.nix
@@ -10,7 +10,6 @@
          inherit system;
        };
        llama-python = pkgs.python310.withPackages (ps: with ps; [
-          torch
          numpy
          sentencepiece
        ]);
@@ -28,8 +27,9 @@
          ];
          installPhase = ''
            mkdir -p $out/bin
-            mv bin/main $out/bin/llama
-            mv bin/quantize $out/bin/quantize
+            mv bin/* $out/bin/
+            mv $out/bin/main $out/bin/llama
+
            echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
            cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
            chmod +x $out/bin/convert-pth-to-ggml
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -0,0 +1,228 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <cuda_fp16.h>
+#include <atomic>
+#include "ggml-cuda.h"
+
+typedef uint16_t ggml_fp16_t;
+static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+
+#define QK4_0 32
+typedef struct {
+    float   d;              // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+typedef struct {
+    float   d;              // delta
+    float   m;              // min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK4_2 16
+typedef struct {
+    __half  d;              // delta
+    uint8_t qs[QK4_2 / 2];  // nibbles / quants
+} block_q4_2;
+static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
+
+#define QK4_3 16
+typedef struct {
+    __half  d;              // delta
+    __half  m;              // min
+    uint8_t qs[QK4_3 / 2];  // nibbles / quants
+} block_q4_3;
+static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
+
+static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+
+    const uint8_t * pp = x[i].qs;
+
+    for (int l = 0; l < QK4_0; l += 2) {
+        const uint8_t vi = pp[l/2];
+
+        const int8_t vi0 = vi & 0xf;
+        const int8_t vi1 = vi >> 4;
+
+        const float v0 = (vi0 - 8)*d;
+        const float v1 = (vi1 - 8)*d;
+
+        y[i*QK4_0 + l + 0] = v0;
+        y[i*QK4_0 + l + 1] = v1;
+    }
+}
+
+static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+    const float m = x[i].m;
+
+    const uint8_t * pp = x[i].qs;
+
+    for (int l = 0; l < QK4_1; l += 2) {
+        const uint8_t vi = pp[l/2];
+
+        const int8_t vi0 = vi & 0xf;
+        const int8_t vi1 = vi >> 4;
+
+        const float v0 = vi0*d + m;
+        const float v1 = vi1*d + m;
+
+        y[i*QK4_1 + l + 0] = v0;
+        y[i*QK4_1 + l + 1] = v1;
+    }
+}
+
+static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
+    const block_q4_2 * x = (const block_q4_2 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+
+    const uint8_t * pp = x[i].qs;
+
+    for (int l = 0; l < QK4_2; l += 2) {
+        const uint8_t vi = pp[l/2];
+
+        const int8_t vi0 = vi & 0xf;
+        const int8_t vi1 = vi >> 4;
+
+        const float v0 = (vi0 - 8)*d;
+        const float v1 = (vi1 - 8)*d;
+
+        y[i*QK4_2 + l + 0] = v0;
+        y[i*QK4_2 + l + 1] = v1;
+    }
+}
+
+static __global__ void dequantize_block_q4_3(const void * vx, float * y) {
+    const block_q4_3 * x = (const block_q4_3 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+    const float m = x[i].m;
+
+    const uint8_t * pp = x[i].qs;
+
+    for (int l = 0; l < QK4_3; l += 2) {
+        const uint8_t vi = pp[l/2];
+
+        const int8_t vi0 = vi & 0xf;
+        const int8_t vi1 = vi >> 4;
+
+        const float v0 = vi0*d + m;
+        const float v1 = vi1*d + m;
+
+        y[i*QK4_3 + l + 0] = v0;
+        y[i*QK4_3 + l + 1] = v1;
+    }
+}
+
+void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK4_0;
+    dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK4_1;
+    dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK4_2;
+    dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK4_3;
+    dequantize_block_q4_3<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+// buffer pool for cuda
+#define MAX_CUDA_BUFFERS 16
+
+struct scoped_spin_lock {
+    std::atomic_flag& lock;
+    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
+        while (lock.test_and_set(std::memory_order_acquire)) {
+            ; // spin
+        }
+    }
+    ~scoped_spin_lock() {
+        lock.clear(std::memory_order_release);
+    }
+    scoped_spin_lock(const scoped_spin_lock&) = delete;
+    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
+};
+
+struct cuda_buffer {
+    void * ptr = nullptr;
+    size_t size = 0;
+};
+
+static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
+static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+
+void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[i];
+        if (b.size >= size && b.ptr != nullptr) {
+            void * ptr = b.ptr;
+            *actual_size = b.size;
+            b.ptr = nullptr;
+            b.size = 0;
+            return ptr;
+        }
+    }
+    void * ptr;
+    CUDA_CHECK(cudaMalloc((void **) &ptr, size));
+    *actual_size = size;
+    return ptr;
+}
+
+void ggml_cuda_pool_free(void * ptr, size_t size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[i];
+        if (b.ptr == nullptr) {
+            b.ptr = ptr;
+            b.size = size;
+            return;
+        }
+    }
+    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+    CUDA_CHECK(cudaFree(ptr));
+}
+
+cublasHandle_t g_cublasH = NULL;
+cudaStream_t g_cudaStream = NULL;
+
+void ggml_init_cublas(void) {
+    if (g_cublasH == NULL) {
+        // create cublas handle, bind a stream
+        CUBLAS_CHECK(cublasCreate(&g_cublasH));
+
+        CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
+
+        CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));
+
+        // configure logging to stdout
+        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
+    }
+}
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -0,0 +1,41 @@
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define CUDA_CHECK(err)                                                                 \
+    do {                                                                                \
+        cudaError_t err_ = (err);                                                       \
+        if (err_ != cudaSuccess) {                                                      \
+            fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__,   \
+                cudaGetErrorString(err_));                                              \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+
+#define CUBLAS_CHECK(err)                                                               \
+    do {                                                                                \
+        cublasStatus_t err_ = (err);                                                    \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
+            fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);    \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+
+extern cublasHandle_t g_cublasH;
+extern cudaStream_t   g_cudaStream;
+
+void   ggml_init_cublas(void);
+void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
+void   ggml_cuda_pool_free(void * ptr, size_t size);
+
+void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+void dequantize_row_q4_3_cuda(const void * vx, float * y, int k, cudaStream_t stream);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@@ -55,6 +55,7 @@ extern "C" {
        bool f16_kv;     // use fp16 for KV cache
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool vocab_only; // only load the vocabulary, no weights
+        bool use_mmap;   // use mmap if possible
        bool use_mlock;  // force system to keep model in RAM
        bool embedding;  // embedding mode only

@@ -64,8 +65,22 @@ extern "C" {
        void * progress_callback_user_data;
    };

+    // model file types
+    enum llama_ftype {
+        LLAMA_FTYPE_ALL_F32     = 0,
+        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
+    };
+
    LLAMA_API struct llama_context_params llama_context_default_params();

+    LLAMA_API bool llama_mmap_supported();
+    LLAMA_API bool llama_mlock_supported();
+
    // Various functions for loading a ggml llama model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure
@@ -78,27 +93,39 @@ extern "C" {

    // TODO: not great API - very likely to change
    // Returns 0 on success
+    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
-                   int   itype);
+      enum llama_ftype   ftype,
+            int          nthread);

-    // Returns the KV cache that will contain the context for the
-    // ongoing prediction with the model.
-    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
-
-    // Returns the size of the KV cache
-    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
+    // Apply a LoRA adapter to a loaded model
+    // path_base_model is the path to a higher quality model to use as a base for
+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
+    // will be applied on top of the previous one
+    // Returns 0 on success
+    LLAMA_API int llama_apply_lora_from_file(
+            struct llama_context * ctx,
+                      const char * path_lora,
+                      const char * path_base_model,
+                             int   n_threads);

    // Returns the number of tokens in the KV cache
    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);

-    // Sets the KV cache containing the current context for the model
-    LLAMA_API void llama_set_kv_cache(
-            struct llama_context * ctx,
-                   const uint8_t * kv_cache,
-                          size_t   n_size,
-                             int   n_token_count);
+    // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
+    LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
+
+    // Copies the state to the specified destination address.
+    // Destination needs to have allocated enough memory.
+    // Returns the number of bytes copied
+    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
+
+    // Set the state reading from the specified address
+    // Returns the number of bytes read
+    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);

    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
@@ -166,4 +193,15 @@ extern "C" {
 }
 #endif

+// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
+#ifdef LLAMA_API_INTERNAL
+
+#include <vector>
+#include <string>
+struct ggml_tensor;
+
+std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
+
 #endif
+
+#endif // LLAMA_H
--- a/llama_util.h
+++ b/llama_util.h
@@ -0,0 +1,408 @@
+// Internal header to be included only by llama.cpp.
+// Contains wrappers around OS interfaces.
+
+#ifndef LLAMA_UTIL_H
+#define LLAMA_UTIL_H
+
+#include <cstdio>
+#include <cstdint>
+#include <cerrno>
+#include <cstring>
+#include <cstdarg>
+#include <cstdlib>
+#include <climits>
+
+#include <string>
+#include <vector>
+
+#ifdef __has_include
+    #if __has_include(<unistd.h>)
+        #include <unistd.h>
+        #if defined(_POSIX_MAPPED_FILES)
+            #include <sys/mman.h>
+        #endif
+        #if defined(_POSIX_MEMLOCK_RANGE)
+            #include <sys/resource.h>
+        #endif
+    #endif
+#endif
+
+#if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    #include <io.h>
+    #include <stdio.h> // for _fseeki64
+#endif
+
+#define LLAMA_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            abort(); \
+        } \
+    } while (0)
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    LLAMA_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    LLAMA_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            throw format("failed to open %s: %s", fname, std::strerror(errno));
+        }
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        LLAMA_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        LLAMA_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw format("read error: %s", strerror(errno));
+        }
+        if (ret != 1) {
+            throw std::string("unexpectedly reached end of file");
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            throw format("write error: %s", strerror(errno));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+#if defined(_WIN32)
+static std::string llama_format_win_err(DWORD err) {
+    LPSTR buf;
+    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
+    if (!size) {
+        return "FormatMessageA failed";
+    }
+    std::string ret(buf, size);
+    LocalFree(buf);
+    return ret;
+}
+#endif
+
+struct llama_mmap {
+    void * addr;
+    size_t size;
+
+    llama_mmap(const llama_mmap &) = delete;
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
+        size = file->size;
+        int fd = fileno(file->fp);
+        int flags = MAP_SHARED;
+#ifdef __linux__
+        flags |= MAP_POPULATE;
+#endif
+        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
+        if (addr == MAP_FAILED) {
+            throw format("mmap failed: %s", strerror(errno));
+        }
+
+        if (prefetch) {
+            // Advise the kernel to preload the mapped memory
+            if (madvise(addr, file->size, MADV_WILLNEED)) {
+                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                        strerror(errno));
+            }
+        }
+    }
+
+    ~llama_mmap() {
+        munmap(addr, size);
+    }
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
+        size = file->size;
+
+        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
+
+        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+        DWORD error = GetLastError();
+
+        if (hMapping == NULL) {
+            throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
+        }
+
+        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+        error = GetLastError();
+        CloseHandle(hMapping);
+
+        if (addr == NULL) {
+            throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
+        }
+
+        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
+        if (prefetch) {
+            // Advise the kernel to preload the mapped memory
+            WIN32_MEMORY_RANGE_ENTRY range;
+            range.VirtualAddress = addr;
+            range.NumberOfBytes = (SIZE_T)size;
+            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+                fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+            }
+        }
+        #else
+        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
+        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
+    }
+
+    ~llama_mmap() {
+        if (!UnmapViewOfFile(addr)) {
+            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    llama_mmap(struct llama_file *) {
+        throw std::string("mmap not supported");
+    }
+#endif
+};
+
+// Represents some region of memory being locked using mlock or VirtualLock;
+// will automatically unlock on destruction.
+struct llama_mlock {
+    void * addr = NULL;
+    size_t size = 0;
+    bool failed_already = false;
+
+    llama_mlock() {}
+    llama_mlock(const llama_mlock &) = delete;
+
+    ~llama_mlock() {
+        if (size) {
+            raw_unlock(addr, size);
+        }
+    }
+
+    void init(void * addr) {
+        LLAMA_ASSERT(this->addr == NULL && this->size == 0);
+        this->addr = addr;
+    }
+
+    void grow_to(size_t target_size) {
+        LLAMA_ASSERT(addr);
+        if (failed_already) {
+            return;
+        }
+        size_t granularity = lock_granularity();
+        target_size = (target_size + granularity - 1) & ~(granularity - 1);
+        if (target_size > size) {
+            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
+                size = target_size;
+            } else {
+                failed_already = true;
+            }
+        }
+    }
+
+#ifdef _POSIX_MEMLOCK_RANGE
+    static constexpr bool SUPPORTED = true;
+
+    size_t lock_granularity() {
+        return (size_t) sysconf(_SC_PAGESIZE);
+    }
+
+    #ifdef __APPLE__
+        #define MLOCK_SUGGESTION \
+            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
+    #else
+        #define MLOCK_SUGGESTION \
+            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
+    #endif
+
+    bool raw_lock(const void * addr, size_t size) {
+        if (!mlock(addr, size)) {
+            return true;
+        } else {
+            char* errmsg = std::strerror(errno);
+            bool suggest = (errno == ENOMEM);
+
+            // Check if the resource limit is fine after all
+            struct rlimit lock_limit;
+            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
+                suggest = false;
+            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
+                suggest = false;
+
+            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
+                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
+            return false;
+        }
+    }
+
+    #undef MLOCK_SUGGESTION
+
+    void raw_unlock(void * addr, size_t size) {
+        if (munlock(addr, size)) {
+            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
+        }
+    }
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+
+    size_t lock_granularity() {
+        SYSTEM_INFO si;
+        GetSystemInfo(&si);
+        return (size_t) si.dwPageSize;
+    }
+
+    bool raw_lock(void * addr, size_t size) {
+        for (int tries = 1; ; tries++) {
+            if (VirtualLock(addr, size)) {
+                return true;
+            }
+            if (tries == 2) {
+                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
+                        size, this->size, llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+
+            // It failed but this was only the first try; increase the working
+            // set size and try again.
+            SIZE_T min_ws_size, max_ws_size;
+            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
+                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+            // Per MSDN: "The maximum number of pages that a process can lock
+            // is equal to the number of pages in its minimum working set minus
+            // a small overhead."
+            // Hopefully a megabyte is enough overhead:
+            size_t increment = size + 1048576;
+            // The minimum must be <= the maximum, so we need to increase both:
+            min_ws_size += increment;
+            max_ws_size += increment;
+            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
+                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+        }
+    }
+
+    void raw_unlock(void * addr, size_t size) {
+        if (!VirtualUnlock(addr, size)) {
+            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    void raw_lock(const void * addr, size_t size) {
+        fprintf(stderr, "warning: mlock not supported on this system\n");
+    }
+
+    void raw_unlock(const void * addr, size_t size) {}
+#endif
+};
+
+// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
+struct llama_buffer {
+    uint8_t * addr = NULL;
+    size_t size = 0;
+
+    void resize(size_t size) {
+        delete[] addr;
+        addr = new uint8_t[size];
+        this->size = size;
+    }
+
+    ~llama_buffer() {
+        delete[] addr;
+    }
+};
+#endif
--- a/migrate-ggml-2023-03-30-pr613.py
+++ b/migrate-ggml-2023-03-30-pr613.py
@@ -1,311 +0,0 @@
-# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
-#
-# We caused a breaking change to the file format on 2023-03-30 in:
-#     https://github.com/ggerganov/llama.cpp/pull/613
-#
-# (1) If you still have the Meta LLaMA .pth files, then close this
-#     file now; you can just run `convert-pth-to-ggml.py` again to
-#     migrate to the new format. The tool is easier to use too. It
-#     isn't necessary anymore to manage split output files because
-#     the new format always combines things into a single file.
-#
-# (2) If you deleted the Meta LLaMA .pth files due to save on disk
-#     space, then this tool is intended to help you.  Please check
-#     out the instructions below.
-#
-# USAGE
-#
-#     python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
-#
-# PREREQUISITES
-#
-#     pip install numpy
-#     cd llama.cpp
-#     make -j4
-#
-# EXAMPLE (7B MODEL)
-#
-#     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
-#     python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
-#
-#     # check that it works
-#     ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
-#
-#     # you can delete the old files
-#     rm -f models/7B/ggml-model-f16.bin
-#     mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
-#
-# EXAMPLE (13B MODEL)
-#
-#     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
-#     python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
-#
-#     # check that it works
-#     ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
-#
-#     # you can delete the old files
-#     rm -f models/13B/ggml-model-f16.bin*
-#     mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
-#
-
-import argparse
-import os
-import sys
-import json
-import struct
-import numpy as np
-
-QK = 32
-
-GGML_TYPE_Q4_0  = 0
-GGML_TYPE_Q4_1  = 1
-GGML_TYPE_I8    = 2
-GGML_TYPE_I16   = 3
-GGML_TYPE_I32   = 4
-GGML_TYPE_F16   = 5
-GGML_TYPE_F32   = 6
-
-WTYPE_NAMES = {
-    0: "F32",
-    1: "F16",
-    2: "Q4_0",
-    3: "Q4_1",
-}
-
-WTYPES = {
-    0: GGML_TYPE_F32,
-    1: GGML_TYPE_F16,
-    2: GGML_TYPE_Q4_0,
-    3: GGML_TYPE_Q4_1,
-}
-
-GGML_BLCK_SIZE = {
-    GGML_TYPE_Q4_0:  QK,
-    GGML_TYPE_Q4_1:  QK,
-    GGML_TYPE_I8:    1,
-    GGML_TYPE_I16:   1,
-    GGML_TYPE_I32:   1,
-    GGML_TYPE_F16:   1,
-    GGML_TYPE_F32:   1,
-}
-
-GGML_TYPE_SIZE = {
-    GGML_TYPE_Q4_0: 4   + QK//2,
-    GGML_TYPE_Q4_1: 4*2 + QK//2,
-    GGML_TYPE_I8:   1,
-    GGML_TYPE_I16:  2,
-    GGML_TYPE_I32:  4,
-    GGML_TYPE_F16:  2,
-    GGML_TYPE_F32:  4,
-}
-
-HPARAMS = [
-    'magic',    # int32
-    'version',  # int32
-    'n_vocab',  # int32
-    'n_embd',   # int32
-    'n_mult',   # int32
-    'n_head',   # int32
-    'n_layer',  # int32
-    'n_rot',    # int32
-    'f16',      # int32
-]
-
-def read_hparams(fin):
-    struct_fmt = "i" * len(HPARAMS)
-    struct_size = struct.calcsize(struct_fmt)
-    buf = fin.read(struct_size)
-    ints = struct.unpack(struct_fmt, buf)
-    hparams = dict(zip(HPARAMS, ints))
-    return hparams
-
-def write_hparams(fout, hparams):
-    struct_fmt = "i" * len(HPARAMS)
-    struct_size = struct.calcsize(struct_fmt)
-    ints = [hparams[h] for h in HPARAMS]
-    fout.write(struct.pack(struct_fmt, *ints))
-
-def read_tokens(fin, hparams):
-    tokens = []
-    for i in range(hparams['n_vocab']):
-        len_b = fin.read(4)
-        (length,) = struct.unpack("i", len_b)
-        word = fin.read(length)
-        score_b = fin.read(4)
-        (score,) = struct.unpack("f", score_b)
-        tokens.append((word, score))
-    return tokens
-
-def write_tokens(fout, tokens):
-    for word, score in tokens:
-        fout.write(struct.pack("i", len(word)))
-        fout.write(word)
-        fout.write(struct.pack("f", score))
-
-def ggml_nelements(shape):
-    r = 1
-    for i in shape:
-        r *= i
-    return r
-
-def ggml_nbytes(shape, ftype):
-    x = ggml_nelements(shape)
-    t = WTYPES[ftype]
-    x *= GGML_TYPE_SIZE[t]
-    x //= GGML_BLCK_SIZE[t]
-    return x
-
-def copy_tensors(fin, fout, part_id, n_parts):
-    while True:
-
-        b = fin.read(4)
-        if not b: break
-        (n_dims,) = struct.unpack("i", b)
-        b = fin.read(4)
-        (length,) = struct.unpack("i", b)
-        b = fin.read(4)
-        (ftype,) = struct.unpack("i", b)
-
-        assert n_dims in (1, 2)
-
-        partshape = list(range(n_dims))
-        for i in range(n_dims):
-            b = fin.read(4)
-            partshape[i] = struct.unpack("i", b)[0]
-        partshape = list(reversed(partshape))
-
-        name = fin.read(length)
-        data = fin.read(ggml_nbytes(partshape, ftype))
-
-        blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
-        type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
-
-        print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
-
-        # determine dimension along which multipart tensor is sharded
-        #
-        # split_dim 0 regex:
-        #   - output.*
-        #   - layers.*.attention.wq.weight
-        #   - layers.*.attention.wk.weight
-        #   - layers.*.attention.wv.weight
-        #   - layers.*.feed_forward.w1.weight
-        #   - layers.*.feed_forward.w3.weight
-        #
-        # split_dim 1 regex:
-        #   - tok_embeddings.*
-        #   - layers.*.attention.wo.weight
-        #   - layers.*.feed_forward.w2.weight
-        #
-        if n_dims > 1:
-            split_dim = 1
-            if b"tok_embeddings" in name:
-                split_dim = 1
-            elif b"layers" in name:
-                if b"attention.wo.weight" in name:
-                    split_dim = 1
-                elif b"feed_forward.w2.weight" in name:
-                    split_dim = 1
-                else:
-                    split_dim = 0
-            elif b"output" in name:
-                split_dim = 0
-
-        # output tensor header
-        fullshape = list(partshape)
-        if n_dims > 1:
-            fullshape[split_dim] *= n_parts
-        fout.write(struct.pack("iii", n_dims, len(name), ftype))
-        for dim in reversed(fullshape):
-            fout.write(struct.pack("i", dim))
-        fout.write(name)
-
-        # ensure tensor data is aligned
-        tensor_data_offset = fout.tell()
-        while tensor_data_offset % QK != 0:
-            fout.write(struct.pack("B", 0))
-            tensor_data_offset += 1
-
-        # output unified mappable tensor data
-        if n_dims == 1 or n_parts == 1:
-            # copy tensor which we thankfully received in one piece
-            if part_id == 0:
-                fout.write(data)
-        elif split_dim == 0:
-            # reassemble multifile tensor containing some of the rows
-            rows_per_chunk = partshape[0]
-            current_row = part_id * rows_per_chunk
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset = current_row * bytes_per_row
-            fout.seek(tensor_data_offset + offset)
-            fout.write(data)
-        elif split_dim == 1:
-            # reassemble multifile tensor containing some of the cols
-            cols_per_chunk = partshape[1]
-            current_col = part_id * cols_per_chunk
-            bpr = partshape[1] // blck_size * type_size
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset_current_col = current_col // blck_size * type_size
-            for row in range(partshape[0]):
-                offset_row = row * bytes_per_row
-                offset = offset_row + offset_current_col
-                fout.seek(tensor_data_offset + offset)
-                fout.write(data[row * bpr:row * bpr + bpr])
-
-        # advance file position to next tensor
-        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
-    parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
-    parser.add_argument('fout_path', help='your new ggjt file name')
-    return parser.parse_args()
-
-def main():
-    args = parse_args()
-    assert args.fin_path
-    assert args.fout_path
-    assert args.fin_path != args.fout_path
-
-    with open(args.fin_path, "rb") as fin:
-        hparams = read_hparams(fin)
-        tokens = read_tokens(fin, hparams)
-
-    if hparams['magic'] == 0x67676a74:  # ggjt
-        print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
-        sys.exit(1)
-
-    if hparams['magic'] != 0x67676d66:  # ggmf
-        print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
-        sys.exit(1)
-
-    hparams['magic'] = 0x67676a74  # ggjt
-
-    # count number of multipart files by convention
-    n_parts = 1
-    while True:
-        if os.path.exists(f"{args.fin_path}.{n_parts}"):
-            n_parts += 1
-        else:
-            break
-
-    # we output a single file for ggml
-    with open(args.fout_path, "wb") as fout:
-        write_hparams(fout, hparams)
-        write_tokens(fout, tokens)
-        offset_of_tensors = fout.tell()
-        # the tensors we load could be split across multiple files
-        for part_id in range(n_parts):
-            fout.seek(offset_of_tensors)
-            print(f"Processing part {part_id+1} of {n_parts}\n")
-            fin_path = args.fin_path
-            if part_id > 0:
-                fin_path += f".{part_id}"
-            with open(fin_path, "rb") as fin:
-                read_tokens(fin, read_hparams(fin))
-                copy_tensors(fin, fout, part_id, n_parts)
-
-    print(f"Done. Output file: {args.fout_path}\n")
-
-if __name__ == "__main__":
-    main()
--- a/pocs/CMakeLists.txt
+++ b/pocs/CMakeLists.txt
@@ -0,0 +1,12 @@
+# dependencies
+
+find_package(Threads REQUIRED)
+
+# third-party
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+if (EMSCRIPTEN)
+else()
+    add_subdirectory(vdot)
+endif()
--- a/pocs/vdot/CMakeLists.txt
+++ b/pocs/vdot/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(TARGET vdot)
+add_executable(${TARGET} vdot.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+set(TARGET q8dot)
+add_executable(${TARGET} q8dot.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/pocs/vdot/q8dot.cpp
+++ b/pocs/vdot/q8dot.cpp
@@ -0,0 +1,172 @@
+#include <cstdio>
+#include <type_traits>
+#include <vector>
+#include <random>
+#include <chrono>
+#include <cstdlib>
+#include <cmath>
+#include <cassert>
+#include <cstring>
+#include <array>
+#include <type_traits>
+
+#include <ggml.h>
+
+constexpr int kVecSize = 1 << 16;
+
+// Copy-pasted from ggml.c
+#define QK4_0 32
+typedef struct {
+    float   d;          // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+typedef struct {
+    float   d;          // delta
+    float   m;          // min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+// Copy-pasted from ggml.c
+#define QK8_0 32
+typedef struct {
+    float   d;          // delta
+    float   s;          // d * sum(qs[i])
+    int8_t  qs[QK8_0];  // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == 2*sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
+
+static_assert(QK4_1 == QK8_0, "QK4_1 and QK8_0 must be the same");
+static_assert(QK4_0 == QK8_0, "QK4_0 and QK8_0 must be the same");
+
+template <typename T>
+void fillQ4blocks(std::vector<T>& blocks, std::mt19937& rndm) {
+    for (auto& b : blocks) {
+        b.d = 1;
+        for (int i=0; i<QK4_1/2; ++i) {
+            uint8_t v1 = rndm() >> 28;
+            uint8_t v2 = rndm() >> 28;
+            b.qs[i] = v1 | (v2 << 4);
+        }
+    }
+}
+
+void fillQ80blocks(std::vector<block_q8_0>& blocks, std::mt19937& rndm) {
+    for (auto& b : blocks) {
+        b.d = 1;
+        int sum = 0;
+        for (int i=0; i<QK8_0; ++i) {
+            b.qs[i] = (rndm() >> 24) - 128;
+            sum += b.qs[i];
+        }
+        b.s = b.d * sum;
+    }
+}
+
+float simpleDot(const block_q4_0& x, const block_q8_0& y) {
+    int s1 = 0; //, s2 = 0;
+    for (int i=0; i<QK4_1/2; i+=2) {
+        int v1 = x.qs[i+0] & 0xf;
+        int v2 = x.qs[i+0] >> 4;
+        int v3 = x.qs[i+1] & 0xf;
+        int v4 = x.qs[i+1] >> 4;
+        int j = 2*i;
+        s1 += v1*y.qs[j] + v2*y.qs[j+1] + v3*y.qs[j+2] + v4*y.qs[j+3];
+        //s2 += y.qs[j] + y.qs[j+1] + y.qs[j+2] + y.qs[j+3];
+    }
+    return y.d * x.d * s1 - 8 * x.d * y.s;
+    //return y.d * x.d * (s1 - 8 * s2);
+}
+
+float simpleDot(const block_q4_1& x, const block_q8_0& y) {
+    int s1 = 0; //, s2 = 0;
+    for (int i=0; i<QK4_1/2; i+=2) {
+        int v1 = x.qs[i+0] & 0xf;
+        int v2 = x.qs[i+0] >> 4;
+        int v3 = x.qs[i+1] & 0xf;
+        int v4 = x.qs[i+1] >> 4;
+        int j = 2*i;
+        s1 += v1*y.qs[j] + v2*y.qs[j+1] + v3*y.qs[j+2] + v4*y.qs[j+3];
+        //s2 += y.qs[j] + y.qs[j+1] + y.qs[j+2] + y.qs[j+3];
+    }
+    return y.d * x.d * s1 + y.s * x.m;
+    //return y.d * (x.d * s1 + x.m * s2);
+}
+
+struct Stat {
+    double sum = 0, sumt = 0, sumt2 = 0, maxt = 0;
+    int nloop = 0;
+    void addResult(double s, double t) {
+        sum += s;
+        sumt += t; sumt2 += t*t; maxt = std::max(maxt, t);
+        ++nloop;
+    }
+    void reportResult(const char* title) const {
+        if (nloop < 1) {
+            printf("%s(%s): no result\n",__func__,title);
+            return;
+        }
+        printf("============ %s\n",title);
+        printf("<dot> = %g\n",sum/nloop);
+        auto t = sumt/nloop, dt = sumt2/nloop - t*t;
+        if (dt > 0) dt = sqrt(dt);
+        printf("<time> = %g +/- %g us. Max. time = %g us.\n",t,dt,maxt);
+    }
+};
+
+
+int main(int argc, char** argv) {
+
+    int nloop = argc > 1 ? atoi(argv[1]) : 10;
+    int type  = argc > 2 ? atoi(argv[2]) : 1;
+
+    std::mt19937 rndm(1234);
+
+    std::vector<block_q4_1> x41;
+    std::vector<block_q4_0> x40;
+    std::vector<block_q8_0> y(kVecSize);
+    if (type == 0) x40.resize(kVecSize);
+    else {
+        x41.resize(kVecSize);
+        for (auto& b : x41) b.m = 1;
+    }
+
+    auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
+
+    auto funcs = ggml_internal_get_quantize_fn(ggml_type);
+
+    Stat simple, ggml;
+
+    for (int iloop=0; iloop<nloop; ++iloop) {
+
+        if (type == 0) fillQ4blocks(x40, rndm);
+        else fillQ4blocks(x41, rndm);
+        fillQ80blocks(y, rndm);
+
+        auto t1 = std::chrono::high_resolution_clock::now();
+        double s = 0;
+        if (type == 0) for (int i=0; i<kVecSize; ++i) s += simpleDot(x40[i], y[i]);
+        else for (int i=0; i<kVecSize; ++i) s += simpleDot(x41[i], y[i]);
+        auto t2 = std::chrono::high_resolution_clock::now();
+        auto t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
+        if (iloop > 3) simple.addResult(s, t);
+
+        t1 = std::chrono::high_resolution_clock::now();
+        float fs;
+        if (type == 0) funcs.vec_dot_q(kVecSize * QK4_1, &fs, x40.data(), y.data());
+        else funcs.vec_dot_q(kVecSize * QK4_1, &fs, x41.data(), y.data());
+        t2 = std::chrono::high_resolution_clock::now();
+        t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
+        if (iloop > 3) ggml.addResult(fs, t);
+
+    }
+
+    // Report the time (and the average of the dot products so the compiler does not come up with the idea
+    // of optimizing away the function calls after figuring that the result is not used).
+    simple.reportResult("Simple");
+    ggml.reportResult("ggml");
+    return 0;
+}
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@@ -0,0 +1,305 @@
+#include <cstdio>
+#include <vector>
+#include <random>
+#include <chrono>
+#include <cstdlib>
+#include <cmath>
+#include <cassert>
+#include <cstring>
+#include <array>
+
+#include <ggml.h>
+
+constexpr int kVecSize = 1 << 18;
+
+float drawFromGaussianPdf(std::mt19937& rndm) {
+    constexpr double kScale = 1./(1. + std::mt19937::max());
+    constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
+    static float lastX;
+    static bool haveX = false;
+    if (haveX) { haveX = false; return lastX; }
+    auto r = sqrt(-2*log(1 - kScale*rndm()));
+    auto phi = kTwoPiTimesScale * rndm();
+    lastX = r*sin(phi);
+    haveX = true;
+    return r*cos(phi);
+}
+void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
+    for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
+}
+
+// Copy-pasted from ggml.c
+#define QK4_0 32
+typedef struct {
+    float   d;          // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+typedef struct {
+    float   d;          // delta
+    float   m;          // min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+// Copy-pasted from ggml.c
+#define QK8_0 32
+typedef struct {
+    float   d;          // delta
+    int8_t  qs[QK8_0];  // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
+
+// "Scalar" dot product between the quantized vector x and float vector y
+inline double dot(int n, const block_q4_0* x, const float* y) {
+    const static float kValues[16] = {-8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
+    constexpr uint32_t kMask1 = 0x0f0f0f0f;
+    uint32_t u1, u2;
+    auto q1 = (const uint8_t*)&u1;
+    auto q2 = (const uint8_t*)&u2;
+    double sum = 0;
+    for (int i=0; i<n; ++i) {
+        float d = x->d;
+        auto u = (const uint32_t*)x->qs;
+        float s = 0;
+        for (int k=0; k<4; ++k) {
+            u1 = u[k] & kMask1;
+            u2 = (u[k] >> 4) & kMask1;
+            s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
+                 y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
+                 y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
+                 y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
+            y += 8;
+        }
+        sum += s*d;
+        ++x;
+    }
+    return sum;
+}
+// Alternative version of the above. Faster on my Mac (~45 us vs ~55 us per dot product),
+// but about the same on X86_64 (Ryzen 7950X CPU).
+inline double dot3(int n, const block_q4_0* x, const float* y) {
+    const static std::pair<float,float> kValues[256] = {
+        {-8.f, -8.f}, {-7.f, -8.f}, {-6.f, -8.f}, {-5.f, -8.f}, {-4.f, -8.f}, {-3.f, -8.f}, {-2.f, -8.f}, {-1.f, -8.f},
+        { 0.f, -8.f}, { 1.f, -8.f}, { 2.f, -8.f}, { 3.f, -8.f}, { 4.f, -8.f}, { 5.f, -8.f}, { 6.f, -8.f}, { 7.f, -8.f},
+        {-8.f, -7.f}, {-7.f, -7.f}, {-6.f, -7.f}, {-5.f, -7.f}, {-4.f, -7.f}, {-3.f, -7.f}, {-2.f, -7.f}, {-1.f, -7.f},
+        { 0.f, -7.f}, { 1.f, -7.f}, { 2.f, -7.f}, { 3.f, -7.f}, { 4.f, -7.f}, { 5.f, -7.f}, { 6.f, -7.f}, { 7.f, -7.f},
+        {-8.f, -6.f}, {-7.f, -6.f}, {-6.f, -6.f}, {-5.f, -6.f}, {-4.f, -6.f}, {-3.f, -6.f}, {-2.f, -6.f}, {-1.f, -6.f},
+        { 0.f, -6.f}, { 1.f, -6.f}, { 2.f, -6.f}, { 3.f, -6.f}, { 4.f, -6.f}, { 5.f, -6.f}, { 6.f, -6.f}, { 7.f, -6.f},
+        {-8.f, -5.f}, {-7.f, -5.f}, {-6.f, -5.f}, {-5.f, -5.f}, {-4.f, -5.f}, {-3.f, -5.f}, {-2.f, -5.f}, {-1.f, -5.f},
+        { 0.f, -5.f}, { 1.f, -5.f}, { 2.f, -5.f}, { 3.f, -5.f}, { 4.f, -5.f}, { 5.f, -5.f}, { 6.f, -5.f}, { 7.f, -5.f},
+        {-8.f, -4.f}, {-7.f, -4.f}, {-6.f, -4.f}, {-5.f, -4.f}, {-4.f, -4.f}, {-3.f, -4.f}, {-2.f, -4.f}, {-1.f, -4.f},
+        { 0.f, -4.f}, { 1.f, -4.f}, { 2.f, -4.f}, { 3.f, -4.f}, { 4.f, -4.f}, { 5.f, -4.f}, { 6.f, -4.f}, { 7.f, -4.f},
+        {-8.f, -3.f}, {-7.f, -3.f}, {-6.f, -3.f}, {-5.f, -3.f}, {-4.f, -3.f}, {-3.f, -3.f}, {-2.f, -3.f}, {-1.f, -3.f},
+        { 0.f, -3.f}, { 1.f, -3.f}, { 2.f, -3.f}, { 3.f, -3.f}, { 4.f, -3.f}, { 5.f, -3.f}, { 6.f, -3.f}, { 7.f, -3.f},
+        {-8.f, -2.f}, {-7.f, -2.f}, {-6.f, -2.f}, {-5.f, -2.f}, {-4.f, -2.f}, {-3.f, -2.f}, {-2.f, -2.f}, {-1.f, -2.f},
+        { 0.f, -2.f}, { 1.f, -2.f}, { 2.f, -2.f}, { 3.f, -2.f}, { 4.f, -2.f}, { 5.f, -2.f}, { 6.f, -2.f}, { 7.f, -2.f},
+        {-8.f, -1.f}, {-7.f, -1.f}, {-6.f, -1.f}, {-5.f, -1.f}, {-4.f, -1.f}, {-3.f, -1.f}, {-2.f, -1.f}, {-1.f, -1.f},
+        { 0.f, -1.f}, { 1.f, -1.f}, { 2.f, -1.f}, { 3.f, -1.f}, { 4.f, -1.f}, { 5.f, -1.f}, { 6.f, -1.f}, { 7.f, -1.f},
+        {-8.f,  0.f}, {-7.f,  0.f}, {-6.f,  0.f}, {-5.f,  0.f}, {-4.f,  0.f}, {-3.f,  0.f}, {-2.f,  0.f}, {-1.f,  0.f},
+        { 0.f,  0.f}, { 1.f,  0.f}, { 2.f,  0.f}, { 3.f,  0.f}, { 4.f,  0.f}, { 5.f,  0.f}, { 6.f,  0.f}, { 7.f,  0.f},
+        {-8.f,  1.f}, {-7.f,  1.f}, {-6.f,  1.f}, {-5.f,  1.f}, {-4.f,  1.f}, {-3.f,  1.f}, {-2.f,  1.f}, {-1.f,  1.f},
+        { 0.f,  1.f}, { 1.f,  1.f}, { 2.f,  1.f}, { 3.f,  1.f}, { 4.f,  1.f}, { 5.f,  1.f}, { 6.f,  1.f}, { 7.f,  1.f},
+        {-8.f,  2.f}, {-7.f,  2.f}, {-6.f,  2.f}, {-5.f,  2.f}, {-4.f,  2.f}, {-3.f,  2.f}, {-2.f,  2.f}, {-1.f,  2.f},
+        { 0.f,  2.f}, { 1.f,  2.f}, { 2.f,  2.f}, { 3.f,  2.f}, { 4.f,  2.f}, { 5.f,  2.f}, { 6.f,  2.f}, { 7.f,  2.f},
+        {-8.f,  3.f}, {-7.f,  3.f}, {-6.f,  3.f}, {-5.f,  3.f}, {-4.f,  3.f}, {-3.f,  3.f}, {-2.f,  3.f}, {-1.f,  3.f},
+        { 0.f,  3.f}, { 1.f,  3.f}, { 2.f,  3.f}, { 3.f,  3.f}, { 4.f,  3.f}, { 5.f,  3.f}, { 6.f,  3.f}, { 7.f,  3.f},
+        {-8.f,  4.f}, {-7.f,  4.f}, {-6.f,  4.f}, {-5.f,  4.f}, {-4.f,  4.f}, {-3.f,  4.f}, {-2.f,  4.f}, {-1.f,  4.f},
+        { 0.f,  4.f}, { 1.f,  4.f}, { 2.f,  4.f}, { 3.f,  4.f}, { 4.f,  4.f}, { 5.f,  4.f}, { 6.f,  4.f}, { 7.f,  4.f},
+        {-8.f,  5.f}, {-7.f,  5.f}, {-6.f,  5.f}, {-5.f,  5.f}, {-4.f,  5.f}, {-3.f,  5.f}, {-2.f,  5.f}, {-1.f,  5.f},
+        { 0.f,  5.f}, { 1.f,  5.f}, { 2.f,  5.f}, { 3.f,  5.f}, { 4.f,  5.f}, { 5.f,  5.f}, { 6.f,  5.f}, { 7.f,  5.f},
+        {-8.f,  6.f}, {-7.f,  6.f}, {-6.f,  6.f}, {-5.f,  6.f}, {-4.f,  6.f}, {-3.f,  6.f}, {-2.f,  6.f}, {-1.f,  6.f},
+        { 0.f,  6.f}, { 1.f,  6.f}, { 2.f,  6.f}, { 3.f,  6.f}, { 4.f,  6.f}, { 5.f,  6.f}, { 6.f,  6.f}, { 7.f,  6.f},
+        {-8.f,  7.f}, {-7.f,  7.f}, {-6.f,  7.f}, {-5.f,  7.f}, {-4.f,  7.f}, {-3.f,  7.f}, {-2.f,  7.f}, {-1.f,  7.f},
+        { 0.f,  7.f}, { 1.f,  7.f}, { 2.f,  7.f}, { 3.f,  7.f}, { 4.f,  7.f}, { 5.f,  7.f}, { 6.f,  7.f}, { 7.f,  7.f}
+    };
+    double sum = 0;
+    for (int i=0; i<n; ++i) {
+        float d = x->d;
+        auto q = x->qs;
+        float s = 0;
+        for (int k=0; k<4; ++k) {
+            s += y[0]*kValues[q[0]].first + y[1]*kValues[q[0]].second +
+                 y[2]*kValues[q[1]].first + y[3]*kValues[q[1]].second +
+                 y[4]*kValues[q[2]].first + y[5]*kValues[q[2]].second +
+                 y[6]*kValues[q[3]].first + y[7]*kValues[q[3]].second;
+            y += 8; q += 4;
+        }
+        sum += s*d;
+        ++x;
+    }
+    return sum;
+}
+
+inline double dot41(int n, const block_q4_1* x, const float* y) {
+    const static float kValues[16] = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f};
+    constexpr uint32_t kMask1 = 0x0f0f0f0f;
+    uint32_t u1, u2;
+    auto q1 = (const uint8_t*)&u1;
+    auto q2 = (const uint8_t*)&u2;
+    double sum = 0;
+    for (int i=0; i<n; ++i) {
+        auto u = (const uint32_t*)x->qs;
+        float s = 0, s1 = 0;
+        for (int k=0; k<4; ++k) {
+            u1 = u[k] & kMask1;
+            u2 = (u[k] >> 4) & kMask1;
+            s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
+                 y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
+                 y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
+                 y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
+            s1 += y[0] + y[1] + y[2] + y[3] + y[4] + y[5] + y[6] + y[7];
+            y += 8;
+        }
+        sum += s*x->d + s1*x->m;
+        ++x;
+    }
+    return sum;
+}
+
+// Copy-pasted from ggml.c
+static void quantize_row_q8_0_reference(const float *x, block_q8_0 *y, int k) {
+    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
+
+    for (int i = 0; i < nb; i++) {
+        float amax = 0.0f; // absolute max
+
+        for (int l = 0; l < QK8_0; l++) {
+            const float v = x[i*QK8_0 + l];
+            amax = std::max(amax, fabsf(v));
+        }
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        for (int l = 0; l < QK8_0; ++l) {
+            const float   v  = x[i*QK8_0 + l]*id;
+            y[i].qs[l] = roundf(v);
+        }
+    }
+}
+
+// Copy-pasted from ggml.c
+static void dot_q4_q8(const int n, float* s, const void* vx, const void* vy) {
+    const int nb = n / QK8_0;
+    const block_q4_0* x = (const block_q4_0*)vx;
+    const block_q8_0* y = (const block_q8_0*)vy;
+    float sumf = 0;
+    for (int i = 0; i < nb; i++) {
+        const float d0 = x[i].d;
+        const float d1 = y[i].d;
+
+        const uint8_t * p0 = x[i].qs;
+        const  int8_t * p1 = y[i].qs;
+
+        int sumi = 0;
+        for (int j = 0; j < QK8_0/2; j++) {
+            const uint8_t v0 = p0[j];
+
+            const int i0 = (int8_t) (v0 & 0xf) - 8;
+            const int i1 = (int8_t) (v0 >> 4)  - 8;
+
+            const int i2 = p1[2*j + 0];
+            const int i3 = p1[2*j + 1];
+
+            sumi += i0*i2 + i1*i3;
+        }
+        sumf += d0*d1*sumi;
+    }
+    *s = sumf;
+}
+
+int main(int argc, char** argv) {
+
+    int nloop = argc > 1 ? atoi(argv[1]) : 10;
+    bool scalar = argc > 2 ? atoi(argv[2]) : false;
+    bool useQ4_1 = argc > 3 ? atoi(argv[3]) : false;
+
+    if (scalar && useQ4_1) {
+        printf("It is not possible to use Q4_1 quantization and scalar implementations\n");
+        return 1;
+    }
+
+    std::mt19937 rndm(1234);
+
+    std::vector<float> x1(kVecSize), y1(kVecSize);
+    int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
+    int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
+
+    auto funcs = useQ4_1 ? ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1) : ggml_internal_get_quantize_fn(GGML_TYPE_Q4_0);
+
+    std::vector<block_q4_0> q40;
+    std::vector<block_q4_1> q41;
+    if (useQ4_1) q41.resize(n4);
+    else q40.resize(n4);
+    std::vector<block_q8_0> q8(n8);
+    std::vector<int64_t> H(16, 0);
+    double sumt = 0, sumt2 = 0, maxt = 0;
+    double sumqt = 0, sumqt2 = 0, maxqt = 0;
+    double sum = 0, sumq = 0, exactSum = 0;
+    for (int iloop=0; iloop<nloop; ++iloop) {
+
+        // Fill vector x with random numbers
+        fillRandomGaussianFloats(x1, rndm);
+
+        // Fill vector y with random numbers
+        fillRandomGaussianFloats(y1, rndm);
+
+        // Compute the exact dot product
+        for (int k=0; k<kVecSize; ++k) exactSum += x1[k]*y1[k];
+
+        // quantize x.
+        // Note, we do not include this in the timing as in practical application
+        // we already have the quantized model weights.
+        if (useQ4_1) {
+            funcs.quantize_row_q(x1.data(), q41.data(), kVecSize);
+        } else {
+            funcs.quantize_row_q(x1.data(), q40.data(), kVecSize);
+        }
+
+        // Now measure time the dot product needs using the "scalar" version above
+        auto t1 = std::chrono::high_resolution_clock::now();
+        if (useQ4_1) sum += dot41(kVecSize / QK4_1, q41.data(), y1.data());
+        else sum += dot(kVecSize / QK4_0, q40.data(), y1.data());
+        auto t2 = std::chrono::high_resolution_clock::now();
+        auto t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
+        sumt += t; sumt2 += t*t; maxt = std::max(maxt, t);
+
+        // And now measure the time needed to quantize y and perform the dot product with the quantized y
+        t1 = std::chrono::high_resolution_clock::now();
+        float result;
+        if (scalar) {
+            quantize_row_q8_0_reference(y1.data(), q8.data(), kVecSize);
+            dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
+        }
+        else {
+            funcs.quantize_row_q_dot(y1.data(), q8.data(), kVecSize);
+            if (useQ4_1) funcs.vec_dot_q(kVecSize, &result, q41.data(), q8.data());
+            else funcs.vec_dot_q(kVecSize, &result, q40.data(), q8.data());
+        }
+        sumq += result;
+        t2 = std::chrono::high_resolution_clock::now();
+        t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
+        sumqt += t; sumqt2 += t*t; maxqt = std::max(maxqt, t);
+
+    }
+
+    // Report the time (and the average of the dot products so the compiler does not come up with the idea
+    // of optimizing away the function calls after figuring that the result is not used).
+    sum /= nloop; sumq /= nloop;
+    exactSum /= nloop;
+    printf("Exact result: <dot> = %g\n",exactSum);
+    printf("<dot> = %g, %g\n",sum,sumq);
+    sumt /= nloop; sumt2 /= nloop; sumt2 -= sumt*sumt;
+    if (sumt2 > 0) sumt2 = sqrt(sumt2);
+    printf("time = %g +/- %g us. maxt = %g us\n",sumt,sumt2,maxt);
+    sumqt /= nloop; sumqt2 /= nloop; sumqt2 -= sumqt*sumqt;
+    if (sumqt2 > 0) sumqt2 = sqrt(sumqt2);
+    printf("timeq = %g +/- %g us. maxt = %g us\n",sumqt,sumqt2,maxqt);
+    return 0;
+}
--- a/prompts/chat-with-bob.txt
+++ b/prompts/chat-with-bob.txt
@@ -4,4 +4,4 @@ User: Hello, Bob.
 Bob: Hello. How may I help you today?
 User: Please tell me the largest city in Europe.
 Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
-User:
+User:
--- a/prompts/reason-act.txt
+++ b/prompts/reason-act.txt
@@ -15,4 +15,4 @@ Answer: The calculate tool says it is 9.3333333333
 Question: What is capital of france?
 Thought: Do I need to use an action? No, I know the answer
 Answer: Paris is the capital of France
-Question:
+Question:
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+numpy==1.24
+sentencepiece==0.1.98
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+cp -rpv ../ggml/src/ggml.c          ./ggml.c
+cp -rpv ../ggml/src/ggml-cuda.cu    ./ggml-cuda.cu
+cp -rpv ../ggml/src/ggml-cuda.h     ./ggml-cuda.h
+cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -6,5 +6,6 @@ function(llama_add_test source)
 endfunction()

 # llama_add_test(test-double-float.c) # SLOW
-llama_add_test(test-quantize.c)
+llama_add_test(test-quantize-fns.cpp)
+llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -0,0 +1,154 @@
+// Unit tests for quantization specific functions - quantize, dequantize and dot product
+
+#include "ggml.h"
+
+#undef NDEBUG
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+
+const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001;
+const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002;
+const float MAX_DOT_PRODUCT_ERROR = 0.02;
+
+const char* RESULT_STR[] = {"ok", "FAILED"};
+
+
+// Generate synthetic data
+void generate_data(float offset, size_t n, float * dst) {
+    for (size_t i = 0; i < n; i++) {
+        dst[i] = 0.1 + 2*cosf(i + offset);
+    }
+}
+
+// Calculate RMSE between two float arrays
+float array_rmse(const float * a1, const float * a2, size_t n) {
+    double sum = 0;
+    for (size_t i = 0; i < n; i++) {
+        double diff = a1[i] - a2[i];
+        sum += diff * diff;
+    }
+    return sqrtf(sum) / n;
+}
+
+// Total quantization error on test data
+float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
+    std::vector<uint8_t> tmp_q(test_size);
+    std::vector<float> tmp_out(test_size);
+
+    qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
+    qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
+    return array_rmse(test_data, tmp_out.data(), test_size);
+}
+
+// Total quantization error on test data
+float reference_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
+    std::vector<uint8_t> tmp_q(test_size);
+    std::vector<float> tmp_out(test_size);
+    std::vector<float> tmp_out_ref(test_size);
+
+    qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
+    qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
+
+    qfns.quantize_row_q_reference(test_data, tmp_q.data(), test_size);
+    qfns.dequantize_row_q(tmp_q.data(), tmp_out_ref.data(), test_size);
+
+    return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
+}
+
+float dot_product(const float * a1, const float * a2, size_t test_size) {
+    double sum = 0;
+    for (size_t i = 0; i < test_size; i++) {
+        sum += a1[i] * a2[i];
+    }
+    return sum;
+}
+
+// Total dot product error
+float dot_product_error(quantize_fns_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
+    std::vector<uint8_t> tmp_q1(test_size);
+    std::vector<uint8_t> tmp_q2(test_size*2);
+
+    qfns.quantize_row_q(test_data1, tmp_q1.data(), test_size);
+    qfns.quantize_row_q_dot(test_data2, tmp_q2.data(), test_size);
+
+    float result = INFINITY;
+    qfns.vec_dot_q(test_size, &result, tmp_q1.data(), tmp_q2.data());
+
+    const float dot_ref = dot_product(test_data1, test_data2, test_size);
+
+    return fabsf(result - dot_ref) / test_size;
+}
+
+int main(int argc, char * argv[]) {
+    bool verbose = false;
+    const size_t test_size = 32 * 128;
+
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-v") {
+            verbose = true;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return 1;
+        }
+    }
+
+    std::vector<float> test_data(test_size);
+    std::vector<float> test_data2(test_size);
+
+    generate_data(0.0, test_data.size(), test_data.data());
+    generate_data(1.0, test_data2.size(), test_data2.data());
+
+    // Initialize GGML, ensures float conversion tables are initialized
+    struct ggml_init_params ggml_params = {
+        /* .mem_size   = */ 1*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ true,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    int num_failed = 0;
+    bool failed = false;
+
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+
+        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
+            failed = !(total_error < MAX_QUANTIZATION_TOTAL_ERROR);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
+            }
+
+            const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
+            failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
+            }
+
+            const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
+            failed = !(vec_dot_error < MAX_DOT_PRODUCT_ERROR);
+            num_failed += failed;
+            if (failed || verbose) {
+                printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
+            }
+        }
+    }
+
+    if (num_failed || verbose) {
+        printf("%d tests failed\n", num_failed);
+    }
+
+    ggml_free(ctx);
+
+    return num_failed > 0;
+}
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@@ -0,0 +1,310 @@
+// Benchmark quantization specific functions on synthetic data
+
+#include "ggml.h"
+
+#undef NDEBUG
+#include <algorithm>
+#include <assert.h>
+#include <functional>
+#include <inttypes.h>
+#include <math.h>
+#include <memory>
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#define MAX_ALIGNMENT 64
+#define QK 32
+#define WARMUP 5
+#define ITERATIONS 10
+
+#define L1_SIZE      32*128
+#define L2_SIZE     32*2048
+#define L3_SIZE    32*20480
+#define MEM_SIZE 32*2048000
+
+struct quantize_perf_params {
+    std::vector<std::string> include_types;
+    std::vector<size_t> test_sizes;
+    size_t alignment_offset = 0;
+    bool op_quantize_row_q_reference = false;
+    bool op_quantize_row_q = false;
+    bool op_dequantize_row_q = false;
+    bool op_quantize_row_q_dot = false;
+    bool op_vec_dot_q = false;
+};
+
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#include <x86intrin.h>
+inline int64_t cpu_cycles() {
+// Rough way to detect new-ish CPUs
+#ifdef __POPCNT__
+    unsigned int dummy;
+    return __rdtscp(&dummy);
+#else
+    return __rdtsc();
+#endif
+}
+
+#else
+
+#define cpu_cycles() 0
+
+#endif
+
+
+// Generate synthetic data
+void generate_data(float offset, size_t n, float * dst) {
+    for (size_t i = 0; i < n; i++) {
+        dst[i] = 0.1 + 2*cosf(i + offset);
+    }
+}
+
+float gigabytes_per_second(size_t bytes, int64_t usecs) {
+    return bytes / (float) usecs * 1000000 / (1024*1024*1024);
+}
+
+void * align_with_offset(void * ptr, int offset) {
+    size_t dummy_size = MAX_ALIGNMENT * 4;
+    return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
+}
+
+void benchmark_function(size_t size, size_t q_size, std::function<size_t(void)> function) {
+    int64_t min_time_us = INT64_MAX;
+    int64_t total_time_us = 0;
+    int64_t min_time_cycles = INT64_MAX;
+    int64_t total_time_cycles = 0;
+
+    for (int i = 0; i < WARMUP; i++) {
+        function();
+    }
+
+
+    for (int i = 0; i < ITERATIONS; i++) {
+        const int64_t start_time = ggml_time_us();
+        const int64_t start_cycles = cpu_cycles();
+
+        function();
+
+        const int64_t end_cycles = cpu_cycles();
+        const int64_t end_time = ggml_time_us();
+
+        total_time_cycles += end_cycles - start_cycles;
+        min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
+        total_time_us += end_time - start_time;
+        min_time_us = std::min(min_time_us, end_time - start_time);
+    }
+
+    printf("      min cycles/%d vals   : %9.2f\n",  QK, QK * min_time_cycles / (float) size);
+    printf("      avg cycles/%d vals   : %9.2f\n",  QK, QK * total_time_cycles / (float) (size * ITERATIONS));
+    printf("      float32 throughput   : %9.2f GB/s\n",  gigabytes_per_second(4 * size * ITERATIONS, total_time_us));
+    printf("      quantized throughput : %9.2f GB/s\n",  gigabytes_per_second(q_size * ITERATIONS, total_time_us));
+}
+
+int main(int argc, char * argv[]) {
+    quantize_perf_params params {};
+
+    // read command line
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "--size") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            size_t size = std::stoi(argv[i]);
+            if (size % 32 != 0) {
+                fprintf(stderr, "error: size %zu not divisible by 32\n", size);
+                invalid_param = true;
+                break;
+            }
+            params.test_sizes.push_back(size);
+        } else if (arg == "-3") {
+            // quick select sizes that probably fit in CPU caches
+            params.test_sizes.push_back(L1_SIZE);
+            params.test_sizes.push_back(L2_SIZE);
+            params.test_sizes.push_back(L3_SIZE);
+        } else if (arg == "-4") {
+            // quick select cache sizes + memory
+            params.test_sizes.push_back(L1_SIZE);
+            params.test_sizes.push_back(L2_SIZE);
+            params.test_sizes.push_back(L3_SIZE);
+            params.test_sizes.push_back(MEM_SIZE);
+        } else if (arg == "--op") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            std::string op {argv[i]};
+            if (op == "quantize_row_q_reference") {
+                params.op_quantize_row_q_reference = true;
+            } else if (op == "quantize_row_q") {
+                params.op_quantize_row_q = true;
+            } else if (op == "dequantize_row_q") {
+                params.op_dequantize_row_q = true;
+            } else if (op == "quantize_row_q_dot") {
+                params.op_quantize_row_q_dot = true;
+            } else if (op == "vec_dot_q") {
+                params.op_vec_dot_q = true;
+            } else {
+                invalid_param = true;
+                break;
+            }
+        } else if (arg == "--type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.include_types.push_back(argv[i]);
+        } else if (arg == "--alignment-offset") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int alignment = std::stoi(argv[i]);
+            if (alignment < 0 || alignment > MAX_ALIGNMENT) {
+            fprintf(stderr, "error: aligment-offset must be less than %d\n", MAX_ALIGNMENT);
+                invalid_param = true;
+                break;
+            }
+            params.alignment_offset = alignment;
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            return 1;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        return 1;
+    }
+
+    if (params.test_sizes.empty()) {
+        params.test_sizes.push_back(L1_SIZE);
+    }
+    if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
+        params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
+    }
+
+    std::sort(params.test_sizes.begin(), params.test_sizes.end());
+    size_t largest = params.test_sizes.back();
+
+    std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q1_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_q2_v(largest*4 + MAX_ALIGNMENT*2);
+    std::vector<uint8_t> test_out_v(largest*4 + MAX_ALIGNMENT*2);
+
+    float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
+    float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
+    float * test_q1 = (float *) align_with_offset(test_q1_v.data(), params.alignment_offset);
+    float * test_q2 = (float *) align_with_offset(test_q2_v.data(), params.alignment_offset);
+    float * test_out = (float *) align_with_offset(test_out_v.data(), params.alignment_offset);
+
+    generate_data(0, largest, test_data1);
+    generate_data(1, largest, test_data2);
+
+
+    // Initialize GGML, ensures float conversion tables are initialized
+    struct ggml_init_params ggml_params = {
+        /* .mem_size   = */ 1*1024,
+        /* .mem_buffer = */ NULL,
+        /* .no_alloc   = */ true,
+    };
+    struct ggml_context * ctx = ggml_init(ggml_params);
+
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        ggml_type type = (ggml_type) i;
+        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
+            continue;
+        }
+
+        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+            printf("%s\n", ggml_type_name(type));
+
+            if (params.op_quantize_row_q_reference) {
+                printf("  quantize_row_q_reference\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        qfns.quantize_row_q_reference(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_quantize_row_q) {
+                printf("  quantize_row_q\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        qfns.quantize_row_q(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_dequantize_row_q) {
+                printf("  dequantize_row_q\n");
+                qfns.quantize_row_q(test_data1, test_q1, largest);
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        qfns.dequantize_row_q(test_q1, test_out, size);
+                        return test_out[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_quantize_row_q_dot) {
+                printf("  quantize_row_q_dot\n");
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        qfns.quantize_row_q_dot(test_data1, test_q1, size);
+                        return test_q1[0];
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+
+            if (params.op_vec_dot_q) {
+                printf("  vec_dot_q\n");
+                qfns.quantize_row_q(test_data1, test_q1, largest);
+                qfns.quantize_row_q(test_data2, test_q2, largest);
+                for (size_t size : params.test_sizes) {
+                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
+                    auto quantize_fn = [&](void ) {
+                        float result;
+                        qfns.vec_dot_q(size, &result, test_q1, test_q2);
+                        return result;
+                    };
+                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
+                    benchmark_function(size, quantized_size, quantize_fn);
+                }
+                printf("\n");
+            }
+        }
+    }
+
+    ggml_free(ctx);
+
+    return 0;
+}
--- a/tests/test-quantize.c
+++ b/tests/test-quantize.c
@@ -1,42 +0,0 @@
-#include "ggml.h"
-#undef NDEBUG
-#include <assert.h>
-#include <math.h>
-
-int main(void) {
-    #define QK 32
-    float src[QK];
-    uint8_t dst[24];
-    int64_t hist[16];
-
-    for (int i = 0; i < QK; i++) {
-        src[i] = (float)(i + 1);
-    }
-
-    size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist);
-    assert(size == 20);
-    float max_result = ((float *)dst)[0];
-    float max_expected = src[31] / ((1 << 3) - 1);
-    assert(max_result == max_expected);
-    for (int i = 0; i < QK; i++) {
-        uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF);
-        uint8_t q4_expected = roundf(src[i] / max_expected) + 8;
-        assert(q4_result == q4_expected);
-    }
-
-    size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
-    assert(size == 24);
-    float delta_result = ((float *)dst)[0];
-    float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
-    assert(delta_result == delta_expected);
-    float min_result = ((float *)dst)[1];
-    float min_expected = src[0];
-    assert(min_result == min_expected);
-    for (int i = 0; i < QK; i++) {
-        uint8_t q4_result = (i % 2) ? (dst[sizeof(float)*2 + i/2] >> 4) : (dst[sizeof(float)*2 + i/2] & 0xF);
-        uint8_t q4_expected = roundf((src[i] - min_expected) / delta_expected);
-        assert(q4_result == q4_expected);
-    }
-
-    return 0;
-}
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -5,13 +5,17 @@
 #include <map>
 #include <vector>

-static const std::map<std::string, std::vector<llama_token>> k_tests = {
-    { "Hello World",        { 1,  10994,   2787, }, },
-    { " Hello World",       { 1,  15043,   2787, }, },
-    { " Hello World!",      { 1,  15043,   2787,  29991, }, },
-    { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
-    { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
-    { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, },
+static const std::map<std::string, std::vector<llama_token>> & k_tests()
+{
+    static std::map<std::string, std::vector<llama_token>> _k_tests = {
+        { "Hello World",        { 1,  10994,   2787, }, },
+        { " Hello World",       { 1,  15043,   2787, }, },
+        { " Hello World!",      { 1,  15043,   2787,  29991, }, },
+        { " this is 🦙.cpp",    { 1,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
+        { "w048 7tuijk dsdfhu", { 1,  29893,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
+        { "нещо на Български",  { 1,    821,   4851,    665,   1386,  29713,   1305, }, },
+    };
+    return _k_tests;
 };

 int main(int argc, char **argv) {
@@ -47,7 +51,7 @@ int main(int argc, char **argv) {
        return 2;
    }

-    for (const auto & test_kv : k_tests) {
+    for (const auto & test_kv : k_tests()) {
        std::vector<llama_token> res(test_kv.first.size());
        const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
        res.resize(n);