benchmark : add tool for timing q4_0 matrix multiplication (#653 )

* Initial version of q4_0 matrix multiplication benchmark * Bugfix: Added dependency to ggml.o to benchmark * Reviewer requests: added parameter for threads, switched to ggml_time_us() * Reviewer input: removed rtsc, use epsilon for check * Review comment: Removed set_locale * Feature: Param for numer of iterations, Bugfix for use of parameter threads * Reviewer suggestion: Moved to examples * Reviewer feedback: Updated clean: and benchmark: sections --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
do not force the prompt file to end with a new line (#908 )
2026-02-12 14:03:20 +02:00 · 2023-04-13 15:46:23 +03:00 · 2023-04-13 11:33:16 +02:00 · 2023-04-12 15:06:16 +00:00 · 2023-04-12 14:48:57 +03:00 · 2023-04-12 14:31:12 +03:00
62 changed files with 5004 additions and 2178 deletions
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -6,7 +6,8 @@ RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip

 RUN pip install --upgrade pip setuptools wheel \
-    && pip install numpy requests sentencepiece torch tqdm
+    && pip install numpy requests sentencepiece tqdm \
+    && pip install torch --index-url https://download.pytorch.org/whl/cpu

 WORKDIR /app

--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -15,4 +15,4 @@ FROM ubuntu:$UBUNTU_VERSION as runtime

 COPY --from=build /app/main /main

-ENTRYPOINT [ "/main" ]
+ENTRYPOINT [ "/main" ]
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,4 +21,4 @@ models/*

 arm_neon.h
 compile_commands.json
-Dockerfile
+Dockerfile
--- a/.ecrc
+++ b/.ecrc
@@ -0,0 +1,5 @@
+{
+  "Disable": {
+    "IndentSize": true
+  }
+}
--- a/.editorconfig
+++ b/.editorconfig
@@ -0,0 +1,19 @@
+# https://EditorConfig.org
+
+# Top-most EditorConfig file
+root = true
+
+# Unix-style newlines with a newline ending every file, utf-8 charset
+[*]
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+charset = utf-8
+indent_style = space
+indent_size = 4
+
+[Makefile]
+indent_style = tab
+
+[prompts/*.txt]
+insert_final_newline = unset
--- a/.github/ISSUE_TEMPLATE/custom.md
+++ b/.github/ISSUE_TEMPLATE/custom.md
@@ -22,9 +22,9 @@ Please provide a detailed written description of what you were trying to do, and

 # Current Behavior

-Please provide a detailed written description of what `llama.cpp` did, instead. 
+Please provide a detailed written description of what `llama.cpp` did, instead.

-# Environment and Context 
+# Environment and Context

 Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.

@@ -133,7 +133,7 @@ llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.
 llama_model_load: .......................................................................................... done
 llama_model_load: model size =  4869.09 MB / num tensors = 723

-system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 
+system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |

 main: prompt: 'Please close your issue when it has been answered.'
 main: number of tokens in prompt = 11
@@ -166,14 +166,14 @@ main:    total time = 246406.42 ms

 Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':

-        3636882.89 msec task-clock                #   14.677 CPUs utilized          
-             13509      context-switches          #    3.714 /sec                   
-              2436      cpu-migrations            #    0.670 /sec                   
-          10476679      page-faults               #    2.881 K/sec                  
+        3636882.89 msec task-clock                #   14.677 CPUs utilized
+             13509      context-switches          #    3.714 /sec
+              2436      cpu-migrations            #    0.670 /sec
+          10476679      page-faults               #    2.881 K/sec
    13133115082869      cycles                    #    3.611 GHz                      (16.77%)
       29314462753      stalled-cycles-frontend   #    0.22% frontend cycles idle     (16.76%)
    10294402631459      stalled-cycles-backend    #   78.39% backend cycles idle      (16.74%)
-    23479217109614      instructions              #    1.79  insn per cycle         
+    23479217109614      instructions              #    1.79  insn per cycle
                                                  #    0.44  stalled cycles per insn  (16.76%)
     2353072268027      branches                  #  647.002 M/sec                    (16.77%)
        1998682780      branch-misses             #    0.08% of all branches          (16.76%)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -8,10 +8,10 @@ on:
        required: true
        type: boolean
  push:
-    paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
  pull_request:
    types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
-    paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']

 env:
 BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -62,7 +62,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --output-on-failure
+          ctest --verbose

  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest
@@ -98,7 +98,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --output-on-failure
+          ctest --verbose

  macOS-latest-make:
    runs-on: macos-latest
@@ -143,7 +143,7 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --output-on-failure
+          ctest --verbose

  windows-latest-cmake:
    runs-on: windows-latest
@@ -177,15 +177,19 @@ jobs:
        continue-on-error: true
        run: |
          cd build
-          Set-Content -Path .\avx512f.exe -Value ([Convert]::FromBase64String('TVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAyAAAAA4fug4AtAnNIbgBTM0hVGhpcyBwcm9ncmFtIGNhbm5vdCBiZSBydW4gaW4gRE9TIG1vZGUuDQ0KJAAAAAAAAAClmfXY4fibi+H4m4vh+JuL4fiai+P4m4si98aL4vibi7Xbq4vg+JuLUmljaOH4m4sAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABQRQAATAEBAGo6H2QAAAAAAAAAAOAADwELAQYAAAIAAAAAAAAAAAAADBAAAAAQAAAAIAAAAABAAAAQAAAAAgAABAAAAAAAAAAEAAAAAAAAAAAgAAAAAgAAAAAAAAMAAAAAABAAABAAAAAAEAAAEAAAAAAAABAAAAAAAAAAAAAAAFQQAAAoAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAADAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC50ZXh0AAAAsgAAAAAQAAAAAgAAAAIAAAAAAAAAAAAAAAAAACAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACUEAAAiBAAAAAAAABVi+xRUVNTuAcAAAAPosHrEGaD4wGJXfxbg0X8MI1F+GoAUI1F/GoBUGr1/xUAEEAAUP8VBBBAAItF/FuDwND32BvAQMnDzMx8EAAAAAAAAAAAAACkEAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAlBAAAIgQAAAAAAAApANXcml0ZUZpbGUAuQFHZXRTdGRIYW5kbGUAAEtFUk5FTDMyLmRsbAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==')) -AsByteStream
-          .\avx512f.exe && echo " AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo " AVX512F: NO"
+          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
+          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
+          $cl =  $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
+          echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
+          & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
+          .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"

      - name: Test
        id: cmake_test
        if: ${{ matrix.build != 'avx512' || env.HAS_AVX512F == '1' }} # Test AVX-512 only when possible
        run: |
          cd build
-          ctest -C Release --output-on-failure
+          ctest -C Release --verbose

      - name: Get commit hash
        id: commit
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -60,4 +60,4 @@ jobs:
          push: ${{ github.event_name == 'push' }}
          platforms: linux/amd64,linux/arm64
          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
-          file: ${{ matrix.config.dockerfile }}
+          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -0,0 +1,17 @@
+name: EditorConfig Checker
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  editorconfig:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: editorconfig-checker/action-editorconfig-checker@main
+      - run: editorconfig-checker
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@
 .vscode/
 .DS_Store

+.build/
 build/
 build-em/
 build-debug/
@@ -18,12 +19,21 @@ models/*

 /main
 /quantize
+/quantize-stats
 /result
 /perplexity
 /embedding
+/Pipfile

 arm_neon.h
 compile_commands.json

 .envrc
 .direnv/
+
+.venv
+__pycache__
+.swiftpm
+
+zig-out/
+zig-cache/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -68,7 +68,9 @@ option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
 # Compile flags
 #

+set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
+set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
@@ -113,6 +115,7 @@ if (LLAMA_OPENBLAS)

        add_compile_definitions(GGML_USE_OPENBLAS)
        add_link_options(${BLAS_LIBRARIES})
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
    else()
        message(WARNING "OpenBLAS not found")
    endif()
@@ -124,8 +127,9 @@ if (LLAMA_ALL_WARNINGS)
            -Wall
            -Wextra
            -Wpedantic
-            -Wshadow
            -Wcast-qual
+            -Wdouble-promotion
+            -Wshadow
            -Wstrict-prototypes
            -Wpointer-arith
            -Wno-unused-function
@@ -135,6 +139,8 @@ if (LLAMA_ALL_WARNINGS)
            -Wextra
            -Wpedantic
            -Wcast-qual
+            -Wno-unused-function
+            -Wno-multichar
        )
    else()
        # todo : msvc
@@ -147,6 +153,10 @@ if (LLAMA_ALL_WARNINGS)

 endif()

+if (MSVC)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+endif()
+
 if (LLAMA_LTO)
    include(CheckIPOSupported)
    check_ipo_supported(RESULT result OUTPUT output)
@@ -236,7 +246,9 @@ endif()

 add_library(llama
            llama.cpp
-            llama.h)
+            llama.h
+            llama_internal.h
+            llama_util.h)

 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
@@ -251,7 +263,7 @@ endif()
 #

 if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
-    enable_testing()
+    include(CTest)
    add_subdirectory(tests)
 endif ()

--- a/107
+++ b/107
@@ -35,6 +35,10 @@ CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
 CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =

+# warnings
+CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
+
 # OS specific
 # TODO: support Windows
 ifeq ($(UNAME_S),Linux)
@@ -66,92 +70,9 @@ endif
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
-	ifeq ($(UNAME_S),Darwin)
-		CFLAGS += -mf16c
-		AVX1_M := $(shell sysctl machdep.cpu.features)
-		ifneq (,$(findstring FMA,$(AVX1_M)))
-			CFLAGS += -mfma
-		endif
-		ifneq (,$(findstring AVX1.0,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
-		ifneq (,$(findstring AVX2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-	else ifeq ($(UNAME_S),Linux)
-		AVX1_M := $(shell grep "avx " /proc/cpuinfo)
-		ifneq (,$(findstring avx,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
-		ifneq (,$(findstring avx2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-		FMA_M := $(shell grep "fma " /proc/cpuinfo)
-		ifneq (,$(findstring fma,$(FMA_M)))
-			CFLAGS += -mfma
-		endif
-		F16C_M := $(shell grep "f16c " /proc/cpuinfo)
-		ifneq (,$(findstring f16c,$(F16C_M)))
-			CFLAGS += -mf16c
-		endif
-		SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
-		ifneq (,$(findstring sse3,$(SSE3_M)))
-			CFLAGS += -msse3
-		endif
-		AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
-		ifneq (,$(findstring avx512f,$(AVX512F_M)))
-			CFLAGS += -mavx512f
-		endif
-		AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
-		ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
-			CFLAGS += -mavx512bw
-		endif
-		AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
-		ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
-			CFLAGS += -mavx512dq
-		endif
-		AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
-		ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
-			CFLAGS += -mavx512vl
-		endif
-		AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
-		ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
-			CFLAGS += -mavx512cd
-		endif
-		AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
-		ifneq (,$(findstring avx512er,$(AVX512ER_M)))
-			CFLAGS += -mavx512er
-		endif
-		AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
-		ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
-			CFLAGS += -mavx512ifma
-		endif
-		AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
-		ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
-			CFLAGS += -mavx512pf
-		endif
-	else ifeq ($(UNAME_S),Haiku)
-		AVX1_M := $(shell sysinfo -cpu | grep -w "AVX")
-		ifneq (,$(findstring AVX,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell sysinfo -cpu | grep -w "AVX2")
-		ifneq (,$(findstring AVX2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-		FMA_M := $(shell sysinfo -cpu | grep -w "FMA")
-		ifneq (,$(findstring FMA,$(FMA_M)))
-			CFLAGS += -mfma
-		endif
-		F16C_M := $(shell sysinfo -cpu | grep -w "F16C")
-		ifneq (,$(findstring F16C,$(F16C_M)))
-			CFLAGS += -mf16c
-		endif
-	else
-		CFLAGS += -mfma -mf16c -mavx -mavx2
-	endif
+	# Use all CPU extensions that are available:
+	CFLAGS += -march=native -mtune=native
+	CXXFLAGS += -march=native -mtune=native
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
@@ -221,14 +142,14 @@ default: main quantize perplexity embedding
 ggml.o: ggml.c ggml.h
 	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o

-llama.o: llama.cpp llama.h
+llama.o: llama.cpp llama.h llama_util.h llama_internal.h
 	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o

 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o

 clean:
-	rm -vf *.o main quantize perplexity embedding
+	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult

 main: examples/main/main.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@@ -239,16 +160,26 @@ main: examples/main/main.cpp ggml.o llama.o common.o
 quantize: examples/quantize/quantize.cpp ggml.o llama.o
 	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)

+quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
+	$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
+
 perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)

 embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)

+libllama.so: llama.o ggml.o
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
+  
 #
 # Tests
 #

+benchmark: ggml.o
+	$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)	
+	./benchmark-q4_0-matmult
+	
 .PHONY: tests
 tests:
 	bash ./tests/run-tests.sh
--- a/Package.swift
+++ b/Package.swift
@@ -0,0 +1,23 @@
+// swift-tools-version:5.3
+
+import PackageDescription
+
+let package = Package(
+    name: "llama",
+    products: [
+        .library(name: "llama", targets: ["llama"]),
+    ],
+    targets: [
+        .target(
+            name: "llama",
+            path: ".",
+            sources: ["ggml.c", "llama.cpp"],
+            publicHeadersPath: "spm-headers",
+            cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
+            linkerSettings: [
+                .linkedFramework("Accelerate")
+            ]
+        ),
+    ],
+    cxxLanguageStandard: .cxx11
+)
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # llama.cpp

-![llama](https://user-images.githubusercontent.com/1991296/227761327-6d83e30e-2200-41a6-bfbb-f575231c54f4.png)
+![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

 [![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
@@ -9,10 +9,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 **Hot topics:**

- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
- New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
+- [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
+- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)

 ## Description

@@ -30,13 +28,33 @@ Please do not make conclusions about the models based on the results from this i
 For all I know, it can be completely wrong. This project is for educational purposes.
 New features will probably be added mostly through community contributions.

-Supported platforms:
+**Supported platforms:**

 - [X] Mac OS
 - [X] Linux
 - [X] Windows (via CMake)
 - [X] Docker

+**Supported models:**
+
+- [X] LLaMA 🦙
+- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
+- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
+- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
+- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
+- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
+- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
+
+**Bindings:**
+
+- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
+- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
+
+**UI:**
+
+- [nat/openplayground](https://github.com/nat/openplayground)
+- [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
+
 ---

 Here is a typical run using LLaMA-7B:
@@ -139,6 +157,13 @@ git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 make

+#For Windows and CMake, use the following command instead:
+cd <path_to_llama_folder>
+mkdir build
+cd build
+cmake ..
+cmake --build . --config Release
+
 # obtain the original LLaMA model weights and place them in ./models
 ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
@@ -149,8 +174,8 @@ python3 -m pip install torch numpy sentencepiece
 # convert the 7B model to ggml FP16 format
 python3 convert-pth-to-ggml.py models/7B/ 1

-# quantize the model to 4-bits
-python3 quantize.py 7B
+# quantize the model to 4-bits (using method 2 = q4_0)
+./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2

 # run the inference
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
@@ -219,15 +244,30 @@ There 26 letters in the English Alphabet
 The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
 > List 5 words that start with "ca".
 cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
-> 
+>
 ```

+### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
+
+- Obtain the `gpt4all-lora-quantized.bin` model
+- It is distributed in the old `ggml` format which is now obsoleted
+- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
+convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
+
+  ```bash
+  python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
+  python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
+  ```
+
+- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
+- The original model is saved in the same folder with a suffix `.orig`
+
 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data

 - **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository. 
+- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
 - Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
- Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
+- Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
 - The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:

  `sha256sum --ignore-missing -c SHA256SUMS` on Linux
@@ -245,7 +285,7 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
  - GPT-3.5 / InstructGPT / ChatGPT:
    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
-    
+
 ### Perplexity (Measuring model quality)

 You can use the `perplexity` example to measure perplexity over the given prompt.  For more background,
@@ -282,7 +322,7 @@ And after 4.45 hours, you will have the final perplexity.

 ### Android

-You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
+You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
 First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
 ```
 $ mkdir build-android
@@ -291,7 +331,7 @@ $ export NDK=<your_ndk_directory>
 $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
 $ make
 ```
-Install [termux](https://play.google.com/store/apps/details?id=com.termux) on your device and run `termux-setup-storage` to get access to your SD card.
+Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
 Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:

 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
@@ -312,20 +352,22 @@ We have two Docker images available for this project:

 The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.

+Replace `/path/to/models` below with the actual path where you downloaded the models.
+
 ```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
 ```

 On complete, you are ready to play!

 ```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 or with light image:

 ```bash
-docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 ```

 ### Contributing
@@ -346,3 +388,6 @@ docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models
 - Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions

+### Docs
+
+- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
--- a/build.zig
+++ b/build.zig
@@ -0,0 +1,67 @@
+const std = @import("std");
+
+pub fn build(b: *std.Build) void {
+    const target = b.standardTargetOptions(.{});
+    const optimize = b.standardOptimizeOption(.{});
+    const want_lto = b.option(bool, "lto", "Want -fLTO");
+
+    const lib = b.addStaticLibrary(.{
+        .name = "llama",
+        .target = target,
+        .optimize = optimize,
+    });
+    lib.want_lto = want_lto;
+    lib.linkLibCpp();
+    lib.addIncludePath(".");
+    lib.addIncludePath("examples");
+    lib.addCSourceFiles(&.{
+        "ggml.c",
+    }, &.{"-std=c11"});
+    lib.addCSourceFiles(&.{
+        "llama.cpp",
+    }, &.{"-std=c++11"});
+    lib.install();
+
+    const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
+
+    const exe = build_example("main", build_args);
+    _ = build_example("quantize", build_args);
+    _ = build_example("perplexity", build_args);
+    _ = build_example("embedding", build_args);
+
+    // create "zig build run" command for ./main
+
+    const run_cmd = exe.run();
+    run_cmd.step.dependOn(b.getInstallStep());
+    if (b.args) |args| {
+        run_cmd.addArgs(args);
+    }
+
+    const run_step = b.step("run", "Run the app");
+    run_step.dependOn(&run_cmd.step);
+}
+
+fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
+    const b = args.b;
+    const lib = args.lib;
+    const target = args.target;
+    const optimize = args.optimize;
+    const want_lto = args.want_lto;
+
+    const exe = b.addExecutable(.{
+        .name = name,
+        .target = target,
+        .optimize = optimize,
+    });
+    exe.want_lto = want_lto;
+    exe.addIncludePath(".");
+    exe.addIncludePath("examples");
+    exe.addCSourceFiles(&.{
+        std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
+        "examples/common.cpp",
+    }, &.{"-std=c++11"});
+    exe.linkLibrary(lib);
+    exe.install();
+
+    return exe;
+}
--- a/convert-ggml-to-pth.py
+++ b/convert-ggml-to-pth.py
@@ -0,0 +1,299 @@
+# Author: github.com/ductai199x
+import argparse
+import os
+import struct
+
+import numpy as np
+import torch
+from numba import njit
+from tqdm.auto import tqdm
+
+
+def read_header(fin):
+    values = struct.unpack("i" * 9, fin.read(4 * 9))
+    _, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
+    return {
+        "vocab_size": vocab_size,
+        "dim": dim,
+        "multiple_of": multiple_of,
+        "n_heads": n_heads,
+        "n_layers": n_layers,
+    }, ftype
+
+
+def read_tokens(fin, vocab_size):
+    tokens = []
+    for _ in range(vocab_size):
+        text_len = struct.unpack("i", fin.read(4))[0]
+        text_bytes = fin.read(text_len)
+        try:
+            text = text_bytes.decode()
+        except UnicodeDecodeError:
+            text = text_bytes.decode(errors="replace")
+        score = struct.unpack("f", fin.read(4))[0]
+        tokens.append((text, score))
+    return tokens
+
+
+@njit
+def dequantize_weights_numba(fin_data, n_rows, n_cols):
+    qk = 32
+    nb = n_cols // qk
+    bs = 4 + (qk // 2)
+
+    weights = np.zeros((n_rows, n_cols), dtype=np.float32)
+    data_pos = 0
+
+    for row in range(n_rows):
+        for block in range(nb):
+            d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
+            data_pos += 4
+            packed_values = fin_data[data_pos : data_pos + (qk // 2)]
+            data_pos += qk // 2
+
+            for i in range(qk // 2):
+                packed_value = packed_values[i]
+                v0 = np.float32((packed_value & 0b00001111) - 8) * d
+                v1 = np.float32((packed_value >> 4) - 8) * d
+
+                weights[row, block * qk + 2 * i] = v0
+                weights[row, block * qk + 2 * i + 1] = v1
+
+    return weights
+
+
+def dequantize_weights(fin, n_rows, n_cols):
+    qk = 32
+    nb = n_cols // qk
+    data_size = n_rows * n_cols // 2 + n_rows * nb * 4
+    fin_data = fin.read(data_size)
+    return dequantize_weights_numba(fin_data, n_rows, n_cols)
+
+
+def read_variables(fin):
+    model = {}
+    pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
+    while True:
+        start_pos = fin.tell()
+        try:
+            n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
+        except struct.error:
+            break
+
+        shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
+        shape = shape[::-1]
+        name = fin.read(name_length).decode()
+
+        # ensure tensor data is aligned
+        tensor_data_offset = fin.tell()
+        tensor_data_offset = (tensor_data_offset + 31) & -32
+        fin.seek(tensor_data_offset)
+
+        if ftype_cur == 2:
+            # 4-bit quantized weights
+            dtype = np.uint8
+            data = dequantize_weights(fin, shape[0], shape[1])
+            data = data.reshape(shape)
+        elif ftype_cur == 0:
+            dtype = np.float32
+            data_size = np.prod(shape)
+            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
+        elif ftype_cur == 1:
+            dtype = np.float16
+            data_size = np.prod(shape)
+            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
+
+        model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
+
+        pbar.update(fin.tell() - start_pos)
+
+    return model
+
+
+def convert_to_hf_format(model, hparams):
+    # This works for llama 7B, need to test with other models
+    n_layers = hparams["n_layers"]
+    n_heads = hparams["n_heads"]
+    dim = hparams["dim"]
+    dims_per_head = dim // n_heads
+    base = 10000.0
+    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+
+    # permute for sliced rotary
+    def permute(w):
+        return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
+
+    state_dict = {}
+    for layer_i in range(n_layers):
+        state_dict.update(
+            {
+                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
+                    model[f"layers.{layer_i}.attention.wq.weight"]
+                ),
+                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
+                    model[f"layers.{layer_i}.attention.wk.weight"]
+                ),
+                f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
+                    f"layers.{layer_i}.attention.wv.weight"
+                ],
+                f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
+                    f"layers.{layer_i}.attention.wo.weight"
+                ],
+                f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
+                    f"layers.{layer_i}.feed_forward.w1.weight"
+                ],
+                f"model.layers.{layer_i}.mlp.down_proj.weight": model[
+                    f"layers.{layer_i}.feed_forward.w2.weight"
+                ],
+                f"model.layers.{layer_i}.mlp.up_proj.weight": model[
+                    f"layers.{layer_i}.feed_forward.w3.weight"
+                ],
+                f"model.layers.{layer_i}.input_layernorm.weight": model[
+                    f"layers.{layer_i}.attention_norm.weight"
+                ],
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
+                    f"layers.{layer_i}.ffn_norm.weight"
+                ],
+            }
+        )
+        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
+    state_dict.update(
+        {
+            "model.embed_tokens.weight": model["tok_embeddings.weight"],
+            "model.norm.weight": model["norm.weight"],
+            "lm_head.weight": model["output.weight"],
+        }
+    )
+
+    return state_dict
+
+
+def chat(model, hparams, llama_dir):
+    from transformers import (GenerationConfig, LlamaForCausalLM,
+                              LlamaTokenizer, StoppingCriteria,
+                              StoppingCriteriaList)
+    from transformers.models.llama.configuration_llama import LlamaConfig
+
+    class StoppingCriteriaSub(StoppingCriteria):
+        def __init__(self):
+            super().__init__()
+
+        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
+            print(tokenizer.decode(input_ids[0]), end="", flush=True)
+            if input_ids[0][-1] == 13:
+                return True
+
+            return False
+
+    config = LlamaConfig(
+        vocab_size=hparams["vocab_size"],
+        dim=hparams["dim"],
+        num_hidden_layers=hparams["n_layers"],
+        num_attention_heads=hparams["n_heads"],
+    )
+
+    llama = LlamaForCausalLM(config=config)
+    llama.load_state_dict(state_dict=model, strict=True)
+    tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
+
+    device = torch.device("cpu")
+    llama = llama.to(device)
+
+    ctx = """You are AI.
+This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
+User: Hello, AI.
+AI: Hello! How can I assist you today?
+"""
+    print(ctx.rstrip("\n"))
+    while True:
+        print("-" * 60)
+        prompt = input("User: ")
+        if ctx != "":
+            ctx = f"{ctx}User: {prompt}\n"
+        else:
+            ctx = f"{prompt}\nAI:"
+
+        ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
+
+        print("-" * 60)
+        if len(ctx.strip()) > 0:
+            input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
+            generation_config = GenerationConfig(
+                temperature=0.8,
+                top_p=0.95,
+                top_k=50,
+                repetition_penalty=1.1764,
+            )
+            with torch.no_grad():
+                generation_output = llama.generate(
+                    input_ids=input_ids,
+                    generation_config=generation_config,
+                    return_dict_in_generate=True,
+                    output_scores=True,
+                    max_length=2048,
+                    do_sample=True,
+                    stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
+                )
+            s = generation_output.sequences[0]
+            decoded = tokenizer.decode(s)
+            ctx = f"{decoded}\n"
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
+    )
+    parser.add_argument(
+        "--prefix",
+        "-p",
+        type=str,
+        required=True,
+        help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
+    )
+    parser.add_argument(
+        "--hf",
+        action="store_true",
+        help="Whether to save the model in the Hugging Face format. (default: False)",
+    )
+    parser.add_argument(
+        "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
+    )
+    args = parser.parse_args()
+
+    llama_dir = os.path.abspath(f"{args.input_dir}/../")
+
+    ggml_files = sorted(
+        [f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
+    )
+
+    fin = open(ggml_files[0], "rb")
+    hparams, ftype = read_header(fin)
+    tokens = read_tokens(fin, hparams["vocab_size"])
+    model = read_variables(fin)
+
+    for f in tqdm(ggml_files[1:]):
+        fin = open(f, "rb")
+        read_header(fin)
+        read_tokens(fin, hparams["vocab_size"])
+        model.update(read_variables(fin))
+
+    if args.hf:
+        model = convert_to_hf_format(model, hparams)
+
+    pth_ckpt = {
+        "state_dict": model,
+        "hparams": hparams,
+        "tokens": tokens,
+    }
+
+    torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
+
+    if args.chat:
+        if not args.hf:
+            model = convert_to_hf_format(model, hparams)
+        chat(model, hparams, llama_dir)
+
+
+if __name__ == "__main__":
+    main()
--- a/convert-gpt4all-to-ggml.py
+++ b/convert-gpt4all-to-ggml.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+
+#
+# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
+#
+
+# Original by https://github.com/eiz
+# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
+import argparse
+import glob
+import os
+import struct
+import sys
+from sentencepiece import SentencePieceProcessor
+
+HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
+    parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
+    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
+    return parser.parse_args()
+
+def read_header(f_in):
+    struct_fmt = "i" * (3 + len(HPARAMS))
+    struct_size = struct.calcsize(struct_fmt)
+    buf = f_in.read(struct_size)
+    return struct.unpack(struct_fmt, buf)
+
+def write_header(f_out, header):
+    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
+
+    if magic != 0x67676d6c:
+        raise Exception('Invalid file magic. Must be an old style ggml file.')
+
+    values = [
+        0x67676d66, # magic: ggml in hex
+        1,          # file version
+        vocab_size,
+        dim,
+        multiple_of,
+        n_heads,
+        n_layers,
+        rot,
+        ftype
+    ]
+    f_out.write(struct.pack("i" * len(values), *values))
+
+def write_tokens(fout, tokenizer):
+    for i in range(tokenizer.vocab_size()):
+        if tokenizer.is_unknown(i):
+            text = " \u2047 ".encode()
+        elif tokenizer.is_control(i):
+            text = b""
+        elif tokenizer.is_byte(i):
+            piece = tokenizer.id_to_piece(i)
+            if len(piece) != 6:
+                print(f"Invalid token: {piece}")
+                sys.exit(1)
+            byte_value = int(piece[3:-1], 16)
+            text = struct.pack("B", byte_value)
+        else:
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+        fout.write(struct.pack("f", tokenizer.get_score(i)))
+
+    # TODO: GPT4All - add extra <pad> token
+    text = "<pad>".encode()
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+    fout.write(struct.pack("f", 0.0))
+
+def read_tokens(f_in, tokenizer):
+    for i in range(tokenizer.vocab_size()):
+        len_b = f_in.read(4)
+        (length,) = struct.unpack("i", len_b)
+        f_in.read(length)
+
+def copy_all_data(f_out, f_in):
+    while True:
+        buf = f_in.read(1024 * 1024)
+        if not buf:
+            break
+        f_out.write(buf)
+
+def convert_one_file(path_in, tokenizer):
+    path_tmp = f"{path_in}.tmp"
+    path_orig= f"{path_in}.orig"
+    print(f"converting {path_in}")
+    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
+        write_header(f_out, read_header(f_in))
+        read_tokens(f_in, tokenizer)
+        write_tokens(f_out, tokenizer)
+        copy_all_data(f_out, f_in)
+    os.rename(path_in, path_orig)
+    os.rename(path_tmp, path_in)
+
+def main():
+    args = parse_args()
+
+    tokenizer = SentencePieceProcessor(args.tokenizer_model)
+
+    convert_one_file(args.gpt4all_model, tokenizer)
+
+if __name__ == "__main__":
+    main()
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@@ -50,7 +50,7 @@ fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
    if tokenizer.is_unknown(i):
-        text = " \u2047 ".encode("utf-8")
+        text = " \u2047 ".encode()
    elif tokenizer.is_control(i):
        text = b""
    elif tokenizer.is_byte(i):
@@ -61,21 +61,26 @@ for i in range(tokenizer.vocab_size()):
        byte_value = int(piece[3:-1], 16)
        text = struct.pack("B", byte_value)
    else:
-        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    fout.write(struct.pack("f", tokenizer.get_score(i)))

 def write_header(shape, dst_name, ftype_cur):
-    sname = dst_name.encode('utf-8')
+    sname = dst_name.encode()
    fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
    fout.write(sname)

+    # ensure tensor data is aligned
+    tensor_data_offset = fout.tell()
+    tensor_data_offset = (tensor_data_offset + 31) & -32
+    fout.seek(tensor_data_offset)
+
 def convert_non_q4(src_name, dst_name):
    v = model[src_name]
    shape = v.shape
-    print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
+    print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
    if len(shape) == 1:
        print("  Converting to float32")
        v = v.to(torch.float32)
@@ -100,7 +105,7 @@ def convert_q4(src_name, dst_name, permute=False):
    # Each int32 item is actually 8 int4 items packed together, and it's transposed.
    shape = (qweight.shape[0], qweight.shape[1] * 8)

-    print("Processing Q4 variable: " + src_name + " with shape: ", shape)
+    print(f"Processing Q4 variable: {src_name} with shape: {shape}")

    # The output format has the int4 weights in groups of 32 rather than 8.
    # It looks like this:
@@ -163,5 +168,5 @@ for i in range(n_layer):

 fout.close()

-print("Done. Output file: " + fname_out)
-print("")
+print(f"Done. Output file: {fname_out}")
+print()
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -1,4 +1,4 @@
-# Convert a LLaMA model checkpoint to a ggml compatible file
+# Convert a LLaMA model checkpoint to a ggjt compatible file
 #
 # Load the model using Torch
 # Iterate over all variables and write them to a binary file.
@@ -24,8 +24,57 @@ import torch

 from sentencepiece import SentencePieceProcessor

-def parse_args():
+QK = 32

+GGML_TYPE_Q4_0  = 0
+GGML_TYPE_Q4_1  = 1
+GGML_TYPE_I8    = 2
+GGML_TYPE_I16   = 3
+GGML_TYPE_I32   = 4
+GGML_TYPE_F16   = 5
+GGML_TYPE_F32   = 6
+
+WTYPES = {
+    0: GGML_TYPE_F32,
+    1: GGML_TYPE_F16,
+    2: GGML_TYPE_Q4_0,
+    3: GGML_TYPE_Q4_1,
+}
+
+GGML_BLCK_SIZE = {
+    GGML_TYPE_Q4_0:  QK,
+    GGML_TYPE_Q4_1:  QK,
+    GGML_TYPE_I8:    1,
+    GGML_TYPE_I16:   1,
+    GGML_TYPE_I32:   1,
+    GGML_TYPE_F16:   1,
+    GGML_TYPE_F32:   1,
+}
+
+GGML_TYPE_SIZE = {
+    GGML_TYPE_Q4_0: 4   + QK//2,
+    GGML_TYPE_Q4_1: 4*2 + QK//2,
+    GGML_TYPE_I8:   1,
+    GGML_TYPE_I16:  2,
+    GGML_TYPE_I32:  4,
+    GGML_TYPE_F16:  2,
+    GGML_TYPE_F32:  4,
+}
+
+def ggml_nelements(shape):
+    r = 1
+    for i in shape:
+        r *= i
+    return r
+
+def ggml_nbytes(shape, ftype):
+    x = ggml_nelements(shape)
+    t = WTYPES[ftype]
+    x *= GGML_TYPE_SIZE[t]
+    x //= GGML_BLCK_SIZE[t]
+    return x
+
+def parse_args():
    parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
    parser.add_argument('dir_model',  help='directory containing the model checkpoint')
    parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
@@ -33,7 +82,6 @@ def parse_args():
    return parser.parse_args()

 def get_n_parts(dim):
-
    mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
    n_parts = mappings.get(dim)
    if n_parts is None:
@@ -44,30 +92,24 @@ def get_n_parts(dim):
    return n_parts

 def load_hparams_and_tokenizer(dir_model):
-
    # `dir_model` is something like `models/7B` or `models/7B/`.
    # "tokenizer.model" is expected under model's parent dir.
    # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
    # Let's use the model's parent dir directly.
    model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
-
    fname_hparams = f"{dir_model}/params.json"
    fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
-
    with open(fname_hparams, "r") as f:
        hparams = json.load(f)
        print(hparams)
-
    tokenizer = SentencePieceProcessor(fname_tokenizer)
    hparams.update({"vocab_size": tokenizer.vocab_size()})
-
    return hparams, tokenizer

 def write_header(fout, hparams, ftype):
-
    keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
    values = [
-        0x67676d66,  # magic: ggmf in hex
+        0x67676a74,  # magic: ggjt in hex
        1, # file version
        *[hparams[key] for key in keys],
        hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
@@ -76,10 +118,9 @@ def write_header(fout, hparams, ftype):
    fout.write(struct.pack("i" * len(values), *values))

 def write_tokens(fout, tokenizer):
-
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode("utf-8")
+            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
@@ -90,92 +131,144 @@ def write_tokens(fout, tokenizer):
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))

-def process_and_write_variables(fout, model, ftype):
-
+def process_and_write_variables(fout, model, ftype, part_id, n_parts):
    for name, datao in model.items():
-
        if name.endswith("freqs"):
            continue

-        shape = datao.shape
-
-        print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
-
+        # remove dimensions with a single element
        data = datao.numpy().squeeze()
-        n_dims = len(shape)
+        partshape = data.shape
+        n_dims = len(data.shape)
+        assert n_dims in (1, 2)

-        # default type is fp16
+        print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
+
+        # coerce single-dimensional tensors from float16 to float32
        ftype_cur = 1
        if ftype == 0 or n_dims == 1:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
+        blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
+        type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]

-        # header
-        sname = name.encode('utf-8')
-        fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
-        for dim in reversed(data.shape):
+        # determine dimension along which multipart tensor is sharded
+        #
+        # split_dim 0 regex:
+        #   - output.*
+        #   - layers.*.attention.wq.weight
+        #   - layers.*.attention.wk.weight
+        #   - layers.*.attention.wv.weight
+        #   - layers.*.feed_forward.w1.weight
+        #   - layers.*.feed_forward.w3.weight
+        #
+        # split_dim 1 regex:
+        #   - tok_embeddings.*
+        #   - layers.*.attention.wo.weight
+        #   - layers.*.feed_forward.w2.weight
+        #
+        if n_dims > 1:
+            split_dim = 1
+            if "tok_embeddings" in name:
+                split_dim = 1
+            elif "layers" in name:
+                if "attention.wo.weight" in name:
+                    split_dim = 1
+                elif "feed_forward.w2.weight" in name:
+                    split_dim = 1
+                else:
+                    split_dim = 0
+            elif "output" in name:
+                split_dim = 0
+
+        # output tensor header
+        fullshape = list(partshape)
+        if n_dims > 1:
+            fullshape[split_dim] *= n_parts
+        sname = name.encode()
+        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
+        for dim in reversed(fullshape):
            fout.write(struct.pack("i", dim))
        fout.write(sname)

-        # data output to file
-        data.tofile(fout)
+        # ensure tensor data is aligned
+        tensor_data_offset = fout.tell()
+        while tensor_data_offset % QK != 0:
+            fout.write(struct.pack("B", 0))
+            tensor_data_offset += 1
+
+        # output unified mappable tensor data
+        if n_dims == 1 or n_parts == 1:
+            # copy tensor which we thankfully received in one piece
+            if part_id == 0:
+                data.tofile(fout)
+        elif split_dim == 0:
+            # reassemble multifile tensor containing some of the rows
+            rows_per_chunk = partshape[0]
+            current_row = part_id * rows_per_chunk
+            bytes_per_row = fullshape[1] // blck_size * type_size
+            offset = current_row * bytes_per_row
+            fout.seek(tensor_data_offset + offset)
+            data.tofile(fout)
+        elif split_dim == 1:
+            # reassemble multifile tensor containing some of the cols
+            cols_per_chunk = partshape[1]
+            current_col = part_id * cols_per_chunk
+            bytes_per_row = fullshape[1] // blck_size * type_size
+            offset_current_col = current_col // blck_size * type_size
+            for row in range(partshape[0]):
+                offset_row = row * bytes_per_row
+                offset = offset_row + offset_current_col
+                fout.seek(tensor_data_offset + offset)
+                data[row].tofile(fout)
+
+        # advance file position to next tensor
+        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))

 def main():
-
    args = parse_args()
    dir_model = args.dir_model
    ftype = args.ftype
    ftype_str = ["f32", "f16"]
-
    hparams, tokenizer = load_hparams_and_tokenizer(dir_model)

    print(args)

    # if only writing vocab to file
    if args.vocab_only:
-
        fname_model = f"{dir_model}/consolidated.00.pth"
        fname_out = f"{dir_model}/ggml-vocab.bin"
-
        print(f"Extracting only the vocab from '{fname_model}'\n")
-
-        model = torch.load(fname_model, map_location="cpu")
-
        with open(fname_out, "wb") as fout:
            write_header(fout, hparams, ftype)
            write_tokens(fout, tokenizer)
-
-        del model
-
        print(f"Done. Output file: {fname_out}\n")
-
        return

    n_parts = get_n_parts(hparams["dim"])
+    fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"

-    for p in range(n_parts):
+    # we output a single file for ggml
+    with open(fname_out, "wb") as fout:
+        write_header(fout, hparams, ftype)
+        write_tokens(fout, tokenizer)
+        offset_of_tensors = fout.tell()
+        # the tensors we load could be split across multiple files
+        for part_id in range(n_parts):
+            fout.seek(offset_of_tensors)
+            print(f"Processing part {part_id+1} of {n_parts}\n")
+            fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
+            model = torch.load(fname_model, map_location="cpu")
+            process_and_write_variables(fout, model, ftype, part_id, n_parts)
+            del model

-        print(f"Processing part {p+1} of {n_parts}\n")
-
-        fname_model = f"{dir_model}/consolidated.0{p}.pth"
-        fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
-
-        model = torch.load(fname_model, map_location="cpu")
-
-        with open(fname_out, "wb") as fout:
-            write_header(fout, hparams, ftype)
-            write_tokens(fout, tokenizer)
-            process_and_write_variables(fout, model, ftype)
-
-        del model
-
-        print(f"Done. Output file: {fname_out}, (part {p})\n")
+    print(f"Done. Output file: {fname_out}\n")

 if __name__ == "__main__":
    main()
--- a/convert-unversioned-ggml-to-ggml.py
+++ b/convert-unversioned-ggml-to-ggml.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+# Original by https://github.com/eiz
+# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
+import argparse
+import glob
+import os
+import struct
+import sys
+from sentencepiece import SentencePieceProcessor
+
+HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
+    parser.add_argument('dir_model', help='directory containing ggml .bin files')
+    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
+    return parser.parse_args()
+
+def read_header(f_in):
+    struct_fmt = "i" * (3 + len(HPARAMS))
+    struct_size = struct.calcsize(struct_fmt)
+    buf = f_in.read(struct_size)
+    return struct.unpack(struct_fmt, buf)
+
+def write_header(f_out, header):
+    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
+
+    if magic != 0x67676d6c:
+        raise Exception('Invalid file magic. Must be an old style ggml file.')
+
+    values = [
+        0x67676d66,  # magic: ggml in hex
+        1, # file version
+        vocab_size,
+        dim,
+        multiple_of,
+        n_heads,
+        n_layers,
+        rot,
+        ftype
+    ]
+    f_out.write(struct.pack("i" * len(values), *values))
+
+def write_tokens(fout, tokenizer):
+    for i in range(tokenizer.vocab_size()):
+        if tokenizer.is_unknown(i):
+            text = " \u2047 ".encode()
+        elif tokenizer.is_control(i):
+            text = b""
+        elif tokenizer.is_byte(i):
+            piece = tokenizer.id_to_piece(i)
+            if len(piece) != 6:
+                print(f"Invalid token: {piece}")
+                sys.exit(1)
+            byte_value = int(piece[3:-1], 16)
+            text = struct.pack("B", byte_value)
+        else:
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+        fout.write(struct.pack("f", tokenizer.get_score(i)))
+
+def read_tokens(f_in, tokenizer):
+    for i in range(tokenizer.vocab_size()):
+        len_b = f_in.read(4)
+        (length,) = struct.unpack("i", len_b)
+        f_in.read(length)
+
+def copy_all_data(f_out, f_in):
+    while True:
+        buf = f_in.read(1024 * 1024)
+        if not buf:
+            break
+        f_out.write(buf)
+
+def convert_one_file(path_in, tokenizer):
+    path_tmp = f"{path_in}.tmp"
+    path_orig= f"{path_in}.orig"
+    print(f"converting {path_in}")
+    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
+        write_header(f_out, read_header(f_in))
+        read_tokens(f_in, tokenizer)
+        write_tokens(f_out, tokenizer)
+        copy_all_data(f_out, f_in)
+    os.rename(path_in, path_orig)
+    os.rename(path_tmp, path_in)
+
+def main():
+    args = parse_args()
+    files = []
+    files.extend(glob.glob(f"{args.dir_model}/*.bin"))
+    files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
+
+    tokenizer = SentencePieceProcessor(args.tokenizer_model)
+
+    for file in files:
+        convert_one_file(file, tokenizer)
+
+if __name__ == "__main__":
+    main()
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -31,6 +31,7 @@ if (EMSCRIPTEN)
 else()
    add_subdirectory(main)
    add_subdirectory(quantize)
+    add_subdirectory(quantize-stats)
    add_subdirectory(perplexity)
    add_subdirectory(embedding)
 endif()
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+set -e
+
+AI_NAME="${AI_NAME:-Miku}"
+MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
+USER_NAME="${USER_NAME:-Anon}"
+
+# Uncomment and adjust to the number of CPU cores you want to use.
+#N_THREAD="${N_THREAD:-4}"
+N_PREDICTS="${N_PREDICTS:-4096}"
+
+GEN_OPTIONS=(--batch_size 1024
+--ctx_size 2048
+--keep -1
+--repeat_last_n 256
+--repeat_penalty 1.17647
+--temp 0.7
+--top_k 40
+--top_p 0.5)
+
+if [ -n "$N_THREAD" ]; then
+    GEN_OPTIONS+=(--threads "$N_THREAD")
+fi
+
+./main "${GEN_OPTIONS[@]}" \
+    --model "$MODEL" \
+    --n_predict "$N_PREDICTS" \
+    --color --interactive \
+    --reverse-prompt "${USER_NAME}:" \
+    --prompt "
+This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
+${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
+${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
+${AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad.
+${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her.
+The conversation is only between ${USER_NAME} and ${AI_NAME}
+The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
+${AI_NAME} can only communicate through text, so she can't send images or videos.
+
+
+${USER_NAME}: Hello!
+${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk so it's important that I make a good first impression!
+${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^
+${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
+${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
+${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
+${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
+${AI_NAME}: What do you like to do in your free time? ^_^
+${USER_NAME}:" "$@"
--- a/examples/benchmark/benchmark-q4_0-matmult.c
+++ b/examples/benchmark/benchmark-q4_0-matmult.c
@@ -0,0 +1,270 @@
+/*
+    License: MIT License
+
+    Changelog:
+    - 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
+
+*/
+
+#include <locale.h>
+#include "ggml.h"
+#include <assert.h>
+#include <math.h>
+#include <cstring>
+#include <cstdio>
+#include <cinttypes>
+#include <unordered_map>
+#include <queue>
+#include <string.h>
+#include <cassert>
+#include <fstream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+
+float tensor_sum_elements(struct ggml_tensor * tensor) {
+    float sum = 0;
+    if (tensor->type==6) { 
+        for (int j = 0; j < tensor->ne[1]; j++) { 
+            for (int k = 0; k < tensor->ne[0]; k++) { 
+                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k]; 
+            } 
+        } 
+    }
+    return sum;
+}
+
+
+/*
+    These are mapping to unknown
+    GGML_TYPE_I8,
+    GGML_TYPE_I16,
+    GGML_TYPE_I32,    
+    GGML_TYPE_COUNT,
+*/
+
+#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
+
+#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
+        TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
+        TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
+    { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
+
+struct benchmark_params_struct {    
+    int32_t n_threads     = 1;
+    int32_t n_iterations  = 10;
+};
+
+void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -i N, --iter N     number of iterations to use during computation (default: %d)\n", params.n_iterations);
+    fprintf(stderr, "\n");
+}
+
+int main(int argc, char ** argv)  {
+
+    
+    struct benchmark_params_struct benchmark_params;
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            benchmark_params.n_threads = std::stoi(argv[i]);
+        } else if (arg == "-i" || arg == "--iter") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            benchmark_params.n_iterations = std::stoi(argv[i]);
+        }  else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, benchmark_params);
+            exit(0);
+        }     
+        if (invalid_param) {
+            fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+            print_usage(argc, argv, benchmark_params);
+            exit(1);
+        }
+    }
+
+
+    // create the ggml context
+    printf("Starting Test\n");
+    
+
+    
+    struct ggml_context * ctx;
+    //const int sizex = 4096;
+    //const int sizey = 11008;
+
+#undef VERBOSE_DEBUGGING
+#ifndef VERBOSE_DEBUGGING
+    const int sizey = 4096;
+    const int sizex = 11008;  
+    const int sizez = 128;
+#else
+    /* Working - let's increase size */
+    const int sizey = 1;
+    const int sizex = (8*32);  
+    const int sizez = 1;
+
+    /*const int sizey = 1;
+    const int sizex = 3*(8*32);  
+    const int sizez = 1;*/
+#endif
+
+    //printf("Memsize required = %i\n", sizex*sizex);
+    ggml_type wtype = GGML_TYPE_F32;    
+    
+    size_t ctx_size = 0;
+    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(wtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizeof(float);
+    ctx_size += 1024*1024*100;    
+    
+    printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
+    
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /* no_alloc   =*/ 0
+    };
+
+    ctx = ggml_init(params);
+    if (!ctx) {
+        fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+        return false;
+    }
+    
+    
+    printf("Creating new tensors\n");
+    // printf("Creating new tensor m1\n");
+    struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
+    ggml_set_f32(m11, 1.0f);
+    
+    // printf("Creating new tensor m1\n");
+    struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
+    ggml_set_f32(m12, 1.5f);
+    
+    // printf("Creating new tensor m2\n");
+    struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
+    ggml_set_f32(m2, 2.0f);
+    
+    printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
+    // printf("Creating new tensor m11xm2\n");
+    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
+    
+    // printf("Creating compute graph\n");
+    struct ggml_cgraph gf = ggml_build_forward(m11xm2);
+    
+    gf.n_threads=benchmark_params.n_threads;
+    printf("cgraph->n_threads=%i\n",gf.n_threads); 
+    
+    TENSOR_DUMP(m11);
+    TENSOR_DUMP(m2);
+    
+    ggml_graph_compute(ctx, &gf);
+
+    TENSOR_DUMP(gf.nodes[0]);
+    
+    printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
+        
+    int32_t nelements = sizex*sizey;
+    int32_t ne[2] = { sizex, sizey };
+        
+    std::vector<int64_t> hist_cur(1 << 4, 0);    
+
+    // Set up a the benchmark matrices
+    // printf("Creating new tensor q11 & Running quantize\n");
+    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
+    ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
+    
+    // Set up a the compute graph
+    // printf("Creating new tensor q31\n");
+    struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
+        
+    // printf("Creating compute graph\n");
+    struct ggml_cgraph gf31 = ggml_build_forward(q31);
+    gf31.n_threads=benchmark_params.n_threads;
+    
+    // Set up a second graph computation to make sure we override the CPU cache lines    
+    // printf("Creating new tensor q12 & Running quantize\n");
+    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
+    ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
+
+    // printf("Creating new tensor q32\n");
+    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
+        
+    //printf("Creating compute graph\n");
+    struct ggml_cgraph gf32 = ggml_build_forward(q32);
+    gf32.n_threads=benchmark_params.n_threads;
+    printf("cgraph->n_threads=%i\n",gf31.n_threads); 
+    
+    const int dimx = sizex;
+    const int dimy = sizey;
+    const int dimz = sizez;
+    long long int flops_per_dot_product = dimy + dimy;
+    long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
+    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
+   
+
+    // Let's use the F32 result from above as a reference for the q4_0 multiplication
+    float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
+    
+
+    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
+    printf("==============================================================================================\n");
+    
+    for (int i=0;i<benchmark_params.n_iterations ;i++) {
+    
+        long long int start = ggml_time_us();
+        //printf("Running ggml_graph_compute\n");
+        ggml_graph_compute(ctx, &gf31);
+        long long int stop = ggml_time_us();
+        long long int usec = stop-start;
+        float sec = usec/1000000;
+        float flops_per_usec = (1.0f*flops_per_matrix)/usec;
+        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
+            i,
+            gf31.n_threads, 
+            sizex, sizey, sizez, flops_per_matrix, 
+            usec,flops_per_usec);
+
+#ifdef VERBOSE_DEBUGGING
+        TENSOR_DUMP("res",gf31.nodes[0])
+#endif
+
+        // Check that the matrix multiplication result is in the right ballpark        
+        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
+        float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
+        float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
+        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
+
+        if (delta > allowed_delta)  {
+            printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
+                sum_of_F32_reference, 
+                sum_of_Q4_result,
+                delta,
+                allowed_delta
+            );
+            exit(0);
+        }
+        
+        // Running a different graph computation to make sure we override the CPU cache lines    
+        ggml_graph_compute(ctx, &gf32);
+        
+    }
+    
+}
--- a/examples/chat-13B.bat
+++ b/examples/chat-13B.bat
@@ -0,0 +1,57 @@
+@setlocal disabledelayedexpansion enableextensions
+@echo off
+
+cd /d "%~dp0.."
+if not "%errorlevel%"=="0" (
+    echo Unable to change directory.
+    pause
+    exit /b 1
+)
+
+if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin"
+if not defined USER_NAME set "USER_NAME=User"
+if not defined AI_NAME set "AI_NAME=ChatLLaMa"
+rem Adjust to the number of CPU cores you want to use.
+rem if not defined N_THREAD set "N_THREAD=8"
+rem Number of tokens to predict (made it larger than default because we want a long interaction)
+if not defined N_PREDICTS set "N_PREDICTS=2048"
+if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
+
+rem Default main script paths
+set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
+
+rem Get main script path from command line arguments
+set "MAIN_SCRIPT_PATH=%~1"
+
+rem If the main script path was not specified, try the default paths
+if not defined MAIN_SCRIPT_PATH (
+    for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do (
+        if exist "%%i" set "MAIN_SCRIPT_PATH=%%i"
+    )
+)
+
+rem If the main script path was not found, tell the user how to specify it
+if not defined MAIN_SCRIPT_PATH (
+    echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations:
+    echo %DEFAULT_MAIN_SCRIPT_PATHS%
+    pause
+    exit /b 1
+)
+
+rem Default context, feel free to edit it
+set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown."
+
+rem Set a temporary variable if N_THREAD is set
+if defined N_THREAD (
+    set "_N_THREAD=--threads %N_THREAD%"
+) else (
+    set "_N_THREAD="
+)
+
+rem Run the script
+echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^
+  --model "%MODEL%" ^
+  --n_predict %N_PREDICTS% ^
+  --color --interactive ^
+  --reverse-prompt "%USER_NAME%:" ^
+  --prompt "%PROMPT_TEXT%"
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -1,7 +1,5 @@
 #include "common.h"

-#include "ggml.h"
-
 #include <cassert>
 #include <cstring>
 #include <fstream>
@@ -16,12 +14,19 @@
 #endif

 #if defined (_WIN32)
+#include <fcntl.h>
+#include <io.h>
 #pragma comment(lib,"kernel32.lib")
 extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
 extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
 extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
 extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
 extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
+extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
+                                                                   const wchar_t * lpWideCharStr, int cchWideChar,
+                                                                   char * lpMultiByteStr, int cbMultiByte,
+                                                                   const char * lpDefaultChar, bool * lpUsedDefaultChar);
+#define CP_UTF8 65001
 #endif

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
@@ -39,6 +44,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {

    bool invalid_param = false;
    std::string arg;
+    gpt_params default_params;
+
    for (int i = 1; i < argc; i++) {
        arg = argv[i];

@@ -66,6 +73,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
@@ -147,6 +159,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.use_color = true;
        } else if (arg == "--mlock") {
            params.use_mlock = true;
+        } else if (arg == "--no-mmap") {
+            params.use_mmap = false;
        } else if (arg == "--mtest") {
            params.mem_test = true;
        } else if (arg == "--verbose-prompt") {
@@ -168,7 +182,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            }
            params.n_parts = std::stoi(argv[i]);
        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, params);
+            gpt_print_usage(argc, argv, default_params);
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
@@ -180,13 +194,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.input_prefix = argv[i];
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argc, argv, params);
+            gpt_print_usage(argc, argv, default_params);
            exit(1);
        }
    }
    if (invalid_param) {
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        gpt_print_usage(argc, argv, params);
+        gpt_print_usage(argc, argv, default_params);
        exit(1);
    }

@@ -215,20 +229,23 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "                        prompt file to start generation.\n");
    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
    fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
+    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", (double)params.top_p);
    fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
+    fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
    fprintf(stderr, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating\n");
    fprintf(stderr, "  --memory_f32          use f32 instead of f16 for memory key+value\n");
-    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
+    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
    fprintf(stderr, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n");
    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
-    if (ggml_mlock_supported()) {
+    if (llama_mlock_supported()) {
        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
+    if (llama_mmap_supported()) {
+        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+    }
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
    fprintf(stderr, "  -m FNAME, --model FNAME\n");
@@ -300,12 +317,20 @@ void win32_console_init(bool enable_color) {
            SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
        }
        // Set console output codepage to UTF8
-        SetConsoleOutputCP(65001); // CP_UTF8
+        SetConsoleOutputCP(CP_UTF8);
    }
    void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
    if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
-        // Set console input codepage to UTF8
-        SetConsoleCP(65001); // CP_UTF8
+        // Set console input codepage to UTF16
+        _setmode(_fileno(stdin), _O_WTEXT);
    }
 }
+
+// Convert a wide Unicode string to an UTF8 string
+void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
+    int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
+    std::string strTo(size_needed, 0);
+    WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
+    str = strTo;
+}
 #endif
--- a/examples/common.h
+++ b/examples/common.h
@@ -47,6 +47,7 @@ struct gpt_params {
    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool ignore_eos        = false; // do not stop generating after eos
    bool perplexity        = false; // compute perplexity over the prompt
+    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool mem_test          = false; // compute maximum memory usage
    bool verbose_prompt    = false; // print prompt tokens before generation
@@ -92,4 +93,5 @@ void set_console_color(console_state & con_st, console_color_t color);

 #if defined (_WIN32)
 void win32_console_init(bool enable_color);
+void win32_utf8_encode(const std::wstring & wstr, std::string & str);
 #endif
--- a/examples/embedding/CMakeLists.txt
+++ b/examples/embedding/CMakeLists.txt
@@ -1,4 +1,4 @@
 set(TARGET embedding)
 add_executable(${TARGET} embedding.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -1,3 +1,3 @@
-# embedding
-
-TODO
+# embedding
+
+TODO
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -38,6 +38,7 @@ int main(int argc, char ** argv) {
        lparams.seed       = params.seed;
        lparams.f16_kv     = params.memory_f16;
        lparams.logits_all = params.perplexity;
+        lparams.use_mmap   = params.use_mmap;
        lparams.use_mlock  = params.use_mlock;
        lparams.embedding  = params.embedding;

--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+#
+# Temporary script - will be removed in the future
+#
+
+cd `dirname $0`
+cd ..
+
+./main --color --instruct --threads 4 \
+       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
+       --file ./prompts/alpaca.txt \
+       --batch_size 8 --ctx_size 2048 \
+       --repeat_last_n 64 --repeat_penalty 1.3 \
+       --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@@ -1,4 +1,4 @@
 set(TARGET main)
 add_executable(${TARGET} main.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -1,3 +1,3 @@
-# main
-
-TODO
+# main
+
+TODO
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,3 +1,8 @@
+// Defines sigaction on msys:
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
 #include "common.h"
 #include "llama.h"

@@ -97,6 +102,7 @@ int main(int argc, char ** argv) {
        lparams.n_parts    = params.n_parts;
        lparams.seed       = params.seed;
        lparams.f16_kv     = params.memory_f16;
+        lparams.use_mmap   = params.use_mmap;
        lparams.use_mlock  = params.use_mlock;

        ctx = llama_init_from_file(params.model.c_str(), lparams);
@@ -162,7 +168,7 @@ int main(int argc, char ** argv) {
    }

    // enable interactive mode if reverse prompt or interactive start is specified
-    if (params.antiprompt.size() != 0 || params.interactive_start) { 
+    if (params.antiprompt.size() != 0 || params.interactive_start) {
        params.interactive = true;
    }

@@ -209,7 +215,8 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
        }
    }
-    fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
+    fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
+        params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    fprintf(stderr, "\n\n");

@@ -274,10 +281,10 @@ int main(int argc, char ** argv) {

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
            // out of user input, sample next token
-            const float top_k          = params.top_k;
-            const float top_p          = params.top_p;
-            const float temp           = params.temp;
-            const float repeat_penalty = params.repeat_penalty;
+            const int32_t top_k          = params.top_k;
+            const float   top_p          = params.top_p;
+            const float   temp           = params.temp;
+            const float   repeat_penalty = params.repeat_penalty;

            llama_token id = 0;

@@ -367,6 +374,11 @@ int main(int argc, char ** argv) {
                // potentially set color to indicate we are taking user input
                set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);

+#if defined (_WIN32)
+                // Windows: must reactivate sigint handler after each signal
+                signal(SIGINT, sigint_handler);
+#endif
+
                if (params.instruct) {
                    printf("\n> ");
                }
@@ -380,10 +392,19 @@ int main(int argc, char ** argv) {
                std::string line;
                bool another_line = true;
                do {
+#if defined(_WIN32)
+                    std::wstring wline;
+                    if (!std::getline(std::wcin, wline)) {
+                        // input stream is bad or EOF received
+                        return 0;
+                    }
+                    win32_utf8_encode(wline, line);
+#else
                    if (!std::getline(std::cin, line)) {
                        // input stream is bad or EOF received
                        return 0;
                    }
+#endif
                    if (line.empty() || line.back() != '\\') {
                        another_line = false;
                    } else {
@@ -425,7 +446,7 @@ int main(int argc, char ** argv) {
        }

        // end of text token
-        if (embd.back() == llama_token_eos()) {
+        if (!embd.empty() && embd.back() == llama_token_eos()) {
            if (params.instruct) {
                is_interacting = true;
            } else {
--- a/examples/perplexity/CMakeLists.txt
+++ b/examples/perplexity/CMakeLists.txt
@@ -1,4 +1,4 @@
 set(TARGET perplexity)
 add_executable(${TARGET} perplexity.cpp)
-target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/perplexity/README.md
+++ b/examples/perplexity/README.md
@@ -1,3 +1,3 @@
-# perplexity
-
-TODO
+# perplexity
+
+TODO
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,15 +1,17 @@
 #include "common.h"
 #include "llama.h"

-std::vector<double> softmax(const std::vector<float>& logits) {
-    std::vector<double> probs(logits.size());
+#include <cmath>
+
+std::vector<float> softmax(const std::vector<float>& logits) {
+    std::vector<float> probs(logits.size());
    float max_logit = logits[0];
    for (float v : logits) max_logit = std::max(max_logit, v);
    double sum_exp = 0.0;
    for (size_t i = 0; i < logits.size(); i++) {
        // Subtract the maximum logit value from the current logit value for numerical stability
-        float logit = logits[i] - max_logit;
-        double exp_logit = std::exp(logit);
+        const float logit = logits[i] - max_logit;
+        const float exp_logit = expf(logit);
        sum_exp += exp_logit;
        probs[i] = exp_logit;
    }
@@ -24,14 +26,16 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    auto tokens = ::llama_tokenize(ctx, params.prompt, true);

    int count = 0;
-    double nll = 0.0;
    int seq_count = tokens.size() / params.n_ctx;

+    double nll = 0.0;
+
    fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);

    for (int i = 0; i < seq_count; ++i) {
        int start = i * params.n_ctx;
-        int end = start + params.n_ctx - 1;
+        int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512
+                                            //       it is better to always be power of 2 for better performance
        std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
        auto start_t = std::chrono::high_resolution_clock::now();
        if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
@@ -40,7 +44,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
        }
        auto end_t = std::chrono::high_resolution_clock::now();
        if (i == 0) {
-            double seconds = std::chrono::duration<double>(end_t - start_t).count();
+            const float seconds = std::chrono::duration<float>(end_t - start_t).count();
            printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
        }
        // We get the logits for all the tokens in the context window (params.n_ctx)
@@ -63,7 +67,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
            std::vector<float> tok_logits(
                logits + j * n_vocab,
                logits + (j + 1) * n_vocab);
-            double prob = softmax(tok_logits)[tokens[start + j + 1]];
+            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
            nll += -std::log(prob);
            ++count;
        }
@@ -111,6 +115,7 @@ int main(int argc, char ** argv) {
        lparams.seed       = params.seed;
        lparams.f16_kv     = params.memory_f16;
        lparams.logits_all = params.perplexity;
+        lparams.use_mmap   = params.use_mmap;
        lparams.use_mlock  = params.use_mlock;
        lparams.embedding  = params.embedding;

--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET quantize-stats)
+add_executable(${TARGET} quantize-stats.cpp)
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -0,0 +1,354 @@
+#include "ggml.h"
+#include "llama.h"
+#include "llama_internal.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <numeric>
+#include <regex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32"  };
+static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
+
+struct quantize_stats_params {
+    std::string model = "models/7B/ggml-model-f16.bin";
+    bool verbose = false;
+    bool per_layer_stats = false;
+    bool print_histogram = false;
+    bool reference = false;
+    std::vector<std::string> include_layers;
+    std::vector<std::string> exclude_layers;
+    std::vector<enum ggml_type> include_types;
+};
+
+const int64_t SCRATCH_ELEMENTS = 32*32;
+const size_t HISTOGRAM_BUCKETS = 150;
+const double HISTOGRAM_RANGE = 0.03;
+
+struct error_stats {
+    size_t num_samples;
+    double total_error;
+    double max_error;
+    uint64_t error_histogram[HISTOGRAM_BUCKETS];
+};
+
+
+void quantize_stats_print_usage(int /*argc*/, char ** argv) {
+    quantize_stats_params params;
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "  -r, --reference\n");
+    fprintf(stderr, "                        use reference implementation (default: false)\n");
+    fprintf(stderr, "  -v, --verbose\n");
+    fprintf(stderr, "                        verbose output (default: false)\n");
+    fprintf(stderr, "  -p, --per-layer-stats\n");
+    fprintf(stderr, "                        print stats per layer (default: false)\n");
+    fprintf(stderr, "  --histogram\n");
+    fprintf(stderr, "                        print error histogram (default: false)\n");
+    fprintf(stderr, "  -l LAYER, --include-layer LAYER\n");
+    fprintf(stderr, "                        only test layers matching pattern\n");
+    fprintf(stderr, "  -L LAYER, --exclude-layer LAYER\n");
+    fprintf(stderr, "                        exclude layers matching pattern\n");
+    fprintf(stderr, "  -t TYPE, --type TYPE\n");
+    fprintf(stderr, "                        only test given type (q4_0, q4_1)\n");
+    fprintf(stderr, "\n");
+}
+
+// Check if a layer is included/excluded by command line
+bool layer_included(const quantize_stats_params params, const std::string & layer) {
+    for (const auto& excluded : params.exclude_layers) {
+        if (std::regex_search(layer, std::regex(excluded))) {
+            return false;
+        }
+    }
+    for (const auto& included : params.include_layers) {
+        if (std::regex_search(layer, std::regex(included))) {
+            return true;
+        }
+    }
+    return params.include_layers.empty();
+}
+
+// Update error statistics given vectors with the before/after result of quantization
+void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
+    for (int64_t i = 0; i < nelements; i++) {
+        double diff = input[i] - output[i];
+        stats.total_error += diff * diff;
+        stats.max_error = fmax(fabs(diff), stats.max_error);
+        stats.error_histogram[std::max(std::min((size_t) floor(fabs(diff) / HISTOGRAM_RANGE * HISTOGRAM_BUCKETS), HISTOGRAM_BUCKETS-1), (size_t) 0)]++;
+    }
+    stats.num_samples += nelements;
+}
+
+double find_quantile(const error_stats & stats, double quantile) {
+    double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);
+
+    double accum = 0;
+    for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+        accum += stats.error_histogram[i];
+        if (accum >= sum*quantile) {
+            return (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+        }
+    }
+    return INFINITY;
+}
+
+void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
+    double rmse = sqrt(stats.total_error / (double) stats.num_samples);
+    double median = find_quantile(stats, .5);
+    double pct95 = find_quantile(stats, .95);
+    printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median);
+    if (print_histogram) {
+        printf("Error distribution:\n");
+        for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) {
+            double lower = i * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            double upper = (i+1) * HISTOGRAM_RANGE / HISTOGRAM_BUCKETS;
+            if (i == HISTOGRAM_BUCKETS -1) upper = INFINITY;
+            printf("[%3.4f, %3.4f): %11" PRIu64 "\n", lower, upper, stats.error_histogram[i]);
+        }
+    }
+}
+
+// copied from ggml.h - verify that we can access this as a flat array
+static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+        tensor->nb[0] == ggml_type_size(tensor->type) &&
+        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/ggml_blck_size(tensor->type) &&
+        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&
+        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
+}
+
+// Run quantization function for a single layer and update error stats
+void test_roundtrip_on_layer(
+        std::string & name,
+        bool print_layer_stats,
+        const quantize_fns_t & qfns,
+        bool use_reference,
+        const ggml_tensor * layer,
+        float * input_scratch,
+        char *quantized_scratch,
+        float * output_scratch,
+        error_stats & total_error) {
+
+    assert(tensor_is_contiguous(layer));
+    error_stats layer_error {};
+    int64_t nelements = ggml_nelements(layer);
+
+    for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) {
+        int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset);
+
+        if (layer->type == GGML_TYPE_F16) {
+            for (int i = 0; i < chunk_size; i++) {
+                input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
+            }
+        } else {
+            input_scratch = ggml_get_data_f32(layer) + offset;
+        }
+
+        if (use_reference) {
+            qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
+        } else {
+            qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
+        }
+        qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
+
+        update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
+        if (print_layer_stats) {
+            update_error_stats(chunk_size, input_scratch, output_scratch, layer_error);
+        }
+    }
+    if (print_layer_stats) {
+        print_error_stats(name, layer_error, false);
+    }
+}
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    quantize_stats_params params;
+
+    // read command line
+
+    bool invalid_param = false;
+    std::string arg;
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            quantize_stats_print_usage(argc, argv);
+            exit(0);
+        } else if (arg == "-r" || arg == "--reference") {
+            params.reference = true;
+        } else if (arg == "-v") {
+            params.verbose = true;
+        } else if (arg == "-p" || arg == "--per-layer-stats") {
+            params.per_layer_stats = true;
+        } else if (arg == "--histogram") {
+            params.print_histogram = true;
+        } else if (arg == "-m" || arg == "--model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model = argv[i];
+        } else if (arg == "-l" || arg == "--include-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.include_layers.push_back(argv[i]);
+        } else if (arg == "-L" || arg == "--exclude-layer") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.exclude_layers.push_back(argv[i]);
+        } else if (arg == "-t" || arg == "--type") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            int j;
+            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
+                // find match
+            }
+            if (j < GGML_TYPE_COUNT) {
+                params.include_types.push_back((ggml_type) j);
+            } else {
+                fprintf(stderr, "error: %s not in list of types\n", argv[i]);
+                invalid_param = true;
+            }
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            quantize_stats_print_usage(argc, argv);
+            return 1;
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        quantize_stats_print_usage(argc, argv);
+        return 1;
+    }
+
+    // load the model
+    fprintf(stderr, "Loading model\n");
+
+    const int64_t t_main_start_us = ggml_time_us();
+    llama_context * ctx;
+
+    {
+        auto lparams = llama_context_default_params();
+
+        lparams.n_ctx      = 256;
+        lparams.n_parts    = 1;
+        lparams.seed       = 1;
+        lparams.f16_kv     = false;
+        lparams.use_mlock  = false;
+
+        ctx = llama_init_from_file(params.model.c_str(), lparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+            return 1;
+        }
+    }
+
+    const auto &tensors = llama_internal_get_tensor_map(ctx);
+
+    // check layer tensors
+    int included_layers = 0;
+    int64_t max_nelements = 0;
+    bool is_f16 = false;
+    for (const auto& kv_tensor : tensors) {
+        if (!layer_included(params, kv_tensor.first)) {
+            continue;
+        }
+        if (params.verbose) {
+            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
+        }
+        if (kv_tensor.second->type == GGML_TYPE_F16) {
+            is_f16 = true;
+        } else if (kv_tensor.second->type != GGML_TYPE_F32) {
+            fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
+                "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
+            llama_free(ctx);
+            return 1;
+        }
+        included_layers++;
+        max_nelements = std::max(max_nelements, ggml_nelements(kv_tensor.second));
+    }
+
+    if (is_f16) {
+        printf("note: source model is f16\n");
+    }
+    printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
+    // allocate scratch space
+    std::vector<float> input_scratch(SCRATCH_ELEMENTS);
+    std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
+    std::vector<float> output_scratch(SCRATCH_ELEMENTS);
+
+    // loop throught quantization types
+    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
+            continue;
+        }
+        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+            if (params.verbose) {
+                printf("testing %s ...\n",  type_strs[i]);
+            }
+
+            error_stats global_stats {};
+
+            for (const auto& kv_tensor : tensors) {
+                if (!layer_included(params, kv_tensor.first)) {
+                    continue;
+                }
+                if (params.verbose) {
+                    printf("  %s ...\n",  kv_tensor.first.c_str());
+                }
+                std::string layer_name { type_strs[i] };
+                layer_name += "::" + kv_tensor.first;
+                test_roundtrip_on_layer(
+                        layer_name,
+                        params.per_layer_stats,
+                        qfns,
+                        params.reference,
+                        kv_tensor.second,
+                        input_scratch.data(),
+                        quantized_scratch.data(),
+                        output_scratch.data(),
+                        global_stats
+                );
+            }
+
+            print_error_stats(type_strs[i], global_stats, params.print_histogram);
+        }
+    }
+
+
+    llama_free(ctx);
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
+    }
+
+    return 0;
+}
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -1,4 +1,4 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
-target_link_libraries(${TARGET} PRIVATE llama ggml ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -5,21 +5,21 @@
 #include <string>

 // usage:
-//  ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
+//  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
 //
 int main(int argc, char ** argv) {
    ggml_time_init();

    if (argc != 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
-        fprintf(stderr, "  type = 2 - q4_0\n");
-        fprintf(stderr, "  type = 3 - q4_1\n");
+        fprintf(stderr, "  type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
+        fprintf(stderr, "  type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
        return 1;
    }

    // needed to initialize f16 tables
    {
-        struct ggml_init_params params = { 0, NULL };
+        struct ggml_init_params params = { 0, NULL, false };
        struct ggml_context * ctx = ggml_init(params);
        ggml_free(ctx);
    }
@@ -27,7 +27,7 @@ int main(int argc, char ** argv) {
    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];

-    const int itype = atoi(argv[3]);
+    const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);

    const int64_t t_main_start_us = ggml_time_us();

@@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
    {
        const int64_t t_start_us = ggml_time_us();

-        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
+        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }
@@ -50,8 +50,8 @@ int main(int argc, char ** argv) {
        const int64_t t_main_end_us = ggml_time_us();

        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
    }

    return 0;
--- a/examples/reason-act.sh
+++ b/examples/reason-act.sh
@@ -0,0 +1,17 @@
+
+#!/bin/bash
+
+cd `dirname $0`
+cd ..
+
+# get -m model parameter otherwise defer to default
+if [ "$1" == "-m" ]; then
+  MODEL="-m $2 "
+fi
+
+./main $MODEL --color \
+    -f ./prompts/reason-act.txt \
+    -i --interactive-first \
+    --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
+    -r "Question:" -r "Observation:" --in-prefix " " \
+    -n -1
--- a/flake.nix
+++ b/flake.nix
@@ -30,6 +30,9 @@
            mkdir -p $out/bin
            mv bin/main $out/bin/llama
            mv bin/quantize $out/bin/quantize
+            mv bin/embedding $out/bin/embedding
+            mv bin/perplexity $out/bin/perplexity
+
            echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
            cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
            chmod +x $out/bin/convert-pth-to-ggml
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -198,13 +198,14 @@ struct ggml_object;
 struct ggml_context;

 enum ggml_type {
-    GGML_TYPE_Q4_0,
-    GGML_TYPE_Q4_1,
+    // explicitly numbered values are used in llama.cpp files
+    GGML_TYPE_F32  = 0,
+    GGML_TYPE_F16  = 1,
+    GGML_TYPE_Q4_0 = 2,
+    GGML_TYPE_Q4_1 = 3,
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
-    GGML_TYPE_F16,
-    GGML_TYPE_F32,
    GGML_TYPE_COUNT,
 };

@@ -236,6 +237,7 @@ enum ggml_op {

    GGML_OP_SCALE,
    GGML_OP_CPY,
+    GGML_OP_CONT,
    GGML_OP_RESHAPE,
    GGML_OP_VIEW,
    GGML_OP_PERMUTE,
@@ -253,16 +255,29 @@ enum ggml_op {
    GGML_OP_COUNT,
 };

+
+// ggml object
+struct ggml_object {
+    size_t offs;
+    size_t size;
+
+    struct ggml_object * next;
+
+    char padding[8];
+};
+
+static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+
 // n-dimensional tensor
 struct ggml_tensor {
    enum ggml_type type;

    int    n_dims;
-    int    ne[GGML_MAX_DIMS]; // number of elements
-    size_t nb[GGML_MAX_DIMS]; // stride in bytes:
-                              // nb[0] = sizeof(type)
-                              // nb[1] = nb[0]   * ne[0] + padding
-                              // nb[i] = nb[i-1] * ne[i-1]
+    int64_t ne[GGML_MAX_DIMS]; // number of elements
+    size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
+                               // nb[0] = sizeof(type)
+                               // nb[1] = nb[0]   * ne[0] + padding
+                               // nb[i] = nb[i-1] * ne[i-1]

    // compute data
    enum ggml_op op;
@@ -316,6 +331,7 @@ struct ggml_init_params {
    // memory pool
    size_t mem_size;   // bytes
    void * mem_buffer; // if NULL, memory will be allocated internally
+    bool   no_alloc;   // don't allocate memory for the tensor data
 };

 void    ggml_time_init(void); // call this once at the beginning of the program
@@ -327,8 +343,8 @@ int64_t ggml_cycles_per_ms(void);
 void ggml_print_object (const struct ggml_object * obj);
 void ggml_print_objects(const struct ggml_context * ctx);

-int    ggml_nelements(const struct ggml_tensor * tensor);
-size_t ggml_nbytes   (const struct ggml_tensor * tensor);
+int64_t ggml_nelements(const struct ggml_tensor * tensor);
+size_t  ggml_nbytes   (const struct ggml_tensor * tensor);

 int    ggml_blck_size (enum ggml_type type);
 size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@@ -343,40 +359,37 @@ size_t ggml_used_mem(const struct ggml_context * ctx);

 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);

-bool ggml_mlock_supported(void);
-bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
-
 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    n_dims,
-        const int *ne);
+        const int64_t *ne);

 struct ggml_tensor * ggml_new_tensor_1d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0);
+        int64_t ne0);

 struct ggml_tensor * ggml_new_tensor_2d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0,
-        int    ne1);
+        int64_t ne0,
+        int64_t ne1);

 struct ggml_tensor * ggml_new_tensor_3d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0,
-        int    ne1,
-        int    ne2);
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2);

 struct ggml_tensor * ggml_new_tensor_4d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0,
-        int    ne1,
-        int    ne2,
-        int    ne3);
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2,
+        int64_t ne3);

 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
 struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
@@ -514,6 +527,11 @@ struct ggml_tensor * ggml_cpy(
        struct ggml_tensor  * a,
        struct ggml_tensor  * b);

+// make contiguous
+struct ggml_tensor * ggml_cont(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+
 // return view(a), b specifies the new shape
 // TODO: when we start computing gradient, make a copy instead of view
 struct ggml_tensor * ggml_reshape(
@@ -526,33 +544,43 @@ struct ggml_tensor * ggml_reshape(
 struct ggml_tensor * ggml_reshape_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1);
+        int64_t               ne0,
+        int64_t               ne1);

 // return view(a)
 // TODO: when we start computing gradient, make a copy instead of view
 struct ggml_tensor * ggml_reshape_3d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1,
-        int                   ne2);
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2);

 // offset in bytes
 struct ggml_tensor * ggml_view_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
+        int64_t               ne0,
        size_t                offset);

 struct ggml_tensor * ggml_view_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1,
+        int64_t               ne0,
+        int64_t               ne1,
        size_t                nb1, // row stride in bytes
        size_t                offset);

+struct ggml_tensor * ggml_view_3d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2,
+        size_t                nb1, // row   stride in bytes
+        size_t                nb2, // slice stride in bytes
+        size_t                offset);
+
 struct ggml_tensor * ggml_permute(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
@@ -768,6 +796,30 @@ int ggml_cpu_has_blas(void);
 int ggml_cpu_has_sse3(void);
 int ggml_cpu_has_vsx(void);

+
+//
+// Internal types and functions exposed for tests and benchmarks
+//
+
+#ifdef  __cplusplus
+// restrict not standard in C++
+#define GGML_RESTRICT
+#else
+#define GGML_RESTRICT restrict
+#endif
+typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+
+typedef struct {
+    dequantize_row_q_t dequantize_row_q;
+    quantize_row_q_t   quantize_row_q;
+    quantize_row_q_t   quantize_row_q_reference;
+    vec_dot_q_t        vec_dot_q;
+} quantize_fns_t;
+
+quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@@ -6,7 +6,7 @@
 #include <stdbool.h>

 #ifdef LLAMA_SHARED
-#    ifdef _WIN32
+#    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
 #            define LLAMA_API __declspec(dllexport)
 #        else
@@ -20,7 +20,7 @@
 #endif

 #define LLAMA_FILE_VERSION 1
-#define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
+#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
 #define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files

 #ifdef __cplusplus
@@ -45,7 +45,7 @@ extern "C" {

    } llama_token_data;

-    typedef void (*llama_progress_callback)(double progress, void *ctx);
+    typedef void (*llama_progress_callback)(float progress, void *ctx);

    struct llama_context_params {
        int n_ctx;   // text context
@@ -55,6 +55,7 @@ extern "C" {
        bool f16_kv;     // use fp16 for KV cache
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool vocab_only; // only load the vocabulary, no weights
+        bool use_mmap;   // use mmap if possible
        bool use_mlock;  // force system to keep model in RAM
        bool embedding;  // embedding mode only

@@ -64,8 +65,20 @@ extern "C" {
        void * progress_callback_user_data;
    };

+    // model file types
+    enum llama_ftype {
+        LLAMA_FTYPE_ALL_F32     = 0,
+        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+    };
+
    LLAMA_API struct llama_context_params llama_context_default_params();

+    LLAMA_API bool llama_mmap_supported();
+    LLAMA_API bool llama_mlock_supported();
+
    // Various functions for loading a ggml llama model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure
@@ -81,7 +94,24 @@ extern "C" {
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
-                   int   itype);
+      enum llama_ftype   ftype);
+
+    // Returns the KV cache that will contain the context for the
+    // ongoing prediction with the model.
+    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
+
+    // Returns the size of the KV cache
+    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
+
+    // Returns the number of tokens in the KV cache
+    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
+
+    // Sets the KV cache containing the current context for the model
+    LLAMA_API void llama_set_kv_cache(
+            struct llama_context * ctx,
+                   const uint8_t * kv_cache,
+                          size_t   n_size,
+                             int   n_token_count);

    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
@@ -134,9 +164,9 @@ extern "C" {
          const llama_token * last_n_tokens_data,
                        int   last_n_tokens_size,
                        int   top_k,
-                     double   top_p,
-                     double   temp,
-                     double   repeat_penalty);
+                      float   top_p,
+                      float   temp,
+                      float   repeat_penalty);

    // Performance information
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -149,4 +179,4 @@ extern "C" {
 }
 #endif

-#endif
+#endif // LLAMA_H
--- a/llama_internal.h
+++ b/llama_internal.h
@@ -0,0 +1,12 @@
+// Internal header to be included by llama.cpp and tests/benchmarks only.
+
+#ifndef LLAMA_INTERNAL_H
+#define LLAMA_INTERNAL_H
+
+#include <vector>
+#include <string>
+struct ggml_tensor;
+
+std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
+
+#endif // LLAMA_INTERNAL_H
--- a/llama_util.h
+++ b/llama_util.h
@@ -0,0 +1,389 @@
+// Internal header to be included only by llama.cpp.
+// Contains wrappers around OS interfaces.
+
+#ifndef LLAMA_UTIL_H
+#define LLAMA_UTIL_H
+
+#include <cstdio>
+#include <cstdint>
+#include <cerrno>
+#include <cstring>
+#include <cstdarg>
+#include <cstdlib>
+#include <climits>
+
+#include <string>
+#include <vector>
+
+#ifdef __has_include
+    #if __has_include(<unistd.h>)
+        #include <unistd.h>
+        #if defined(_POSIX_MAPPED_FILES)
+            #include <sys/mman.h>
+        #endif
+    #endif
+#endif
+
+#if defined(_WIN32)
+    #define WIN32_LEAN_AND_MEAN
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    #include <io.h>
+    #include <stdio.h> // for _fseeki64
+#endif
+
+#define LLAMA_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            abort(); \
+        } \
+    } while (0)
+
+#ifdef __GNUC__
+__attribute__((format(printf, 1, 2)))
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    LLAMA_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    LLAMA_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+};
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            throw format("failed to open %s: %s", fname, std::strerror(errno));
+        }
+        seek(0, SEEK_END);
+        size = tell();
+        seek(0, SEEK_SET);
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        LLAMA_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        LLAMA_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw format("read error: %s", strerror(errno));
+        }
+        if (ret != 1) {
+            throw std::string("unexpectedly reached end of file");
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            throw format("write error: %s", strerror(errno));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+#if defined(_WIN32)
+static std::string llama_format_win_err(DWORD err) {
+    LPSTR buf;
+    size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
+                                 NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
+    if (!size) {
+        return "FormatMessageA failed";
+    }
+    std::string ret(buf, size);
+    LocalFree(buf);
+    return ret;
+}
+#endif
+
+struct llama_mmap {
+    void * addr;
+    size_t size;
+
+    llama_mmap(const llama_mmap &) = delete;
+
+#ifdef _POSIX_MAPPED_FILES
+    static constexpr bool SUPPORTED = true;
+
+    llama_mmap(struct llama_file * file) {
+        size = file->size;
+        int fd = fileno(file->fp);
+        int flags = MAP_SHARED;
+#ifdef __linux__
+        flags |= MAP_POPULATE;
+#endif
+        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
+        close(fd);
+        if (addr == MAP_FAILED) {
+            throw format("mmap failed: %s", strerror(errno));
+        }
+
+        // Advise the kernel to preload the mapped memory
+        if (madvise(addr, file->size, MADV_WILLNEED)) {
+            fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                    strerror(errno));
+        }
+    }
+
+    ~llama_mmap() {
+        munmap(addr, size);
+    }
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+
+    llama_mmap(struct llama_file * file) {
+        size = file->size;
+
+        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
+
+        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+        DWORD error = GetLastError();
+        CloseHandle(hFile);
+
+        if (hMapping == NULL) {
+            throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
+        }
+
+        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+        error = GetLastError();
+        CloseHandle(hMapping);
+
+        if (addr == NULL) {
+            throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
+        }
+
+        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
+        // Advise the kernel to preload the mapped memory
+        WIN32_MEMORY_RANGE_ENTRY range;
+        range.VirtualAddress = addr;
+        range.NumberOfBytes = (SIZE_T)size;
+        if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+            fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+        #else
+        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
+        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
+    }
+
+    ~llama_mmap() {
+        if (!UnmapViewOfFile(addr)) {
+            fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    llama_mmap(struct llama_file *) {
+        throw std::string("mmap not supported");
+    }
+#endif
+};
+
+// Represents some region of memory being locked using mlock or VirtualLock;
+// will automatically unlock on destruction.
+struct llama_mlock {
+    void * addr = NULL;
+    size_t size = 0;
+    bool failed_already = false;
+
+    llama_mlock() {}
+    llama_mlock(const llama_mlock &) = delete;
+
+    ~llama_mlock() {
+        if (size) {
+            raw_unlock(addr, size);
+        }
+    }
+
+    void init(void * addr) {
+        LLAMA_ASSERT(this->addr == NULL && this->size == 0);
+        this->addr = addr;
+    }
+
+    void grow_to(size_t target_size) {
+        LLAMA_ASSERT(addr);
+        if (failed_already) {
+            return;
+        }
+        size_t granularity = lock_granularity();
+        target_size = (target_size + granularity - 1) & ~(granularity - 1);
+        if (target_size > size) {
+            if (raw_lock((uint8_t *) addr + size, target_size - size)) {
+                size = target_size;
+            } else {
+                failed_already = true;
+            }
+        }
+    }
+
+#ifdef _POSIX_MEMLOCK_RANGE
+    static constexpr bool SUPPORTED = true;
+
+    size_t lock_granularity() {
+        return (size_t) sysconf(_SC_PAGESIZE);
+    }
+
+    #ifdef __APPLE__
+        #define MLOCK_SUGGESTION \
+            "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
+            "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
+    #else
+        #define MLOCK_SUGGESTION \
+            "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
+    #endif
+
+    bool raw_lock(const void * addr, size_t size) {
+        if (!mlock(addr, size)) {
+            return true;
+        } else {
+            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
+                    size, this->size, std::strerror(errno));
+            return false;
+        }
+    }
+
+    #undef MLOCK_SUGGESTION
+
+    void raw_unlock(void * addr, size_t size) {
+        if (munlock(addr, size)) {
+            fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
+        }
+    }
+#elif defined(_WIN32)
+    static constexpr bool SUPPORTED = true;
+
+    size_t lock_granularity() {
+        SYSTEM_INFO si;
+        GetSystemInfo(&si);
+        return (size_t) si.dwPageSize;
+    }
+
+    bool raw_lock(void * addr, size_t size) {
+        for (int tries = 1; ; tries++) {
+            if (VirtualLock(addr, size)) {
+                return true;
+            }
+            if (tries == 2) {
+                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
+                        size, this->size, llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+
+            // It failed but this was only the first try; increase the working
+            // set size and try again.
+            SIZE_T min_ws_size, max_ws_size;
+            if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
+                fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+            // Per MSDN: "The maximum number of pages that a process can lock
+            // is equal to the number of pages in its minimum working set minus
+            // a small overhead."
+            // Hopefully a megabyte is enough overhead:
+            size_t increment = size + 1048576;
+            // The minimum must be <= the maximum, so we need to increase both:
+            min_ws_size += increment;
+            max_ws_size += increment;
+            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
+                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+                return false;
+            }
+        }
+    }
+
+    void raw_unlock(void * addr, size_t size) {
+        if (!VirtualUnlock(addr, size)) {
+            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
+                    llama_format_win_err(GetLastError()).c_str());
+        }
+    }
+#else
+    static constexpr bool SUPPORTED = false;
+
+    void raw_lock(const void * addr, size_t size) {
+        fprintf(stderr, "warning: mlock not supported on this system\n");
+    }
+
+    void raw_unlock(const void * addr, size_t size) {}
+#endif
+};
+
+// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
+struct llama_buffer {
+    uint8_t * addr = NULL;
+    size_t size = 0;
+
+    void resize(size_t size) {
+        delete[] addr;
+        addr = new uint8_t[size];
+        this->size = size;
+    }
+
+    ~llama_buffer() {
+        delete[] addr;
+    }
+};
+#endif
--- a/media/llama-leader.jpeg
+++ b/media/llama-leader.jpeg
--- a/media/llama0-banner.png
+++ b/media/llama0-banner.png
--- a/media/llama0-logo.png
+++ b/media/llama0-logo.png
--- a/media/llama1-banner.png
+++ b/media/llama1-banner.png
--- a/media/llama1-logo.png
+++ b/media/llama1-logo.png
--- a/migrate-ggml-2023-03-30-pr613.py
+++ b/migrate-ggml-2023-03-30-pr613.py
@@ -0,0 +1,311 @@
+# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
+#
+# We caused a breaking change to the file format on 2023-03-30 in:
+#     https://github.com/ggerganov/llama.cpp/pull/613
+#
+# (1) If you still have the Meta LLaMA .pth files, then close this
+#     file now; you can just run `convert-pth-to-ggml.py` again to
+#     migrate to the new format. The tool is easier to use too. It
+#     isn't necessary anymore to manage split output files because
+#     the new format always combines things into a single file.
+#
+# (2) If you deleted the Meta LLaMA .pth files due to save on disk
+#     space, then this tool is intended to help you.  Please check
+#     out the instructions below.
+#
+# USAGE
+#
+#     python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
+#
+# PREREQUISITES
+#
+#     pip install numpy
+#     cd llama.cpp
+#     make -j4
+#
+# EXAMPLE (7B MODEL)
+#
+#     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
+#     python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
+#
+#     # check that it works
+#     ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
+#
+#     # you can delete the old files
+#     rm -f models/7B/ggml-model-f16.bin
+#     mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
+#
+# EXAMPLE (13B MODEL)
+#
+#     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
+#     python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
+#
+#     # check that it works
+#     ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
+#
+#     # you can delete the old files
+#     rm -f models/13B/ggml-model-f16.bin*
+#     mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
+#
+
+import argparse
+import os
+import sys
+import json
+import struct
+import numpy as np
+
+QK = 32
+
+GGML_TYPE_Q4_0  = 0
+GGML_TYPE_Q4_1  = 1
+GGML_TYPE_I8    = 2
+GGML_TYPE_I16   = 3
+GGML_TYPE_I32   = 4
+GGML_TYPE_F16   = 5
+GGML_TYPE_F32   = 6
+
+WTYPE_NAMES = {
+    0: "F32",
+    1: "F16",
+    2: "Q4_0",
+    3: "Q4_1",
+}
+
+WTYPES = {
+    0: GGML_TYPE_F32,
+    1: GGML_TYPE_F16,
+    2: GGML_TYPE_Q4_0,
+    3: GGML_TYPE_Q4_1,
+}
+
+GGML_BLCK_SIZE = {
+    GGML_TYPE_Q4_0:  QK,
+    GGML_TYPE_Q4_1:  QK,
+    GGML_TYPE_I8:    1,
+    GGML_TYPE_I16:   1,
+    GGML_TYPE_I32:   1,
+    GGML_TYPE_F16:   1,
+    GGML_TYPE_F32:   1,
+}
+
+GGML_TYPE_SIZE = {
+    GGML_TYPE_Q4_0: 4   + QK//2,
+    GGML_TYPE_Q4_1: 4*2 + QK//2,
+    GGML_TYPE_I8:   1,
+    GGML_TYPE_I16:  2,
+    GGML_TYPE_I32:  4,
+    GGML_TYPE_F16:  2,
+    GGML_TYPE_F32:  4,
+}
+
+HPARAMS = [
+    'magic',    # int32
+    'version',  # int32
+    'n_vocab',  # int32
+    'n_embd',   # int32
+    'n_mult',   # int32
+    'n_head',   # int32
+    'n_layer',  # int32
+    'n_rot',    # int32
+    'f16',      # int32
+]
+
+def read_hparams(fin):
+    struct_fmt = "i" * len(HPARAMS)
+    struct_size = struct.calcsize(struct_fmt)
+    buf = fin.read(struct_size)
+    ints = struct.unpack(struct_fmt, buf)
+    hparams = dict(zip(HPARAMS, ints))
+    return hparams
+
+def write_hparams(fout, hparams):
+    struct_fmt = "i" * len(HPARAMS)
+    struct_size = struct.calcsize(struct_fmt)
+    ints = [hparams[h] for h in HPARAMS]
+    fout.write(struct.pack(struct_fmt, *ints))
+
+def read_tokens(fin, hparams):
+    tokens = []
+    for i in range(hparams['n_vocab']):
+        len_b = fin.read(4)
+        (length,) = struct.unpack("i", len_b)
+        word = fin.read(length)
+        score_b = fin.read(4)
+        (score,) = struct.unpack("f", score_b)
+        tokens.append((word, score))
+    return tokens
+
+def write_tokens(fout, tokens):
+    for word, score in tokens:
+        fout.write(struct.pack("i", len(word)))
+        fout.write(word)
+        fout.write(struct.pack("f", score))
+
+def ggml_nelements(shape):
+    r = 1
+    for i in shape:
+        r *= i
+    return r
+
+def ggml_nbytes(shape, ftype):
+    x = ggml_nelements(shape)
+    t = WTYPES[ftype]
+    x *= GGML_TYPE_SIZE[t]
+    x //= GGML_BLCK_SIZE[t]
+    return x
+
+def copy_tensors(fin, fout, part_id, n_parts):
+    while True:
+
+        b = fin.read(4)
+        if not b: break
+        (n_dims,) = struct.unpack("i", b)
+        b = fin.read(4)
+        (length,) = struct.unpack("i", b)
+        b = fin.read(4)
+        (ftype,) = struct.unpack("i", b)
+
+        assert n_dims in (1, 2)
+
+        partshape = list(range(n_dims))
+        for i in range(n_dims):
+            b = fin.read(4)
+            partshape[i] = struct.unpack("i", b)[0]
+        partshape = list(reversed(partshape))
+
+        name = fin.read(length)
+        data = fin.read(ggml_nbytes(partshape, ftype))
+
+        blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
+        type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
+
+        print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
+
+        # determine dimension along which multipart tensor is sharded
+        #
+        # split_dim 0 regex:
+        #   - output.*
+        #   - layers.*.attention.wq.weight
+        #   - layers.*.attention.wk.weight
+        #   - layers.*.attention.wv.weight
+        #   - layers.*.feed_forward.w1.weight
+        #   - layers.*.feed_forward.w3.weight
+        #
+        # split_dim 1 regex:
+        #   - tok_embeddings.*
+        #   - layers.*.attention.wo.weight
+        #   - layers.*.feed_forward.w2.weight
+        #
+        if n_dims > 1:
+            split_dim = 1
+            if b"tok_embeddings" in name:
+                split_dim = 1
+            elif b"layers" in name:
+                if b"attention.wo.weight" in name:
+                    split_dim = 1
+                elif b"feed_forward.w2.weight" in name:
+                    split_dim = 1
+                else:
+                    split_dim = 0
+            elif b"output" in name:
+                split_dim = 0
+
+        # output tensor header
+        fullshape = list(partshape)
+        if n_dims > 1:
+            fullshape[split_dim] *= n_parts
+        fout.write(struct.pack("iii", n_dims, len(name), ftype))
+        for dim in reversed(fullshape):
+            fout.write(struct.pack("i", dim))
+        fout.write(name)
+
+        # ensure tensor data is aligned
+        tensor_data_offset = fout.tell()
+        while tensor_data_offset % QK != 0:
+            fout.write(struct.pack("B", 0))
+            tensor_data_offset += 1
+
+        # output unified mappable tensor data
+        if n_dims == 1 or n_parts == 1:
+            # copy tensor which we thankfully received in one piece
+            if part_id == 0:
+                fout.write(data)
+        elif split_dim == 0:
+            # reassemble multifile tensor containing some of the rows
+            rows_per_chunk = partshape[0]
+            current_row = part_id * rows_per_chunk
+            bytes_per_row = fullshape[1] // blck_size * type_size
+            offset = current_row * bytes_per_row
+            fout.seek(tensor_data_offset + offset)
+            fout.write(data)
+        elif split_dim == 1:
+            # reassemble multifile tensor containing some of the cols
+            cols_per_chunk = partshape[1]
+            current_col = part_id * cols_per_chunk
+            bpr = partshape[1] // blck_size * type_size
+            bytes_per_row = fullshape[1] // blck_size * type_size
+            offset_current_col = current_col // blck_size * type_size
+            for row in range(partshape[0]):
+                offset_row = row * bytes_per_row
+                offset = offset_row + offset_current_col
+                fout.seek(tensor_data_offset + offset)
+                fout.write(data[row * bpr:row * bpr + bpr])
+
+        # advance file position to next tensor
+        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
+    parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
+    parser.add_argument('fout_path', help='your new ggjt file name')
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+    assert args.fin_path
+    assert args.fout_path
+    assert args.fin_path != args.fout_path
+
+    with open(args.fin_path, "rb") as fin:
+        hparams = read_hparams(fin)
+        tokens = read_tokens(fin, hparams)
+
+    if hparams['magic'] == 0x67676a74:  # ggjt
+        print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
+        sys.exit(1)
+
+    if hparams['magic'] != 0x67676d66:  # ggmf
+        print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
+        sys.exit(1)
+
+    hparams['magic'] = 0x67676a74  # ggjt
+
+    # count number of multipart files by convention
+    n_parts = 1
+    while True:
+        if os.path.exists(f"{args.fin_path}.{n_parts}"):
+            n_parts += 1
+        else:
+            break
+
+    # we output a single file for ggml
+    with open(args.fout_path, "wb") as fout:
+        write_hparams(fout, hparams)
+        write_tokens(fout, tokens)
+        offset_of_tensors = fout.tell()
+        # the tensors we load could be split across multiple files
+        for part_id in range(n_parts):
+            fout.seek(offset_of_tensors)
+            print(f"Processing part {part_id+1} of {n_parts}\n")
+            fin_path = args.fin_path
+            if part_id > 0:
+                fin_path += f".{part_id}"
+            with open(fin_path, "rb") as fin:
+                read_tokens(fin, read_hparams(fin))
+                copy_tensors(fin, fout, part_id, n_parts)
+
+    print(f"Done. Output file: {args.fout_path}\n")
+
+if __name__ == "__main__":
+    main()
--- a/models/ggml-vocab.bin
+++ b/models/ggml-vocab.bin
--- a/prompts/chat-with-bob.txt
+++ b/prompts/chat-with-bob.txt
@@ -4,4 +4,4 @@ User: Hello, Bob.
 Bob: Hello. How may I help you today?
 User: Please tell me the largest city in Europe.
 Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
-User:
+User:
--- a/prompts/reason-act.txt
+++ b/prompts/reason-act.txt
@@ -0,0 +1,18 @@
+You run in a loop of Thought, Action, Observation.
+At the end of the loop either Answer or restate your Thought and Action.
+Use Thought to describe your thoughts about the question you have been asked.
+Use Action to run one of these actions available to you:
+- calculate[python math expression]
+Observation will be the result of running those actions
+
+
+Question: What is 4 * 7 / 3?
+Thought: Do I need to use an action? Yes, I use calculate to do math
+Action: calculate[4 * 7 / 3]
+Observation: 9.3333333333
+Thought: Do I need to use an action? No, have the result
+Answer: The calculate tool says it is 9.3333333333
+Question: What is capital of france?
+Thought: Do I need to use an action? No, I know the answer
+Answer: Paris is the capital of France
+Question:
--- a/quantize.py
+++ b/quantize.py
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-
-"""Script to execute the "quantize" script on a given set of models."""
-
-import subprocess
-import argparse
-import glob
-import sys
-import os
-
-
-def main():
-    """Update the quantize binary name depending on the platform and parse
-    the command line arguments and execute the script.
-    """
-
-    if "linux" in sys.platform or "darwin" in sys.platform:
-        quantize_script_binary = "quantize"
-
-    elif "win32" in sys.platform or "cygwin" in sys.platform:
-        quantize_script_binary = "quantize.exe"
-
-    else:
-        print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
-        quantize_script_binary = "quantize"
-
-    parser = argparse.ArgumentParser(
-        prog='python3 quantize.py',
-        description='This script quantizes the given models by applying the '
-        f'"{quantize_script_binary}" script on them.'
-    )
-    parser.add_argument(
-        'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
-        help='The models to quantize.'
-    )
-    parser.add_argument(
-        '-r', '--remove-16', action='store_true', dest='remove_f16',
-        help='Remove the f16 model after quantizing it.'
-    )
-    parser.add_argument(
-        '-m', '--models-path', dest='models_path',
-        default=os.path.join(os.getcwd(), "models"),
-        help='Specify the directory where the models are located.'
-    )
-    parser.add_argument(
-        '-q', '--quantize-script-path', dest='quantize_script_path',
-        default=os.path.join(os.getcwd(), quantize_script_binary),
-        help='Specify the path to the "quantize" script.'
-    )
-
-    # TODO: Revise this code
-    # parser.add_argument(
-    #     '-t', '--threads', dest='threads', type='int',
-    #     default=os.cpu_count(),
-    #     help='Specify the number of threads to use to quantize many models at '
-    #     'once. Defaults to os.cpu_count().'
-    # )
-
-    args = parser.parse_args()
-    args.models_path = os.path.abspath(args.models_path)
-
-    if not os.path.isfile(args.quantize_script_path):
-        print(
-            f'The "{quantize_script_binary}" script was not found in the '
-            "current location.\nIf you want to use it from another location, "
-            "set the --quantize-script-path argument from the command line."
-        )
-        sys.exit(1)
-
-    for model in args.models:
-        # The model is separated in various parts
-        # (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
-        f16_model_path_base = os.path.join(
-            args.models_path, model, "ggml-model-f16.bin"
-        )
-
-        if not os.path.isfile(f16_model_path_base):
-            print(f'The file %s was not found' % f16_model_path_base)
-            sys.exit(1)
-
-        f16_model_parts_paths = map(
-            lambda filename: os.path.join(f16_model_path_base, filename),
-            glob.glob(f"{f16_model_path_base}*")
-        )
-
-        for f16_model_part_path in f16_model_parts_paths:
-            if not os.path.isfile(f16_model_part_path):
-                print(
-                    f"The f16 model {os.path.basename(f16_model_part_path)} "
-                    f"was not found in {args.models_path}{os.path.sep}{model}"
-                    ". If you want to use it from another location, set the "
-                    "--models-path argument from the command line."
-                )
-                sys.exit(1)
-
-            __run_quantize_script(
-                args.quantize_script_path, f16_model_part_path
-            )
-
-            if args.remove_f16:
-                os.remove(f16_model_part_path)
-
-
-# This was extracted to a top-level function for parallelization, if
-# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
-
-def __run_quantize_script(script_path, f16_model_part_path):
-    """Run the quantize script specifying the path to it and the path to the
-    f16 model to quantize.
-    """
-
-    new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
-    subprocess.run(
-        [script_path, f16_model_part_path, new_quantized_model_path, "2"],
-        check=True
-    )
-
-
-if __name__ == "__main__":
-    try:
-        main()
-
-    except subprocess.CalledProcessError:
-        print("\nAn error ocurred while trying to quantize the models.")
-        sys.exit(1)
-
-    except KeyboardInterrupt:
-        sys.exit(0)
-
-    else:
-        print("\nSuccesfully quantized all models.")
--- a/spm-headers/llama.h
+++ b/spm-headers/llama.h
@@ -0,0 +1 @@
+../llama.h
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -5,5 +5,6 @@ function(llama_add_test source)
    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()

+# llama_add_test(test-double-float.c) # SLOW
 llama_add_test(test-quantize.c)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
--- a/tests/test-double-float.c
+++ b/tests/test-double-float.c
@@ -0,0 +1,53 @@
+// These tests may take a long time!
+// They are to prove that conversion from double to float of various functions in ggml.c doesn't affect the result.
+// This is done by checking all finite (non-NaN, non-infinite) floats.
+
+#undef NDEBUG
+#include <assert.h>
+#include <immintrin.h>
+#include <math.h>
+#include <stdint.h>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+
+// ggml.c::quantize_row_q4_0_reference
+inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
+
+// ggml.c::ggml_silu_f32
+inline static float silu_orig(float x) {
+    return x/(1.0 + exp(-x));
+}
+
+#pragma GCC diagnostic pop
+
+// ggml.c::quantize_row_q4_0_reference
+inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
+
+// ggml.c::ggml_silu_f32
+inline static float silu_float(float x) {
+    return x/(1.0f + expf(-x));
+}
+
+int main(void) {
+    uint32_t x = UINT32_MAX;
+    do {
+        float f = *(float *)&x;
+        assert(!isfinite(f) || (round_orig(f) == round_float(f)));
+    } while (x--);
+
+#ifdef __F16C__
+    // GELU and SILU implementations are used with a FP16 lookup table.
+    // The original and float-only results are not equal for all inputs after converting to FP16.
+    // GELU is an approximation anyway (tanh), not tested here.
+    // For SILU, verify that the results are at least the closest floating point numbers, if the FP16 values don't match.
+    for (x = 0; x <= UINT16_MAX; x++) {
+        float f = _cvtsh_ss(x);
+        const float so = silu_orig(f);
+        const float sf = silu_float(f);
+        assert(   (_cvtss_sh(so, 0) == _cvtss_sh(sf, 0))
+               || (nextafterf(so, sf) == sf)
+               || (nextafterf(sf, so) == so));
+    }
+#endif
+}
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@@ -77,5 +77,7 @@ int main(int argc, char **argv) {
        }
    }

+    llama_free(ctx);
+
    return 0;
 }