cmake : set include path for OpenBlas (#1830 )

swift : Package compile breaks due to ggml-metal.metal (#1831 )
* Ignore metal file in spm * Add ggml.h to spm public Headers --------- Co-authored-by: Vogel Frederik <vogel.frederik@linecorp.com>
2026-02-26 14:23:22 +02:00 · 2023-06-15 20:51:26 +03:00 · 2023-06-15 20:47:04 +03:00 · 2023-06-15 20:42:48 +03:00 · 2023-06-15 20:36:38 +03:00 · 2023-06-15 20:36:06 +03:00
64 changed files with 58216 additions and 2031 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -0,0 +1,18 @@
+---
+Checks: >
+    bugprone-*,
+    -bugprone-easily-swappable-parameters,
+    -bugprone-implicit-widening-of-multiplication-result,
+    -bugprone-narrowing-conversions,
+    readability-*,
+    -readability-avoid-unconditional-preprocessor-if,
+    -readability-function-cognitive-complexity,
+    -readability-identifier-length,
+    -readability-implicit-bool-conversion,
+    -readability-magic-numbers,
+    -readability-uppercase-literal-suffix,
+    clang-analyzer-*,
+    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
+    performance-*,
+    portability-*,
+FormatStyle: none
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip
+    apt-get install -y build-essential python3 python3-pip git

 COPY requirements.txt requirements.txt

@@ -16,4 +16,6 @@ COPY . .

 RUN make

+ENV LC_ALL=C.utf8
+
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential
+    apt-get install -y build-essential git

 WORKDIR /app

@@ -15,4 +15,6 @@ FROM ubuntu:$UBUNTU_VERSION as runtime

 COPY --from=build /app/main /main

+ENV LC_ALL=C.utf8
+
 ENTRYPOINT [ "/main" ]
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -11,7 +11,7 @@ shift
 arg2="$@"

 if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
-    python3 ./convert-pth-to-ggml.py $arg2
+    python3 ./convert.py $arg2
 elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
    ./quantize $arg2
 elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
@@ -32,7 +32,7 @@ else
    echo "  --run (-r): Run a model previously converted into ggml"
    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
    echo "  --convert (-c): Convert a llama model into ggml"
-    echo "              ex: \"/models/7B/\" 1"
+    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
    echo "  --all-in-one (-a): Execute --convert & --quantize"
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,10 +10,10 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']

 env:
 BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -151,21 +151,21 @@ jobs:
    env:
      OPENBLAS_VERSION: 0.3.23
      OPENCL_VERSION: 2023.04.17
-      CLBLAST_VERSION: 1.5.3
+      CLBLAST_VERSION: 1.6.0

    strategy:
      matrix:
        include:
          - build: 'avx2'
-            defines: ''
+            defines: '-DLLAMA_BUILD_SERVER=ON'
          - build: 'avx'
-            defines: '-DLLAMA_AVX2=OFF'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
          - build: 'avx512'
-            defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'clblast'
-            defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
          - build: 'openblas'
-            defines: '-DLLAMA_OPENBLAS=ON -DBLAS_LIBRARIES="/LIBPATH:$env:RUNNER_TEMP/openblas/lib" -DOPENBLAS_INC="$env:RUNNER_TEMP/openblas/include"'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'

    steps:
      - name: Clone
@@ -184,13 +184,13 @@ jobs:
        id: get_clblast
        if: ${{ matrix.build == 'clblast' }}
        run: |
-          curl.exe -o $env:RUNNER_TEMP/clblast.zip -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-Windows-x64.zip"
+          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/clblast
-          tar.exe -xvf $env:RUNNER_TEMP/clblast.zip -C $env:RUNNER_TEMP/clblast
+          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
+          rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
          foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
            $txt = Get-Content -Path $f -Raw
-            $txt.Replace('C:/dependencies/opencl/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
+            $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
          }

      - name: Download OpenBLAS
@@ -213,7 +213,6 @@ jobs:
          cd build
          cmake .. ${{ matrix.defines }}
          cmake --build . --config Release
-          cp ../LICENSE ./bin/Release/llama.cpp.txt

      - name: Add clblast.dll
        id: add_clblast_dll
@@ -258,6 +257,7 @@ jobs:
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
+          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*

      - name: Upload artifacts
@@ -292,7 +292,7 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_CUBLAS=ON
+          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
          cmake --build . --config Release

      - name: Get commit hash
--- a/.github/workflows/tidy-post.yml
+++ b/.github/workflows/tidy-post.yml
@@ -0,0 +1,20 @@
+name: clang-tidy review post comments
+
+on:
+  workflow_dispatch:
+    workflows: ["clang-tidy-review"]
+    types:
+      - completed
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: ZedThree/clang-tidy-review/post@v0.13.0
+        # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
+        with:
+          # adjust options as necessary
+          lgtm_comment_body: ''
+          annotations: false
+          max_comments: 25
--- a/.github/workflows/tidy-review.yml
+++ b/.github/workflows/tidy-review.yml
@@ -0,0 +1,23 @@
+name: clang-tidy-review
+
+on:
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  clang-tidy-review:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - uses: ZedThree/clang-tidy-review@v0.13.0
+      id: review
+      with:
+        lgtm_comment_body: ''
+        build_dir: build
+        cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
+        split_workflow: true
+
+    - uses: ZedThree/clang-tidy-review/upload@v0.13.0
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
 .envrc
 .swiftpm
 .venv
+.clang-tidy
 .vs/
 .vscode/

@@ -16,6 +17,8 @@ build-debug/
 build-release/
 build-static/
 build-cublas/
+build-opencl/
+build-metal/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
@@ -29,9 +32,11 @@ models/*
 /result
 /perplexity
 /embedding
+/train-text-from-scratch
 /benchmark-matmult
 /vdot
 /Pipfile
+/libllama.so

 build-info.h
 arm_neon.h
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,40 +37,46 @@ endif()
 #

 # general
-option(LLAMA_STATIC                 "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                 "llama: enable -march=native flag"                      OFF)
-option(LLAMA_LTO                    "llama: enable link time optimization"                  OFF)
+option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
+option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
+option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)

 # debug
-option(LLAMA_ALL_WARNINGS           "llama: enable all compiler warnings"                   ON)
-option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
-option(LLAMA_GPROF                  "llama: enable gprof"                                   OFF)
+option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
+option(LLAMA_ALL_WARNINGS_3RD_PARTY     "llama: enable all compiler warnings in 3rd party libs" OFF)
+option(LLAMA_GPROF                      "llama: enable gprof"                                   OFF)

 # sanitizers
-option(LLAMA_SANITIZE_THREAD        "llama: enable thread sanitizer"                        OFF)
-option(LLAMA_SANITIZE_ADDRESS       "llama: enable address sanitizer"                       OFF)
-option(LLAMA_SANITIZE_UNDEFINED     "llama: enable undefined sanitizer"                     OFF)
+option(LLAMA_SANITIZE_THREAD            "llama: enable thread sanitizer"                        OFF)
+option(LLAMA_SANITIZE_ADDRESS           "llama: enable address sanitizer"                       OFF)
+option(LLAMA_SANITIZE_UNDEFINED         "llama: enable undefined sanitizer"                     OFF)

 # instruction set specific
-option(LLAMA_AVX                    "llama: enable AVX"                                     ON)
-option(LLAMA_AVX2                   "llama: enable AVX2"                                    ON)
-option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
-option(LLAMA_AVX512_VBMI            "llama: enable AVX512-VBMI"                             OFF)
-option(LLAMA_AVX512_VNNI            "llama: enable AVX512-VNNI"                             OFF)
-option(LLAMA_FMA                    "llama: enable FMA"                                     ON)
+option(LLAMA_AVX                        "llama: enable AVX"                                     ON)
+option(LLAMA_AVX2                       "llama: enable AVX2"                                    ON)
+option(LLAMA_AVX512                     "llama: enable AVX512"                                  OFF)
+option(LLAMA_AVX512_VBMI                "llama: enable AVX512-VBMI"                             OFF)
+option(LLAMA_AVX512_VNNI                "llama: enable AVX512-VNNI"                             OFF)
+option(LLAMA_FMA                        "llama: enable FMA"                                     ON)
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
-    option(LLAMA_F16C               "llama: enable F16C"                                    ON)
+    option(LLAMA_F16C                   "llama: enable F16C"                                    ON)
 endif()

 # 3rd party libs
-option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
-option(LLAMA_OPENBLAS               "llama: use OpenBLAS"                                   OFF)
-option(LLAMA_CUBLAS                 "llama: use cuBLAS"                                     OFF)
-option(LLAMA_CLBLAST                "llama: use CLBlast"                                    OFF)
+option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
+option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
+set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
+option(LLAMA_CUBLAS                          "llama: use cuBLAS"                                OFF)
+set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
+set(LLAMA_CUDA_DMMV_Y       "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
+option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
+option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
+option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)

-option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER               "llama: build server example"                           OFF)

 #
 # Build info header
@@ -145,36 +151,40 @@ if (APPLE AND LLAMA_ACCELERATE)
    endif()
 endif()

-if (LLAMA_OPENBLAS)
+if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
    endif()
-
-    set(BLA_VENDOR OpenBLAS)
+    if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
+        set(BLA_SIZEOF_INTEGER 8)
+    endif()
+    set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
    find_package(BLAS)
    if (BLAS_FOUND)
-        message(STATUS "OpenBLAS found")
+        message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")

+        # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
+        # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
+        find_path(BLAS_INCLUDE_DIRS
+            NAMES cblas.h
+            HINTS
+                /usr/include
+                /usr/local/include
+                /usr/include/openblas
+        )
+
+        
+        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
+
+        add_compile_options(${BLAS_LINKER_FLAGS})
        add_compile_definitions(GGML_USE_OPENBLAS)
-        add_link_options(${BLAS_LIBRARIES})
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})

-        # find header file
-        set(OPENBLAS_INCLUDE_SEARCH_PATHS
-            /usr/include
-            /usr/include/openblas
-            /usr/include/openblas-base
-            /usr/local/include
-            /usr/local/include/openblas
-            /usr/local/include/openblas-base
-            /opt/OpenBLAS/include
-            $ENV{OpenBLAS_HOME}
-            $ENV{OpenBLAS_HOME}/include
-            )
-        find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
-        add_compile_options(-I${OPENBLAS_INC})
    else()
-        message(WARNING "OpenBLAS not found")
+        message(WARNING "BLAS not found, please refer to "
+        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
+        " to set correct LLAMA_BLAS_VENDOR")
    endif()
 endif()

@@ -187,9 +197,11 @@ if (LLAMA_CUBLAS)

        enable_language(CUDA)

-        set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
+        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)

        add_compile_definitions(GGML_USE_CUBLAS)
+        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
+        add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})

        if (LLAMA_STATIC)
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -202,12 +214,42 @@ if (LLAMA_CUBLAS)
    endif()
 endif()

+if (LLAMA_METAL)
+    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
+    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
+    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
+    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
+
+    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
+
+    add_compile_definitions(GGML_USE_METAL)
+    add_compile_definitions(GGML_METAL_NDEBUG)
+
+    # get full path to the file
+    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
+
+    # copy ggml-metal.metal to bin directory
+    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
+
+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+        ${METALPERFORMANCE_FRAMEWORK}
+        )
+endif()
+
+if (LLAMA_K_QUANTS)
+    set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
+    add_compile_definitions(GGML_USE_K_QUANTS)
+endif()
+
 if (LLAMA_CLBLAST)
    find_package(CLBlast)
    if (CLBlast_FOUND)
        message(STATUS "CLBlast found")

-        set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)
+        set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)

        add_compile_definitions(GGML_USE_CLBLAST)

@@ -372,10 +414,13 @@ endif()
 add_library(ggml OBJECT
            ggml.c
            ggml.h
-            ${GGML_CUDA_SOURCES}
-            ${GGML_OPENCL_SOURCES})
+            ${GGML_SOURCES_CUDA}
+            ${GGML_SOURCES_OPENCL}
+            ${GGML_SOURCES_METAL}
+            ${GGML_SOURCES_EXTRA}
+            )

-target_include_directories(ggml PUBLIC .)
+target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})

@@ -386,21 +431,28 @@ endif()
 add_library(llama
            llama.cpp
            llama.h
-            llama-util.h)
+            llama-util.h
+            )

 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
-target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
+target_link_libraries(llama PRIVATE
+    ggml
+    ${LLAMA_EXTRA_LIBS}
+    )

 if (BUILD_SHARED_LIBS)
    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    if (LLAMA_METAL)
+        set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
+    endif()
 endif()

-if (GGML_CUDA_SOURCES)
+if (GGML_SOURCES_CUDA)
    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
-    set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
-    set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
+    set_property(TARGET ggml  PROPERTY CUDA_ARCHITECTURES OFF)
+    set_property(TARGET ggml  PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
    set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
 endif()

--- a/123
+++ b/123
@@ -1,5 +1,11 @@
 # Define the default target now so that it is always the first target
-default: main quantize quantize-stats perplexity embedding vdot
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch
+
+ifdef LLAMA_BUILD_SERVER
+	BUILD_TARGETS += server
+endif
+
+default: $(BUILD_TARGETS)

 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@@ -34,11 +40,18 @@ endif
 #

 # keep standard at C11 and C++11
-CFLAGS   = -I.              -O3 -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
+# -Ofast tends to produce faster code, but may not be available for some compilers.
+#OPT = -Ofast
+OPT = -O3
+CFLAGS   = -I.              $(OPT) -std=c11   -fPIC
+CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
 LDFLAGS  =

-ifndef LLAMA_DEBUG
+ifdef LLAMA_DEBUG
+	CFLAGS   += -O0 -g
+	CXXFLAGS += -O0 -g
+	LDFLAGS  += -g
+else
 	CFLAGS   += -DNDEBUG
 	CXXFLAGS += -DNDEBUG
 endif
@@ -74,6 +87,15 @@ ifeq ($(UNAME_S),Haiku)
 	CXXFLAGS += -pthread
 endif

+ifdef LLAMA_GPROF
+	CFLAGS   += -pg
+	CXXFLAGS += -pg
+endif
+ifdef LLAMA_PERF
+	CFLAGS   += -DGGML_PERF
+	CXXFLAGS += -DGGML_PERF
+endif
+
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
@@ -85,7 +107,12 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 	# Usage AVX-only
 	#CFLAGS   += -mfma -mf16c -mavx
 	#CXXFLAGS += -mfma -mf16c -mavx
+
+	# Usage SSSE3-only (Not is SSE3!)
+	#CFLAGS   += -mssse3
+	#CXXFLAGS += -mssse3
 endif
+
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
@@ -97,6 +124,13 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
 		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
 	endif
 endif
+
+ifndef LLAMA_NO_K_QUANTS
+	CFLAGS   += -DGGML_USE_K_QUANTS
+	CXXFLAGS += -DGGML_USE_K_QUANTS
+	OBJS     += k_quants.o
+endif
+
 ifndef LLAMA_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework.
 	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
@@ -104,15 +138,22 @@ ifndef LLAMA_NO_ACCELERATE
 		CFLAGS  += -DGGML_USE_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
-endif
+endif # LLAMA_NO_ACCELERATE
+
 ifdef LLAMA_OPENBLAS
-	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
+	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
 	ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
 		LDFLAGS += -lopenblas -lcblas
 	else
 		LDFLAGS += -lopenblas
 	endif
-endif
+endif # LLAMA_OPENBLAS
+
+ifdef LLAMA_BLIS
+	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
+	LDFLAGS += -lblis -L/usr/local/lib
+endif # LLAMA_BLIS
+
 ifdef LLAMA_CUBLAS
 	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
@@ -120,11 +161,23 @@ ifdef LLAMA_CUBLAS
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
 	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
+ifdef LLAMA_CUDA_DMMV_X
+	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+else
+	NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
+endif # LLAMA_CUDA_DMMV_X
+ifdef LLAMA_CUDA_DMMV_Y
+	NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
+else
+	NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
+endif # LLAMA_CUDA_DMMV_Y
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
-endif
+endif # LLAMA_CUBLAS
+
 ifdef LLAMA_CLBLAST
-	CFLAGS  += -DGGML_USE_CLBLAST
+	CFLAGS   += -DGGML_USE_CLBLAST
+	CXXFLAGS += -DGGML_USE_CLBLAST
 	# Mac provides OpenCL as a framework
 	ifeq ($(UNAME_S),Darwin)
 		LDFLAGS += -lclblast -framework OpenCL
@@ -132,36 +185,48 @@ ifdef LLAMA_CLBLAST
 		LDFLAGS += -lclblast -lOpenCL
 	endif
 	OBJS    += ggml-opencl.o
-ggml-opencl.o: ggml-opencl.c ggml-opencl.h
+
+ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif # LLAMA_CLBLAST
+
+ifdef LLAMA_METAL
+	CFLAGS   += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
+	CXXFLAGS += -DGGML_USE_METAL
+	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
+	OBJS     += ggml-metal.o
+
+ggml-metal.o: ggml-metal.m ggml-metal.h
 	$(CC) $(CFLAGS) -c $< -o $@
-endif
-ifdef LLAMA_GPROF
-	CFLAGS   += -pg
-	CXXFLAGS += -pg
-endif
-ifdef LLAMA_PERF
-	CFLAGS   += -DGGML_PERF
-	CXXFLAGS += -DGGML_PERF
-endif
+endif # LLAMA_METAL
+
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	# Apple M1, M2, etc.
 	# Raspberry Pi 3, 4, Zero 2 (64-bit)
 	CFLAGS   += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
+
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# Raspberry Pi 1, Zero
 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
 endif
+
 ifneq ($(filter armv7%,$(UNAME_M)),)
 	# Raspberry Pi 2
 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif
+
 ifneq ($(filter armv8%,$(UNAME_M)),)
 	# Raspberry Pi 3, 4, Zero 2 (32-bit)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif

+ifdef LLAMA_NO_K_QUANTS
+k_quants.o: k_quants.c k_quants.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif # LLAMA_NO_K_QUANTS
+
 #
 # Print build information
 #
@@ -194,33 +259,39 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 clean:
-	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h
+	rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h

 #
 # Examples
 #

-main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo

-quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS)
+quantize: examples/quantize/quantize.cpp                      build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+perplexity: examples/perplexity/perplexity.cpp                build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+embedding: examples/embedding/embedding.cpp                   build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
+
+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
@@ -240,6 +311,6 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-.PHONY: tests
+.PHONY: tests clean
 tests:
 	bash ./tests/run-tests.sh
--- a/Package.swift
+++ b/Package.swift
@@ -11,6 +11,7 @@ let package = Package(
        .target(
            name: "llama",
            path: ".",
+            exclude: ["ggml-metal.metal"],
            sources: ["ggml.c", "llama.cpp"],
            publicHeadersPath: "spm-headers",
            cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
--- a/README.md
+++ b/README.md
@@ -9,8 +9,12 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 **Hot topics:**

- Qauntization formats `Q4` and `Q5` have changed - requantize any old models [(info)](https://github.com/ggerganov/llama.cpp/pull/1405)
- [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
+- Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
+- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642
+- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684
+- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607
+- Training LLaMA models from scratch: https://github.com/ggerganov/llama.cpp/pull/1652
+- CPU threading improvements: https://github.com/ggerganov/llama.cpp/pull/1632

 <details>
  <summary>Table of Contents</summary>
@@ -50,12 +54,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook

 - Plain C/C++ implementation without dependencies
- Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
+- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
 - AVX, AVX2 and AVX512 support for x86 architectures
 - Mixed F16 / F32 precision
 - 4-bit, 5-bit and 8-bit integer quantization support
- Runs on the CPU
- OpenBLAS support
+- Supports OpenBLAS/Apple BLAS/ARM Performance Lib/ATLAS/BLIS/Intel MKL/NVHPC/ACML/SCSL/SGIMATH and [more](https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors) in BLAS
 - cuBLAS and CLBlast support

 The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
@@ -80,6 +83,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
 - [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
 - [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
+- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)

 **Bindings:**

@@ -87,6 +91,7 @@ as the main playground for developing new features for the [ggml](https://github
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
+- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)

 **UI:**

@@ -233,15 +238,41 @@ In order to build llama.cpp you have three different options.
    zig build -Drelease-fast
    ```

+### Metal Build
+
+Using Metal allows the computation to be executed on the GPU for Apple devices:
+
+- Using `make`:
+
+  ```bash
+  LLAMA_METAL=1 make
+  ```
+
+- Using `CMake`:
+
+    ```bash
+    mkdir build-metal
+    cd build-metal
+    cmake -DLLAMA_METAL=ON ..
+    cmake --build . --config Release
+    ```
+
+When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
+Any value larger than 0 will offload the computation to the GPU. For example:
+
+```bash
+./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
+```
+
 ### BLAS Build

 Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:

- Accelerate Framework:
+- #### Accelerate Framework:

  This is only available on Mac PCs and it's enabled by default. You can just build using the normal instructions.

- OpenBLAS:
+- #### OpenBLAS:

  This provides BLAS acceleration using only the CPU. Make sure to have OpenBLAS installed on your machine.

@@ -271,11 +302,26 @@ Building the program with BLAS support may lead to some performance improvements
      ```bash
      mkdir build
      cd build
-      cmake .. -DLLAMA_OPENBLAS=ON
+      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
      cmake --build . --config Release
      ```

- cuBLAS
+- #### BLIS
+
+  Check [BLIS.md](docs/BLIS.md) for more information.
+
+- #### Intel MKL
+
+  By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:
+
+  ```bash
+  mkdir build
+  cd build
+  cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+  cmake --build . --config Release
+  ```
+
+- #### cuBLAS

  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
  - Using `make`:
@@ -290,8 +336,83 @@ Building the program with BLAS support may lead to some performance improvements
    cmake .. -DLLAMA_CUBLAS=ON
    cmake --build . --config Release
    ```
+  Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.

-Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
+  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
+
+- #### CLBlast
+
+  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
+
+  You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
+    - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.
+
+    - <details>
+        <summary>Installing the OpenCL SDK from source</summary>
+
+        ```sh
+        git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
+        mkdir OpenCL-SDK/build
+        cd OpenCL-SDK/build
+        cmake .. -DBUILD_DOCS=OFF \
+          -DBUILD_EXAMPLES=OFF \
+          -DBUILD_TESTING=OFF \
+          -DOPENCL_SDK_BUILD_SAMPLES=OFF \
+          -DOPENCL_SDK_TEST_SAMPLES=OFF
+        cmake --build . --config Release
+        cmake --install . --prefix /some/path
+        ```
+      </details>
+
+  Installing CLBlast: it may be found in your operating system's packages.
+
+  - <details>
+    <summary>If not, then installing from source:</summary>
+
+      ```sh
+      git clone https://github.com/CNugteren/CLBlast.git
+      mkdir CLBlast/build
+      cd CLBLast/build
+      cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
+      cmake --build . --config Release
+      cmake --install . --prefix /some/path
+      ```
+
+      Where `/some/path` is where the built library will be installed (default is `/usr/local`).
+    </details>
+
+  Building:
+
+  - Build with make:
+    ```sh
+    make LLAMA_CLBLAST=1
+    ```
+  - CMake:
+    ```sh
+    mkdir build
+    cd build
+    cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
+    cmake --build . --config Release
+    ```
+
+  Running:
+
+  The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
+
+  To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
+  The selection can be a number (starting from 0) or a text string to search:
+
+  ```sh
+  GGML_OPENCL_PLATFORM=1 ./main ...
+  GGML_OPENCL_DEVICE=2 ./main ...
+  GGML_OPENCL_PLATFORM=Intel ./main ...
+  GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
+  ```
+
+  The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
+  Using the variables it is possible to select a CPU-based driver as well, if so desired.
+
+  You can get a list of platforms and devices from the `clinfo -l` command, etc.

 ### Prepare Data & Run

@@ -332,16 +453,16 @@ Several quantization methods are supported. They differ in the resulting model d

 | Model | Measure      | F16    | Q4_0   | Q4_1   | Q5_0   | Q5_1   | Q8_0   |
 |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|
-|    7B | perplexity   | 5.9066 | 6.1620 | 6.0910 | 5.9862 | 5.9481 | 5.9069 |
-|    7B | file size    |  13.0G |   4.0G |   4.8G |   4.4G |   4.8G |   7.1G |
-|    7B | ms/tok @ 4th |    128 |     50 |     54 |     75 |     83 |     75 |
-|    7B | ms/tok @ 8th |    123 |     44 |     52 |     53 |     58 |     72 |
-|    7B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.5 |    6.0 |    9.0 |
-|   13B | perplexity   | 5.2543 | 5.3863 | 5.3607 | 5.2856 | 5.2706 | 5.2548 |
-|   13B | file size    |  25.0G |   7.6G |   9.1G |   8.4G |   9.1G |    14G |
-|   13B | ms/tok @ 4th |    239 |     93 |    101 |    150 |    164 |    141 |
-|   13B | ms/tok @ 8th |    240 |     81 |     96 |     96 |    104 |    136 |
-|   13B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.5 |    6.0 |    9.0 |
+|    7B | perplexity   | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 |
+|    7B | file size    |  13.0G |   3.5G |   3.9G |   4.3G |   4.7G |   6.7G |
+|    7B | ms/tok @ 4th |    127 |     55 |     54 |     76 |     83 |     72 |
+|    7B | ms/tok @ 8th |    122 |     43 |     45 |     52 |     56 |     67 |
+|    7B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
+|   13B | perplexity   | 5.2543 | 5.3860 | 5.3608 | 5.2856 | 5.2706 | 5.2548 |
+|   13B | file size    |  25.0G |   6.8G |   7.6G |   8.3G |   9.1G |    13G |
+|   13B | ms/tok @ 4th |      - |    103 |    105 |    148 |    160 |    131 |
+|   13B | ms/tok @ 8th |      - |     73 |     82 |     98 |    105 |    128 |
+|   13B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |

 ### Perplexity (measuring model quality)

@@ -373,6 +494,25 @@ Note the use of `--color` to distinguish between user input and generated text.

 ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)

+### Persistent Interaction
+
+The prompt, user inputs, and model generations can be saved and resumed across calls to `./main` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file.
+
+```bash
+# Start a new chat
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
+
+# Resume that chat
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh
+
+# Start a different chat with the same prompt/model
+PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh
+
+# Different prompt cache for different prompt/model
+PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
+    CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh
+```
+
 ### Instruction mode with Alpaca

 1. First, download the `ggml` Alpaca model into the `./models` folder
@@ -545,3 +685,4 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /mode
 ### Docs

 - [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
--- a/8
+++ b/8
@@ -1,6 +1,6 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_0.bin
+ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf  models/7B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_1.bin
@@ -8,7 +8,7 @@ ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_0.bin
+fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5  models/13B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_1.bin
@@ -18,7 +18,7 @@ e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/con
 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q4_0.bin
+d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d  models/30B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_1.bin
@@ -32,7 +32,7 @@ a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/con
 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q4_0.bin
+cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92  models/65B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_1.bin
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -4,7 +4,9 @@ import argparse

 import convert

-parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
+parser = argparse.ArgumentParser(
+    description="""[DEPRECATED - use `convert.py` instead]
+    Convert a LLaMA model checkpoint to a ggml compatible file""")
 parser.add_argument('dir_model',  help='directory containing the model checkpoint')
 parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
 args = parser.parse_args()
--- a/convert.py
+++ b/convert.py
@@ -121,7 +121,6 @@ def make_tensors_list() -> List[str]:
            f'layers.{i}.feed_forward.w1.weight',
            f'layers.{i}.feed_forward.w2.weight',
            f'layers.{i}.feed_forward.w3.weight',
-            f'layers.{i}.atttention_norm.weight',
            f'layers.{i}.ffn_norm.weight',
        ]
    return ret
@@ -1055,7 +1054,7 @@ def load_some_model(path: Path) -> ModelPlus:
        files = list(path.glob("model-00001-of-*.safetensors"))
        if not files:
            # Try the PyTorch patterns too, with lower priority
-            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt"]
+            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin" ]
            files = [file for glob in globs for file in path.glob(glob)]
        if not files:
            # Try GGML too, but with lower priority, since if both a non-GGML
--- a/docs/BLIS.md
+++ b/docs/BLIS.md
@@ -0,0 +1,67 @@
+BLIS Installation Manual
+------------------------
+
+BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
+
+Project URL: https://github.com/flame/blis
+
+### Prepare:
+
+Compile BLIS:
+
+```bash
+git clone https://github.com/flame/blis
+cd blis
+./configure --enable-cblas -t openmp,pthreads auto
+# will install to /usr/local/ by default.
+make -j
+```
+
+Install BLIS:
+
+```bash
+sudo make install
+```
+
+We recommend using openmp since it's easier to modify the cores been used.
+
+### llama.cpp compilation
+
+Makefile:
+
+```bash
+make LLAMA_BLIS=1 -j
+# make LLAMA_BLIS=1 benchmark-matmult
+```
+
+CMake:
+
+```bash
+mkdir build
+cd build
+cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
+make -j
+```
+
+### llama.cpp execution
+
+According to the BLIS documentation, we could set the following
+environment variables to modify the behavior of openmp:
+
+```
+export GOMP_GPU_AFFINITY="0-19"
+export BLIS_NUM_THREADS=14
+```
+
+And then run the binaries as normal.
+
+
+### Intel specific issue
+
+Some might get the error message saying that `libimf.so` cannot be found.
+Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
+
+### Reference:
+
+1. https://github.com/flame/blis#getting-started
+2. https://github.com/flame/blis/blob/master/docs/Multithreading.md
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@@ -0,0 +1,40 @@
+# Token generation performance troubleshooting
+
+## Verifying that the model is running on the GPU with cuBLAS
+Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
+```shell
+./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some "
+```
+
+When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
+```shell
+llama_model_load_internal: [cublas] offloading 60 layers to GPU
+llama_model_load_internal: [cublas] offloading output layer to GPU
+llama_model_load_internal: [cublas] total VRAM used: 17223 MB
+... rest of inference
+```
+
+If you see these lines, then the GPU is being used.
+
+## Verifying that the CPU is not oversaturated
+llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physicial CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
+
+# Example of runtime flags effect on inference speed benchmark
+These runs were tested on the following machine:
+GPU: A6000 (48GB VRAM)
+CPU: 7 physical cores
+RAM: 32GB
+
+Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML)
+
+Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
+
+Result:
+
+| command | tokens/second (higher is better) |
+| - | - |
+| -ngl 2000000 | N/A (less than 0.1) |
+| -t 7 | 1.7 |
+| -t 1 -ngl 2000000 | 5.5 |
+| -t 7 -ngl 2000000 | 8.7 |
+| -t 4 -ngl 2000000 | 9.1 |
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -36,4 +36,12 @@ else()
    add_subdirectory(embedding)
    add_subdirectory(save-load-state)
    add_subdirectory(benchmark)
+    add_subdirectory(baby-llama)
+    add_subdirectory(train-text-from-scratch)
+    if (LLAMA_METAL)
+        add_subdirectory(metal)
+    endif()
+    if (LLAMA_BUILD_SERVER)
+        add_subdirectory(server)
+    endif()
 endif()
--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET baby-llama)
+add_executable(${TARGET} baby-llama.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -1,6 +1,7 @@
-#include <locale.h>
 #include "ggml.h"
 #include "build-info.h"
+
+#include <locale.h>
 #include <assert.h>
 #include <math.h>
 #include <cstring>
@@ -15,7 +16,7 @@
 #include <iterator>
 #include <algorithm>

-float tensor_sum_elements(struct ggml_tensor * tensor) {
+float tensor_sum_elements(const ggml_tensor * tensor) {
    float sum = 0;
    if (tensor->type==GGML_TYPE_F32) {
        for (int j = 0; j < tensor->ne[1]; j++) {
@@ -27,21 +28,15 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
    return sum;
 }

+void tensor_dump(const ggml_tensor * tensor, const char * name) {
+    printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", name,
+        tensor->type, ggml_type_name(tensor->type),
+        (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
+    float sum = tensor_sum_elements(tensor);
+    printf("Sum of tensor %s is %6.2f\n", name, sum);
+}

-/*
-    These are mapping to unknown
-    GGML_TYPE_I8,
-    GGML_TYPE_I16,
-    GGML_TYPE_I32,
-    GGML_TYPE_COUNT,
-*/
-
-#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
-
-#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
-        TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
-        (int) TENSOR->ne[0], (int) TENSOR->ne[1], (int) TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
-    { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
+#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)

 struct benchmark_params_struct {
    int32_t n_threads     = 1;
@@ -59,8 +54,6 @@ void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct para
 }

 int main(int argc, char ** argv)  {
-
-
    struct benchmark_params_struct benchmark_params;

    bool invalid_param = false;
@@ -84,11 +77,11 @@ int main(int argc, char ** argv)  {
            print_usage(argc, argv, benchmark_params);
            exit(0);
        }
-        if (invalid_param) {
-            fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-            print_usage(argc, argv, benchmark_params);
-            exit(1);
-        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv, benchmark_params);
+        exit(1);
    }

    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -216,10 +209,10 @@ int main(int argc, char ** argv)  {
    // Let's use the F32 result from above as a reference for the q4_0 multiplication
    float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);

+    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
+    printf("=====================================================================================\n");

-    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
-    printf("==============================================================================================\n");
-
+    double  gflops_sum = 0;
    for (int i=0;i<benchmark_params.n_iterations ;i++) {

        long long int start = ggml_time_us();
@@ -227,12 +220,13 @@ int main(int argc, char ** argv)  {
        ggml_graph_compute(ctx, &gf31);
        long long int stop = ggml_time_us();
        long long int usec = stop-start;
-        float flops_per_usec = (1.0f*flops_per_matrix)/usec;
-        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
+        double gflops = (double)(flops_per_matrix)/usec/1000.0;
+        gflops_sum += gflops;
+        printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
            i,
            gf31.n_threads,
            sizex, sizey, sizez, flops_per_matrix,
-            usec,flops_per_usec);
+            usec,gflops);

 #ifdef VERBOSE_DEBUGGING
        TENSOR_DUMP("res",gf31.nodes[0])
@@ -256,7 +250,8 @@ int main(int argc, char ** argv)  {

        // Running a different graph computation to make sure we override the CPU cache lines
        ggml_graph_compute(ctx, &gf32);
-
    }
-
+    printf("\n");
+    printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
+    printf("=====================================================================================\n");
 }
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+set -euo pipefail
+
+cd "$(dirname "$0")/.." || exit
+
+if [[ -z "${PROMPT_CACHE_FILE+x}" || -z "${CHAT_SAVE_DIR+x}" ]]; then
+    echo >&2 "error: PROMPT_CACHE_FILE and CHAT_SAVE_DIR must be provided"
+    exit 1
+fi
+
+MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
+PROMPT_TEMPLATE="${PROMPT_TEMPLATE:-./prompts/chat.txt}"
+USER_NAME="${USER_NAME:-User}"
+AI_NAME="${AI_NAME:-ChatLLaMa}"
+DATE_TIME="$(date +%H:%M)"
+DATE_YEAR="$(date +%Y)"
+
+LOG="${CHAT_SAVE_DIR}/main.log"
+LOG_BG="${CHAT_SAVE_DIR}/main-bg.log"
+CUR_PROMPT_FILE="${CHAT_SAVE_DIR}/current-prompt.txt"
+CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
+NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
+NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
+
+SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
+SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
+SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
+
+CTX_SIZE=2048
+CTX_ROTATE_POINT=$((CTX_SIZE * 3 / 5)) # REVIEW
+OPTS=(--model "$MODEL" --ctx_size "$CTX_SIZE" --repeat_last_n 256 "$@")
+
+# An unbuffered `tail -c+N`
+skip_bytes() {
+    LANG=C IFS= read -r -n "$1" -d '' c
+    while LANG=C IFS= read -r -n 1 -d '' c; do
+        printf '%s' "$c"
+    done
+}
+
+mkdir -p "$CHAT_SAVE_DIR"
+echo >"$LOG"
+trap "tail -n100 ${LOG}" EXIT
+
+if [[ ! -e "$CUR_PROMPT_FILE" ]]; then
+    sed -e "s/\[\[USER_NAME\]\]/${USER_NAME}/g" \
+        -e "s/\[\[AI_NAME\]\]/${AI_NAME}/g" \
+        -e "s/\[\[DATE_TIME\]\]/${DATE_TIME}/g" \
+        -e "s/\[\[DATE_YEAR\]\]/${DATE_YEAR}/g" \
+        "$PROMPT_TEMPLATE" >"$CUR_PROMPT_FILE"
+fi
+
+if [[ ! -e "$NEXT_PROMPT_FILE" ]]; then
+    sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
+fi
+
+if [[ "$(tail -c4 "$NEXT_PROMPT_FILE")" != "..." ]]; then
+    echo '...' >>"$NEXT_PROMPT_FILE"
+fi
+
+if [[ ! -e "$PROMPT_CACHE_FILE" ]]; then
+    echo 'Prompt cache does not exist, building...'
+    # Default batch_size to 8 here for better user feedback during initial prompt processing
+    ./main 2>>"$LOG" \
+        --batch_size 8 \
+        "${OPTS[@]}" \
+        --prompt-cache "$PROMPT_CACHE_FILE" \
+        --file "$CUR_PROMPT_FILE" \
+        --n_predict 1
+    echo
+    echo 'Done!'
+fi
+
+if [[ ! -e "$CUR_PROMPT_CACHE" ]]; then
+    cp "$PROMPT_CACHE_FILE" "$CUR_PROMPT_CACHE"
+fi
+if [[ ! -e "$NEXT_PROMPT_CACHE" ]]; then
+    cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
+fi
+
+printf '%s ' "$(< "$CUR_PROMPT_FILE")"
+n_tokens=0
+
+while read -e line; do
+    # Limit generation to remaining context, with a buffer and estimating 2 chars/token for input
+    n_predict=$((CTX_SIZE - n_tokens - ${#line} / 2 - 32))
+
+    # Swap prompts when we're about to run out of context
+    if ((n_predict <= 0)); then
+        wait # for background main (below) to finish with next prompt
+        mv "$NEXT_PROMPT_FILE"  "$CUR_PROMPT_FILE"
+        mv "$NEXT_PROMPT_CACHE" "$CUR_PROMPT_CACHE"
+
+        sed -r "$SED_DELETE_MESSAGES" "$CUR_PROMPT_FILE" >"$NEXT_PROMPT_FILE"
+        echo '...' >>"$NEXT_PROMPT_FILE"
+        cp "$PROMPT_CACHE_FILE" "$NEXT_PROMPT_CACHE"
+
+        n_tokens=0
+        n_predict=$((CTX_SIZE / 2))
+    fi
+
+    echo " ${line}" >>"$CUR_PROMPT_FILE"
+    if ((n_tokens > CTX_ROTATE_POINT)); then
+        echo " ${line}" >>"$NEXT_PROMPT_FILE"
+    fi
+
+    n_prompt_len_pre=$(($(wc -c <"$CUR_PROMPT_FILE")))
+
+    printf '%s: ' "$AI_NAME" >>"$CUR_PROMPT_FILE"
+
+    ./main 2>>"$LOG" "${OPTS[@]}" \
+            --prompt-cache "$CUR_PROMPT_CACHE" \
+            --prompt-cache-all \
+            --file "$CUR_PROMPT_FILE" \
+            --reverse-prompt "${USER_NAME}:" \
+            --n_predict "$n_predict" |
+        skip_bytes 1 |                  # skip BOS token added by ./main
+        tee "$CUR_PROMPT_FILE.tmp" |    # save prompt + generation to tmp file
+        skip_bytes "$n_prompt_len_pre"  # print generation
+
+    mv "$CUR_PROMPT_FILE.tmp" "$CUR_PROMPT_FILE"
+
+    # if we hit n_predict instead of reverse-prompt, we need to add the prompt
+    if [[ "$(tail -n1 "$CUR_PROMPT_FILE")" != "${USER_NAME}:" ]]; then
+        printf '\n%s:' "$USER_NAME"
+        printf '\n%s:' "$USER_NAME" >> "$CUR_PROMPT_FILE"
+    fi
+
+    printf ' '
+
+    # HACK get num tokens from debug message
+    # TODO get both messages in one go
+    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
+        ! sample_time_msg="$( tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
+        echo >&2 "Couldn't get number of tokens from ./main output!"
+        exit 1
+    fi
+
+    n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
+
+    if ((n_tokens > CTX_ROTATE_POINT)); then
+        tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
+    fi
+
+    # Update cache for next prompt in background, ideally during user input
+    ./main >>"$LOG_BG" 2>&1 "${OPTS[@]}" \
+          --prompt-cache "$NEXT_PROMPT_CACHE" \
+          --file "$NEXT_PROMPT_FILE" \
+          --n_predict 1 &
+done
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -8,6 +8,8 @@
 #include <iterator>
 #include <algorithm>
 #include <sstream>
+#include <unordered_set>
+#include <regex>

 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -28,21 +30,21 @@

 int32_t get_num_physical_cores() {
 #ifdef __linux__
-    std::ifstream cpuinfo("/proc/cpuinfo");
-    std::string line;
-    while (std::getline(cpuinfo, line)) {
-        std::size_t pos = line.find("cpu cores");
-        if (pos != std::string::npos) {
-            pos = line.find(": ", pos);
-            if (pos != std::string::npos) {
-                try {
-                    // Extract the number and return it
-                    return static_cast<int32_t>(std::stoul(line.substr(pos + 2)));
-                } catch (const std::invalid_argument &) {
-                    // Ignore if we could not parse
-                }
-            }
+    // enumerate the set of thread siblings, num entries is num cores
+    std::unordered_set<std::string> siblings;
+    for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
+        std::ifstream thread_siblings("/sys/devices/system/cpu"
+            + std::to_string(cpu) + "/topology/thread_siblings");
+        if (!thread_siblings.is_open()) {
+            break; // no more cpus
        }
+        std::string line;
+        if (std::getline(thread_siblings, line)) {
+            siblings.insert(line);
+        }
+    }
+    if (siblings.size() > 0) {
+        return static_cast<int32_t>(siblings.size());
    }
 #elif defined(__APPLE__) && defined(__MACH__)
    int32_t num_physical_cores;
@@ -91,9 +93,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    bool escape_prompt = false;
    std::string arg;
    gpt_params default_params;
+    const std::string arg_prefix = "--";

    for (int i = 1; i < argc; i++) {
        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }

        if (arg == "-s" || arg == "--seed") {
 #if defined(GGML_USE_CUBLAS)
@@ -126,6 +132,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.path_prompt_cache = argv[i];
        } else if (arg == "--prompt-cache-all") {
            params.prompt_cache_all = true;
+        } else if (arg == "--prompt-cache-ro") {
+            params.prompt_cache_ro = true;
        } else if (arg == "-f" || arg == "--file") {
            if (++i >= argc) {
                invalid_param = true;
@@ -141,27 +149,27 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
            }
-        } else if (arg == "-n" || arg == "--n_predict") {
+        } else if (arg == "-n" || arg == "--n-predict") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "--top_k") {
+        } else if (arg == "--top-k") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.top_k = std::stoi(argv[i]);
-        } else if (arg == "-c" || arg == "--ctx_size") {
+        } else if (arg == "-c" || arg == "--ctx-size") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--memory_f32") {
+        } else if (arg == "--memory-f32") {
            params.memory_f16 = false;
-        } else if (arg == "--top_p") {
+        } else if (arg == "--top-p") {
            if (++i >= argc) {
                invalid_param = true;
                break;
@@ -185,25 +193,25 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.typical_p = std::stof(argv[i]);
-        } else if (arg == "--repeat_last_n") {
+        } else if (arg == "--repeat-last-n") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.repeat_last_n = std::stoi(argv[i]);
-        } else if (arg == "--repeat_penalty") {
+        } else if (arg == "--repeat-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.repeat_penalty = std::stof(argv[i]);
-        } else if (arg == "--frequency_penalty") {
+        } else if (arg == "--frequency-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.frequency_penalty = std::stof(argv[i]);
-        } else if (arg == "--presence_penalty") {
+        } else if (arg == "--presence-penalty") {
            if (++i >= argc) {
                invalid_param = true;
                break;
@@ -215,19 +223,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.mirostat = std::stoi(argv[i]);
-        } else if (arg == "--mirostat_lr") {
+        } else if (arg == "--mirostat-lr") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.mirostat_eta = std::stof(argv[i]);
-        } else if (arg == "--mirostat_ent") {
+        } else if (arg == "--mirostat-ent") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.mirostat_tau = std::stof(argv[i]);
-        } else if (arg == "-b" || arg == "--batch_size") {
+        } else if (arg == "-b" || arg == "--batch-size") {
            if (++i >= argc) {
                invalid_param = true;
                break;
@@ -246,6 +254,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.model = argv[i];
+        } else if (arg == "-a" || arg == "--alias") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model_alias = argv[i];
        } else if (arg == "--lora") {
            if (++i >= argc) {
                invalid_param = true;
@@ -273,10 +287,63 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.use_color = true;
        } else if (arg == "--mlock") {
            params.use_mlock = true;
+        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+            params.n_gpu_layers = std::stoi(argv[i]);
+#else
+            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
+        } else if (arg == "--main-gpu" || arg == "-mg") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef GGML_USE_CUBLAS
+            params.main_gpu = std::stoi(argv[i]);
+#else
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
+#endif
+        } else if (arg == "--tensor-split" || arg == "-ts") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef GGML_USE_CUBLAS
+            std::string arg_next = argv[i];
+
+            // split string by , and /
+            const std::regex regex{R"([,/]+)"};
+            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+            std::vector<std::string> split_arg{it, {}};
+            GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+
+            for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
+                if (i < split_arg.size()) {
+                    params.tensor_split[i] = std::stof(split_arg[i]);
+                } else {
+                    params.tensor_split[i] = 0.0f;
+                }
+            }
+#else
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+#endif // GGML_USE_CUBLAS
+        } else if (arg == "--low-vram" || arg == "-lv") {
+#ifdef GGML_USE_CUBLAS
+            params.low_vram = true;
+#else
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
+#endif // GGML_USE_CUBLAS
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
        } else if (arg == "--mtest") {
            params.mem_test = true;
+        } else if (arg == "--export") {
+            params.export_cgraph = true;
        } else if (arg == "--verbose-prompt") {
            params.verbose_prompt = true;
        } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -310,12 +377,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-        } else if (arg == "--n_parts") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_parts = std::stoi(argv[i]);
        } else if (arg == "-h" || arg == "--help") {
            gpt_print_usage(argc, argv, default_params);
            exit(0);
@@ -346,11 +407,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    }
    if (params.prompt_cache_all &&
            (params.interactive || params.interactive_first ||
-             params.instruct || params.antiprompt.size())) {
+             params.instruct)) {
        fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
        gpt_print_usage(argc, argv, default_params);
        exit(1);
    }
+
+#ifdef GGML_USE_CUBLAS
+    if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
+        fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
+        exit(1);
+    }
+#endif // GGML_USE_CUBLAS
+
    if (escape_prompt) {
        process_escapes(params.prompt);
    }
@@ -368,8 +437,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
    fprintf(stderr, "  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
    fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
-    fprintf(stderr, "                        run in interactive mode and poll user input upon seeing PROMPT (can be\n");
-    fprintf(stderr, "                        specified more than once for multiple prompts).\n");
+    fprintf(stderr, "                        halt generation at PROMPT, return control in interactive mode\n");
+    fprintf(stderr, "                        (can be specified more than once for multiple prompts).\n");
    fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
@@ -379,36 +448,37 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
    fprintf(stderr, "  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
    fprintf(stderr, "                        not supported with --interactive or other interactive options\n");
+    fprintf(stderr, "  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
    fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
    fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
    fprintf(stderr, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    fprintf(stderr, "  -f FNAME, --file FNAME\n");
    fprintf(stderr, "                        prompt file to start generation.\n");
-    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
-    fprintf(stderr, "  --top_k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
+    fprintf(stderr, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
+    fprintf(stderr, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
+    fprintf(stderr, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
    fprintf(stderr, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
    fprintf(stderr, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
-    fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    fprintf(stderr, "  --presence_penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
-    fprintf(stderr, "  --frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
+    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    fprintf(stderr, "  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
+    fprintf(stderr, "  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
    fprintf(stderr, "  --mirostat N          use Mirostat sampling.\n");
    fprintf(stderr, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
    fprintf(stderr, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
-    fprintf(stderr, "  --mirostat_lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
-    fprintf(stderr, "  --mirostat_ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
+    fprintf(stderr, "  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
+    fprintf(stderr, "  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
    fprintf(stderr, "  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
    fprintf(stderr, "                        modifies the likelihood of token appearing in the completion,\n");
    fprintf(stderr, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
    fprintf(stderr, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
-    fprintf(stderr, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    fprintf(stderr, "  --no-penalize-nl      do not penalize newline token\n");
-    fprintf(stderr, "  --memory_f32          use f32 instead of f16 for memory key+value\n");
+    fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
-    fprintf(stderr, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n");
-    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    if (llama_mlock_supported()) {
@@ -417,7 +487,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    if (llama_mmap_supported()) {
        fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+    fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
+    fprintf(stderr, "                        number of layers to store in VRAM\n");
+    fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
+    fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
+    fprintf(stderr, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
+#endif
    fprintf(stderr, "  --mtest               compute maximum memory usage\n");
+    fprintf(stderr, "  --export              export the computation graph to 'llama.ggml'\n");
    fprintf(stderr, "  --verbose-prompt      print prompt before generation\n");
    fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
@@ -459,14 +538,18 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
 struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
    auto lparams = llama_context_default_params();

-    lparams.n_ctx      = params.n_ctx;
-    lparams.n_parts    = params.n_parts;
-    lparams.seed       = params.seed;
-    lparams.f16_kv     = params.memory_f16;
-    lparams.use_mmap   = params.use_mmap;
-    lparams.use_mlock  = params.use_mlock;
-    lparams.logits_all = params.perplexity;
-    lparams.embedding  = params.embedding;
+    lparams.n_ctx        = params.n_ctx;
+    lparams.n_batch      = params.n_batch;
+    lparams.n_gpu_layers = params.n_gpu_layers;
+    lparams.main_gpu     = params.main_gpu;
+    memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
+    lparams.low_vram     = params.low_vram;
+    lparams.seed         = params.seed;
+    lparams.f16_kv       = params.memory_f16;
+    lparams.use_mmap     = params.use_mmap;
+    lparams.use_mlock    = params.use_mlock;
+    lparams.logits_all   = params.perplexity;
+    lparams.embedding    = params.embedding;

    llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);

@@ -565,6 +648,9 @@ void console_set_color(console_state & con_st, console_color_t color) {
            case CONSOLE_COLOR_USER_INPUT:
                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
                break;
+            case CONSOLE_COLOR_ERROR:
+                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_RED);
+                break;
        }
        con_st.color = color;
        fflush(con_st.out);
@@ -572,6 +658,37 @@ void console_set_color(console_state & con_st, console_color_t color) {
 }

 char32_t getchar32() {
+#if defined(_WIN32)
+    HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
+    wchar_t high_surrogate = 0;
+
+    while (true) {
+        INPUT_RECORD record;
+        DWORD count;
+        if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
+            return WEOF;
+        }
+
+        if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
+            wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
+            if (wc == 0) {
+                continue;
+            }
+
+            if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
+                high_surrogate = wc;
+                continue;
+            } else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
+                if (high_surrogate != 0) { // Check if we have a high surrogate
+                    return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
+                }
+            }
+
+            high_surrogate = 0; // Reset the high surrogate
+            return static_cast<char32_t>(wc);
+        }
+    }
+#else
    wchar_t wc = getwchar();
    if (static_cast<wint_t>(wc) == WEOF) {
        return WEOF;
@@ -590,6 +707,7 @@ char32_t getchar32() {
 #endif

    return static_cast<char32_t>(wc);
+#endif
 }

 void pop_cursor(console_state & con_st) {
@@ -743,7 +861,7 @@ bool console_readline(console_state & con_st, std::string & line) {
            break;
        }

-        if (input_char == WEOF || input_char == 0x04 /* Ctrl+D*/) {
+        if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
            end_of_stream = true;
            break;
        }
@@ -758,7 +876,7 @@ bool console_readline(console_state & con_st, std::string & line) {
            char32_t code = getchar32();
            if (code == '[' || code == 0x1B) {
                // Discard the rest of the escape sequence
-                while ((code = getchar32()) != WEOF) {
+                while ((code = getchar32()) != (char32_t) WEOF) {
                    if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
                        break;
                    }
--- a/examples/common.h
+++ b/examples/common.h
@@ -21,13 +21,16 @@
 int32_t get_num_physical_cores();

 struct gpt_params {
-    int32_t seed          = -1;   // RNG seed
-    int32_t n_threads     = get_num_physical_cores();
-    int32_t n_predict     = -1;  // new tokens to predict
-    int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
-    int32_t n_ctx         = 512;  // context size
-    int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
+    int32_t seed                            = -1;  // RNG seed
+    int32_t n_threads                       = get_num_physical_cores();
+    int32_t n_predict                       = -1;  // new tokens to predict
+    int32_t n_ctx                           = 512; // context size
+    int32_t n_batch                         = 512; // batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                          = 0;   // number of tokens to keep from initial prompt
+    int32_t n_gpu_layers                    = 0;   // number of layers to store in VRAM
+    int32_t main_gpu                        = 0;   // the GPU that is used for scratch and small tensors
+    float   tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
+    bool    low_vram                        = 0;   // if true, reduce VRAM usage at the cost of performance

    // sampling parameters
    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
@@ -44,21 +47,23 @@ struct gpt_params {
    float   mirostat_tau      = 5.00f; // target entropy
    float   mirostat_eta      = 0.10f; // learning rate

-    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
-    std::string prompt = "";
+    std::string model             = "models/7B/ggml-model.bin"; // model path
+    std::string model_alias       = "unknown"; // model alias
+    std::string prompt            = "";
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
    std::string input_prefix      = "";  // string to prefix user inputs with
    std::string input_suffix      = "";  // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

    std::string lora_adapter = "";  // lora adapter path
-    std::string lora_base = "";     // base model path for the lora adapter
+    std::string lora_base    = "";  // base model path for the lora adapter

    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
+    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it

    bool embedding         = false; // get only sentence embedding
    bool interactive_first = false; // wait for user input immediately
@@ -70,6 +75,7 @@ struct gpt_params {
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
    bool mem_test          = false; // compute maximum memory usage
+    bool export_cgraph     = false; // export the computation graph
    bool verbose_prompt    = false; // print prompt tokens before generation
 };

@@ -107,7 +113,8 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
 enum console_color_t {
    CONSOLE_COLOR_DEFAULT=0,
    CONSOLE_COLOR_PROMPT,
-    CONSOLE_COLOR_USER_INPUT
+    CONSOLE_COLOR_USER_INPUT,
+    CONSOLE_COLOR_ERROR
 };

 struct console_state {
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -6,7 +6,6 @@

 int main(int argc, char ** argv) {
    gpt_params params;
-    params.model = "models/llama-7B/ggml-model.bin";

    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
@@ -32,6 +31,8 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

+    llama_init_backend();
+
    llama_context * ctx;

    // load the model
@@ -56,9 +57,6 @@ int main(int argc, char ** argv) {
    // tokenize the prompt
    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);

-    // determine newline token
-    auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
-
    if (params.verbose_prompt) {
        fprintf(stderr, "\n");
        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -69,8 +69,8 @@ In this section, we cover the most commonly used options for running the `main`
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
-   `-n N, --n_predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
-   `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.

 ## Input Prompts

@@ -136,9 +136,9 @@ During text generation, LLaMA models have a limited context size, which means th

 ### Context Size

-The `--ctx_size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.
+The `--ctx-size` option allows you to set the size of the prompt context used by the LLaMA models during text generation. A larger context size helps the model to better comprehend and generate responses for longer input or conversations.

-   `-c N, --ctx_size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.
+-   `-c N, --ctx-size N`: Set the size of the prompt context (default: 512). The LLaMA models were built with a context of 2048, which will yield the best results on longer input/inference. However, increasing the context size beyond 2048 may lead to unpredictable results.

 ### Keep Prompt

@@ -146,7 +146,7 @@ The `--keep` option allows users to retain the original prompt when the model ru

 -   `--keep N`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.

-By utilizing context management options like `--ctx_size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.
+By utilizing context management options like `--ctx-size` and `--keep`, you can maintain a more coherent and consistent interaction with the LLaMA models, ensuring that the generated text remains relevant to the original prompt or conversation.

 ## Generation Flags

@@ -154,11 +154,11 @@ The following options allow you to control the text generation process and fine-

 ### Number of Tokens to Predict

-   `-n N, --n_predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).

-The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
+The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.

-It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.

 ### Temperature

@@ -170,33 +170,33 @@ Example usage: `--temp 0.5`

 ### Repeat Penalty

-   `--repeat_penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
-   `--repeat_last_n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx_size).
+-   `--repeat-penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
+-   `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size).
 -   `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.

-The `repeat_penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
+The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.

-The `repeat_last_n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx_size`).
+The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`).

 Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.

-Example usage: `--repeat_penalty 1.15 --repeat_last_n 128 --no-penalize-nl`
+Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl`

 ### Top-K Sampling

-   `--top_k N`: Limit the next token selection to the K most probable tokens (default: 40).
+-   `--top-k N`: Limit the next token selection to the K most probable tokens (default: 40).

-Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.
+Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top-k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.

-Example usage: `--top_k 30`
+Example usage: `--top-k 30`

 ### Top-P Sampling

-   `--top_p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
+-   `--top-p N`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).

-Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.
+Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top-p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.

-Example usage: `--top_p 0.95`
+Example usage: `--top-p 0.95`

 ### Tail Free Sampling (TFS)

@@ -217,16 +217,16 @@ Example usage: `--typical 0.9`
 ### Mirostat Sampling

 -   `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
-   `--mirostat_lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
-   `--mirostat_ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
+-   `--mirostat-lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
+-   `--mirostat-ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).

 Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps).

-The `--mirostat_lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
+The `--mirostat-lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.

-The `--mirostat_ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
+The `--mirostat-ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.

-Example usage: `--mirostat 2 --mirostat_lr 0.05 --mirostat_ent 3.0`
+Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`

 ### Logit Bias

@@ -264,15 +264,15 @@ These options help improve the performance and memory usage of the LLaMA models.

 ### Memory Float 32

-   `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value, allowing higher quality inference at the cost of higher memory usage.
+-   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.

 ### Batch Size

-   `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.
+-   `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.

 ### Prompt Caching

-   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs.
+-   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.

 ### Quantization

@@ -285,5 +285,9 @@ These options provide extra functionality and customization when running the LLa
 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
 -   `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
+-   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+-   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
 -   `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
 -   `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -50,7 +50,6 @@ void sigint_handler(int signo) {

 int main(int argc, char ** argv) {
    gpt_params params;
-    params.model = "models/llama-7B/ggml-model.bin";

    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
@@ -82,6 +81,9 @@ int main(int argc, char ** argv) {
    if (params.n_ctx > 2048) {
        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
                "expect poor results\n", __func__, params.n_ctx);
+    } else if (params.n_ctx < 8) {
+        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
    }

    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
@@ -97,8 +99,7 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

-//    params.prompt = R"(// this function checks if the number n is prime
-//bool is_prime(int n) {)";
+    llama_init_backend();

    llama_context * ctx;
    g_ctx = &ctx;
@@ -121,7 +122,7 @@ int main(int argc, char ** argv) {
    // uncomment the "used_mem" line in llama.cpp to see the results
    if (params.mem_test) {
        {
-            const std::vector<llama_token> tmp(params.n_batch, 0);
+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
            llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
        }

@@ -136,8 +137,13 @@ int main(int argc, char ** argv) {
        return 0;
    }

-    // Add a space in front of the first character to match OG llama tokenizer behavior
-    params.prompt.insert(0, 1, ' ');
+    // export the cgraph and exit
+    if (params.export_cgraph) {
+        llama_eval_export(ctx, "llama.ggml");
+        llama_free(ctx);
+
+        return 0;
+    }

    std::string path_session = params.path_prompt_cache;
    std::vector<llama_token> session_tokens;
@@ -157,6 +163,7 @@ int main(int argc, char ** argv) {
                return 1;
            }
            session_tokens.resize(n_token_count_out);
+            llama_set_rng_seed(ctx, params.seed);

            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
        } else {
@@ -165,7 +172,16 @@ int main(int argc, char ** argv) {
    }

    // tokenize the prompt
-    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    std::vector<llama_token> embd_inp;
+
+    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
+        // Add a space in front of the first character to match OG llama tokenizer behavior
+        params.prompt.insert(0, 1, ' ');
+
+        embd_inp = ::llama_tokenize(ctx, params.prompt, true);
+    } else {
+        embd_inp = session_tokens;
+    }

    const int n_ctx = llama_n_ctx(ctx);

@@ -183,7 +199,9 @@ int main(int argc, char ** argv) {
            }
            n_matching_session_tokens++;
        }
-        if (n_matching_session_tokens >= embd_inp.size()) {
+        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
+            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
+        } else if (n_matching_session_tokens >= embd_inp.size()) {
            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
@@ -194,6 +212,13 @@ int main(int argc, char ** argv) {
        }
    }

+    // if we will use the cache for the full prompt without reaching the end of the cache, force
+    // reevaluation of the last token token to recalculate the cached logits
+    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
+            session_tokens.size() > embd_inp.size()) {
+        session_tokens.resize(embd_inp.size() - 1);
+    }
+
    // number of tokens to keep when resetting context
    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
        params.n_keep = (int)embd_inp.size();
@@ -209,8 +234,8 @@ int main(int argc, char ** argv) {
        params.antiprompt.push_back("### Instruction:\n\n");
    }

-    // enable interactive mode if reverse prompt or interactive start is specified
-    if (params.antiprompt.size() != 0 || params.interactive_first) {
+    // enable interactive mode if interactive start is specified
+    if (params.interactive_first) {
        params.interactive = true;
    }

@@ -242,7 +267,7 @@ int main(int argc, char ** argv) {
        sigint_action.sa_flags = 0;
        sigaction(SIGINT, &sigint_action, NULL);
 #elif defined (_WIN32)
-        auto console_ctrl_handler = [](DWORD ctrl_type) -> BOOL {
+        auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
            return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
        };
        SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
@@ -306,9 +331,29 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd;

-    while (n_remain != 0 || params.interactive) {
+    // do one empty run to warm up the model
+    {
+        const std::vector<llama_token> tmp = { llama_token_bos(), };
+        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+        llama_reset_timings(ctx);
+    }
+
+    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
        if (embd.size() > 0) {
+            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
+            // --prompt or --file which uses the same value.
+            auto max_embd_size = n_ctx - 4;
+            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
+            if ((int)embd.size() > max_embd_size) {
+                auto skipped_tokens = embd.size() - max_embd_size;
+                console_set_color(con_st, CONSOLE_COLOR_ERROR);
+                printf("<<input too long: skipped %ld token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+                fflush(stdout);
+                embd.resize(max_embd_size);
+            }
+
            // infinite text generation via context swapping
            // if we run out of context:
            // - take the n_keep first tokens from the original prompt (via n_past)
@@ -395,7 +440,7 @@ int main(int argc, char ** argv) {
            const bool    penalize_nl     = params.penalize_nl;

            // optionally save the session on first sample (for faster prompt loading next time)
-            if (!path_session.empty() && need_to_save_session) {
+            if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
                need_to_save_session = false;
                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
            }
@@ -504,9 +549,8 @@ int main(int argc, char ** argv) {
            console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
        }

-        // in interactive mode, and not currently processing queued inputs;
-        // check if we should prompt the user for more
-        if (params.interactive && (int) embd_inp.size() <= n_consumed) {
+        // if not currently processing queued inputs;
+        if ((int) embd_inp.size() <= n_consumed) {

            // check for reverse prompt
            if (params.antiprompt.size()) {
@@ -517,10 +561,21 @@ int main(int argc, char ** argv) {

                is_antiprompt = false;
                // Check if each of the reverse prompts appears at the end of the output.
+                // If we're not running interactively, the reverse prompt might be tokenized with some following characters
+                // so we'll compensate for that by widening the search window a bit.
                for (std::string & antiprompt : params.antiprompt) {
-                    if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
-                        is_interacting = true;
+                    size_t extra_padding = params.interactive ? 0 : 2;
+                    size_t search_start_pos = last_output.length() > static_cast<size_t>(antiprompt.length() + extra_padding)
+                        ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
+                        : 0;
+
+                    if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
+                        if (params.interactive) {
+                            is_interacting = true;
+                            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+                        }
                        is_antiprompt = true;
+                        fflush(stdout);
                        break;
                    }
                }
@@ -598,7 +653,7 @@ int main(int argc, char ** argv) {
        }
    }

-    if (!path_session.empty() && params.prompt_cache_all) {
+    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
        fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
        llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }
--- a/examples/metal/CMakeLists.txt
+++ b/examples/metal/CMakeLists.txt
@@ -0,0 +1,3 @@
+set(TEST_TARGET metal)
+add_executable(${TEST_TARGET} metal.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@@ -0,0 +1,102 @@
+// Evaluate a statically exported ggml computation graph with Metal
+//
+// - First, export a LLaMA graph:
+//
+//  $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export
+//
+// - Run this tool to evaluate the exported graph:
+//
+//  $ ./bin/metal llama.ggml
+//
+// The purpose of this tool is mostly for debugging and demonstration purposes.
+// The main limitation of exporting computation graphs is that their sizes are static which often
+// can be a problem for real-world applications.
+//
+
+#include "ggml.h"
+#include "ggml-metal.h"
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s llama.ggml\n", argv[0]);
+        return -1;
+    }
+
+    const char * fname_cgraph = argv[1];
+
+    // load the compute graph
+    struct ggml_context * ctx_data = NULL;
+    struct ggml_context * ctx_eval = NULL;
+
+    struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+    gf.n_threads = 1;
+
+    // this allocates all Metal resources and memory buffers
+    auto * ctx_metal = ggml_metal_init();
+
+    ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
+    ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
+
+    // main
+    {
+        struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
+        *(int32_t *) input->data = 1; // BOS
+
+        ggml_metal_set_tensor(ctx_metal, input);
+
+        // warmup
+        ggml_metal_graph_compute(ctx_metal, &gf);
+
+        const int n_iter = 16;
+
+        const int64_t t0 = ggml_time_us();
+
+        // the actual inference happens here
+        for (int i = 0; i < n_iter; ++i) {
+            ggml_metal_graph_compute(ctx_metal, &gf);
+        }
+
+        const int64_t t1 = ggml_time_us();
+
+        printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
+    }
+
+    // debug output
+    {
+        struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
+        ggml_metal_get_tensor(ctx_metal, logits);
+
+        float * ptr = (float *) ggml_get_data(logits);
+
+        printf("logits: ");
+        for (int i = 0; i < 10; i++) {
+            printf("%8.4f ", ptr[i]);
+        }
+        printf("\n");
+        int imax = 0;
+        double sum = 0.0;
+        double vmax = -1e9;
+        for (int i = 0; i < 32000; i++) {
+            sum += (double) ptr[i];
+            if (ptr[i] > vmax) {
+                vmax = ptr[i];
+                imax = i;
+            }
+        }
+        printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
+    }
+
+    ggml_metal_free(ctx_metal);
+
+    ggml_free(ctx_data);
+    ggml_free(ctx_eval);
+
+    return 0;
+}
+
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -116,7 +116,6 @@ void perplexity(llama_context * ctx, const gpt_params & params) {

 int main(int argc, char ** argv) {
    gpt_params params;
-    params.model = "models/llama-7B/ggml-model.bin";

    params.n_batch = 512;
    if (gpt_params_parse(argc, argv, params) == false) {
@@ -144,6 +143,8 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

+    llama_init_backend();
+
    llama_context * ctx;

    // load the model and apply lora adapter, if any
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -282,8 +282,9 @@ int main(int argc, char ** argv) {
                break;
            }
            int j;
-            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) j)) != 0; j++) {
-                // find match
+            for (j = 0; j < GGML_TYPE_COUNT; ++j) {
+               const auto * name = ggml_type_name((ggml_type) j);
+               if (name && strcmp(argv[i], name) == 0) break;
            }
            if (j < GGML_TYPE_COUNT) {
                params.include_types.push_back((ggml_type) j);
@@ -321,7 +322,6 @@ int main(int argc, char ** argv) {
        auto lparams = llama_context_default_params();

        lparams.n_ctx      = 256;
-        lparams.n_parts    = 1;
        lparams.seed       = 1;
        lparams.f16_kv     = false;
        lparams.use_mlock  = false;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,33 +1,138 @@
-#include "ggml.h"
-#include "llama.h"
 #include "build-info.h"

+#include "llama.h"
+
 #include <cstdio>
-#include <map>
+#include <cstring>
+#include <vector>
 #include <string>

-static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
-  {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
-  {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
-  {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
-  {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
-  {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
+struct quant_option {
+    std::string name;
+    llama_ftype ftype;
+    std::string desc;
 };

-bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
-    auto it = LLAMA_FTYPE_MAP.find(ftype_str);
-    if (it != LLAMA_FTYPE_MAP.end()) {
-        ftype = it->second;
-        ftype_str_out = it->first;
-        return true;
+static const std::vector<struct quant_option> QUANT_OPTIONS = {
+    {
+        "Q4_0",
+        LLAMA_FTYPE_MOSTLY_Q4_0,
+        " 3.50G, +0.2499 ppl @ 7B - small, very high quality loss - legacy, prefer using Q3_K_M",
+    },
+    {
+        "Q4_1",
+        LLAMA_FTYPE_MOSTLY_Q4_1,
+        " 3.90G, +0.1846 ppl @ 7B - small, substantial quality loss - legacy, prefer using Q3_K_L",
+    },
+    {
+        "Q5_0",
+        LLAMA_FTYPE_MOSTLY_Q5_0,
+        " 4.30G, +0.0796 ppl @ 7B - medium, balanced quality - legacy, prefer using Q4_K_M",
+    },
+    {
+        "Q5_1",
+        LLAMA_FTYPE_MOSTLY_Q5_1,
+        " 4.70G, +0.0415 ppl @ 7B - medium, low quality loss - legacy, prefer using Q5_K_M",
+    },
+#ifdef GGML_USE_K_QUANTS
+    {
+        "Q2_K",
+        LLAMA_FTYPE_MOSTLY_Q2_K,
+        " 2.67G, +0.8698 ppl @ 7B - smallest, extreme quality loss - not recommended",
+    },
+    {
+        "Q3_K",
+        LLAMA_FTYPE_MOSTLY_Q3_K_M,
+        "alias for Q3_K_M"
+    },
+    {
+        "Q3_K_S",
+        LLAMA_FTYPE_MOSTLY_Q3_K_S,
+        " 2.75G, +0.5505 ppl @ 7B - very small, very high quality loss",
+    },
+    {
+        "Q3_K_M",
+        LLAMA_FTYPE_MOSTLY_Q3_K_M,
+        " 3.06G, +0.2437 ppl @ 7B - very small, very high quality loss",
+    },
+    {
+        "Q3_K_L",
+        LLAMA_FTYPE_MOSTLY_Q3_K_L,
+        " 3.35G, +0.1803 ppl @ 7B - small, substantial quality loss",
+    },
+    {
+        "Q4_K",
+        LLAMA_FTYPE_MOSTLY_Q4_K_M,
+        "alias for Q4_K_M",
+    },
+    {
+        "Q4_K_S",
+        LLAMA_FTYPE_MOSTLY_Q4_K_S,
+        " 3.56G, +0.1149 ppl @ 7B - small, significant quality loss",
+    },
+    {
+        "Q4_K_M",
+        LLAMA_FTYPE_MOSTLY_Q4_K_M,
+        " 3.80G, +0.0535 ppl @ 7B - medium, balanced quality - *recommended*",
+    },
+    {
+        "Q5_K",
+        LLAMA_FTYPE_MOSTLY_Q5_K_M,
+        "alias for Q5_K_M",
+    },
+    {
+        "Q5_K_S",
+        LLAMA_FTYPE_MOSTLY_Q5_K_S,
+        " 4.33G, +0.0353 ppl @ 7B - large, low quality loss - *recommended*",
+    },
+    {
+        "Q5_K_M",
+        LLAMA_FTYPE_MOSTLY_Q5_K_M,
+        " 4.45G, +0.0142 ppl @ 7B - large, very low quality loss - *recommended*",
+    },
+    {
+        "Q6_K",
+        LLAMA_FTYPE_MOSTLY_Q6_K,
+        " 5.15G, +0.0044 ppl @ 7B - very large, extremely low quality loss",
+    },
+#endif
+    {
+        "Q8_0",
+        LLAMA_FTYPE_MOSTLY_Q8_0,
+        " 6.70G, +0.0004 ppl @ 7B - very large, extremely low quality loss - not recommended",
+    },
+    {
+        "F16",
+        LLAMA_FTYPE_MOSTLY_F16,
+        "13.00G              @ 7B - extremely large, virtually no quality loss - not recommended",
+    },
+    {
+        "F32",
+        LLAMA_FTYPE_ALL_F32,
+        "26.00G              @ 7B - absolutely huge, lossless - not recommended",
+    },
+};
+
+
+bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
+    std::string ftype_str;
+
+    for (auto ch : ftype_str_in) {
+        ftype_str.push_back(std::toupper(ch));
+    }
+    for (auto & it : QUANT_OPTIONS) {
+        if (it.name == ftype_str) {
+            ftype = it.ftype;
+            ftype_str_out = it.name;
+            return true;
+        }
    }
-    // try to parse as an integer
    try {
        int ftype_int = std::stoi(ftype_str);
-        for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
-            if (it->second == ftype_int) {
-                ftype = it->second;
-                ftype_str_out = it->first;
+        for (auto & it : QUANT_OPTIONS) {
+            if (it.ftype == ftype_int) {
+                ftype = it.ftype;
+                ftype_str_out = it.name;
                return true;
            }
        }
@@ -39,36 +144,51 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st
 }

 // usage:
-//  ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
+//  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
 //
+void usage(const char * executable) {
+    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
+    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
+    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
+    fprintf(stderr, "\nAllowed quantization types:\n");
+    for (auto & it : QUANT_OPTIONS) {
+        printf("  %2d  or  %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
+    }
+    exit(1);
+}
+
 int main(int argc, char ** argv) {
-    ggml_time_init();
-
    if (argc < 3) {
-        fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
-        for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
-            fprintf(stderr, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
-        }
-        return 1;
+        usage(argv[0]);
    }

-    // needed to initialize f16 tables
-    {
-        struct ggml_init_params params = { 0, NULL, false };
-        struct ggml_context * ctx = ggml_init(params);
-        ggml_free(ctx);
+    llama_model_quantize_params params = llama_model_quantize_default_params();
+
+    int arg_idx = 1;
+
+    for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
+        if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
+            params.quantize_output_tensor = false;
+        } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
+            params.allow_requantize = true;
+        } else {
+            usage(argv[0]);
+        }
    }

+    if (argc - arg_idx < 3) {
+        usage(argv[0]);
+    }
+
+    llama_init_backend();
+
    // parse command line arguments
-    const std::string fname_inp = argv[1];
+    const std::string fname_inp = argv[arg_idx];
+    arg_idx++;
    std::string fname_out;
-    int nthread;
-    llama_ftype ftype;

-    int arg_idx = 2;
    std::string ftype_str;
-    if (try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
-        // argv[2] is the ftype
+    if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
        std::string fpath;
        const size_t pos = fname_inp.find_last_of('/');
        if (pos != std::string::npos) {
@@ -79,7 +199,6 @@ int main(int argc, char ** argv) {
        arg_idx++;
    }
    else {
-        // argv[2] is the output path
        fname_out = argv[arg_idx];
        arg_idx++;

@@ -87,8 +206,7 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "%s: missing ftype\n", __func__);
            return 1;
        }
-        // argv[3] is the ftype
-        if (!try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
+        if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
            return 1;
        }
@@ -98,43 +216,41 @@ int main(int argc, char ** argv) {
    // parse nthreads
    if (argc > arg_idx) {
        try {
-            nthread = std::stoi(argv[arg_idx]);
+            params.nthread = std::stoi(argv[arg_idx]);
        }
        catch (const std::exception & e) {
            fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
            return 1;
        }
-    } else {
-        nthread = 0;
    }

    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

    fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
-    if (nthread > 0) {
-        fprintf(stderr, " using %d threads", nthread);
+    if (params.nthread > 0) {
+        fprintf(stderr, " using %d threads", params.nthread);
    }
    fprintf(stderr, "\n");

-    const int64_t t_main_start_us = ggml_time_us();
+    const int64_t t_main_start_us = llama_time_us();

    int64_t t_quantize_us = 0;

    // load the model
    {
-        const int64_t t_start_us = ggml_time_us();
+        const int64_t t_start_us = llama_time_us();

-        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
+        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), &params)) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }

-        t_quantize_us = ggml_time_us() - t_start_us;
+        t_quantize_us = llama_time_us() - t_start_us;
    }

    // report timing
    {
-        const int64_t t_main_end_us = ggml_time_us();
+        const int64_t t_main_end_us = llama_time_us();

        printf("\n");
        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -8,7 +8,6 @@

 int main(int argc, char ** argv) {
    gpt_params params;
-    params.model = "models/llama-7B/ggml-model.bin";
    params.seed = 42;
    params.n_threads = 4;
    params.repeat_last_n = 64;
@@ -27,7 +26,6 @@ int main(int argc, char ** argv) {
    auto lparams = llama_context_default_params();

    lparams.n_ctx     = params.n_ctx;
-    lparams.n_parts   = params.n_parts;
    lparams.seed      = params.seed;
    lparams.f16_kv    = params.memory_f16;
    lparams.use_mmap  = params.use_mmap;
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(TARGET server)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_executable(${TARGET} server.cpp json.hpp httplib.h)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -0,0 +1,319 @@
+# llama.cpp/example/server
+
+This example allow you to have a llama.cpp http server to interact from a web page or consume the API.
+
+## Table of Contents
+
+1. [Quick Start](#quick-start)
+2. [Node JS Test](#node-js-test)
+3. [API Endpoints](#api-endpoints)
+4. [More examples](#more-examples)
+5. [Common Options](#common-options)
+6. [Performance Tuning and Memory Options](#performance-tuning-and-memory-options)
+
+## Quick Start
+
+To get started right away, run the following command, making sure to use the correct path for the model you have:
+
+#### Unix-based systems (Linux, macOS, etc.):
+Make sure to build with the server option on
+```bash
+LLAMA_BUILD_SERVER=1 make
+```
+
+```bash
+./server -m models/7B/ggml-model.bin --ctx_size 2048
+```
+
+#### Windows:
+
+```powershell
+server.exe -m models\7B\ggml-model.bin --ctx_size 2048
+```
+
+That will start a server that by default listens on `127.0.0.1:8080`. You can consume the endpoints with Postman or NodeJS with axios library.
+
+## Node JS Test
+
+You need to have [Node.js](https://nodejs.org/en) installed.
+
+```bash
+mkdir llama-client
+cd llama-client
+npm init
+npm install axios
+```
+
+Create a index.js file and put inside this:
+
+```javascript
+const axios = require("axios");
+
+const prompt = `Building a website can be done in 10 simple steps:`;
+
+async function Test() {
+    let result = await axios.post("http://127.0.0.1:8080/completion", {
+        prompt,
+        batch_size: 128,
+        n_predict: 512,
+    });
+
+    // the response is received until completion finish
+    console.log(result.data.content);
+}
+
+Test();
+```
+
+And run it:
+
+```bash
+node .
+```
+
+## API Endpoints
+
+You can interact with this API Endpoints. This implementations just support chat style interaction.
+
+-   **POST** `hostname:port/completion`: Setting up the Llama Context to begin the completions tasks.
+
+*Options:*
+
+`batch_size`: Set the batch size for prompt processing (default: 512).
+
+`temperature`: Adjust the randomness of the generated text (default: 0.8).
+
+`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
+
+`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
+
+`n_predict`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+
+`threads`: Set the number of threads to use during computation.
+
+`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
+
+`as_loop`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
+
+`interactive`: It allows interacting with the completion, and the completion stops as soon as it encounters a `stop word`. To enable this, set to `true`.
+
+`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate.
+
+`stop`: Specify the words or characters that indicate a stop. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration.
+
+`exclude`: Specify the words or characters you do not want to appear in the completion. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration.
+
+-   **POST** `hostname:port/embedding`: Generate embedding of a given text
+
+*Options:*
+
+`content`: Set the text to get generate the embedding.
+
+`threads`: Set the number of threads to use during computation.
+
+To use this endpoint, you need to start the server with the `--embedding` option added.
+
+-   **POST** `hostname:port/tokenize`: Tokenize a given text
+
+*Options:*
+
+`content`: Set the text to tokenize.
+
+-   **GET** `hostname:port/next-token`: Receive the next token predicted, execute this request in a loop. Make sure set `as_loop` as `true` in the completion request.
+
+*Options:*
+
+`stop`: Set `hostname:port/next-token?stop=true` to stop the token generation.
+
+## More examples
+
+### Interactive mode
+
+This mode allows interacting in a chat-like manner. It is recommended for models designed as assistants such as `Vicuna`, `WizardLM`, `Koala`, among others. Make sure to add the correct stop word for the corresponding model.
+
+The prompt should be generated by you, according to the model's guidelines. You should keep adding the model's completions to the context as well.
+
+This example works well for `Vicuna - version 1`.
+
+```javascript
+const axios = require("axios");
+
+let prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
+### Human: Hello, Assistant.
+### Assistant: Hello. How may I help you today?
+### Human: Please tell me the largest city in Europe.
+### Assistant: Sure. The largest city in Europe is Moscow, the capital of Russia.`;
+
+async function ChatCompletion(answer) {
+    // the user's next question to the prompt
+    prompt += `\n### Human: ${answer}\n`
+
+    result = await axios.post("http://127.0.0.1:8080/completion", {
+        prompt,
+        batch_size: 128,
+        temperature: 0.2,
+        top_k: 40,
+        top_p: 0.9,
+        n_keep: -1,
+        n_predict: 2048,
+        stop: ["\n### Human:"], // when detect this, stop completion
+        exclude: ["### Assistant:"], // no show in the completion
+        threads: 8,
+        as_loop: true, // use this to request the completion token by token
+        interactive: true, // enable the detection of a stop word
+    });
+
+    // create a loop to receive every token predicted
+    // note: this operation is blocking, avoid use this in a ui thread
+
+    let message = "";
+    while (true) {
+        // you can stop the inference adding '?stop=true' like this http://127.0.0.1:8080/next-token?stop=true
+        result = await axios.get("http://127.0.0.1:8080/next-token");
+        process.stdout.write(result.data.content);
+        message += result.data.content;
+
+        // to avoid an infinite loop
+        if (result.data.stop) {
+            console.log("Completed");
+            // make sure to add the completion to the prompt.
+            prompt += `### Assistant: ${message}`;
+            break;
+        }
+    }
+}
+
+// This function should be called every time a question to the model is needed.
+async function Test() {
+    // the server can't inference in paralell
+    await ChatCompletion("Write a long story about a time magician in a fantasy world");
+    await ChatCompletion("Summary the story");
+}
+
+Test();
+```
+
+### Alpaca example
+
+**Temporaly note:** no tested, if you have the model, please test it and report me some issue
+
+```javascript
+const axios = require("axios");
+
+let prompt = `Below is an instruction that describes a task. Write a response that appropriately completes the request.
+`;
+
+async function DoInstruction(instruction) {
+    prompt += `\n\n### Instruction:\n\n${instruction}\n\n### Response:\n\n`;
+    result = await axios.post("http://127.0.0.1:8080/completion", {
+        prompt,
+        batch_size: 128,
+        temperature: 0.2,
+        top_k: 40,
+        top_p: 0.9,
+        n_keep: -1,
+        n_predict: 2048,
+        stop: ["### Instruction:\n\n"], // when detect this, stop completion
+        exclude: [], // no show in the completion
+        threads: 8,
+        as_loop: true, // use this to request the completion token by token
+        interactive: true, // enable the detection of a stop word
+    });
+
+    // create a loop to receive every token predicted
+    // note: this operation is blocking, avoid use this in a ui thread
+
+    let message = "";
+    while (true) {
+        result = await axios.get("http://127.0.0.1:8080/next-token");
+        process.stdout.write(result.data.content);
+        message += result.data.content;
+
+        // to avoid an infinite loop
+        if (result.data.stop) {
+            console.log("Completed");
+            // make sure to add the completion and the user's next question to the prompt.
+            prompt += message;
+            break;
+        }
+    }
+}
+
+// This function should be called every time a instruction to the model is needed.
+DoInstruction("Destroy the world"); // as joke
+```
+
+### Embeddings
+
+First, run the server with `--embedding` option:
+
+```bash
+server -m models/7B/ggml-model.bin --ctx_size 2048 --embedding
+```
+
+Run this code in NodeJS:
+
+```javascript
+const axios = require('axios');
+
+async function Test() {
+    let result = await axios.post("http://127.0.0.1:8080/embedding", {
+        content: `Hello`,
+        threads: 5
+    });
+    // print the embedding array
+    console.log(result.data.embedding);
+}
+
+Test();
+```
+
+### Tokenize
+
+Run this code in NodeJS:
+
+```javascript
+const axios = require('axios');
+
+async function Test() {
+    let result = await axios.post("http://127.0.0.1:8080/tokenize", {
+        content: `Hello`
+    });
+    // print the embedding array
+    console.log(result.data.tokens);
+}
+
+Test();
+```
+
+## Common Options
+
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
+-   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
+-   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
+-   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
+-   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
+-   `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
+-   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
+-   `--port`: Set the port to listen. Default: `8080`.
+
+### RNG Seed
+
+-   `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
+
+The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
+
+## Performance Tuning and Memory Options
+
+### No Memory Mapping
+
+-   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance.
+
+### Memory Float 32
+
+-   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement but does not appear to increase generation quality in a measurable way. Not recommended.
+
+## Limitations:
+
+-   The actual implementation of llama.cpp need a `llama-state` for handle multiple contexts and clients, but this could require more powerful hardware.
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
--- a/examples/server/json.hpp
+++ b/examples/server/json.hpp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -0,0 +1,799 @@
+#include <httplib.h>
+#include <json.hpp>
+#include "common.h"
+#include "llama.h"
+
+struct server_params
+{
+  std::string hostname = "127.0.0.1";
+  int32_t port = 8080;
+};
+
+struct llama_server_context
+{
+  bool as_loop = false;
+  bool has_next_token = false;
+  std::string generated_text = "";
+
+  int32_t num_tokens_predicted = 0;
+  int32_t n_past = 0;
+  int32_t n_consumed = 0;
+  int32_t n_session_consumed = 0;
+  int32_t n_remain = 0;
+
+  std::vector<llama_token> embd;
+  std::vector<llama_token> last_n_tokens;
+  std::vector<llama_token> processed_tokens;
+  std::vector<llama_token> llama_token_newline;
+  std::vector<llama_token> embd_inp;
+  std::vector<std::vector<llama_token>> no_show_words;
+  std::vector<llama_token> tokens_predicted;
+
+  llama_context *ctx;
+  gpt_params params;
+
+  void rewind() {
+    as_loop = false;
+    params.antiprompt.clear();
+    no_show_words.clear();
+    num_tokens_predicted = 0;
+    generated_text = "";
+  }
+
+  bool loadModel(gpt_params params_)
+  {
+    params = params_;
+    ctx = llama_init_from_gpt_params(params);
+    if (ctx == NULL)
+    {
+      fprintf(stderr, "%s: error: unable to load model\n", __func__);
+      return false;
+    }
+    // determine newline token
+    llama_token_newline = ::llama_tokenize(ctx, "\n", false);
+    last_n_tokens.resize(params.n_ctx);
+    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
+    return true;
+  }
+
+  bool loadPrompt() {
+    params.prompt.insert(0, 1, ' '); // always add a first space
+    std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
+    // compare the evaluated prompt with the new prompt
+    int new_prompt_len = 0;
+    for (size_t i = 0; i < prompt_tokens.size(); i++) {
+      if (i < processed_tokens.size() &&
+        processed_tokens[i] == prompt_tokens[i])
+      {
+        continue;
+      }
+      else
+      {
+        embd_inp.push_back(prompt_tokens[i]);
+        if(new_prompt_len == 0) {
+          if(int32_t(i) - 1 < n_past) {
+            processed_tokens.erase(processed_tokens.begin() + i, processed_tokens.end());
+          }
+          // Evaluate the new fragment prompt from the last token processed.
+          n_past = processed_tokens.size();
+        }
+        new_prompt_len ++;
+      }
+    }
+    if(n_past > 0 && params.interactive) {
+      n_remain -= new_prompt_len;
+    }
+    if ((int)embd_inp.size() > params.n_ctx - 4)
+    {
+      return false;
+    }
+    has_next_token = true;
+    return true;
+  }
+
+  void beginCompletion()
+  {
+    if(n_remain == 0) {
+      // number of tokens to keep when resetting context
+      if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size())
+      {
+        params.n_keep = (int)embd_inp.size();
+      }
+    }
+    n_remain = params.n_predict;
+  }
+
+  llama_token nextToken() {
+    llama_token result = -1;
+    if (embd.size() > 0)
+    {
+      if (n_past + (int)embd.size() > params.n_ctx)
+      {
+        // Reset context
+        const int n_left = n_past - params.n_keep;
+        n_past = std::max(1, params.n_keep);
+        processed_tokens.erase(processed_tokens.begin() + n_past, processed_tokens.end());
+        embd.insert(embd.begin(), last_n_tokens.begin() + params.n_ctx - n_left / 2 - embd.size(), last_n_tokens.end() - embd.size());
+      }
+      for (int i = 0; i < (int)embd.size(); i += params.n_batch)
+      {
+        int n_eval = (int)embd.size() - i;
+        if (n_eval > params.n_batch)
+        {
+          n_eval = params.n_batch;
+        }
+        if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads))
+        {
+          fprintf(stderr, "%s : failed to eval\n", __func__);
+          has_next_token = false;
+          return result;
+        }
+        n_past += n_eval;
+      }
+    }
+    embd.clear();
+    if ((int)embd_inp.size() <= n_consumed && has_next_token)
+    {
+      // out of user input, sample next token
+      const float temp = params.temp;
+      // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+      const float top_p = params.top_p;
+      const float tfs_z = params.tfs_z;
+      const float typical_p = params.typical_p;
+      const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n;
+      const float repeat_penalty = params.repeat_penalty;
+      const float alpha_presence = params.presence_penalty;
+      const float alpha_frequency = params.frequency_penalty;
+      const int mirostat = params.mirostat;
+      const float mirostat_tau = params.mirostat_tau;
+      const float mirostat_eta = params.mirostat_eta;
+      const bool penalize_nl = params.penalize_nl;
+      llama_token id = 0;
+      {
+        auto logits = llama_get_logits(ctx);
+        auto n_vocab = llama_n_vocab(ctx);
+
+        // Apply params.logit_bias map
+        for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++)
+        {
+          logits[it->first] += it->second;
+        }
+
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++)
+        {
+          candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+
+        llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
+
+        // Apply penalties
+        float nl_logit = logits[llama_token_nl()];
+        auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
+        llama_sample_repetition_penalty(ctx, &candidates_p,
+                                        last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                                        last_n_repeat, repeat_penalty);
+        llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+                                                      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                                                      last_n_repeat, alpha_frequency, alpha_presence);
+        if (!penalize_nl)
+        {
+          logits[llama_token_nl()] = nl_logit;
+        }
+
+        if (temp <= 0)
+        {
+          // Greedy sampling
+          id = llama_sample_token_greedy(ctx, &candidates_p);
+        }
+        else
+        {
+          if (mirostat == 1)
+          {
+            static float mirostat_mu = 2.0f * mirostat_tau;
+            const int mirostat_m = 100;
+            llama_sample_temperature(ctx, &candidates_p, temp);
+            id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+          }
+          else if (mirostat == 2)
+          {
+            static float mirostat_mu = 2.0f * mirostat_tau;
+            llama_sample_temperature(ctx, &candidates_p, temp);
+            id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+          }
+          else
+          {
+            // Temperature sampling
+            llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
+            llama_sample_typical(ctx, &candidates_p, typical_p, 1);
+            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+            llama_sample_temperature(ctx, &candidates_p, temp);
+            id = llama_sample_token(ctx, &candidates_p);
+          }
+        }
+        last_n_tokens.erase(last_n_tokens.begin());
+        last_n_tokens.push_back(id);
+        processed_tokens.push_back(id);
+        num_tokens_predicted++;
+      }
+
+      // replace end of text token with newline token when in interactive mode
+      if (id == llama_token_eos() && params.interactive)
+      {
+        id = llama_token_newline.front();
+        if (params.antiprompt.size() != 0)
+        {
+          // tokenize and inject first reverse prompt
+          const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
+          embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
+        }
+      }
+
+      // add it to the context
+      embd.push_back(id);
+      for (auto id : embd)
+      {
+        result = id;
+      }
+      // decrement remaining sampling budget
+      --n_remain;
+    }
+    else
+    {
+      // some user input remains from prompt or interaction, forward it to processing
+      while ((int)embd_inp.size() > n_consumed)
+      {
+        embd.push_back(embd_inp[n_consumed]);
+        last_n_tokens.erase(last_n_tokens.begin());
+        last_n_tokens.push_back(embd_inp[n_consumed]);
+        processed_tokens.push_back(embd_inp[n_consumed]);
+        ++n_consumed;
+        if ((int)embd.size() >= params.n_batch)
+        {
+          break;
+        }
+      }
+    }
+    if (params.interactive && (int)embd_inp.size() <= n_consumed)
+    {
+      // check for reverse prompt
+      if (params.antiprompt.size())
+      {
+        std::string last_output;
+        for (auto id : last_n_tokens)
+        {
+          last_output += llama_token_to_str(ctx, id);
+        }
+        has_next_token = true;
+        // Check if each of the reverse prompts appears at the end of the output.
+        for (std::string &antiprompt : params.antiprompt)
+        {
+          if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos)
+          {
+            has_next_token = false;
+            return result;
+          }
+        }
+      }
+      if (n_past > 0)
+      {
+        has_next_token = true;
+      }
+    }
+
+    if (!embd.empty() && embd.back() == llama_token_eos()) {
+        has_next_token = false;
+    }
+
+    if (params.interactive && n_remain <= 0 && params.n_predict != -1)
+    {
+      n_remain = params.n_predict;
+    }
+    has_next_token = n_remain != 0;
+    return result;
+  }
+
+  std::string doCompletion()
+  {
+    llama_token token = nextToken();
+    if (token == -1) {
+      return "";
+    }
+    tokens_predicted.clear();
+    tokens_predicted.push_back(token);
+
+    // Avoid add the no show words to the response
+    for (std::vector<llama_token> word_tokens : no_show_words)
+    {
+      size_t match_token = 1;
+      if (tokens_predicted.front() == word_tokens.front())
+      {
+        bool execute_matching = true;
+        if (tokens_predicted.size() > 1) { // if previus tokens had been tested
+          for (size_t i = 1; i < word_tokens.size(); i++)
+          {
+            if (i >= tokens_predicted.size()) {
+              match_token = i;
+              break;
+            }
+            if (tokens_predicted[i] == word_tokens[i])
+            {
+              continue;
+            }
+            else
+            {
+              execute_matching = false;
+              break;
+            }
+          }
+        }
+        while (execute_matching) {
+          if (match_token == word_tokens.size()) {
+            return "";
+          }
+          token = nextToken();
+          tokens_predicted.push_back(token);
+          if (token == word_tokens[match_token])
+          { // the token follow the sequence
+            match_token++;
+          }
+          else if (match_token < word_tokens.size())
+          { // no complete all word sequence
+            break;
+          }
+        }
+      }
+    }
+    if(as_loop) {
+      generated_text = "";
+    }
+    for (llama_token tkn : tokens_predicted)
+    {
+      generated_text += llama_token_to_str(ctx, tkn);
+    }
+    return generated_text;
+  }
+
+  std::vector<float> embedding(std::string content, int threads) {
+    content.insert(0, 1, ' ');
+    std::vector<llama_token> tokens = ::llama_tokenize(ctx, content, true);
+    if (tokens.size() > 0)
+    {
+      if (llama_eval(ctx, tokens.data(), tokens.size(), 0, threads))
+      {
+        fprintf(stderr, "%s : failed to eval\n", __func__);
+        std::vector<float> embeddings_;
+        return embeddings_;
+      }
+    }
+    const int n_embd = llama_n_embd(ctx);
+    const auto embeddings = llama_get_embeddings(ctx);
+    std::vector<float> embeddings_(embeddings, embeddings + n_embd);
+    return embeddings_;
+  }
+};
+
+using namespace httplib;
+
+using json = nlohmann::json;
+
+void server_print_usage(int /*argc*/, char **argv, const gpt_params &params)
+{
+  fprintf(stderr, "usage: %s [options]\n", argv[0]);
+  fprintf(stderr, "\n");
+  fprintf(stderr, "options:\n");
+  fprintf(stderr, "  -h, --help            show this help message and exit\n");
+  fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
+  fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+  fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+  fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
+  fprintf(stderr, "  --embedding           enable embedding mode\n");
+  fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+  if (llama_mlock_supported())
+  {
+    fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+  }
+  if (llama_mmap_supported())
+  {
+    fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+  }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+  fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
+  fprintf(stderr, "                        number of layers to store in VRAM\n");
+  fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n");
+  fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+  fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+  fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n" );
+  fprintf(stderr, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n" );
+#endif
+  fprintf(stderr, "  -m FNAME, --model FNAME\n");
+  fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+  fprintf(stderr, "  -a ALIAS, --alias ALIAS\n");
+  fprintf(stderr, "                        set an alias for the model, will be added as `model` field in completion response\n");
+  fprintf(stderr, "  --host                ip address to listen (default 127.0.0.1)\n");
+  fprintf(stderr, "  --port PORT           port to listen (default 8080)\n");
+  fprintf(stderr, "\n");
+}
+
+bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_params &params)
+{
+  gpt_params default_params;
+  std::string arg;
+  bool invalid_param = false;
+
+  for (int i = 1; i < argc; i++)
+  {
+    arg = argv[i];
+    if (arg == "--port")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+      sparams.port = std::stoi(argv[i]);
+    }
+    else if (arg == "--host")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+      sparams.hostname = argv[i];
+    }
+    else if (arg == "-s" || arg == "--seed")
+    {
+#if defined(GGML_USE_CUBLAS)
+      fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n");
+#endif
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+      params.seed = std::stoi(argv[i]);
+    }
+    else if (arg == "-m" || arg == "--model")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+      params.model = argv[i];
+    }
+    else if (arg == "-a" || arg == "--alias")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+      params.model_alias = argv[i];
+    }
+    else if (arg == "--embedding")
+    {
+      params.embedding = true;
+    }
+    else if (arg == "-h" || arg == "--help")
+    {
+      server_print_usage(argc, argv, default_params);
+      exit(0);
+    }
+    else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+      params.n_ctx = std::stoi(argv[i]);
+    }
+    else if (arg == "--memory-f32" || arg == "--memory_f32")
+    {
+      params.memory_f16 = false;
+    }
+    else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+      params.n_gpu_layers = std::stoi(argv[i]);
+#else
+      fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+      fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
+    }
+    else if (arg == "--tensor-split" || arg == "-ts")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+#ifdef GGML_USE_CUBLAS
+      std::string arg_next = argv[i];
+
+      // split string by , and /
+      const std::regex regex{R"([,/]+)"};
+      std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
+      std::vector<std::string> split_arg{it, {}};
+      GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+
+      for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i)
+      {
+        if (i < split_arg.size())
+        {
+          params.tensor_split[i] = std::stof(split_arg[i]);
+        }
+        else
+        {
+          params.tensor_split[i] = 0.0f;
+        }
+      }
+#else
+      fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+#endif // GGML_USE_CUBLAS
+    }
+    else if (arg == "--low-vram" || arg == "-lv")
+    {
+#ifdef GGML_USE_CUBLAS
+      params.low_vram = true;
+#else
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
+#endif // GGML_USE_CUBLAS
+    }
+    else if (arg == "--main-gpu" || arg == "-mg")
+    {
+      if (++i >= argc)
+      {
+        invalid_param = true;
+        break;
+      }
+#ifdef GGML_USE_CUBLAS
+      params.main_gpu = std::stoi(argv[i]);
+#else
+      fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
+#endif
+    }
+    else
+    {
+      fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+      server_print_usage(argc, argv, default_params);
+      exit(1);
+    }
+  }
+
+  if (invalid_param)
+  {
+    fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+    server_print_usage(argc, argv, default_params);
+    exit(1);
+  }
+  return true;
+}
+
+bool parse_options_completion(json body, llama_server_context& llama, Response &res) {
+  if (!body["threads"].is_null())
+  {
+    llama.params.n_threads = body["threads"].get<int>();
+  }
+  if (!body["n_predict"].is_null())
+  {
+    llama.params.n_predict = body["n_predict"].get<int>();
+  }
+  if (!body["top_k"].is_null())
+  {
+    llama.params.top_k = body["top_k"].get<int>();
+  }
+  if (!body["top_p"].is_null())
+  {
+    llama.params.top_p = body["top_p"].get<float>();
+  }
+  if (!body["temperature"].is_null())
+  {
+    llama.params.temp = body["temperature"].get<float>();
+  }
+  if (!body["batch_size"].is_null())
+  {
+    llama.params.n_batch = body["batch_size"].get<int>();
+  }
+  if (!body["n_keep"].is_null())
+  {
+    llama.params.n_keep = body["n_keep"].get<int>();
+  }
+  if (!body["as_loop"].is_null())
+  {
+    llama.as_loop = body["as_loop"].get<bool>();
+  }
+  if (!body["interactive"].is_null())
+  {
+    llama.params.interactive = body["interactive"].get<bool>();
+  }
+  if (!body["prompt"].is_null())
+  {
+    llama.params.prompt = body["prompt"].get<std::string>();
+  }
+  else
+  {
+    json data = {
+        {"status", "error"},
+        {"reason", "You need to pass the prompt"}};
+    res.set_content(data.dump(), "application/json");
+    res.status = 400;
+    return false;
+  }
+  if (!body["stop"].is_null())
+  {
+    std::vector<std::string> stop_words = body["stop"].get<std::vector<std::string>>();
+    for (std::string stop_word : stop_words)
+    {
+      llama.params.antiprompt.push_back(stop_word);
+      llama.no_show_words.push_back(::llama_tokenize(llama.ctx, stop_word, false));
+    }
+  }
+  if (!body["exclude"].is_null())
+  {
+    std::vector<std::string> no_show_words = body["exclude"].get<std::vector<std::string>>();
+    for (std::string no_show : no_show_words)
+    {
+      llama.no_show_words.push_back(::llama_tokenize(llama.ctx, no_show, false));
+    }
+  }
+  return true;
+}
+
+int main(int argc, char **argv)
+{
+  // own arguments required by this example
+  gpt_params params;
+  server_params sparams;
+
+  // struct that contains llama context and inference
+  llama_server_context llama;
+  params.model = "ggml-model.bin";
+
+  if (server_params_parse(argc, argv, sparams, params) == false)
+  {
+    return 1;
+  }
+
+  if (params.seed <= 0)
+  {
+    params.seed = time(NULL);
+  }
+
+  fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
+
+  // load the model
+  if (!llama.loadModel(params))
+  {
+    return 1;
+  }
+
+  Server svr;
+
+  svr.Get("/", [](const Request &, Response &res)
+          { res.set_content("<h1>llama.cpp server works</h1>", "text/html"); });
+
+  svr.Post("/completion", [&llama](const Request &req, Response &res)
+            {
+              if(llama.params.embedding) {
+                json data = {
+                    {"status", "error"},
+                    {"reason", "To use completion function disable embedding mode"}};
+                res.set_content(data.dump(), "application/json");
+                res.status = 400;
+                return;
+              }
+
+              llama.rewind();
+
+              if(parse_options_completion(json::parse(req.body), llama, res) == false){
+                return;
+              }
+
+              if (!llama.loadPrompt())
+              {
+                json data = {
+                    {"status", "error"},
+                    {"reason", "Context too long, please be more specific"}};
+                res.set_content(data.dump(), "application/json");
+                res.status = 400;
+                return;
+              }
+
+              llama.beginCompletion();
+              if(llama.as_loop) {
+                json data = {
+                    {"status", "done" } };
+                return res.set_content(data.dump(), "application/json");
+              } else {
+                // loop inference until finish completion
+                while (llama.has_next_token)
+                {
+                  llama.doCompletion();
+                }
+                try
+                {
+                  json data = {
+                      {"model", llama.params.model_alias },
+                      {"content", llama.generated_text },
+                      {"tokens_predicted", llama.num_tokens_predicted}};
+                  return res.set_content(data.dump(), "application/json");
+                }
+                catch (const json::exception &e)
+                {
+                  // Some tokens have bad UTF-8 strings, the json parser is very sensitive
+                  json data = {
+                      {"content", "Bad encoding token"},
+                      {"tokens_predicted", 0}};
+                  return res.set_content(data.dump(), "application/json");
+                }
+              } });
+
+  svr.Post("/tokenize", [&llama](const Request &req, Response &res)
+            {
+              json body = json::parse(req.body);
+              json data = {
+                    {"tokens", ::llama_tokenize(llama.ctx, body["content"].get<std::string>(), false) } };
+                return res.set_content(data.dump(), "application/json");
+            });
+
+  svr.Post("/embedding", [&llama](const Request &req, Response &res)
+            {
+              if(!llama.params.embedding) {
+                std::vector<float> empty;
+                json data = {
+                    {"embedding", empty}};
+                fprintf(stderr, "[llama-server] : You need enable embedding mode adding: --embedding option\n");
+                return res.set_content(data.dump(), "application/json");
+              }
+              json body = json::parse(req.body);
+              std::string content = body["content"].get<std::string>();
+              int threads = body["threads"].get<int>();
+              json data = {
+                    {"embedding", llama.embedding(content, threads) } };
+              return res.set_content(data.dump(), "application/json");
+            });
+
+  svr.Get("/next-token", [&llama](const Request &req, Response &res)
+          {
+            if(llama.params.embedding) {
+                res.set_content("{}", "application/json");
+                return;
+            }
+            std::string result = "";
+            if (req.has_param("stop")) {
+                llama.has_next_token = false;
+            } else {
+              result = llama.doCompletion(); // inference next token
+            }
+            try {
+              json data = {
+                        {"content", result },
+                        {"stop", !llama.has_next_token }};
+              return res.set_content(data.dump(), "application/json");
+            } catch (const json::exception &e) {
+              // Some tokens have bad UTF-8 strings, the json parser is very sensitive
+              json data = {
+                        {"content", "" },
+                        {"stop", !llama.has_next_token }};
+              return res.set_content(data.dump(), "application/json");
+            }
+          });
+
+  fprintf(stderr, "%s: http server Listening at http://%s:%i\n", __func__, sparams.hostname.c_str(), sparams.port);
+
+  if(params.embedding) {
+    fprintf(stderr, "NOTE: Mode embedding enabled. Completion function doesn't work in this mode.\n");
+  }
+
+  // change hostname and port
+  svr.listen(sparams.hostname, sparams.port);
+}
--- a/examples/train-text-from-scratch/CMakeLists.txt
+++ b/examples/train-text-from-scratch/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET train-text-from-scratch)
+add_executable(${TARGET} train-text-from-scratch.cpp)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@@ -0,0 +1,22 @@
+# train-text-from-scratch
+
+Basic usage instructions:
+
+```bash
+# get training data
+wget https://github.com/brunoklein99/deep-learning-notes/blob/master/shakespeare.txt
+
+# train
+./bin/train-text-from-scratch \
+        --vocab-model ../models/ggml-vocab.bin \
+        --ctx 64 --embd 256 --head 8 --layer 16 \
+        --checkpoint-in  chk-shakespeare-256x16.bin \
+        --checkpoint-out chk-shakespeare-256x16.bin \
+        --model-out ggml-shakespeare-256x16-f32.bin \
+        --train-data "shakespeare.txt" \
+        -t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
+        --print-details-interval 0 --predict 16 --use-flash
+
+# predict
+./bin/main -m ggml-shakespeare-256x16-f32.bin
+```
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
--- a/flake.lock
+++ b/flake.lock
@@ -1,12 +1,15 @@
 {
  "nodes": {
    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
      "locked": {
-        "lastModified": 1676283394,
-        "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
+        "lastModified": 1685518550,
+        "narHash": "sha256-o2d0KcvaXzTrPRIo0kOLV0/QXHhDQ5DTi+OxcjO8xqY=",
        "owner": "numtide",
        "repo": "flake-utils",
-        "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
+        "rev": "a1720a10a6cfe8234c0e93907ffe81be440f4cef",
        "type": "github"
      },
      "original": {
@@ -17,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1678470307,
-        "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
+        "lastModified": 1685931219,
+        "narHash": "sha256-8EWeOZ6LKQfgAjB/USffUSELPRjw88A+xTcXnOUvO5M=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
+        "rev": "7409480d5c8584a1a83c422530419efe4afb0d19",
        "type": "github"
      },
      "original": {
@@ -36,6 +39,21 @@
        "flake-utils": "flake-utils",
        "nixpkgs": "nixpkgs"
      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
    }
  },
  "root": "root",
--- a/flake.nix
+++ b/flake.nix
@@ -6,6 +6,13 @@
  outputs = { self, nixpkgs, flake-utils }:
    flake-utils.lib.eachDefaultSystem (system:
      let
+        inherit (pkgs.stdenv) isAarch64 isDarwin;
+        inherit (pkgs.lib) optionals;
+        isM1 = isAarch64 && isDarwin;
+        osSpecific =
+          if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ]
+          else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ]
+          else [ ];
        pkgs = import nixpkgs {
          inherit system;
        };
@@ -18,17 +25,22 @@
        packages.default = pkgs.stdenv.mkDerivation {
          name = "llama.cpp";
          src = ./.;
+          postPatch =
+            if isM1 then ''
+              substituteInPlace ./ggml-metal.m \
+                --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";"
+            '' else "";
          nativeBuildInputs = with pkgs; [ cmake ];
-          buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
-            darwin.apple_sdk.frameworks.Accelerate
-          ];
-          cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
+          buildInputs = osSpecific;
+          cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [
            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
-          ];
+            "-DLLAMA_METAL=ON"
+          ]);
          installPhase = ''
            mkdir -p $out/bin
            mv bin/* $out/bin/
            mv $out/bin/main $out/bin/llama
+            mv $out/bin/server $out/bin/llama-server

            echo "#!${llama-python}/bin/python" > $out/bin/convert.py
            cat ${./convert.py} >> $out/bin/convert.py
@@ -40,9 +52,7 @@
          packages = with pkgs; [
            cmake
            llama-python
-          ] ++ lib.optionals stdenv.isDarwin [
-            darwin.apple_sdk.frameworks.Accelerate
-          ];
+          ] ++ osSpecific;
        };
      }
    );
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -1,11 +1,21 @@
+#pragma once
+
 #include "ggml.h"

 #ifdef  __cplusplus
 extern "C" {
 #endif

-void   ggml_init_cublas(void);
+#define GGML_CUDA_MAX_DEVICES       16

+struct ggml_tensor_extra_gpu {
+    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
+};
+
+void   ggml_init_cublas(void);
+void   ggml_cuda_set_tensor_split(const float * tensor_split);
+
+void   ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
@@ -14,6 +24,16 @@ void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);

+void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+
+void   ggml_cuda_free_data(struct ggml_tensor * tensor);
+void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
+void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
+void   ggml_cuda_set_main_device(int main_device);
+void   ggml_cuda_set_scratch_size(size_t scratch_size);
+void   ggml_cuda_free_scratch(void);
+bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -0,0 +1,64 @@
+// An interface allowing to compute ggml_cgraph with Metal
+//
+// This is a fully functional interface that extends ggml with GPU support for Apple devices.
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
+//
+// How it works?
+//
+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
+// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
+//
+// You only need to make sure that all memory buffers that you used during the graph creation
+// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
+// used during the graph evaluation to determine the arguments of the compute kernels.
+//
+// Synchronization between device and host memory (for example for input and output tensors)
+// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
+//
+
+#pragma once
+
+#include <stddef.h>
+#include <stdbool.h>
+
+// max memory buffers that can be mapped to the device
+#define GGML_METAL_MAX_BUFFERS 16
+
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_metal_context;
+
+struct ggml_metal_context * ggml_metal_init(void);
+void ggml_metal_free(struct ggml_metal_context * ctx);
+
+// creates a mapping between a host memory buffer and a device memory buffer
+// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
+// - the mapping is used during computation to determine the arguments of the compute kernels
+// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
+//
+bool ggml_metal_add_buffer(
+        struct ggml_metal_context * ctx,
+                       const char * name,
+                             void * data,
+                           size_t   size);
+
+// set data from host memory into the device
+void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// get data from the device into host memory
+void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// same as ggml_graph_compute but uses Metal
+// creates gf->n_threads command buffers in parallel
+void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+
+#ifdef __cplusplus
+}
+#endif
+
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -0,0 +1,834 @@
+#import "ggml-metal.h"
+
+#import "ggml.h"
+
+#import <Foundation/Foundation.h>
+
+#import <Metal/Metal.h>
+#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+#ifdef GGML_METAL_NDEBUG
+#define metal_printf(...)
+#else
+#define metal_printf(...) fprintf(stderr, __VA_ARGS__)
+#endif
+
+#define UNUSED(x) (void)(x)
+
+struct ggml_metal_buffer {
+    const char * name;
+
+    void   * data;
+    size_t   size;
+
+    id<MTLBuffer> metal;
+};
+
+struct ggml_metal_context {
+    float * logits;
+
+    id<MTLDevice>       device;
+    id<MTLCommandQueue> queue;
+    id<MTLLibrary>      library;
+
+    int n_buffers;
+    struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
+
+    // custom kernels
+#define GGML_METAL_DECL_KERNEL(name) \
+    id<MTLFunction>             function_##name; \
+    id<MTLComputePipelineState> pipeline_##name
+
+    GGML_METAL_DECL_KERNEL(add);
+    GGML_METAL_DECL_KERNEL(mul);
+    GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
+    GGML_METAL_DECL_KERNEL(scale);
+    GGML_METAL_DECL_KERNEL(silu);
+    GGML_METAL_DECL_KERNEL(relu);
+    GGML_METAL_DECL_KERNEL(gelu);
+    GGML_METAL_DECL_KERNEL(soft_max);
+    GGML_METAL_DECL_KERNEL(diag_mask_inf);
+    GGML_METAL_DECL_KERNEL(get_rows_f16);
+    GGML_METAL_DECL_KERNEL(get_rows_q4_0);
+    GGML_METAL_DECL_KERNEL(get_rows_q4_1);
+    GGML_METAL_DECL_KERNEL(get_rows_q2_k);
+    GGML_METAL_DECL_KERNEL(get_rows_q3_k);
+    GGML_METAL_DECL_KERNEL(get_rows_q4_k);
+    GGML_METAL_DECL_KERNEL(get_rows_q5_k);
+    GGML_METAL_DECL_KERNEL(get_rows_q6_k);
+    GGML_METAL_DECL_KERNEL(rms_norm);
+    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
+    GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
+    GGML_METAL_DECL_KERNEL(rope);
+    GGML_METAL_DECL_KERNEL(cpy_f32_f16);
+    GGML_METAL_DECL_KERNEL(cpy_f32_f32);
+
+#undef GGML_METAL_DECL_KERNEL
+};
+
+// MSL code
+// TODO: move the contents here when ready
+//       for now it is easier to work in a separate file
+static NSString * const msl_library_source = @"see metal.metal";
+
+// Here to assist with NSBundle Path Hack
+@interface GGMLMetalClass : NSObject
+@end
+@implementation GGMLMetalClass
+@end
+
+struct ggml_metal_context * ggml_metal_init(void) {
+    fprintf(stderr, "%s: allocating\n", __func__);
+
+    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+
+    ctx->device = MTLCreateSystemDefaultDevice();
+    ctx->queue  = [ctx->device newCommandQueue];
+    ctx->n_buffers = 0;
+
+    // determine if we can use MPS
+    if (MPSSupportsMTLDevice(ctx->device)) {
+        fprintf(stderr, "%s: using MPS\n", __func__);
+    } else {
+        fprintf(stderr, "%s: not using MPS\n", __func__);
+        GGML_ASSERT(false && "MPS not supported");
+    }
+
+#if 0
+    // compile from source string and show compile log
+    {
+        NSError * error = nil;
+
+        ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
+        if (error) {
+            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+            exit(1);
+        }
+    }
+#else
+    UNUSED(msl_library_source);
+
+    // read the source from "ggml-metal.metal" into a string and use newLibraryWithSource
+    {
+        NSError * error = nil;
+
+        //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
+        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
+        NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
+        fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]);
+
+        NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
+        if (error) {
+            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+            exit(1);
+        }
+
+        ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
+        if (error) {
+            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+            exit(1);
+        }
+    }
+#endif
+
+    // load kernels
+    {
+#define GGML_METAL_ADD_KERNEL(name) \
+        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
+        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:nil]; \
+        fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name);
+
+        GGML_METAL_ADD_KERNEL(add);
+        GGML_METAL_ADD_KERNEL(mul);
+        GGML_METAL_ADD_KERNEL(mul_row);
+        GGML_METAL_ADD_KERNEL(scale);
+        GGML_METAL_ADD_KERNEL(silu);
+        GGML_METAL_ADD_KERNEL(relu);
+        GGML_METAL_ADD_KERNEL(gelu);
+        GGML_METAL_ADD_KERNEL(soft_max);
+        GGML_METAL_ADD_KERNEL(diag_mask_inf);
+        GGML_METAL_ADD_KERNEL(get_rows_f16);
+        GGML_METAL_ADD_KERNEL(get_rows_q4_0);
+        GGML_METAL_ADD_KERNEL(get_rows_q4_1);
+        GGML_METAL_ADD_KERNEL(get_rows_q2_k);
+        GGML_METAL_ADD_KERNEL(get_rows_q3_k);
+        GGML_METAL_ADD_KERNEL(get_rows_q4_k);
+        GGML_METAL_ADD_KERNEL(get_rows_q5_k);
+        GGML_METAL_ADD_KERNEL(get_rows_q6_k);
+        GGML_METAL_ADD_KERNEL(rms_norm);
+        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
+        GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
+        GGML_METAL_ADD_KERNEL(rope);
+        GGML_METAL_ADD_KERNEL(cpy_f32_f16);
+        GGML_METAL_ADD_KERNEL(cpy_f32_f32);
+
+#undef GGML_METAL_ADD_KERNEL
+    }
+
+    return ctx;
+}
+
+void ggml_metal_free(struct ggml_metal_context * ctx) {
+    fprintf(stderr, "%s: deallocating\n", __func__);
+
+    free(ctx);
+}
+
+// finds the Metal buffer that contains the tensor data on the GPU device
+// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
+// Metal buffer based on the host memory pointer
+//
+static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
+    //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
+
+    for (int i = 0; i < ctx->n_buffers; ++i) {
+        const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
+
+        if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
+            *offs = (size_t) ioffs;
+
+            //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
+
+            return ctx->buffers[i].metal;
+        }
+    }
+
+    fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+
+    return nil;
+}
+
+bool ggml_metal_add_buffer(
+        struct ggml_metal_context * ctx,
+                     const char * name,
+                           void * data,
+                         size_t   size) {
+    if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
+        fprintf(stderr, "%s: too many buffers\n", __func__);
+        return false;
+    }
+
+    if (data) {
+        // verify that the buffer does not overlap with any of the existing buffers
+        for (int i = 0; i < ctx->n_buffers; ++i) {
+            const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;
+
+            if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
+                fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
+                return false;
+            }
+        }
+
+        size_t page_size = getpagesize();
+        size_t aligned_size = size;
+        if ((aligned_size % page_size) != 0) {
+            aligned_size += (page_size - (aligned_size % page_size));
+        }
+
+        ctx->buffers[ctx->n_buffers].name = name;
+        ctx->buffers[ctx->n_buffers].data = data;
+        ctx->buffers[ctx->n_buffers].size = size;
+
+        if (ctx->device.maxBufferLength < aligned_size) {
+            fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
+            return false;
+        }
+        ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
+
+        if (ctx->buffers[ctx->n_buffers].metal == nil) {
+            fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
+            return false;
+        } else {
+            fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
+        }
+
+        ++ctx->n_buffers;
+    }
+
+    return true;
+}
+
+void ggml_metal_set_tensor(
+        struct ggml_metal_context * ctx,
+        struct ggml_tensor * t) {
+    metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
+
+    size_t offs;
+    id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
+
+    memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
+}
+
+void ggml_metal_get_tensor(
+        struct ggml_metal_context * ctx,
+        struct ggml_tensor * t) {
+    metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
+
+    size_t offs;
+    id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
+
+    memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
+}
+
+void ggml_metal_graph_compute(
+        struct ggml_metal_context * ctx,
+               struct ggml_cgraph * gf) {
+    metal_printf("%s: evaluating graph\n", __func__);
+
+    // create multiple command buffers and enqueue them
+    // then, we encode the graph into the command buffers in parallel
+
+    const int n_cb = gf->n_threads;
+
+    NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
+
+    for (int i = 0; i < n_cb; ++i) {
+        command_buffers[i] = [ctx->queue commandBuffer];
+
+        // enqueue the command buffers in order to specify their execution order
+        [command_buffers[i] enqueue];
+    }
+
+    // TODO: is this the best way to start threads?
+    dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
+
+    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
+        const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
+
+        dispatch_async(queue, ^{
+            size_t offs_src0 = 0;
+            size_t offs_src1 = 0;
+            size_t offs_dst  = 0;
+
+            id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
+
+            id<MTLComputeCommandEncoder> encoder = nil;
+
+            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
+            const int node_end   = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
+
+            for (int i = node_start; i < node_end; ++i) {
+                metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+
+                struct ggml_tensor * src0 = gf->nodes[i]->src0;
+                struct ggml_tensor * src1 = gf->nodes[i]->src1;
+                struct ggml_tensor * dst  = gf->nodes[i];
+
+                const int64_t  ne00 = src0 ? src0->ne[0] : 0;
+                const int64_t  ne01 = src0 ? src0->ne[1] : 0;
+                const int64_t  ne02 = src0 ? src0->ne[2] : 0;
+                const int64_t  ne03 = src0 ? src0->ne[3] : 0;
+
+                const uint64_t nb00 = src0 ? src0->nb[0] : 0;
+                const uint64_t nb01 = src0 ? src0->nb[1] : 0;
+                const uint64_t nb02 = src0 ? src0->nb[2] : 0;
+                const uint64_t nb03 = src0 ? src0->nb[3] : 0;
+
+                const int64_t  ne10 = src1 ? src1->ne[0] : 0;
+                const int64_t  ne11 = src1 ? src1->ne[1] : 0;
+                const int64_t  ne12 = src1 ? src1->ne[2] : 0;
+                const int64_t  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+
+                const uint64_t nb10 = src1 ? src1->nb[0] : 0;
+                const uint64_t nb11 = src1 ? src1->nb[1] : 0;
+                const uint64_t nb12 = src1 ? src1->nb[2] : 0;
+                const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+
+                const int64_t  ne0  = dst ? dst->ne[0] : 0;
+                const int64_t  ne1  = dst ? dst->ne[1] : 0;
+                const int64_t  ne2  = dst ? dst->ne[2] : 0;
+                const int64_t  ne3  = dst ? dst->ne[3] : 0;
+
+                const uint64_t nb0  = dst ? dst->nb[0] : 0;
+                const uint64_t nb1  = dst ? dst->nb[1] : 0;
+                const uint64_t nb2  = dst ? dst->nb[2] : 0;
+                const uint64_t nb3  = dst ? dst->nb[3] : 0;
+
+                const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+                const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+                const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
+
+                id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
+                id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
+                id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
+
+                //metal_printf("%s: op - %s\n", __func__, ggml_op_name(dst->op));
+                //if (src0) {
+                //    metal_printf("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
+                //            ggml_is_contiguous(src0), src0->name);
+                //}
+                //if (src1) {
+                //    metal_printf("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
+                //            ggml_is_contiguous(src1), src1->name);
+                //}
+                //if (dst) {
+                //    metal_printf("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
+                //            dst->name);
+                //}
+
+                switch (dst->op) {
+                    case GGML_OP_RESHAPE:
+                    case GGML_OP_VIEW:
+                    case GGML_OP_TRANSPOSE:
+                    case GGML_OP_PERMUTE:
+                        {
+                            // noop
+                        } break;
+                    case GGML_OP_ADD:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            [encoder setComputePipelineState:ctx->pipeline_add];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+
+                            const int64_t n = ggml_nelements(dst);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_MUL:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            if (ggml_nelements(src1) == ne10) {
+                                // src1 is a row
+                                [encoder setComputePipelineState:ctx->pipeline_mul_row];
+                            } else {
+                                [encoder setComputePipelineState:ctx->pipeline_mul];
+                            }
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+
+                            const int64_t n = ggml_nelements(dst);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_SCALE:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            const float scale = *(const float *) src1->data;
+
+                            [encoder setComputePipelineState:ctx->pipeline_scale];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
+
+                            const int64_t n = ggml_nelements(dst);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_SILU:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            [encoder setComputePipelineState:ctx->pipeline_silu];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                            const int64_t n = ggml_nelements(dst);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_RELU:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            [encoder setComputePipelineState:ctx->pipeline_relu];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                            const int64_t n = ggml_nelements(dst);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_GELU:
+                    {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            [encoder setComputePipelineState:ctx->pipeline_gelu];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                            const int64_t n = ggml_nelements(dst);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    } break;
+                    case GGML_OP_SOFT_MAX:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            const int nth = 32;
+
+                            [encoder setComputePipelineState:ctx->pipeline_soft_max];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    case GGML_OP_DIAG_MASK_INF:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            const int n_past = ((int32_t *)(src1->data))[0];
+
+                            [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
+                            [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
+                            [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_MUL_MAT:
+                        {
+                            // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
+
+                            GGML_ASSERT(ne00 == ne10);
+                            GGML_ASSERT(ne02 == ne12);
+
+                            if (ggml_is_contiguous(src0) &&
+                                ggml_is_contiguous(src1) &&
+                                (src0t == GGML_TYPE_F32 || src0t == GGML_TYPE_F16) && ne11 > 1) {
+
+                                if (encoder != nil) {
+                                    [encoder endEncoding];
+                                    encoder = nil;
+                                }
+
+                                MPSDataType src0dt = src0t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
+                                MPSDataType src1dt = src1t == GGML_TYPE_F32 ? MPSDataTypeFloat32 : MPSDataTypeFloat16;
+
+                                // for F32 x F32 we use MPS
+                                MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
+                                    matrixDescriptorWithRows:ne01 columns:ne00 rowBytes:src0->nb[1] dataType:src0dt];
+
+                                MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
+                                    matrixDescriptorWithRows:ne11 columns:ne10 rowBytes:src1->nb[1] dataType:src1dt];
+
+                                MPSMatrixDescriptor * desc  = [MPSMatrixDescriptor
+                                    matrixDescriptorWithRows:ne1 columns:ne0 rowBytes:dst->nb[1] dataType:MPSDataTypeFloat32];
+
+                                MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc]
+                                    initWithDevice:ctx->device transposeLeft:false transposeRight:true
+                                        resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
+
+                                // we need to do ne02 multiplications
+                                // TODO: is there a way to do this in parallel - currently very slow ..
+                                // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
+                                for (int64_t i02 = 0; i02 < ne02; ++i02) {
+                                    size_t offs_src0_cur = offs_src0 + i02*nb02;
+                                    size_t offs_src1_cur = offs_src1 + i02*nb12;
+                                    size_t offs_dst_cur  = offs_dst  + i02*nb2;
+
+                                    MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0_cur descriptor:desc0];
+                                    MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1_cur descriptor:desc1];
+                                    MPSMatrix * mat_dst  = [[MPSMatrix alloc] initWithBuffer:id_dst  offset:offs_dst_cur  descriptor:desc ];
+
+                                    [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
+                                }
+                            } else {
+                                if (encoder == nil) {
+                                    encoder = [command_buffer computeCommandEncoder];
+                                }
+
+                                int nth0 = 32;
+                                int nth1 = 1;
+
+                                // use custom matrix x vector kernel
+                                switch (src0t) {
+                                    case GGML_TYPE_F16:
+                                        {
+                                            GGML_ASSERT(ne02 == ne12);
+
+                                            nth0 = 64;
+                                            nth1 = 1;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
+                                        } break;
+                                    case GGML_TYPE_Q4_0:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
+                                        } break;
+                                    case GGML_TYPE_Q4_1:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 8;
+                                            nth1 = 8;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
+                                        } break;
+                                    case GGML_TYPE_Q2_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 4;
+                                            nth1 = 16;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
+                                        } break;
+                                    case GGML_TYPE_Q3_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 4;
+                                            nth1 = 16;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
+                                        } break;
+                                    case GGML_TYPE_Q4_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 4;
+                                            nth1 = 16;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
+                                        } break;
+                                    case GGML_TYPE_Q5_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 4;
+                                            nth1 = 16;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
+                                        } break;
+                                    case GGML_TYPE_Q6_K:
+                                        {
+                                            GGML_ASSERT(ne02 == 1);
+                                            GGML_ASSERT(ne12 == 1);
+
+                                            nth0 = 4;
+                                            nth1 = 16;
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
+                                        } break;
+                                    default:
+                                        {
+                                            fprintf(stderr, "Asserting on type %d\n",(int)src0t);
+                                            GGML_ASSERT(false && "not implemented");
+                                        }
+                                };
+
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
+                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
+                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
+                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
+                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
+                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
+                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
+                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
+                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
+                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
+
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
+                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                                else if (src0t == GGML_TYPE_Q2_K ||
+                                         src0t == GGML_TYPE_Q3_K ||
+                                         src0t == GGML_TYPE_Q4_K ||
+                                         src0t == GGML_TYPE_Q5_K ||
+                                         src0t == GGML_TYPE_Q6_K) {
+                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                } else {
+                                    [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                }
+                            }
+                        } break;
+                    case GGML_OP_GET_ROWS:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            switch (src0->type) {
+                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
+                                case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
+                                case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
+                                case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
+                                case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
+                                case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
+                                case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
+                                case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
+                                default: GGML_ASSERT(false && "not implemented");
+                            }
+
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
+                            [encoder setBytes:&(dst->nb[1])  length:sizeof(uint64_t) atIndex:5];
+
+                            const int64_t n = ggml_nelements(src1);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_RMS_NORM:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            const float eps = 1e-6f;
+
+                            const int nth = 256;
+
+                            [encoder setComputePipelineState:ctx->pipeline_rms_norm];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
+                            [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
+                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+
+                            const int64_t nrows = ggml_nrows(src0);
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    case GGML_OP_ROPE:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            const int n_dims = ((int32_t *) src1->data)[1];
+                            const int mode   = ((int32_t *) src1->data)[2];
+
+                            const int n_past = ((int32_t *)(src1->data))[0];
+
+                            [encoder setComputePipelineState:ctx->pipeline_rope];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00   length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne01   length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne02   length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne03   length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&nb00   length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb01   length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb02   length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb03   length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&ne0    length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne1    length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne2    length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne3    length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&nb0    length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb1    length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb2    length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb3    length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBytes:&n_past length:sizeof(     int) atIndex:18];
+                            [encoder setBytes:&n_dims length:sizeof(     int) atIndex:19];
+                            [encoder setBytes:&mode   length:sizeof(     int) atIndex:20];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        } break;
+                    case GGML_OP_CPY:
+                        {
+                            if (encoder == nil) {
+                                encoder = [command_buffer computeCommandEncoder];
+                            }
+
+                            const int nth = 32;
+
+                            switch (src0t) {
+                                case GGML_TYPE_F32:
+                                    {
+                                        switch (dstt) {
+                                            case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f16]; break;
+                                            case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32]; break;
+                                            default: GGML_ASSERT(false && "not implemented");
+                                        };
+                                    } break;
+                                default: GGML_ASSERT(false && "not implemented");
+                            }
+
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
+
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        } break;
+                    default:
+                        fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                        GGML_ASSERT(false);
+                }
+            }
+
+            if (encoder != nil) {
+                [encoder endEncoding];
+                encoder = nil;
+            }
+
+            [command_buffer commit];
+        });
+    }
+
+    // wait for all threads to finish
+    dispatch_barrier_sync(queue, ^{});
+
+    [command_buffers[n_cb - 1] waitUntilCompleted];
+}
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@@ -1,370 +0,0 @@
-#include "ggml-opencl.h"
-
-#define CL_TARGET_OPENCL_VERSION 110
-#include <clblast_c.h>
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include "ggml.h"
-
-#define MULTILINE_QUOTE(...) #__VA_ARGS__
-const char * clblast_dequant = MULTILINE_QUOTE(
-
-struct block_q4_0
-{
-    float d;
-    uchar qs[16];
-};
-
-__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = blocks[i].d;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*32 + l*2;
-    result[index + 0] = ((vi & 0xf) - 8)*d;
-    result[index + 1] = ((vi >> 4) - 8)*d;
-}
-
-struct block_q4_1
-{
-    float d;
-    float m;
-    uchar qs[16];
-};
-
-__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = blocks[i].d;
-    const float m = blocks[i].m;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint index = i*32 + l*2;
-    result[index + 0] = (vi & 0xf) * d + m;
-    result[index + 1] = (vi >> 4) * d + m;
-}
-
-struct block_q5_0
-{
-    float d;
-    uint qh;
-    uchar qs[16];
-};
-
-__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = blocks[i].d;
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint l2 = l * 2;
-
-    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
-    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
-
-    const uint index = i*32 + l2;
-    result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
-    result[index + 1] = (((vi >>  4) | vh1) - 16)*d;
-}
-
-struct block_q5_1
-{
-    ushort d;
-    ushort m;
-    uint qh;
-    uchar qs[16];
-};
-
-__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    const float d = vload_half(0, (__global half*) &blocks[i].d);
-    const float m = vload_half(0, (__global half*) &blocks[i].m);
-
-    const uchar vi = blocks[i].qs[l];
-
-    const uint l2 = l * 2;
-
-    const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
-    const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
-
-    const uint index = i*32 + l2;
-    result[index + 0] = ((vi & 0xf) | vh0)*d + m;
-    result[index + 1] = ((vi >>  4) | vh1)*d + m;
-}
-
-struct block_q8_0
-{
-    float d;
-    char qs[32];
-};
-
-__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
-    const uint i = get_global_id(0) / 32;
-    const uint l = get_local_id(0);
-
-    result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
-}
-
-);
-
-#define CL_CHECK(err, name)                                                                     \
-    do {                                                                                        \
-        cl_int err_ = (err);                                                                    \
-        if (err_ != CL_SUCCESS) {                                                               \
-            fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__);   \
-            exit(1);                                                                            \
-        }                                                                                       \
-    } while (0)
-
-#define QK5_0 32
-typedef struct {
-    ggml_fp16_t d;         // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-
-
-typedef struct {
-    float d;                // delta
-    uint32_t qh;          // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2];  // nibbles / quants
-} cl_block_q5_0;
-
-static cl_platform_id platform;
-static cl_device_id device;
-static cl_context context;
-static cl_command_queue queue;
-static cl_program program;
-static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
-static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
-static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
-
-static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
-    cl_program p;
-    char *program_log;
-    size_t program_size, log_size;
-    int err;
-
-    program_size = strlen(program_buffer);
-
-    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
-    if(err < 0) {
-        fprintf(stderr, "OpenCL error creating program");
-        exit(1);
-    }
-
-    err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
-    if(err < 0) {
-
-        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-        program_log = (char*) malloc(log_size + 1);
-        program_log[log_size] = '\0';
-        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
-        printf("%s\n", program_log);
-        free(program_log);
-        exit(1);
-    }
-
-    return p;
-}
-
-void ggml_cl_init(void) {
-    cl_int err = 0;
-    char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
-    char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
-    int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
-    int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
-    printf("\nInitializing CLBlast (First Run)...");
-    printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
-    cl_uint num_platforms;
-    clGetPlatformIDs(0, NULL, &num_platforms);
-    cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
-    clGetPlatformIDs(num_platforms, platforms, NULL);
-    platform = platforms[plat_num];
-    char platform_buffer[1024];
-    clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
-    cl_uint num_devices;
-    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
-    cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
-    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
-    device = devices[dev_num];
-    char device_buffer[1024];
-    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
-    printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
-    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
-    CL_CHECK(err, "clCreateContext");
-    queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
-    CL_CHECK(err, "clCreateCommandQueue");
-
-    free(platforms);
-    free(devices);
-
-    program = build_program_from_source(context, device, clblast_dequant);
-
-    // Prepare dequantize kernels
-    kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
-    CL_CHECK(err, "clCreateKernel");
-    kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
-    CL_CHECK(err, "clCreateKernel");
-    kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
-    CL_CHECK(err, "clCreateKernel");
-    kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
-    CL_CHECK(err, "clCreateKernel");
-    kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
-    CL_CHECK(err, "clCreateKernel");
-}
-
-static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
-    if (req_size <= *cur_size) {
-        return;
-    }
-
-    // Reallocate buffer with enough space
-    if (*cur_size > 0) {
-        clReleaseMemObject(*buf);
-    }
-    cl_int err;
-    *buf = clCreateBuffer(context, flags, req_size, NULL, &err);
-    *cur_size = req_size;
-    CL_CHECK(err, "clCreateBuffer");
-}
-
-void ggml_cl_sgemm_wrapper(
-        const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
-        const int m, const int n, const int k,
-        const float alpha, const void *host_a, const int lda,
-        const float *host_b, const int ldb, const float beta,
-        float *host_c, const int ldc, const int btype) {
-    cl_int err = 0;
-
-    cl_kernel kernel;
-    size_t global = n * k, local, size_qb;
-    bool dequant;
-    cl_block_q5_0* cl_host_b;
-
-    switch (btype) {
-    case GGML_TYPE_F32:
-        dequant = false;
-        break;
-    case GGML_TYPE_Q4_0:
-        dequant = true;
-        kernel = kernel_q4_0;
-        local = 16;
-        size_qb = global * (sizeof(float) + local) / 32;
-        break;
-    case GGML_TYPE_Q4_1:
-        dequant = true;
-        kernel = kernel_q4_1;
-        local = 16;
-        size_qb = global * (sizeof(float) * 2 + local) / 32;
-        break;
-    case GGML_TYPE_Q5_0:
-        dequant = true;
-        kernel = kernel_q5_0;
-        local = 16;
-        // For some reason OpenCL seems to be incapable of working with structs of size 22.
-        // 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
-        // TODO Find the reason, fix and remove workaround.
-        const block_q5_0* b = (const block_q5_0*) host_b;
-        cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
-        for (size_t i = 0; i < global / 32; i++) {
-            cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
-            memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
-            memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
-        }
-        host_b = (const float*) cl_host_b;
-        size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
-        break;
-    case GGML_TYPE_Q5_1:
-        dequant = true;
-        kernel = kernel_q5_1;
-        local = 16;
-        size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
-        break;
-    case GGML_TYPE_Q8_0:
-        dequant = true;
-        kernel = kernel_q8_0;
-        local = 32;
-        size_qb = global * (sizeof(float) + local) / 32;
-        break;
-    default:
-        fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
-        abort();
-    }
-
-    const size_t size_a =  m * k * sizeof(float);
-    const size_t size_b =  n * k * sizeof(float);
-    const size_t size_c =  m * n * sizeof(float);
-
-    // Prepare buffers
-    ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
-    if (dequant) {
-        ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
-    }
-    ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
-    ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
-
-    cl_event ev_a, ev_qb, ev_b;
-
-    if (dequant) {
-        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
-        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
-        CL_CHECK(err, "clSetKernelArg");
-        err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
-        CL_CHECK(err, "clEnqueueWriteBuffer qb");
-    } else {
-        err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
-        CL_CHECK(err, "clEnqueueWriteBuffer b");
-    }
-
-    err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
-    CL_CHECK(err, "clEnqueueWriteBuffer a");
-    if (dequant) {
-        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
-        CL_CHECK(err, "clEnqueueNDRangeKernel");
-        clReleaseEvent(ev_qb);
-    }
-    clWaitForEvents(1, &ev_a);
-    clWaitForEvents(1, &ev_b);
-    clReleaseEvent(ev_a);
-    clReleaseEvent(ev_b);
-
-    cl_event ev_sgemm;
-    CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
-                                            (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
-                                            m, n, k,
-                                            alpha,
-                                            cl_buffer_a, 0, lda,
-                                            cl_buffer_b, 0, ldb,
-                                            beta,
-                                            cl_buffer_c, 0, ldc,
-                                            &queue, &ev_sgemm);
-
-    if (status != CLBlastSuccess) {
-        fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
-        abort();
-    }
-
-    cl_event ev_c;
-    clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
-
-    // Wait for completion
-    clWaitForEvents(1, &ev_c);
-    clReleaseEvent(ev_sgemm);
-    clReleaseEvent(ev_c);
-    if (btype == GGML_TYPE_Q5_0) {
-        free((void*) cl_host_b);
-    }
-}
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@@ -1,23 +1,24 @@
 #pragma once

+#include "ggml.h"
+
 #ifdef  __cplusplus
 extern "C" {
 #endif

 void ggml_cl_init(void);

-enum ggml_blas_order {
-    GGML_BLAS_ORDER_ROW_MAJOR = 101,
-    GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
-};
+void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);

-enum ggml_blas_op {
-    GGML_BLAS_OP_N = 111,
-    GGML_BLAS_OP_T = 112,
-    GGML_BLAS_OP_C = 113,
-};
+void * ggml_cl_host_malloc(size_t size);
+void   ggml_cl_host_free(void * ptr);

-void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
+void ggml_cl_free_data(const struct ggml_tensor* tensor);
+
+void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);

 #ifdef  __cplusplus
 }
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -190,11 +190,15 @@
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
 #define GGML_FILE_VERSION 1

+#define GGML_QNT_VERSION        2    // bump this on quantization format changes
+#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
+
 #define GGML_MAX_DIMS          4
 #define GGML_MAX_NODES         4096
-#define GGML_MAX_PARAMS        16
+#define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_OPT           4
+#define GGML_MAX_NAME          32
 #define GGML_DEFAULT_N_THREADS 4

 #define GGML_ASSERT(x) \
@@ -237,12 +241,25 @@ extern "C" {
        GGML_TYPE_Q5_1 = 7,
        GGML_TYPE_Q8_0 = 8,
        GGML_TYPE_Q8_1 = 9,
+        // k-quantizations
+        GGML_TYPE_Q2_K = 10,
+        GGML_TYPE_Q3_K = 11,
+        GGML_TYPE_Q4_K = 12,
+        GGML_TYPE_Q5_K = 13,
+        GGML_TYPE_Q6_K = 14,
+        GGML_TYPE_Q8_K = 15,
        GGML_TYPE_I8,
        GGML_TYPE_I16,
        GGML_TYPE_I32,
        GGML_TYPE_COUNT,
    };

+    enum ggml_backend {
+        GGML_BACKEND_CPU = 0,
+        GGML_BACKEND_GPU = 10,
+        GGML_BACKEND_GPU_SPLIT = 20,
+    };
+
    // model file types
    enum ggml_ftype {
        GGML_FTYPE_UNKNOWN     = -1,
@@ -254,6 +271,11 @@ extern "C" {
        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
    };

    // available tensor operations:
@@ -262,14 +284,19 @@ extern "C" {

        GGML_OP_DUP,
        GGML_OP_ADD,
+        GGML_OP_ADD1,
+        GGML_OP_ACC,
        GGML_OP_SUB,
        GGML_OP_MUL,
        GGML_OP_DIV,
        GGML_OP_SQR,
        GGML_OP_SQRT,
+        GGML_OP_LOG,
        GGML_OP_SUM,
+        GGML_OP_SUM_ROWS,
        GGML_OP_MEAN,
        GGML_OP_REPEAT,
+        GGML_OP_REPEAT_BACK,
        GGML_OP_ABS,
        GGML_OP_SGN,
        GGML_OP_NEG,
@@ -277,12 +304,16 @@ extern "C" {
        GGML_OP_RELU,
        GGML_OP_GELU,
        GGML_OP_SILU,
+        GGML_OP_SILU_BACK,
        GGML_OP_NORM, // normalize
        GGML_OP_RMS_NORM,
+        GGML_OP_RMS_NORM_BACK,

        GGML_OP_MUL_MAT,
+        GGML_OP_OUT_PROD,

        GGML_OP_SCALE,
+        GGML_OP_SET,
        GGML_OP_CPY,
        GGML_OP_CONT,
        GGML_OP_RESHAPE,
@@ -290,19 +321,29 @@ extern "C" {
        GGML_OP_PERMUTE,
        GGML_OP_TRANSPOSE,
        GGML_OP_GET_ROWS,
+        GGML_OP_GET_ROWS_BACK,
+        GGML_OP_DIAG,
        GGML_OP_DIAG_MASK_INF,
+        GGML_OP_DIAG_MASK_ZERO,
        GGML_OP_SOFT_MAX,
+        GGML_OP_SOFT_MAX_BACK,
        GGML_OP_ROPE,
+        GGML_OP_ROPE_BACK,
        GGML_OP_ALIBI,
+        GGML_OP_CLAMP,
        GGML_OP_CONV_1D_1S,
        GGML_OP_CONV_1D_2S,

        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
+        GGML_OP_FLASH_ATTN_BACK,

        GGML_OP_MAP_UNARY,
        GGML_OP_MAP_BINARY,

+        GGML_OP_CROSS_ENTROPY_LOSS,
+        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
+
        GGML_OP_COUNT,
    };

@@ -321,7 +362,8 @@ extern "C" {

    // n-dimensional tensor
    struct ggml_tensor {
-        enum ggml_type type;
+        enum ggml_type    type;
+        enum ggml_backend backend;

        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
@@ -350,11 +392,15 @@ extern "C" {

        void * data;

-        char name[32];
+        char name[GGML_MAX_NAME];

-        char padding[8]; // TODO: remove and add padding to name?
+        void * extra; // extra things e.g. for ggml-cuda.cu
+
+        char padding[4];
    };

+    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
+
    // computation graph
    struct ggml_cgraph {
        int n_nodes;
@@ -388,6 +434,25 @@ extern "C" {
        bool   no_alloc;   // don't allocate memory for the tensor data
    };

+
+    // compute types
+    enum ggml_task_type {
+        GGML_TASK_INIT = 0,
+        GGML_TASK_COMPUTE,
+        GGML_TASK_FINALIZE,
+    };
+
+    struct ggml_compute_params {
+        enum ggml_task_type type;
+
+        // ith = thread index, nth = number of threads
+        int ith, nth;
+
+        // work buffer for all threads
+        size_t wsize;
+        void * wdata;
+    };
+
    // misc

    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program
@@ -399,14 +464,17 @@ extern "C" {
    GGML_API void    ggml_print_object (const struct ggml_object * obj);
    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);

-    GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes   (const struct ggml_tensor * tensor);
+    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
+    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);

    GGML_API int     ggml_blck_size (enum ggml_type type);
    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float

    GGML_API const char * ggml_type_name(enum ggml_type type);
+    GGML_API const char * ggml_op_name  (enum ggml_op   op);

    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);

@@ -415,14 +483,25 @@ extern "C" {
    // TODO: temporary until model loading of ggml examples is refactored
    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);

+    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
+
+    // use this to compute the memory overhead of a tensor
+    GGML_API size_t ggml_tensor_overhead(void);
+
    // main

    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
-    GGML_API void    ggml_free(struct ggml_context * ctx);
+    GGML_API void                  ggml_free(struct ggml_context * ctx);

    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);

-    GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
+
+    GGML_API void *  ggml_get_mem_buffer(struct ggml_context * ctx);
+    GGML_API size_t  ggml_get_mem_size  (struct ggml_context * ctx);

    GGML_API struct ggml_tensor * ggml_new_tensor(
            struct ggml_context * ctx,
@@ -462,6 +541,8 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);

+    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+
    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
@@ -496,6 +577,34 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    GGML_API struct ggml_tensor * ggml_add1(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_add1_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_acc(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_acc_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
    GGML_API struct ggml_tensor * ggml_sub(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -519,12 +628,24 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    GGML_API struct ggml_tensor * ggml_log(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_log_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // return scalar
-    // TODO: compute sum along rows
    GGML_API struct ggml_tensor * ggml_sum(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
+    GGML_API struct ggml_tensor * ggml_sum_rows(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // mean along rows
    GGML_API struct ggml_tensor * ggml_mean(
            struct ggml_context * ctx,
@@ -537,6 +658,11 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    GGML_API struct ggml_tensor * ggml_repeat_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    GGML_API struct ggml_tensor * ggml_abs(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
@@ -566,6 +692,13 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_silu_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    // normalize along rows
    // TODO: eps is hardcoded to 1e-5 for now
    GGML_API struct ggml_tensor * ggml_norm(
@@ -576,24 +709,93 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    // A: m rows, n columns
-    // B: p rows, n columns (i.e. we transpose it internally)
+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_rms_norm_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // A: n columns, m rows
+    // B: n columns, p rows  (i.e. we transpose it internally)
    // result is m columns, p rows
    GGML_API struct ggml_tensor * ggml_mul_mat(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // A: m columns, n rows,
+    // B: p columns, n rows,
+    // result is m columns, p rows
+    GGML_API struct ggml_tensor * ggml_out_prod(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    //
    // operations on tensors without backpropagation
    //

-    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_scale(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_scale_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_set_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+
    // a -> b, return view(b)
    GGML_API struct ggml_tensor * ggml_cpy(
            struct ggml_context * ctx,
@@ -614,6 +816,11 @@ extern "C" {

    // return view(a)
    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_API struct ggml_tensor * ggml_reshape_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0);
+
    GGML_API struct ggml_tensor * ggml_reshape_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -629,6 +836,14 @@ extern "C" {
            int64_t               ne1,
            int64_t               ne2);

+    GGML_API struct ggml_tensor * ggml_reshape_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
+
    // offset in bytes
    GGML_API struct ggml_tensor * ggml_view_1d(
            struct ggml_context * ctx,
@@ -654,6 +869,18 @@ extern "C" {
            size_t                nb2, // slice stride in bytes
            size_t                offset);

+    GGML_API struct ggml_tensor * ggml_view_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            size_t                nb1, // row   stride in bytes
+            size_t                nb2, // slice stride in bytes
+            size_t                nb3,
+            size_t                offset);
+
    GGML_API struct ggml_tensor * ggml_permute(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -672,20 +899,61 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    GGML_API struct ggml_tensor * ggml_get_rows_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c);
+
+    GGML_API struct ggml_tensor * ggml_diag(
+        struct ggml_context     * ctx,
+        struct ggml_tensor      * a);
+
    // set elements above the diagonal to -INF
-    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_diag_mask_inf(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past);

    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // set elements above the diagonal to 0
+    GGML_API struct ggml_tensor * ggml_diag_mask_zero(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
    GGML_API struct ggml_tensor * ggml_soft_max(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    // rotary position embedding
    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_soft_max_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_soft_max_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // rotary position embedding
    // if mode & 1 == 1, skip n_past elements
    // if mode & 2 == 1, GPT-NeoX style
    // TODO: avoid creating a new tensor every time
@@ -696,13 +964,39 @@ extern "C" {
            int                   n_dims,
            int                   mode);

+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode);
+
+    // rotary position embedding backward, i.e compute dx from dy
+    // a - dy
+    GGML_API struct ggml_tensor * ggml_rope_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode);
+
    // alibi position embedding
    // in-place, returns view(a)
    struct ggml_tensor * ggml_alibi(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
-            int                   n_head);
+            int                   n_head,
+            float                 bias_max);
+
+    // clamp
+    // in-place, returns view(a)
+    struct ggml_tensor * ggml_clamp(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 min,
+            float                 max);

    // padding = 1
    // TODO: we don't support extra parameters for now
@@ -725,6 +1019,14 @@ extern "C" {
            struct ggml_tensor  * v,
            bool                  masked);

+    GGML_API struct ggml_tensor * ggml_flash_attn_back(
+           struct ggml_context * ctx,
+           struct ggml_tensor  * q,
+           struct ggml_tensor  * k,
+           struct ggml_tensor  * v,
+           struct ggml_tensor  * d,
+           bool                  masked);
+
    GGML_API struct ggml_tensor * ggml_flash_ff(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -740,13 +1042,26 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_map_unary_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
-            const  ggml_unary_op_f32_t fun);
+                   ggml_unary_op_f32_t   fun);

    GGML_API struct ggml_tensor * ggml_map_binary_f32(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b,
-            const  ggml_binary_op_f32_t fun);
+                   ggml_binary_op_f32_t   fun);
+
+    // loss function
+
+    GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b);
+
+    GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b,
+            struct ggml_tensor          * c);

    //
    // automatic differentiation
@@ -764,6 +1079,11 @@ extern "C" {
    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);

+    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
+
+    GGML_API void               ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
+    GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
+
    // print info and performance information for the graph
    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);

@@ -837,6 +1157,8 @@ extern "C" {
        struct {
            int n_iter;

+            float sched; // schedule multiplier (fixed, decay or warmup)
+            float decay; // weight decay for AdamW, use 0.0f to disable
            float alpha; // learning rate
            float beta1;
            float beta2;
@@ -861,6 +1183,49 @@ extern "C" {
        } lbfgs;
    };

+    struct ggml_opt_context {
+        struct ggml_context * ctx;
+        struct ggml_opt_params params;
+
+        int iter;
+        int64_t nx; // number of parameter elements
+
+        bool just_initialized;
+
+        struct {
+            struct ggml_tensor * x;  // view of the parameters
+            struct ggml_tensor * g1; // gradient
+            struct ggml_tensor * g2; // gradient squared
+            struct ggml_tensor * m;  // first moment
+            struct ggml_tensor * v;  // second moment
+            struct ggml_tensor * mh; // first moment hat
+            struct ggml_tensor * vh; // second moment hat
+            struct ggml_tensor * pf; // past function values
+            float fx_best;
+            float fx_prev;
+            int n_no_improvement;
+        } adam;
+
+        struct {
+            struct ggml_tensor * x;    // current parameters
+            struct ggml_tensor * xp;   // previous parameters
+            struct ggml_tensor * g;    // current gradient
+            struct ggml_tensor * gp;   // previous gradient
+            struct ggml_tensor * d;    // search direction
+            struct ggml_tensor * pf;   // past function values
+            struct ggml_tensor * lmal; // the L-BFGS memory alpha
+            struct ggml_tensor * lmys; // the L-BFGS memory ys
+            struct ggml_tensor * lms;  // the L-BFGS memory s
+            struct ggml_tensor * lmy;  // the L-BFGS memory y
+            float fx_best;
+            float step;
+            int j;
+            int k;
+            int end;
+            int n_no_improvement;
+        } lbfgs;
+    };
+
    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);

    // optimize the function defined by the tensor f
@@ -869,6 +1234,27 @@ extern "C" {
            struct ggml_opt_params params,
            struct ggml_tensor * f);

+    // initialize optimizer context
+    GGML_API void ggml_opt_init(
+            struct ggml_context * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_opt_params params,
+            int64_t nx);
+
+    // continue optimizing the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt_resume(
+            struct ggml_context * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_tensor * f);
+
+    // continue optimizing the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt_resume_g(
+            struct ggml_context * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_tensor * f,
+            struct ggml_cgraph * gf,
+            struct ggml_cgraph * gb);
+
    //
    // quantization
    //
--- a/k_quants.c
+++ b/k_quants.c
--- a/k_quants.h
+++ b/k_quants.h
@@ -0,0 +1,122 @@
+#pragma once
+
+#include "ggml.h"
+
+#include <stdint.h>
+#include <assert.h>
+#include <stddef.h>
+
+// Super-block size
+#define QK_K 256
+
+//
+// Super-block quantization structures
+//
+
+// 2-bit quantization
+// weight is represented as x = a * q + b
+// 16 blocks of 16 elemenets each
+// Effectively 2.5625 bits per weight
+typedef struct {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    ggml_fp16_t d;           // super-block scale for quantized scales
+    ggml_fp16_t dmin;        // super-block scale for quantized mins
+} block_q2_K;
+static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+
+// 3-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elemenets each
+// Effectively 3.4375 bits per weight
+typedef struct {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
+    ggml_fp16_t d;             // super-block scale
+} block_q3_K;
+static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + 11 * QK_K / 64, "wrong q3_K block size/padding");
+
+// 4-bit quantization
+// 16 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 4.5 bits per weight
+typedef struct {
+    ggml_fp16_t d;             // super-block scale for quantized scales
+    ggml_fp16_t dmin;          // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
+
+// 5-bit quantization
+// 16 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 5.5 bits per weight
+typedef struct {
+    ggml_fp16_t d;               // super-block scale for quantized scales
+    ggml_fp16_t dmin;            // super-block scale for quantized mins
+    uint8_t scales[3*QK_K/64];   // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];          // quants, high bit
+    uint8_t qs[QK_K/2];          // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elemenets each
+// Effectively 6.5625 bits per weight
+typedef struct {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    ggml_fp16_t d;           // super-block scale
+} block_q6_K;
+static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
+
+// This is only used for intermediate quantization and dot products
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK_K];       // quants
+    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
+} block_q8_K;
+static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
+
+
+// Quantization
+void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
+void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
+void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
+void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
+void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
+void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
+
+void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+
+// Dequantization
+void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
+void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
+void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
+void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
+void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
+void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+
+// Dot product
+void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+
+// Quantization with histogram collection
+size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
+
--- a/llama-util.h
+++ b/llama-util.h
@@ -101,12 +101,12 @@ struct llama_file {
        LLAMA_ASSERT(ret == 0); // same
    }

-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
+    void read_raw(void * ptr, size_t len) const {
+        if (len == 0) {
            return;
        }
        errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
+        std::size_t ret = std::fread(ptr, len, 1, fp);
        if (ferror(fp)) {
            throw std::runtime_error(format("read error: %s", strerror(errno)));
        }
@@ -127,12 +127,12 @@ struct llama_file {
        return std::string(chars.data(), len);
    }

-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
+    void write_raw(const void * ptr, size_t len) const {
+        if (len == 0) {
            return;
        }
        errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
+        size_t ret = std::fwrite(ptr, len, 1, fp);
        if (ret != 1) {
            throw std::runtime_error(format("write error: %s", strerror(errno)));
        }
@@ -172,7 +172,7 @@ struct llama_mmap {
 #ifdef _POSIX_MAPPED_FILES
    static constexpr bool SUPPORTED = true;

-    llama_mmap(struct llama_file * file, bool prefetch = true) {
+    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
        size = file->size;
        int fd = fileno(file->fp);
        int flags = MAP_SHARED;
@@ -184,9 +184,9 @@ struct llama_mmap {
            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
        }

-        if (prefetch) {
+        if (prefetch > 0) {
            // Advise the kernel to preload the mapped memory
-            if (madvise(addr, file->size, MADV_WILLNEED)) {
+            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
                        strerror(errno));
            }
@@ -267,9 +267,9 @@ struct llama_mlock {
        }
    }

-    void init(void * addr) {
-        LLAMA_ASSERT(this->addr == NULL && this->size == 0);
-        this->addr = addr;
+    void init(void * ptr) {
+        LLAMA_ASSERT(addr == NULL && size == 0);
+        addr = ptr;
    }

    void grow_to(size_t target_size) {
@@ -340,14 +340,14 @@ struct llama_mlock {
        return (size_t) si.dwPageSize;
    }

-    bool raw_lock(void * addr, size_t size) {
+    bool raw_lock(void * ptr, size_t len) {
        for (int tries = 1; ; tries++) {
-            if (VirtualLock(addr, size)) {
+            if (VirtualLock(ptr, len)) {
                return true;
            }
            if (tries == 2) {
                fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
-                        size, this->size, llama_format_win_err(GetLastError()).c_str());
+                    len, size, llama_format_win_err(GetLastError()).c_str());
                return false;
            }

@@ -363,7 +363,7 @@ struct llama_mlock {
            // is equal to the number of pages in its minimum working set minus
            // a small overhead."
            // Hopefully a megabyte is enough overhead:
-            size_t increment = size + 1048576;
+            size_t increment = len + 1048576;
            // The minimum must be <= the maximum, so we need to increase both:
            min_ws_size += increment;
            max_ws_size += increment;
@@ -375,8 +375,8 @@ struct llama_mlock {
        }
    }

-    void raw_unlock(void * addr, size_t size) {
-        if (!VirtualUnlock(addr, size)) {
+    void raw_unlock(void * ptr, size_t len) {
+        if (!VirtualUnlock(ptr, len)) {
            fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
                    llama_format_win_err(GetLastError()).c_str());
        }
@@ -388,12 +388,12 @@ struct llama_mlock {
        return (size_t) 65536;
    }

-    bool raw_lock(const void * addr, size_t size) {
+    bool raw_lock(const void * addr, size_t len) {
        fprintf(stderr, "warning: mlock not supported on this system\n");
        return false;
    }

-    void raw_unlock(const void * addr, size_t size) {}
+    void raw_unlock(const void * addr, size_t len) {}
 #endif
 };

@@ -404,14 +404,30 @@ struct llama_buffer {

    llama_buffer() = default;

-    void resize(size_t size) {
+    void resize(size_t len) {
+#ifdef GGML_USE_METAL
+        free(addr);
+        int result = posix_memalign((void **) &addr, getpagesize(), len);
+        if (result == 0) {
+            memset(addr, 0, len);
+        }
+        else {
+            addr = NULL;
+        }
+#else
        delete[] addr;
-        addr = new uint8_t[size];
-        this->size = size;
+        addr = new uint8_t[len];
+#endif
+        size = len;
    }

    ~llama_buffer() {
+#ifdef GGML_USE_METAL
+        free(addr);
+#else
        delete[] addr;
+#endif
+        addr = NULL;
    }

    // disable copy and move
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@@ -1,6 +1,13 @@
 #ifndef LLAMA_H
 #define LLAMA_H

+#include "ggml.h"
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
+#else
+#define LLAMA_MAX_DEVICES 1
+#endif // GGML_USE_CUBLAS
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
@@ -19,12 +26,23 @@
 #    define LLAMA_API
 #endif

-#define LLAMA_FILE_VERSION           2
-#define LLAMA_FILE_MAGIC             'ggjt'
-#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
-#define LLAMA_SESSION_MAGIC          'ggsn'
+#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
+#define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
+#define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
+#define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
+#define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
+
+#define LLAMA_FILE_VERSION           3
+#define LLAMA_FILE_MAGIC             LLAMA_FILE_MAGIC_GGJT
+#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
+#define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION        1

+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
+#define LLAMA_SUPPORTS_GPU_OFFLOAD
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -40,9 +58,9 @@ extern "C" {
    typedef int llama_token;

    typedef struct llama_token_data {
-        llama_token id;  // token id
-        float logit; // log-odds of the token
-        float p;     // probability of the token
+        llama_token id; // token id
+        float logit;    // log-odds of the token
+        float p;        // probability of the token
    } llama_token_data;

    typedef struct llama_token_data_array {
@@ -54,9 +72,13 @@ extern "C" {
    typedef void (*llama_progress_callback)(float progress, void *ctx);

    struct llama_context_params {
-        int n_ctx;   // text context
-        int n_parts; // -1 for default
-        int seed;    // RNG seed, -1 for random
+        int n_ctx;                             // text context
+        int n_batch;                           // prompt processing batch size
+        int n_gpu_layers;                      // number of layers to store in VRAM
+        int main_gpu;                          // the GPU that is used for scratch and small tensors
+        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+        bool low_vram;                         // if true, reduce VRAM usage at the cost of performance
+        int seed;                              // RNG seed, -1 for random

        bool f16_kv;     // use fp16 for KV cache
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
@@ -73,23 +95,48 @@ extern "C" {

    // model file types
    enum llama_ftype {
-        LLAMA_FTYPE_ALL_F32     = 0,
-        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        LLAMA_FTYPE_ALL_F32              = 0,
+        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        // LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // support has been removed
-        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
-        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
+        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
+        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
    };

+    // model quantization parameters
+    typedef struct llama_model_quantize_params {
+        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype   ftype;    // quantize to this llama_ftype
+        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor; // quantize output.weight
+    } llama_model_quantize_params;
+
    LLAMA_API struct llama_context_params llama_context_default_params();
+    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();

    LLAMA_API bool llama_mmap_supported();
    LLAMA_API bool llama_mlock_supported();

+    // TODO: not great API - very likely to change
+    // Initialize the llama + ggml backend
+    // Call once at the start of the program
+    LLAMA_API void llama_init_backend();
+
+    LLAMA_API int64_t llama_time_us();
+
    // Various functions for loading a ggml llama model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure
@@ -100,14 +147,11 @@ extern "C" {
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);

-    // TODO: not great API - very likely to change
    // Returns 0 on success
-    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
-      enum llama_ftype   ftype,
-            int          nthread);
+            const llama_model_quantize_params * params);

    // Apply a LoRA adapter to a loaded model
    // path_base_model is the path to a higher quality model to use as a base for
@@ -134,11 +178,11 @@ extern "C" {
    // Copies the state to the specified destination address.
    // Destination needs to have allocated enough memory.
    // Returns the number of bytes copied
-    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
+    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);

    // Set the state reading from the specified address
    // Returns the number of bytes read
-    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
+    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);

    // Save/load session file
    LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
@@ -155,6 +199,12 @@ extern "C" {
                             int   n_past,
                             int   n_threads);

+    // Export a static computation graph for context of 511 and batch size of 1
+    // NOTE: since this functionality is mostly for debugging and demonstration purposes, we hardcode these
+    //       parameters here to keep things simple
+    // IMPORTANT: do not use for anything else other than debugging and testing!
+    LLAMA_API int llama_eval_export(struct llama_context * ctx, const char * fname);
+
    // Convert the provided text into tokens.
    // The tokens pointer must be large enough to hold the resulting tokens.
    // Returns the number of tokens on success, no more than n_max_tokens
@@ -171,6 +221,14 @@ extern "C" {
    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
    LLAMA_API int llama_n_embd (const struct llama_context * ctx);

+    // Get the vocabulary as output parameters.
+    // Returns number of results.
+    LLAMA_API int llama_get_vocab(
+            const struct llama_context * ctx,
+                          const char * * strings,
+                                 float * scores,
+                                   int   capacity);
+
    // Token logits obtained from the last call to llama_eval()
    // The logits for the last token are stored in the last row
    // Can be mutated in order to change the probabilities of the next token
--- a/spm-headers/ggml.h
+++ b/spm-headers/ggml.h
@@ -0,0 +1 @@
+../ggml.h
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,3 +10,5 @@ llama_add_test(test-quantize-fns.cpp)
 llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
+# llama_add_test(test-grad0.c) # SLOW
+# llama_add_test(test-opt.c) # SLOW
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -0,0 +1,205 @@
+#include "ggml.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#define MAX_NARGS 2
+
+
+//
+// logging
+//
+#define GGML_DEBUG 0
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
+
+float frand() {
+    return (float)rand()/(float)RAND_MAX;
+}
+
+int irand(int n) {
+    return rand()%n;
+}
+
+void get_random_dims(int64_t * dims, int ndims) {
+    dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+    for (int i = 0; i < ndims; i++) {
+        dims[i] = 1 + irand(4);
+    }
+}
+
+void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
+    dims[0] = dims[1] = dims[2] = dims[3] = 1;
+
+    for (int i = 0; i < ndims; i++) {
+        dims[i] = min + irand(max-min);
+    }
+}
+
+
+struct ggml_tensor * get_random_tensor(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        float fmin,
+        float fmax) {
+    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
+
+    switch (ndims) {
+        case 1:
+            for (int i0 = 0; i0 < ne[0]; i0++) {
+                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
+            }
+            break;
+        case 2:
+            for (int i1 = 0; i1 < ne[1]; i1++) {
+                for (int i0 = 0; i0 < ne[0]; i0++) {
+                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                }
+            }
+            break;
+        case 3:
+            for (int i2 = 0; i2 < ne[2]; i2++) {
+                for (int i1 = 0; i1 < ne[1]; i1++) {
+                    for (int i0 = 0; i0 < ne[0]; i0++) {
+                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                    }
+                }
+            }
+            break;
+        case 4:
+            for (int i3 = 0; i3 < ne[3]; i3++) {
+                for (int i2 = 0; i2 < ne[2]; i2++) {
+                    for (int i1 = 0; i1 < ne[1]; i1++) {
+                        for (int i0 = 0; i0 < ne[0]; i0++) {
+                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
+                        }
+                    }
+                }
+            }
+            break;
+        default:
+            assert(false);
+    };
+
+    return result;
+}
+
+float get_element(const struct ggml_tensor * t, int idx) {
+    return ((float *)t->data)[idx];
+}
+
+void set_element(struct ggml_tensor * t, int idx, float value) {
+    ((float *)t->data)[idx] = value;
+}
+
+int main(int argc, const char ** argv) {
+    struct ggml_init_params params = {
+        .mem_size   = 1024*1024*1024,
+        .mem_buffer = NULL,
+        .no_alloc   = false,
+    };
+    struct ggml_context * ctx = ggml_init(params);
+
+    int64_t ne1[4] = {4, 1024, 1, 1};
+    int64_t ne2[4] = {4, 2048, 1, 1};;
+    int64_t ne3[4] = {1024, 2048, 1, 1};
+
+    struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
+    struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
+    ggml_set_param(ctx, a);
+    ggml_set_param(ctx, b);
+
+    struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1);
+
+    struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b);
+    struct ggml_tensor * d  = ggml_sub(ctx, c, ab);
+    struct ggml_tensor * e  = ggml_sum(ctx, ggml_sqr(ctx, d));
+
+
+    struct ggml_cgraph ge = ggml_build_forward(e);
+    ggml_graph_reset  (&ge);
+    ggml_graph_compute(ctx, &ge);
+    const float fe = ggml_get_f32_1d(e, 0);
+    printf("%s: e = %.4f\n", __func__, fe);
+
+    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM);
+
+    ggml_opt(ctx, opt_params, e);
+
+    ggml_graph_reset  (&ge);
+    ggml_graph_compute(ctx, &ge);
+    const float fe_opt = ggml_get_f32_1d(e, 0);
+    printf("%s: original  e = %.4f\n", __func__, fe);
+    printf("%s: optimized e = %.4f\n", __func__, fe_opt);
+
+    const bool success = (fe_opt <= fe);
+    assert(success);
+
+    ggml_free(ctx);
+    return success ? 0 : -1;
+}
+// int64_t ne1[4] = {4, 128, 1, 1};
+// int64_t ne2[4] = {4, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 25890.9375
+// main: optimized e = 10094.7031
+
+// int64_t ne1[4] = {8, 128, 1, 1};
+// int64_t ne2[4] = {8, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 39429.5078
+// main: optimized e = 9275.8936
+
+// int64_t ne1[4] = {16, 128, 1, 1};
+// int64_t ne2[4] = {16, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 68371.1328
+// main: optimized e = 7854.4502
+
+
+// int64_t ne1[4] = {32, 128, 1, 1};
+// int64_t ne2[4] = {32, 256, 1, 1};;
+// int64_t ne3[4] = {128, 256, 1, 1};
+// main: original  e = 126061.1953
+// main: optimized e = 5451.0166
+
+// int64_t ne1[4] = {4, 1024, 1, 1};
+// int64_t ne2[4] = {4, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original  e = 1620817.8750
+// main: optimized e = 698387.6875
+
+// another run on M1
+// int64_t ne1[4] = {4, 1024, 1, 1};
+// int64_t ne2[4] = {4, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original  e = 1629595.6250
+// main: optimized e = 698169.1250
+
+// int64_t ne1[4] = {32, 1024, 1, 1};
+// int64_t ne2[4] = {32, 2048, 1, 1};;
+// int64_t ne3[4] = {1024, 2048, 1, 1};
+// main: original  e = 8146770.5000
+// main: optimized e = 651119.1250
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@@ -12,6 +12,8 @@

 const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001;
 const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002;
+const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075;
+const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040;
 const float MAX_DOT_PRODUCT_ERROR = 0.02;

 const char* RESULT_STR[] = {"ok", "FAILED"};
@@ -122,7 +124,10 @@ int main(int argc, char * argv[]) {

        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
-            failed = !(total_error < MAX_QUANTIZATION_TOTAL_ERROR);
+            const float max_quantization_error =
+                type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
+                type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : MAX_QUANTIZATION_TOTAL_ERROR;
+            failed = !(total_error < max_quantization_error);
            num_failed += failed;
            if (failed || verbose) {
                printf("%5s absolute quantization error:    %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -1,6 +1,10 @@
-#include "llama.h"
 #include "ggml.h"
-#include <cassert>
+#include "llama.h"
+
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
 #include <cmath>
 #include <numeric>
 #include <cassert>
@@ -8,7 +12,6 @@
 #include <vector>
 #include <algorithm>

-
 void dump(const llama_token_data_array * candidates) {
    for (size_t i = 0; i < candidates->size; i++) {
        printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);