temporarily disable broken 512 build

ci: add linux binaries to release build
2026-04-23 16:37:33 +03:00 · 2023-08-28 14:42:15 +02:00 · 2023-08-28 14:42:15 +02:00
106 changed files with 8946 additions and 13345 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -3,7 +3,6 @@ Checks: >
    bugprone-*,
    -bugprone-easily-swappable-parameters,
    -bugprone-implicit-widening-of-multiplication-result,
-    -bugprone-misplaced-widening-cast,
    -bugprone-narrowing-conversions,
    readability-*,
    -readability-avoid-unconditional-preprocessor-if,
@@ -16,8 +15,4 @@ Checks: >
    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
    performance-*,
    portability-*,
-    misc-*,
-    -misc-const-correctness,
-    -misc-non-private-member-variables-in-classes,
-    -misc-no-recursion,
 FormatStyle: none
--- a/.devops/cloud-v-pipeline
+++ b/.devops/cloud-v-pipeline
@@ -1,22 +0,0 @@
-node('x86_runner1'){            // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
-    stage('Cleanup'){
-        cleanWs()               // Cleaning previous CI build in workspace
-    }
-    stage('checkout repo'){
-        retry(5){               // Retry if the cloning fails due to some reason
-            checkout scm        // Clone the repo on Runner
-        }
-    }
-    stage('Compiling llama.cpp'){
-        sh'''#!/bin/bash
-            make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
-        '''
-    }
-    stage('Running llama.cpp'){
-        sh'''#!/bin/bash
-            module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
-            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
-            cat llama_log.txt                   # Printing results
-        '''
-    }
-}
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
+    apt-get install -y build-essential python3 python3-pip

 COPY requirements.txt requirements.txt

--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all

 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential

 WORKDIR /app

--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -7,12 +7,15 @@ arg1="$1"
 # Shift the arguments to remove the first one
 shift

+# Join the remaining arguments into a single string
+arg2="$@"
+
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert.py "$@"
+    python3 ./convert.py "$arg2"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    ./quantize "$@"
+    ./quantize "$arg2"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./main "$@"
+    ./main "$arg2"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -24,7 +27,7 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
        fi
    done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./server "$@"
+    ./server "$arg2"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
--- a/.editorconfig
+++ b/.editorconfig
@@ -17,6 +17,3 @@ indent_style = tab

 [prompts/*.txt]
 insert_final_newline = unset
-
-[examples/server/public/*]
-indent_size = 2
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -18,6 +18,7 @@ on:
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  GGML_NLOOP: 3
+  GGML_NITER: 1
  GGML_N_THREADS: 1

 jobs:
@@ -27,7 +28,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -40,19 +41,13 @@ jobs:
        run: |
          CC=gcc-8 make

-      - name: Test
-        id: make_test
-        run: |
-          CC=gcc-8 make tests
-          make test
-
  ubuntu-latest-cmake:
    runs-on: ubuntu-latest

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -74,6 +69,165 @@ jobs:
          cd build
          ctest --verbose --timeout 900

+  ubuntu-focal-cmake:
+    runs-on: ubuntu-20.04
+
+    strategy:
+      matrix:
+        include:
+          - build: 'avx2'
+            defines: ''
+          - build: 'avx'
+            defines: '-DLLAMA_AVX2=OFF'
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. ${{ matrix.defines }} -DCMAKE_BUILD_RPATH_USE_ORIGIN=ON -DBUILD_SHARED_LIBS=ON
+          cmake --build . --config Release
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest --verbose
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          7z a llama-${{ steps.tag.outputs.name }}-bin-ubuntu20.04-${{ matrix.build }}-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            llama-${{ steps.tag.outputs.name }}-bin-ubuntu20.04-${{ matrix.build }}-x64.zip
+
+  ubuntu-focal-cmake-cublas:
+    runs-on: ubuntu-20.04
+
+    strategy:
+      matrix:
+        cuda: ['12.1.0', '11.7.1']
+        build: ['cublas']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - uses: Jimver/cuda-toolkit@v0.2.11
+        id: cuda-toolkit
+        with:
+          cuda: ${{ matrix.cuda }}
+          method: 'network'
+          linux-local-args: '["--toolkit"]'
+          #TODO(green-sky): the action prefixes "cublas-dev" with "cuda-" instead of "lib"
+          #sub-packages: '["nvcc", "cudart", "cublas-dev"]'
+          sub-packages: '["nvcc", "cudart"]'
+          non-cuda-sub-packages: '["libcublas", "libcublas-dev"]'
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -DLLAMA_CUBLAS=ON -DCMAKE_BUILD_RPATH_USE_ORIGIN=ON -DBUILD_SHARED_LIBS=ON
+          cmake --build . --config Release
+
+      - name: Determine tag name
+        id: tag
+        shell: bash
+        run: |
+          BUILD_NUMBER="$(git rev-list --count HEAD)"
+          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
+          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
+            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
+          else
+            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
+            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          7z a llama-${{ steps.tag.outputs.name }}-bin-ubuntu20.04-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip ./build/bin/*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            llama-${{ steps.tag.outputs.name }}-bin-ubuntu20.04-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+
+      - name: Copy and pack Cuda runtime
+        if: ${{ matrix.cuda == '12.1.0' }}
+        # TODO(green-sky): paths are cuda 12 specific
+        run: |
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          mkdir './build/bin/cudart/'
+          ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}/bin"
+          ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}/lib64"
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}/lib64/libcudart.so.12" './build/bin/cudart/'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}/lib64/libcublas.so.12" './build/bin/cudart/'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}/lib64/libcublasLt.so.12" './build/bin/cudart/'
+          7z a cudart-llama-bin-ubuntu20.04-cu${{ matrix.cuda }}-x64.zip ./build/bin/cudart/*
+
+      - name: Copy and pack Cuda runtime
+        if: ${{ matrix.cuda == '11.7.1' }}
+        # TODO(green-sky): paths are cuda 11 specific
+        run: |
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          mkdir './build/bin/cudart/'
+          ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}/lib64"
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}/lib64/libcudart.so.11.0" './build/bin/cudart/'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}/lib64/libcublas.so.11" './build/bin/cudart/'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}/lib64/libcublasLt.so.11" './build/bin/cudart/'
+          7z a cudart-llama-bin-ubuntu20.04-cu${{ matrix.cuda }}-x64.zip ./build/bin/cudart/*
+
+      - name: Upload Cuda runtime
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            cudart-llama-bin-ubuntu20.04-cu${{ matrix.cuda }}-x64.zip
+
  ubuntu-latest-cmake-sanitizer:
    runs-on: ubuntu-latest

@@ -87,7 +241,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -121,7 +275,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -149,7 +303,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -162,19 +316,13 @@ jobs:
        run: |
          make

-      - name: Test
-        id: make_test
-        run: |
-          make tests
-          make test
-
  macOS-latest-cmake:
    runs-on: macos-latest

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -197,62 +345,6 @@ jobs:
          cd build
          ctest --verbose --timeout 900

-  macOS-latest-cmake-ios:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
-          cmake --build . --config Release
-
-  macOS-latest-cmake-tvos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
-          cmake --build . --config Release
-
  windows-latest-cmake:
    runs-on: windows-latest

@@ -265,24 +357,22 @@ jobs:
      matrix:
        include:
          - build: 'noavx'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
          - build: 'avx2'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_BUILD_SERVER=ON'
          - build: 'avx'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx512'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
+          #- build: 'avx512'
+            #defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
          - build: 'clblast'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
          - build: 'openblas'
-            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
+        uses: actions/checkout@v1

      - name: Download OpenCL SDK
        id: get_opencl
@@ -392,29 +482,27 @@ jobs:

    strategy:
      matrix:
-        cuda: ['12.2.0', '11.7.1']
+        cuda: ['12.1.0', '11.7.1']
        build: ['cublas']

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
+        uses: actions/checkout@v1

-      - uses: Jimver/cuda-toolkit@v0.2.11
+      - uses: Jimver/cuda-toolkit@v0.2.10
        id: cuda-toolkit
        with:
          cuda: ${{ matrix.cuda }}
-          method: 'network'
-          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
+          # TODO(green-sky): _dev seems to fail, and non dev are not enought
+          #sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'

      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
-          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
+          cmake .. -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON
          cmake --build . --config Release

      - name: Determine tag name
@@ -444,11 +532,27 @@ jobs:
            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip

      - name: Copy and pack Cuda runtime
+        if: ${{ matrix.cuda == '12.1.0' }}
+        # TODO(green-sky): paths are cuda 12 specific
        run: |
          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
-          $dst='.\build\bin\cudart\'
-          robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
+          mkdir '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
+          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
+
+      - name: Copy and pack Cuda runtime
+        if: ${{ matrix.cuda == '11.7.1' }}
+        # TODO(green-sky): paths are cuda 11 specific
+        run: |
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          mkdir '.\build\bin\cudart\'
+          ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
+          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*

      - name: Upload Cuda runtime
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@@ -457,23 +561,6 @@ jobs:
          path: |
            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip

-  freeBSD-latest:
-    runs-on: macos-12
-    steps:
-    - name: Clone
-      uses: actions/checkout@v3
-
-    - name: Build
-      uses: cross-platform-actions/action@v0.19.0
-      with:
-        operating_system: freebsd
-        version: '13.2'
-        hypervisor: 'qemu'
-        run: |
-            sudo pkg update
-            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
-            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
-
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

@@ -482,6 +569,8 @@ jobs:
    needs:
      - ubuntu-focal-make
      - ubuntu-latest-cmake
+      - ubuntu-focal-cmake
+      - ubuntu-focal-cmake-cublas
      - macOS-latest-make
      - macOS-latest-cmake
      - windows-latest-cmake
@@ -490,9 +579,7 @@ jobs:
    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
+        uses: actions/checkout@v1

      - name: Determine tag name
        id: tag
@@ -550,7 +637,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v1
 #
 #      - name: Dependencies
 #        run: |
@@ -574,7 +661,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v1
 #
 #      - name: Dependencies
 #        run: |
@@ -598,7 +685,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v1
 #
 #      - name: Dependencies
 #        run: |
@@ -628,7 +715,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v1
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -667,7 +754,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v1
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -713,7 +800,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v1
 #
 #      - name: Dependencies
 #        run: |
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@@ -1,36 +0,0 @@
-name: Code Coverage
-on: [push, pull_request]
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-
-jobs:
-  run:
-    runs-on: ubuntu-20.04
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential gcc-8 lcov
-
-      - name: Build
-        run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
-
-      - name: Run tests
-        run: CC=gcc-8 make test
-
-      - name: Generate coverage report
-        run: |
-          make coverage
-          make lcov-report
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v3
-        env:
-           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-        with:
-          files: lcov-report/coverage.info
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -26,15 +26,8 @@ jobs:
    strategy:
      matrix:
        config:
-          - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
-          #                     have disabled them for now until the reason why
-          #                     is understood.
-          - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "light", dockerfile: ".devops/main.Dockerfile" }
+          - { tag: "full", dockerfile: ".devops/full.Dockerfile" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v3
@@ -58,7 +51,7 @@ jobs:
        with:
          context: .
          push: true
-          platforms: ${{ matrix.config.platforms }}
+          platforms: linux/amd64,linux/arm64
          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}

@@ -67,6 +60,6 @@ jobs:
        with:
          context: .
          push: ${{ github.event_name == 'push' }}
-          platforms: ${{ matrix.config.platforms }}
+          platforms: linux/amd64,linux/arm64
          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -1,43 +0,0 @@
-# This workflow will upload a Python Package using Twine when a GGUF release is created
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
-
-# See `gguf-py/README.md` for how to make a release.
-
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-name: Upload Python Package
-
-on:
-  workflow_dispatch:
-  push:
-    # Pattern matched against refs/tags
-    tags:
-      - 'gguf-v*'           # Push events to every version tag
-
-
-jobs:
-  deploy:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python
-      uses: actions/setup-python@v2
-      with:
-        python-version: '3.9.x'
-    - name: Install dependencies
-      run: |
-        cd gguf-py
-        python -m pip install poetry
-        poetry install
-
-    - name: Build package
-      run: poetry build
-    - name: Publish package
-      uses: pypa/gh-action-pypi-publish@release/v1
-      with:
-        password: ${{ secrets.PYPI_API_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -5,11 +5,6 @@
 *.bin
 *.exe
 *.dll
-*.log
-*.gcov
-*.gcno
-*.gcda
-*.dot
 .DS_Store
 .build/
 .cache/
@@ -21,9 +16,6 @@
 .vs/
 .vscode/

-lcov-report/
-gcovr-report/
-
 build*/
 out/
 tmp/
@@ -31,29 +23,24 @@ tmp/
 models/*
 models-mnt

-/Pipfile
-/baby-llama
-/beam-search
-/benchmark-matmult
-/convert-llama2c-to-ggml
-/embd-input-test
+/main
+/quantize
+/quantize-stats
+/result
+/perplexity
 /embedding
+/train-text-from-scratch
+/convert-llama2c-to-ggml
+/simple
+/benchmark-matmult
+/vdot
+/server
+/Pipfile
+/embd-input-test
 /gguf
 /gguf-llama-simple
 /libllama.so
 /llama-bench
-/main
-/metal
-/perplexity
-/quantize
-/quantize-stats
-/result
-/save-load-state
-/server
-/simple
-/speculative
-/train-text-from-scratch
-/vdot
 build-info.h
 arm_neon.h
 compile_commands.json
@@ -76,13 +63,10 @@ poetry.toml

 # Test binaries
 tests/test-grammar-parser
-tests/test-llama-grammar
 tests/test-double-float
 tests/test-grad0
 tests/test-opt
 tests/test-quantize-fns
 tests/test-quantize-perf
 tests/test-sampling
-tests/test-tokenizer-0-llama
-tests/test-tokenizer-0-falcon
-tests/test-tokenizer-1
+tests/test-tokenizer-0
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,7 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()

+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
@@ -36,12 +37,6 @@ endif()
 # Option list
 #

-if (APPLE)
-    set(LLAMA_METAL_DEFAULT ON)
-else()
-    set(LLAMA_METAL_DEFAULT OFF)
-endif()
-
 # general
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      OFF)
@@ -80,12 +75,9 @@ set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kern
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
 option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
-set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
-                                             "llama: max. batch size for using peer access")
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
-option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
-option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
+option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
@@ -137,7 +129,6 @@ set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
-include(CheckCXXCompilerFlag)

 if (NOT MSVC)
    if (LLAMA_SANITIZE_THREAD)
@@ -168,32 +159,6 @@ if (APPLE AND LLAMA_ACCELERATE)
    endif()
 endif()

-if (LLAMA_METAL)
-    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
-    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
-    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
-
-    message(STATUS "Metal framework found")
-    set(GGML_HEADERS_METAL ggml-metal.h)
-    set(GGML_SOURCES_METAL ggml-metal.m)
-
-    add_compile_definitions(GGML_USE_METAL)
-    if (LLAMA_METAL_NDEBUG)
-        add_compile_definitions(GGML_METAL_NDEBUG)
-    endif()
-
-    # get full path to the file
-    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
-
-    # copy ggml-metal.metal to bin directory
-    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
-
-    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
-        ${FOUNDATION_LIBRARY}
-        ${METAL_FRAMEWORK}
-        ${METALKIT_FRAMEWORK}
-        )
-endif()
 if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
@@ -270,8 +235,7 @@ if (LLAMA_BLAS)
 endif()

 if (LLAMA_K_QUANTS)
-    set(GGML_HEADERS_EXTRA k_quants.h)
-    set(GGML_SOURCES_EXTRA k_quants.c)
+    set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
    add_compile_definitions(GGML_USE_K_QUANTS)
    if (LLAMA_QKK_64)
        add_compile_definitions(GGML_QKK_64)
@@ -287,8 +251,7 @@ if (LLAMA_CUBLAS)

        enable_language(CUDA)

-        set(GGML_HEADERS_CUDA ggml-cuda.h)
-        set(GGML_SOURCES_CUDA ggml-cuda.cu)
+        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)

        add_compile_definitions(GGML_USE_CUBLAS)
 #        if (LLAMA_CUDA_CUBLAS)
@@ -306,7 +269,6 @@ if (LLAMA_CUBLAS)
            add_compile_definitions(GGML_CUDA_F16)
        endif()
        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
-        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})

        if (LLAMA_STATIC)
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -332,12 +294,34 @@ if (LLAMA_CUBLAS)
    endif()
 endif()

+if (LLAMA_METAL)
+    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
+    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
+    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
+
+    set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
+
+    add_compile_definitions(GGML_USE_METAL)
+    add_compile_definitions(GGML_METAL_NDEBUG)
+
+    # get full path to the file
+    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
+
+    # copy ggml-metal.metal to bin directory
+    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
+
+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
+        ${FOUNDATION_LIBRARY}
+        ${METAL_FRAMEWORK}
+        ${METALKIT_FRAMEWORK}
+        )
+endif()
+
 if (LLAMA_MPI)
    cmake_minimum_required(VERSION 3.10)
    find_package(MPI)
    if (MPI_C_FOUND)
        message(STATUS "MPI found")
-        set(GGML_HEADERS_MPI ggml-mpi.h)
        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
        add_compile_definitions(GGML_USE_MPI)
        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
@@ -360,8 +344,7 @@ if (LLAMA_CLBLAST)
    if (CLBlast_FOUND)
        message(STATUS "CLBlast found")

-        set(GGML_HEADERS_OPENCL ggml-opencl.h)
-        set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
+        set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)

        add_compile_definitions(GGML_USE_CLBLAST)

@@ -389,15 +372,13 @@ if (LLAMA_HIPBLAS)
        message(STATUS "HIP and hipBLAS found")
        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
-        if (BUILD_SHARED_LIBS)
-            set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
-        endif()
        if (LLAMA_CUDA_FORCE_DMMV)
            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
        endif()
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
        target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
+        target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)

@@ -422,22 +403,15 @@ if (LLAMA_ALL_WARNINGS)
            -Wstrict-prototypes
            -Wpointer-arith
            -Wmissing-prototypes
-            -Werror=implicit-int
-            -Wno-unused-function
        )
        set(cxx_flags
            -Wall
            -Wextra
            -Wpedantic
            -Wcast-qual
-            -Wmissing-declarations
            -Wno-unused-function
            -Wno-multichar
        )
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-            # g++ only
-            set(cxx_flags ${cxx_flags} -Wno-format-truncation -Wno-array-bounds)
-        endif()
    else()
        # todo : msvc
    endif()
@@ -449,7 +423,7 @@ if (LLAMA_ALL_WARNINGS)

 endif()

-if (WIN32)
+if (MSVC)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)

    if (BUILD_SHARED_LIBS)
@@ -471,13 +445,6 @@ endif()
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
-if (MSVC)
-  string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
-  message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
-else ()
-  set(CMAKE_GENERATOR_PLATFORM_LWR "")
-endif ()
-
 if (NOT MSVC)
    if (LLAMA_STATIC)
        add_link_options(-static)
@@ -493,33 +460,25 @@ if (NOT MSVC)
    endif()
 endif()

-if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
    message(STATUS "ARM detected")
    if (MSVC)
-        add_compile_definitions(__ARM_NEON)
-        add_compile_definitions(__ARM_FEATURE_FMA)
-        add_compile_definitions(__ARM_FEATURE_DOTPROD)
-        # add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
-        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
+        # TODO: arm msvc?
    else()
-        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
-        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-            add_compile_options(-mfp16-format=ieee)
-        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
            # Raspberry Pi 1, Zero
-            add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
+            add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
            # Raspberry Pi 2
-            add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+            add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations)
        endif()
        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
            # Raspberry Pi 3, 4, Zero 2 (32-bit)
-            add_compile_options(-mno-unaligned-access)
+            add_compile_options(-mfp16-format=ieee -mno-unaligned-access)
        endif()
    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
    message(STATUS "x86 detected")
    if (MSVC)
        if (LLAMA_AVX512)
@@ -576,84 +535,27 @@ else()
    message(STATUS "Unknown architecture")
 endif()

-#
-# POSIX conformance
-#
-
-# clock_gettime came in POSIX.1b (1993)
-# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
-# posix_memalign came in POSIX.1-2001 / SUSv3
-# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
-add_compile_definitions(_XOPEN_SOURCE=600)
-
-# Somehow in OpenBSD whenever POSIX conformance is specified
-# some string functions rely on locale_t availability,
-# which was introduced in POSIX.1-2008, forcing us to go higher
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    remove_definitions(-D_XOPEN_SOURCE=600)
-    add_compile_definitions(_XOPEN_SOURCE=700)
-endif()
-
-# Data types, macros and functions related to controlling CPU affinity and
-# some memory allocation are available on Linux through GNU extensions in libc
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    add_compile_definitions(_GNU_SOURCE)
-endif()
-
-# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
-# and on macOS its availability depends on enabling Darwin extensions
-# similarly on DragonFly, enabling BSD extensions is necessary
-if (
-    CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
-    CMAKE_SYSTEM_NAME MATCHES "iOS" OR
-    CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
-    CMAKE_SYSTEM_NAME MATCHES "DragonFly"
-)
-    add_compile_definitions(_DARWIN_C_SOURCE)
-endif()
-
-# alloca is a non-standard interface that is not visible on BSDs when
-# POSIX conformance is specified, but not all of them provide a clean way
-# to enable it in such cases
-if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-    add_compile_definitions(__BSD_VISIBLE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
-    add_compile_definitions(_NETBSD_SOURCE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    add_compile_definitions(_BSD_SOURCE)
-endif()
-
 #
 # libraries
 #

 # ggml

-if (GGML_USE_CPU_HBM)
-    add_definitions(-DGGML_USE_CPU_HBM)
-    find_library(memkind memkind REQUIRED)
-endif()
-
 add_library(ggml OBJECT
            ggml.c
            ggml.h
            ggml-alloc.c
            ggml-alloc.h
-            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
-            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
-            ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
-            ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_CUDA}
+            ${GGML_SOURCES_OPENCL}
+            ${GGML_SOURCES_METAL}
+            ${GGML_SOURCES_MPI}
+            ${GGML_SOURCES_EXTRA}
            )

 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
 target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
-if (GGML_USE_CPU_HBM)
-    target_link_libraries(ggml PUBLIC memkind)
-endif()

 add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
 if (BUILD_SHARED_LIBS)
@@ -683,53 +585,14 @@ if (BUILD_SHARED_LIBS)
    if (LLAMA_METAL)
        set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
    endif()
+    install(TARGETS llama LIBRARY)
 endif()

-
 #
 # install
 #

 include(GNUInstallDirs)
-include(CMakePackageConfigHelpers)
-
-set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}
-    CACHE PATH "Location of header files")
-set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}
-    CACHE PATH "Location of library files")
-set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
-    CACHE PATH "Location of binary files")
-set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
-set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
-set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
-
-configure_package_config_file(
-        ${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
-        ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
-    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama
-    PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
-              LLAMA_LIB_INSTALL_DIR
-              LLAMA_BIN_INSTALL_DIR )
-
-write_basic_package_version_file(
-        ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
-    VERSION ${LLAMA_INSTALL_VERSION}
-    COMPATIBILITY SameMajorVersion)
-
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
-              ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
-        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
-
-set(GGML_PUBLIC_HEADERS "ggml.h"
-        "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
-        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
-
-set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
-install(TARGETS ggml PUBLIC_HEADER)
-
-set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
-install(TARGETS llama LIBRARY PUBLIC_HEADER)
-
 install(
    FILES convert.py
    PERMISSIONS
--- a/402
+++ b/402
@@ -1,11 +1,10 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf llama-bench

 # Binaries only useful for tests
-TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
+TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0

-# Code coverage output files
-COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
+default: $(BUILD_TARGETS)

 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@@ -19,13 +18,12 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif

+CCV := $(shell $(CC) --version | head -n 1)
+CXXV := $(shell $(CXX) --version | head -n 1)
+
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
-	ifndef LLAMA_NO_METAL
-		LLAMA_METAL := 1
-	endif
-
 	ifneq ($(UNAME_P),arm)
 		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
 		ifeq ($(SYSCTL_M),1)
@@ -36,164 +34,64 @@ ifeq ($(UNAME_S),Darwin)
 	endif
 endif

-ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
-BUILD_TARGETS += metal
-endif
-
-default: $(BUILD_TARGETS)
-
-test: $(TEST_TARGETS)
-	@failures=0; \
-	for test_target in $(TEST_TARGETS); do \
-		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
-			continue; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
-			continue; \
-		else \
-			echo "Running test $$test_target..."; \
-			./$$test_target; \
-		fi; \
-		if [ $$? -ne 0 ]; then \
-			printf 'Test $$test_target FAILED!\n\n' $$test_target; \
-			failures=$$(( failures + 1 )); \
-		else \
-			printf 'Test %s passed.\n\n' $$test_target; \
-		fi; \
-	done; \
-	if [ $$failures -gt 0 ]; then \
-		printf '\n%s tests failed.\n' $$failures; \
-		exit 1; \
-	fi
-	@echo 'All tests passed.'
-
-all: $(BUILD_TARGETS) $(TEST_TARGETS)
-
-coverage: ## Run code coverage
-	gcov -pb tests/*.cpp
-
-lcov-report: coverage ## Generate lcov report
-	mkdir -p lcov-report
-	lcov --capture --directory . --output-file lcov-report/coverage.info
-	genhtml lcov-report/coverage.info --output-directory lcov-report
-
-gcovr-report: coverage ## Generate gcovr report
-	mkdir -p gcovr-report
-	gcovr --root . --html --html-details --output gcovr-report/coverage.html
-
-ifdef RISCV_CROSS_COMPILE
-CC	:= riscv64-unknown-linux-gnu-gcc
-CXX	:= riscv64-unknown-linux-gnu-g++
-endif
-
-CCV := $(shell $(CC) --version | head -n 1)
-CXXV := $(shell $(CXX) --version | head -n 1)
-
 #
 # Compile flags
 #

 # keep standard at C11 and C++11
-MK_CPPFLAGS = -I. -Icommon
-MK_CFLAGS   = -std=c11   -fPIC
-MK_CXXFLAGS = -std=c++11 -fPIC
-
 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
-MK_CFLAGS        += -Ofast
-MK_HOST_CXXFLAGS += -Ofast
-MK_CUDA_CXXFLAGS += -O3
+OPT = -Ofast
 else
-MK_CFLAGS        += -O3
-MK_CXXFLAGS      += -O3
-endif
-
-# clock_gettime came in POSIX.1b (1993)
-# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
-# posix_memalign came in POSIX.1-2001 / SUSv3
-# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
-MK_CPPFLAGS += -D_XOPEN_SOURCE=600
-
-# Somehow in OpenBSD whenever POSIX conformance is specified
-# some string functions rely on locale_t availability,
-# which was introduced in POSIX.1-2008, forcing us to go higher
-ifeq ($(UNAME_S),OpenBSD)
-	MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
-endif
-
-# Data types, macros and functions related to controlling CPU affinity and
-# some memory allocation are available on Linux through GNU extensions in libc
-ifeq ($(UNAME_S),Linux)
-	MK_CPPFLAGS += -D_GNU_SOURCE
-endif
-
-# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
-# and on macOS its availability depends on enabling Darwin extensions
-# similarly on DragonFly, enabling BSD extensions is necessary
-ifeq ($(UNAME_S),Darwin)
-	MK_CPPFLAGS += -D_DARWIN_C_SOURCE
-endif
-ifeq ($(UNAME_S),DragonFly)
-	MK_CPPFLAGS += -D__BSD_VISIBLE
-endif
-
-# alloca is a non-standard interface that is not visible on BSDs when
-# POSIX conformance is specified, but not all of them provide a clean way
-# to enable it in such cases
-ifeq ($(UNAME_S),FreeBSD)
-	MK_CPPFLAGS += -D__BSD_VISIBLE
-endif
-ifeq ($(UNAME_S),NetBSD)
-	MK_CPPFLAGS += -D_NETBSD_SOURCE
-endif
-ifeq ($(UNAME_S),OpenBSD)
-	MK_CPPFLAGS += -D_BSD_SOURCE
+OPT = -O3
 endif
+CFLAGS   = -I.            $(OPT) -std=c11   -fPIC
+CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC
+LDFLAGS  =

 ifdef LLAMA_DEBUG
-	MK_CFLAGS   += -O0 -g
-	MK_CXXFLAGS += -O0 -g
-	MK_LDFLAGS  += -g
+	CFLAGS   += -O0 -g
+	CXXFLAGS += -O0 -g
+	LDFLAGS  += -g
 else
-	MK_CPPFLAGS += -DNDEBUG
+	CFLAGS   += -DNDEBUG
+	CXXFLAGS += -DNDEBUG
 endif

 ifdef LLAMA_SERVER_VERBOSE
-	MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
+	CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
 endif

-
-ifdef LLAMA_CODE_COVERAGE
-	MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
-endif
-
-ifdef LLAMA_DISABLE_LOGS
-	MK_CPPFLAGS += -DLOG_DISABLE_LOGS
-endif # LLAMA_DISABLE_LOGS
-
 # warnings
-MK_CFLAGS    += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-				-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
-MK_CXXFLAGS  += -Wall -Wextra -Wpedantic -Wcast-qual -Wmissing-declarations -Wno-unused-function -Wno-multichar
-
-# TODO(cebtenzzre): remove this once PR #2632 gets merged
-TTFS_CXXFLAGS = $(CXXFLAGS) -Wno-missing-declarations
-
-ifneq '' '$(findstring clang,$(shell $(CXX) --version))'
-	# clang++ only
-	MK_CXXFLAGS   += -Wmissing-prototypes
-	TTFS_CXXFLAGS += -Wno-missing-prototypes
-else
-	# g++ only
-	MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
-endif
+CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
+			-Wmissing-prototypes
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar

 # OS specific
 # TODO: support Windows
-ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
-	MK_CFLAGS   += -pthread
-	MK_CXXFLAGS += -pthread
+ifeq ($(UNAME_S),Linux)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),Darwin)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),FreeBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),NetBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),OpenBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),Haiku)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
 endif

 # detect Windows
@@ -219,117 +117,104 @@ ifeq ($(_WIN32),1)
 endif

 ifdef LLAMA_GPROF
-	MK_CFLAGS   += -pg
-	MK_CXXFLAGS += -pg
+	CFLAGS   += -pg
+	CXXFLAGS += -pg
 endif
 ifdef LLAMA_PERF
-	MK_CPPFLAGS += -DGGML_PERF
+	CFLAGS   += -DGGML_PERF
+	CXXFLAGS += -DGGML_PERF
 endif

 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
-
-ifndef RISCV
-
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
-	MK_CFLAGS   += -march=native -mtune=native
-	MK_HOST_CXXFLAGS += -march=native -mtune=native
+	CFLAGS   += -march=native -mtune=native
+	CXXFLAGS += -march=native -mtune=native

 	# Usage AVX-only
-	#MK_CFLAGS   += -mfma -mf16c -mavx
-	#MK_CXXFLAGS += -mfma -mf16c -mavx
+	#CFLAGS   += -mfma -mf16c -mavx
+	#CXXFLAGS += -mfma -mf16c -mavx

 	# Usage SSSE3-only (Not is SSE3!)
-	#MK_CFLAGS   += -mssse3
-	#MK_CXXFLAGS += -mssse3
-endif
-
-# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
-# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
-# https://github.com/ggerganov/llama.cpp/issues/2922
-ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
-	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
-	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
+	#CFLAGS   += -mssse3
+	#CXXFLAGS += -mssse3
 endif

 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	# Apple M1, M2, etc.
 	# Raspberry Pi 3, 4, Zero 2 (64-bit)
-	MK_CFLAGS   += -mcpu=native
-	MK_CXXFLAGS += -mcpu=native
+	CFLAGS   += -mcpu=native
+	CXXFLAGS += -mcpu=native
 endif

 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# Raspberry Pi 1, Zero
-	MK_CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
-	MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
 endif

 ifneq ($(filter armv7%,$(UNAME_M)),)
 	# Raspberry Pi 2
-	MK_CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
-	MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif

 ifneq ($(filter armv8%,$(UNAME_M)),)
 	# Raspberry Pi 3, 4, Zero 2 (32-bit)
-	MK_CFLAGS   += -mfp16-format=ieee -mno-unaligned-access
-	MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access
+	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif

 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
-		MK_CFLAGS   += -mcpu=power9
-		MK_CXXFLAGS += -mcpu=power9
+		CFLAGS   += -mcpu=power9
+		CXXFLAGS += -mcpu=power9
+	endif
+	# Require c++23's std::byteswap for big-endian support.
+	ifeq ($(UNAME_M),ppc64)
+		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
 	endif
 endif

-else
-	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
-	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
-endif
-
 ifndef LLAMA_NO_K_QUANTS
-	MK_CPPFLAGS += -DGGML_USE_K_QUANTS
+	CFLAGS   += -DGGML_USE_K_QUANTS
+	CXXFLAGS += -DGGML_USE_K_QUANTS
 	OBJS     += k_quants.o
 ifdef LLAMA_QKK_64
-	MK_CPPFLAGS += -DGGML_QKK_64
+	CFLAGS   += -DGGML_QKK_64
+	CXXFLAGS += -DGGML_QKK_64
 endif
 endif

 ifndef LLAMA_NO_ACCELERATE
-	# Mac OS - include Accelerate framework.
-	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
+	# Mac M1 - include Accelerate framework.
+	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
 	ifeq ($(UNAME_S),Darwin)
-		MK_CPPFLAGS += -DGGML_USE_ACCELERATE
-		MK_LDFLAGS  += -framework Accelerate
+		CFLAGS  += -DGGML_USE_ACCELERATE
+		LDFLAGS += -framework Accelerate
 	endif
 endif # LLAMA_NO_ACCELERATE

 ifdef LLAMA_MPI
-	MK_CPPFLAGS += -DGGML_USE_MPI
-	MK_CFLAGS   += -Wno-cast-qual
-	MK_CXXFLAGS += -Wno-cast-qual
+	CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
 	OBJS     += ggml-mpi.o
 endif # LLAMA_MPI

 ifdef LLAMA_OPENBLAS
-	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
-	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
-	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
+	CFLAGS  += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas)
+	LDFLAGS += $(shell pkg-config --libs openblas)
 endif # LLAMA_OPENBLAS

 ifdef LLAMA_BLIS
-	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
-	MK_LDFLAGS  += -lblis -L/usr/local/lib
+	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
+	LDFLAGS += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS

 ifdef LLAMA_CUBLAS
-	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
-	MK_LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
+	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
 	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
 ifdef LLAMA_CUDA_NVCC
@@ -368,11 +253,6 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
 else
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
-ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-	NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
-else
-	NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
-endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
 #ifdef LLAMA_CUDA_CUBLAS
 #	NVCCFLAGS += -DGGML_CUDA_CUBLAS
 #endif # LLAMA_CUDA_CUBLAS
@@ -380,20 +260,19 @@ ifdef LLAMA_CUDA_CCBIN
 	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(NVCC) $(NVCCFLAGS) -Wno-pedantic -c $< -o $@
+	$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
 endif # LLAMA_CUBLAS

 ifdef LLAMA_CLBLAST

-	MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
-	MK_CFLAGS   += $(shell pkg-config --cflags-only-other clblast OpenCL)
-	MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
+	CFLAGS   += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
+	CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)

 	# Mac provides OpenCL as a framework
 	ifeq ($(UNAME_S),Darwin)
-		MK_LDFLAGS += -lclblast -framework OpenCL
+		LDFLAGS += -lclblast -framework OpenCL
 	else
-		MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
+		LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
 	endif
 	OBJS    += ggml-opencl.o

@@ -408,13 +287,15 @@ ifdef LLAMA_HIPBLAS
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
-	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
-	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
-	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
+	CFLAGS      += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	CXXFLAGS    += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	LDFLAGS     += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
+	LDFLAGS		+= -lhipblas -lamdhip64 -lrocblas
 	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
 	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 	HIPFLAGS    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+	HIPFLAGS    += -DCC_TURING=1000000000
 ifdef LLAMA_CUDA_FORCE_DMMV
 	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
@@ -424,12 +305,10 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 endif # LLAMA_HIPBLAS

 ifdef LLAMA_METAL
-	MK_CPPFLAGS += -DGGML_USE_METAL
-	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
-	OBJS		+= ggml-metal.o
-ifdef LLAMA_METAL_NDEBUG
-	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
-endif
+	CFLAGS   += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
+	CXXFLAGS += -DGGML_USE_METAL
+	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
+	OBJS     += ggml-metal.o
 endif # LLAMA_METAL

 ifdef LLAMA_METAL
@@ -442,36 +321,24 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI

-ifndef LLAMA_NO_K_QUANTS
+ifdef LLAMA_NO_K_QUANTS
 k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_NO_K_QUANTS

-# combine build flags with cmdline overrides
-override CFLAGS        := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
-override CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
-override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
-override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
-override LDFLAGS       := $(MK_LDFLAGS) $(LDFLAGS)
-
-# save CXXFLAGS before we add host-only options
-NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
-override CXXFLAGS += $(HOST_CXXFLAGS)
-
 #
 # Print build information
 #

 $(info I llama.cpp build info: )
-$(info I UNAME_S:   $(UNAME_S))
-$(info I UNAME_P:   $(UNAME_P))
-$(info I UNAME_M:   $(UNAME_M))
-$(info I CFLAGS:    $(CFLAGS))
-$(info I CXXFLAGS:  $(CXXFLAGS))
-$(info I NVCCFLAGS: $(NVCCFLAGS))
-$(info I LDFLAGS:   $(LDFLAGS))
-$(info I CC:        $(CCV))
-$(info I CXX:       $(CXXV))
+$(info I UNAME_S:  $(UNAME_S))
+$(info I UNAME_P:  $(UNAME_P))
+$(info I UNAME_M:  $(UNAME_M))
+$(info I CFLAGS:   $(CFLAGS))
+$(info I CXXFLAGS: $(CXXFLAGS))
+$(info I LDFLAGS:  $(LDFLAGS))
+$(info I CC:       $(CCV))
+$(info I CXX:      $(CXXV))
 $(info )

 #
@@ -489,7 +356,7 @@ OBJS += ggml-alloc.o
 llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

-common.o: common/common.cpp common/common.h build-info.h common/log.h
+common.o: common/common.cpp common/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

 console.o: common/console.cpp common/console.h
@@ -502,7 +369,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 clean:
-	rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult build-info.h *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf llama-bench build-info.h $(TEST_TARGETS)

 #
 # Examples
@@ -542,34 +409,20 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
 embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput

-gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
+gguf: examples/gguf/gguf.cpp                                  build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
+convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-ifdef LLAMA_METAL
-metal: examples/metal/metal.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-endif
-
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
-	@sh scripts/build-info.sh $(CC) > $@.tmp
+	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
 		mv $@.tmp $@; \
 	else \
@@ -589,38 +442,29 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

-tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

 tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

 tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

 tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

 tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

 tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

 tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-c.o: tests/test-c.c llama.h
-	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
+tests/test-tokenizer-0: tests/test-tokenizer-0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@@ -2,30 +2,8 @@

 import PackageDescription

-#if arch(arm) || arch(arm64)
-let platforms: [SupportedPlatform]? = [
-    .macOS(.v11),
-    .iOS(.v14),
-    .watchOS(.v4),
-    .tvOS(.v14)
-]
-let exclude: [String] = []
-let additionalSources: [String] = ["ggml-metal.m"]
-let additionalSettings: [CSetting] = [
-    .unsafeFlags(["-fno-objc-arc"]),
-    .define("GGML_SWIFT"),
-    .define("GGML_USE_METAL")
-]
-#else
-let platforms: [SupportedPlatform]? = nil
-let exclude: [String] = ["ggml-metal.metal"]
-let additionalSources: [String] = []
-let additionalSettings: [CSetting] = []
-#endif
-
 let package = Package(
    name: "llama",
-    platforms: platforms,
    products: [
        .library(name: "llama", targets: ["llama"]),
    ],
@@ -33,23 +11,14 @@ let package = Package(
        .target(
            name: "llama",
            path: ".",
-            exclude: exclude,
-            sources: [
-                "ggml.c",
-                "llama.cpp",
-                "ggml-alloc.c",
-                "k_quants.c",
-            ] + additionalSources,
+            exclude: ["ggml-metal.metal"],
+            sources: ["ggml.c", "llama.cpp"],
            publicHeadersPath: "spm-headers",
-            cSettings: [
-                .unsafeFlags(["-Wno-shorten-64-to-32"]),
-                .define("GGML_USE_K_QUANTS"),
-                .define("GGML_USE_ACCELERATE")
-            ] + additionalSettings,
+            cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
-        )
+        ),
    ],
    cxxLanguageStandard: .cxx11
 )
--- a/README.md
+++ b/README.md
@@ -11,11 +11,21 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 ### Hot topics

- Parallel decoding + continuous batching support incoming: [#3228](https://github.com/ggerganov/llama.cpp/pull/3228) \
-  **Devs should become familiar with the new API**
- Local Falcon 180B inference on Mac Studio
+- #### IMPORTANT: Tokenizer fixes and API change (developers and projects using `llama.cpp` built-in tokenization must read): https://github.com/ggerganov/llama.cpp/pull/2810

-  https://github.com/ggerganov/llama.cpp/assets/1991296/98abd4e8-7077-464c-ae89-aebabca7757e
+- GGUFv2 adds support for 64-bit sizes + backwards compatible: https://github.com/ggerganov/llama.cpp/pull/2821
+
+- Added support for Falcon models: https://github.com/ggerganov/llama.cpp/pull/2717
+
+- A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
+
+  Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
+
+  ### Current `master` should be considered in Beta - expect some issues for a few days!
+
+  ### Be prepared to re-convert and / or re-quantize your GGUF models while this notice is up!
+
+  ### Issues with non-GGUF models will be considered with low priority!

 ----

@@ -97,20 +107,17 @@ as the main playground for developing new features for the [ggml](https://github

 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp), [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
+- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
 - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)

 **UI:**

 - [nat/openplayground](https://github.com/nat/openplayground)
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
- [withcatai/catai](https://github.com/withcatai/catai)

 ---

@@ -270,11 +277,29 @@ In order to build llama.cpp you have three different options.

 ### Metal Build

-On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
-To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
+Using Metal allows the computation to be executed on the GPU for Apple devices:

-When built with Metal support, you can explicitly disable GPU inference with the `--gpu-layers|-ngl 0` command-line
-argument.
+- Using `make`:
+
+  ```bash
+  LLAMA_METAL=1 make
+  ```
+
+- Using `CMake`:
+
+    ```bash
+    mkdir build-metal
+    cd build-metal
+    cmake -DLLAMA_METAL=ON ..
+    cmake --build . --config Release
+    ```
+
+When built with Metal support, you can enable GPU inference with the `--gpu-layers|-ngl` command-line argument.
+Any value larger than 0 will offload the computation to the GPU. For example:
+
+```bash
+./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
+```

 ### MPI Build

@@ -393,18 +418,17 @@ Building the program with BLAS support may lead to some performance improvements
 <!---
  | LLAMA_CUDA_CUBLAS       | Boolean                |   false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
 --->
-  | Option                         | Legal values           | Default | Description |
-  |--------------------------------|------------------------|---------|-------------|
-  | LLAMA_CUDA_FORCE_DMMV          | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
-  | LLAMA_CUDA_DMMV_X              | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
-  | LLAMA_CUDA_MMV_Y               | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
-  | LLAMA_CUDA_F16                 | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
-  | LLAMA_CUDA_KQUANTS_ITER        | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
-  | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       |     128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
+  | Option                  | Legal values           | Default | Description |
+  |-------------------------|------------------------|---------|-------------|
+  | LLAMA_CUDA_FORCE_DMMV   | Boolean                |   false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
+  | LLAMA_CUDA_DMMV_X       | Positive integer >= 32 |      32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
+  | LLAMA_CUDA_MMV_Y        | Positive integer       |       1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
+  | LLAMA_CUDA_F16          | Boolean                |   false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
+  | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 |       2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |

 - #### hipBLAS

-  This provides BLAS acceleration on HIP-supported AMD GPUs.
+  This provide BLAS acceleation on HIP supported GPU like AMD GPU.
  Make sure to have ROCm installed.
  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
  Windows support is coming soon...
@@ -438,8 +462,6 @@ Building the program with BLAS support may lead to some performance improvements
  You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
    - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed.

-    - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
-
    - <details>
        <summary>Installing the OpenCL SDK from source</summary>

@@ -457,27 +479,10 @@ Building the program with BLAS support may lead to some performance improvements
        ```
      </details>

-  ##### Installing CLBlast
-
-  Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
-
-  Alternatively, they may be built from source.
+  Installing CLBlast: it may be found in your operating system's packages.

  - <details>
-    <summary>Windows:</summary>
-
-      ```cmd
-      set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
-      git clone https://github.com/CNugteren/CLBlast.git
-      mkdir CLBlast\build
-      cd CLBlast\build
-      cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
-      cmake --build . --config Release
-      cmake --install . --prefix C:/CLBlast
-      ```
-
-  - <details>
-    <summary>Unix:</summary>
+    <summary>If not, then installing from source:</summary>

      ```sh
      git clone https://github.com/CNugteren/CLBlast.git
@@ -491,32 +496,21 @@ Building the program with BLAS support may lead to some performance improvements
      Where `/some/path` is where the built library will be installed (default is `/usr/local`).
    </details>

-  ##### Building Llama with CLBlast
+  Building:

  - Build with make:
    ```sh
    make LLAMA_CLBLAST=1
    ```
-  - CMake (Unix):
+  - CMake:
    ```sh
    mkdir build
    cd build
    cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_dir=/some/path
    cmake --build . --config Release
    ```
-  - CMake (Windows):
-    ```cmd
-    set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
-    git clone https://github.com/ggerganov/llama.cpp
-    cd llama.cpp
-    mkdir build
-    cd build
-    cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
-    cmake --build . --config Release
-    cmake --install . --prefix C:/LlamaCPP
-    ```

-  ##### Running Llama with CLBlast
+  Running:

  The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.

@@ -728,12 +722,14 @@ python3 convert.py pygmalion-7b/ --outtype q4_1

 - Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.
 - Alternatively, if you want to save time and space, you can download already converted and quantized models from [TheBloke](https://huggingface.co/TheBloke), including:
-  - [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGUF)
-  - [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGUF)
-  - [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGUF)
-  - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGUF)
-  - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF)
-  - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGUF)
+  - [LLaMA 2 7B base](https://huggingface.co/TheBloke/Llama-2-7B-GGML)
+  - [LLaMA 2 13B base](https://huggingface.co/TheBloke/Llama-2-13B-GGML)
+  - [LLaMA 2 70B base](https://huggingface.co/TheBloke/Llama-2-70B-GGML)
+  - [LLaMA 2 7B chat](https://huggingface.co/TheBloke/Llama-2-7B-chat-GGML)
+  - [LLaMA 2 13B chat](https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML)
+  - [LLaMA 2 70B chat](https://huggingface.co/TheBloke/Llama-2-70B-chat-GGML)
+- Specify `-eps 1e-5` for best generation quality
+- Specify `-gqa 8` for 70B models to work

 ### Verifying the model files

@@ -847,17 +843,8 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
 #### Images
 We have two Docker images available for this project:

-1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-
-Additionally, there the following images, similar to the above:
-
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-
-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
+1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.

 #### Usage

--- a/codecov.yml
+++ b/codecov.yml
@@ -1,14 +0,0 @@
-comment: off
-
-coverage:
-  status:
-    project:
-      default:
-        target: auto
-        threshold: 0
-        base: auto
-    patch:
-      default:
-        target: auto
-        threshold: 0
-        base: auto
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,21 +1,15 @@
 #include "common.h"
-#include "build-info.h"
-#include "llama.h"

-#include <algorithm>
 #include <cassert>
-#include <cmath>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iterator>
 #include <iostream>
-#include <regex>
-#include <sstream>
+#include <cstring>
+#include <fstream>
 #include <string>
+#include <iterator>
+#include <algorithm>
+#include <sstream>
 #include <unordered_set>
-#include <vector>
-#include <cinttypes>
+#include <regex>

 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -24,17 +18,12 @@

 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
-#include <codecvt>
-#include <locale>
+#define NOMINMAX
 #include <windows.h>
 #include <fcntl.h>
 #include <io.h>
 #else
 #include <sys/ioctl.h>
-#include <sys/stat.h>
 #include <unistd.h>
 #endif

@@ -57,7 +46,7 @@ int32_t get_num_physical_cores() {
            siblings.insert(line);
        }
    }
-    if (!siblings.empty()) {
+    if (siblings.size() > 0) {
        return static_cast<int32_t>(siblings.size());
    }
 #elif defined(__APPLE__) && defined(__MACH__)
@@ -78,7 +67,7 @@ int32_t get_num_physical_cores() {
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }

-static void process_escapes(std::string& input) {
+void process_escapes(std::string& input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;

@@ -104,6 +93,7 @@ static void process_escapes(std::string& input) {

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    bool invalid_param = false;
+    bool escape_prompt = false;
    std::string arg;
    gpt_params default_params;
    const std::string arg_prefix = "--";
@@ -135,8 +125,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.prompt = argv[i];
-        } else if (arg == "-e" || arg == "--escape") {
-            params.escape = true;
+        } else if (arg == "-e") {
+            escape_prompt = true;
        } else if (arg == "--prompt-cache") {
            if (++i >= argc) {
                invalid_param = true;
@@ -305,12 +295,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_keep = std::stoi(argv[i]);
-        } else if (arg == "--draft") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_draft = std::stoi(argv[i]);
        } else if (arg == "--chunks") {
            if (++i >= argc) {
                invalid_param = true;
@@ -323,12 +307,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.model = argv[i];
-        } else if (arg == "-md" || arg == "--model-draft") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.model_draft = argv[i];
        } else if (arg == "-a" || arg == "--alias") {
            if (++i >= argc) {
                invalid_param = true;
@@ -374,17 +352,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 #else
            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-#endif
-        } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-            params.n_gpu_layers_draft = std::stoi(argv[i]);
-#else
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
        } else if (arg == "--main-gpu" || arg == "-mg") {
            if (++i >= argc) {
@@ -434,6 +401,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
+        } else if (arg == "--mtest") {
+            params.mem_test = true;
        } else if (arg == "--numa") {
            params.numa = true;
        } else if (arg == "--export") {
@@ -446,16 +415,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.antiprompt.push_back(argv[i]);
-        } else if (arg == "-ld" || arg == "--logdir") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.logdir = argv[i];
-
-            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
-                params.logdir += DIRECTORY_SEPARATOR;
-            }
        } else if (arg == "--perplexity") {
            params.perplexity = true;
        } else if (arg == "--ppl-stride") {
@@ -503,9 +462,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            }
        } else if (arg == "-h" || arg == "--help") {
            gpt_print_usage(argc, argv, default_params);
-#ifndef LOG_DISABLE_LOGS
-            log_print_usage();
-#endif // LOG_DISABLE_LOGS
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
@@ -545,25 +501,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                std::istreambuf_iterator<char>(),
                std::back_inserter(params.grammar)
            );
-#ifndef LOG_DISABLE_LOGS
-        // Parse args for logging parameters
-        } else if ( log_param_single_parse( argv[i] ) ) {
-            // Do nothing, log_param_single_parse automatically does it's thing
-            //  and returns if a match was found and parsed.
-        } else if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) {
-            // We have a matching known parameter requiring an argument,
-            //  now we need to check if there is anything after this argv
-            //  and flag invalid_param or parse it.
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            if( !log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i-1], argv[i]) ) {
-                invalid_param = true;
-                break;
-            }
-        // End of Parse args for logging parameters
-#endif // LOG_DISABLE_LOGS
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            gpt_print_usage(argc, argv, default_params);
@@ -583,7 +520,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        exit(1);
    }

-    if (params.escape) {
+    if (escape_prompt) {
        process_escapes(params.prompt);
        process_escapes(params.input_prefix);
        process_escapes(params.input_suffix);
@@ -593,110 +530,104 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 }

 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
-    printf("usage: %s [options]\n", argv[0]);
-    printf("\n");
-    printf("options:\n");
-    printf("  -h, --help            show this help message and exit\n");
-    printf("  -i, --interactive     run in interactive mode\n");
-    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
-    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
-    printf("  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
-    printf("  -r PROMPT, --reverse-prompt PROMPT\n");
-    printf("                        halt generation at PROMPT, return control in interactive mode\n");
-    printf("                        (can be specified more than once for multiple prompts).\n");
-    printf("  --color               colorise output to distinguish prompt and user input from generations\n");
-    printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    printf("  -p PROMPT, --prompt PROMPT\n");
-    printf("                        prompt to start generation with (default: empty)\n");
-    printf("  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    printf("  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
-    printf("  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
-    printf("                        not supported with --interactive or other interactive options\n");
-    printf("  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
-    printf("  --random-prompt       start with a randomized prompt.\n");
-    printf("  --in-prefix-bos       prefix BOS to user inputs, preceding the `--in-prefix` string\n");
-    printf("  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
-    printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
-    printf("  -f FNAME, --file FNAME\n");
-    printf("                        prompt file to start generation.\n");
-    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
-    printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
-    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
-    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
-    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
-    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
-    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
-    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
-    printf("  --mirostat N          use Mirostat sampling.\n");
-    printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
-    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
-    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
-    printf("  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
-    printf("                        modifies the likelihood of token appearing in the completion,\n");
-    printf("                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
-    printf("                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
-    printf("  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
-    printf("  --grammar-file FNAME  file to read grammar from\n");
-    printf("  --cfg-negative-prompt PROMPT\n");
-    printf("                        negative prompt to use for guidance. (default: empty)\n");
-    printf("  --cfg-negative-prompt-file FNAME\n");
-    printf("                        negative prompt file to use for guidance. (default: empty)\n");
-    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
-    printf("  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
-    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
-    printf("  --rope-freq-scale N   RoPE frequency linear scaling factor (default: loaded from model)\n");
-    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
-    printf("  --no-penalize-nl      do not penalize newline token\n");
-    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
-    printf("  --temp N              temperature (default: %.1f)\n", (double)params.temp);
-    printf("  --perplexity          compute perplexity over each ctx window of the prompt\n");
-    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
-    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
-    printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
-    printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
-    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
+    fprintf(stdout, "usage: %s [options]\n", argv[0]);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "options:\n");
+    fprintf(stdout, "  -h, --help            show this help message and exit\n");
+    fprintf(stdout, "  -i, --interactive     run in interactive mode\n");
+    fprintf(stdout, "  --interactive-first   run in interactive mode and wait for input right away\n");
+    fprintf(stdout, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
+    fprintf(stdout, "  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
+    fprintf(stdout, "  -r PROMPT, --reverse-prompt PROMPT\n");
+    fprintf(stdout, "                        halt generation at PROMPT, return control in interactive mode\n");
+    fprintf(stdout, "                        (can be specified more than once for multiple prompts).\n");
+    fprintf(stdout, "  --color               colorise output to distinguish prompt and user input from generations\n");
+    fprintf(stdout, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
+    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stdout, "  -p PROMPT, --prompt PROMPT\n");
+    fprintf(stdout, "                        prompt to start generation with (default: empty)\n");
+    fprintf(stdout, "  -e                    process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
+    fprintf(stdout, "  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
+    fprintf(stdout, "  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
+    fprintf(stdout, "                        not supported with --interactive or other interactive options\n");
+    fprintf(stdout, "  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
+    fprintf(stdout, "  --random-prompt       start with a randomized prompt.\n");
+    fprintf(stdout, "  --in-prefix-bos       prefix BOS to user inputs, preceding the `--in-prefix` string\n");
+    fprintf(stdout, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
+    fprintf(stdout, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
+    fprintf(stdout, "  -f FNAME, --file FNAME\n");
+    fprintf(stdout, "                        prompt file to start generation.\n");
+    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
+    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stdout, "  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
+    fprintf(stdout, "  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
+    fprintf(stdout, "  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
+    fprintf(stdout, "  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
+    fprintf(stdout, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
+    fprintf(stdout, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
+    fprintf(stdout, "  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
+    fprintf(stdout, "  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+    fprintf(stdout, "  --mirostat N          use Mirostat sampling.\n");
+    fprintf(stdout, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
+    fprintf(stdout, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
+    fprintf(stdout, "  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
+    fprintf(stdout, "  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
+    fprintf(stdout, "  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
+    fprintf(stdout, "                        modifies the likelihood of token appearing in the completion,\n");
+    fprintf(stdout, "                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
+    fprintf(stdout, "                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
+    fprintf(stdout, "  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
+    fprintf(stdout, "  --grammar-file FNAME  file to read grammar from\n");
+    fprintf(stdout, "  --cfg-negative-prompt PROMPT\n");
+    fprintf(stdout, "                        negative prompt to use for guidance. (default: empty)\n");
+    fprintf(stdout, "  --cfg-negative-prompt-file FNAME\n");
+    fprintf(stdout, "                        negative prompt file to use for guidance. (default: empty)\n");
+    fprintf(stdout, "  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
+    fprintf(stdout, "  --rope-scale N        RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
+    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
+    fprintf(stdout, "  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
+    fprintf(stdout, "  --no-penalize-nl      do not penalize newline token\n");
+    fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    fprintf(stdout, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
+    fprintf(stdout, "  --perplexity          compute perplexity over each ctx window of the prompt\n");
+    fprintf(stdout, "  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
+    fprintf(stdout, "  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
+    fprintf(stdout, "  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
+    fprintf(stdout, "  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
    if (llama_mlock_supported()) {
-        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+        fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
    if (llama_mmap_supported()) {
-        printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+        fprintf(stdout, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
-    printf("  --numa                attempt optimizations that help on some NUMA systems\n");
-    printf("                        if run without this previously, it is recommended to drop the system page cache before using this\n");
-    printf("                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
+    fprintf(stdout, "  --numa                attempt optimizations that help on some NUMA systems\n");
+    fprintf(stdout, "                        if run without this previously, it is recommended to drop the system page cache before using this\n");
+    fprintf(stdout, "                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-    printf("  -ngl N, --n-gpu-layers N\n");
-    printf("                        number of layers to store in VRAM\n");
-    printf("  -ngld N, --n-gpu-layers-draft N\n");
-    printf("                        number of layers to store in VRAM for the draft model\n");
-    printf("  -ts SPLIT --tensor-split SPLIT\n");
-    printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-    printf("  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -ngl N, --n-gpu-layers N\n");
+    fprintf(stdout, "                        number of layers to store in VRAM\n");
+    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
+    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+    fprintf(stdout, "  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
 #ifdef GGML_USE_CUBLAS
-    printf("  -nommq, --no-mul-mat-q\n");
-    printf("                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
-    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
+    fprintf(stdout, "                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif // GGML_USE_CUBLAS
 #endif
-    printf("  --export              export the computation graph to 'llama.ggml'\n");
-    printf("  --verbose-prompt      print prompt before generation\n");
+    fprintf(stdout, "  --mtest               compute maximum memory usage\n");
+    fprintf(stdout, "  --export              export the computation graph to 'llama.ggml'\n");
+    fprintf(stdout, "  --verbose-prompt      print prompt before generation\n");
    fprintf(stderr, "  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
-    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    printf("  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
-    printf("  -m FNAME, --model FNAME\n");
-    printf("                        model path (default: %s)\n", params.model.c_str());
-    printf("  -md FNAME, --model-draft FNAME\n");
-    printf("                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
-    printf("  -ld LOGDIR, --logdir LOGDIR\n");
-    printf("                        path under which to save YAML logs (no logging if unset)\n");
-    printf("\n");
+    fprintf(stdout, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
+    fprintf(stdout, "  -m FNAME, --model FNAME\n");
+    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stdout, "\n");
 }

 std::string gpt_random_prompt(std::mt19937 & rng) {
@@ -727,9 +658,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param

    lparams.n_ctx           = params.n_ctx;
    lparams.n_batch         = params.n_batch;
-    if (params.n_gpu_layers != -1) {
-        lparams.n_gpu_layers = params.n_gpu_layers;
-    }
+    lparams.n_gpu_layers    = params.n_gpu_layers;
    lparams.main_gpu        = params.main_gpu;
    lparams.tensor_split    = params.tensor_split;
    lparams.low_vram        = params.low_vram;
@@ -779,14 +708,6 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
    }

-    {
-        LOG("warming up the model with an empty run\n");
-
-        const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
-        llama_eval(lctx, tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, params.n_threads);
-        llama_reset_timings(lctx);
-    }
-
    return std::make_tuple(model, lctx);
 }

@@ -801,10 +722,10 @@ std::vector<llama_token> llama_tokenize(
    // upper limit for the number of tokens
    int n_tokens = text.length() + add_bos;
    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
+    n_tokens = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_tokenize(ctx, text.data(), text.length(), result.data(), result.size(), add_bos);
+        int check = llama_tokenize(ctx, text.c_str(), result.data(), result.size(), add_bos);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@@ -858,413 +779,3 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to

    return result;
 }
-
-//
-// Sampling utils
-//
-
-llama_token llama_sample_token(
-                  struct llama_context * ctx,
-                  struct llama_context * ctx_guidance,
-                  struct llama_grammar * grammar,
-               const struct gpt_params & params,
-        const std::vector<llama_token> & last_tokens,
-         std::vector<llama_token_data> & candidates,
-                                   int   idx) {
-    const int n_ctx   = llama_n_ctx(ctx);
-    const int n_vocab = llama_n_vocab(ctx);
-
-    const float   temp            = params.temp;
-    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
-    const float   top_p           = params.top_p;
-    const float   tfs_z           = params.tfs_z;
-    const float   typical_p       = params.typical_p;
-    const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
-    const float   repeat_penalty  = params.repeat_penalty;
-    const float   alpha_presence  = params.presence_penalty;
-    const float   alpha_frequency = params.frequency_penalty;
-    const int     mirostat        = params.mirostat;
-    const float   mirostat_tau    = params.mirostat_tau;
-    const float   mirostat_eta    = params.mirostat_eta;
-    const bool    penalize_nl     = params.penalize_nl;
-
-    llama_token id = 0;
-
-    float * logits = llama_get_logits(ctx) + idx * n_vocab;
-
-    // Apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
-    }
-
-    candidates.clear();
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-    }
-
-    llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
-
-    if (ctx_guidance) {
-        llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale);
-    }
-
-    // apply penalties
-    if (!last_tokens.empty()) {
-        const float nl_logit = logits[llama_token_nl(ctx)];
-        const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx);
-
-        llama_sample_repetition_penalty(ctx, &cur_p,
-                last_tokens.data() + last_tokens.size() - last_n_repeat,
-                last_n_repeat, repeat_penalty);
-        llama_sample_frequency_and_presence_penalties(ctx, &cur_p,
-                last_tokens.data() + last_tokens.size() - last_n_repeat,
-                last_n_repeat, alpha_frequency, alpha_presence);
-
-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(ctx)) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
-                }
-            }
-        }
-    }
-
-    if (grammar != NULL) {
-        llama_sample_grammar(ctx, &cur_p, grammar);
-    }
-
-    if (temp <= 0) {
-        // Greedy sampling
-        id = llama_sample_token_greedy(ctx, &cur_p);
-    } else {
-        if (mirostat == 1) {
-            static float mirostat_mu = 2.0f * mirostat_tau;
-            const int mirostat_m = 100;
-            llama_sample_temperature(ctx, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
-        } else if (mirostat == 2) {
-            static float mirostat_mu = 2.0f * mirostat_tau;
-            llama_sample_temperature(ctx, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
-        } else {
-            // Temperature sampling
-            llama_sample_top_k      (ctx, &cur_p, top_k, 1);
-            llama_sample_tail_free  (ctx, &cur_p, tfs_z, 1);
-            llama_sample_typical    (ctx, &cur_p, typical_p, 1);
-            llama_sample_top_p      (ctx, &cur_p, top_p, 1);
-            llama_sample_temperature(ctx, &cur_p, temp);
-
-            {
-                const int n_top = 10;
-                LOG("top %d candidates:\n", n_top);
-
-                for (int i = 0; i < n_top; i++) {
-                    const llama_token id = cur_p.data[i].id;
-                    LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p);
-                }
-            }
-
-            id = llama_sample_token(ctx, &cur_p);
-
-            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str());
-        }
-    }
-    // printf("`%d`", candidates_p.size);
-
-    if (grammar != NULL) {
-        llama_grammar_accept_token(ctx, grammar, id);
-    }
-
-    return id;
-}
-
-//
-// YAML utils
-//
-
-// returns true if successful, false otherwise
-bool create_directory_with_parents(const std::string & path) {
-#ifdef _WIN32
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    std::wstring wpath = converter.from_bytes(path);
-
-    // if the path already exists, check whether it's a directory
-    const DWORD attributes = GetFileAttributesW(wpath.c_str());
-    if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
-        return true;
-    }
-
-    size_t pos_slash = 0;
-
-    // process path from front to back, procedurally creating directories
-    while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
-        const std::wstring subpath = wpath.substr(0, pos_slash);
-        const wchar_t * test = subpath.c_str();
-
-        const bool success = CreateDirectoryW(test, NULL);
-        if (!success) {
-            const DWORD error = GetLastError();
-
-            // if the path already exists, ensure that it's a directory
-            if (error == ERROR_ALREADY_EXISTS) {
-                const DWORD attributes = GetFileAttributesW(subpath.c_str());
-                if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
-                    return false;
-                }
-            } else {
-                return false;
-            }
-        }
-
-        pos_slash += 1;
-    }
-
-    return true;
-#else
-    // if the path already exists, check whether it's a directory
-    struct stat info;
-    if (stat(path.c_str(), &info) == 0) {
-        return S_ISDIR(info.st_mode);
-    }
-
-    size_t pos_slash = 1; // skip leading slashes for directory creation
-
-    // process path from front to back, procedurally creating directories
-    while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
-        const std::string subpath = path.substr(0, pos_slash);
-        struct stat info;
-
-        // if the path already exists, ensure that it's a directory
-        if (stat(subpath.c_str(), &info) == 0) {
-            if (!S_ISDIR(info.st_mode)) {
-                return false;
-            }
-        } else {
-            // create parent directories
-            const int ret = mkdir(subpath.c_str(), 0755);
-            if (ret != 0) {
-                return false;
-            }
-        }
-
-        pos_slash += 1;
-    }
-
-    return true;
-#endif // _WIN32
-}
-
-void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data) {
-    if (data.empty()) {
-        fprintf(stream, "%s:\n", prop_name);
-        return;
-    }
-
-    fprintf(stream, "%s: [", prop_name);
-    for (size_t i = 0; i < data.size() - 1; ++i) {
-        fprintf(stream, "%e, ", data[i]);
-    }
-    fprintf(stream, "%e]\n", data.back());
-}
-
-void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data) {
-    if (data.empty()) {
-        fprintf(stream, "%s:\n", prop_name);
-        return;
-    }
-
-    fprintf(stream, "%s: [", prop_name);
-    for (size_t i = 0; i < data.size() - 1; ++i) {
-        fprintf(stream, "%d, ", data[i]);
-    }
-    fprintf(stream, "%d]\n", data.back());
-}
-
-void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
-    std::string data_str(data == NULL ? "" : data);
-
-    if (data_str.empty()) {
-        fprintf(stream, "%s:\n", prop_name);
-        return;
-    }
-
-    size_t pos_start = 0;
-    size_t pos_found = 0;
-
-    if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
-        data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
-        data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
-        data_str = "\"" + data_str + "\"";
-        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
-        return;
-    }
-
-    if (data_str.find('\n') == std::string::npos) {
-        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
-        return;
-    }
-
-    fprintf(stream, "%s: |\n", prop_name);
-    while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
-        fprintf(stream, "  %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
-        pos_start = pos_found + 1;
-    }
-}
-
-std::string get_sortable_timestamp() {
-    using clock = std::chrono::system_clock;
-
-    const clock::time_point current_time = clock::now();
-    const time_t as_time_t = clock::to_time_t(current_time);
-    char timestamp_no_ns[100];
-    std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
-
-    const int64_t ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
-        current_time.time_since_epoch() % 1000000000).count();
-    char timestamp_ns[11];
-    snprintf(timestamp_ns, 11, "%09" PRId64, ns);
-
-    return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
-}
-
-void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
-                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
-    fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
-    fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
-    fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
-    fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
-    fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
-    fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
-    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
-    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
-    fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
-    fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
-    fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
-    fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
-    fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
-    fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
-    fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
-    fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
-    fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
-    fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
-
-#ifdef NDEBUG
-    fprintf(stream, "debug: false\n");
-#else
-    fprintf(stream, "debug: true\n");
-#endif // NDEBUG
-
-    fprintf(stream, "model_desc: %s\n", model_desc);
-    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", llama_n_vocab(lctx));
-
-#ifdef __OPTIMIZE__
-    fprintf(stream, "optimize: true\n");
-#else
-    fprintf(stream, "optimize: false\n");
-#endif // __OPTIMIZE__
-
-    fprintf(stream, "time: %s\n", timestamp.c_str());
-
-    fprintf(stream, "\n");
-    fprintf(stream, "###############\n");
-    fprintf(stream, "# User Inputs #\n");
-    fprintf(stream, "###############\n");
-    fprintf(stream, "\n");
-
-    fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
-    fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
-    dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str());
-    fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale);
-    fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
-    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
-    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
-    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
-    fprintf(stream, "export: %s # default: false\n", params.export_cgraph ? "true" : "false");
-    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
-    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty);
-    dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
-    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
-    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
-    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
-
-    const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
-    const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
-    fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
-
-    dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
-    fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
-    dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
-    fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
-    fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
-    fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
-    fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
-    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
-
-    fprintf(stream, "logit_bias:\n");
-    for (std::pair<llama_token, float> lb : params.logit_bias) {
-        if (ignore_eos && lb.first == logit_bias_eos->first) {
-            continue;
-        }
-        fprintf(stream, "  %d: %f", lb.first, lb.second);
-    }
-
-    fprintf(stream, "lora: %s\n", params.lora_adapter.c_str());
-    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
-    fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
-    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
-    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
-    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
-    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
-    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
-    fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
-    fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
-    fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
-    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
-    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
-    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
-    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs);
-    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
-    fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
-    fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false");
-    fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
-    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
-    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
-    fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty);
-    dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
-    fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
-    fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
-    fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
-    dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
-    fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
-    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty);
-
-    fprintf(stream, "reverse_prompt:\n");
-    for (std::string ap : params.antiprompt) {
-        size_t pos = 0;
-        while ((pos = ap.find('\n', pos)) != std::string::npos) {
-            ap.replace(pos, 1, "\\n");
-            pos += 1;
-        }
-
-        fprintf(stream, "  - %s\n", ap.c_str());
-    }
-
-    fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
-    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
-    fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed);
-    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
-    fprintf(stream, "temp: %f # default: 0.8\n", params.temp);
-
-    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
-    dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
-
-    fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z);
-    fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency());
-    fprintf(stream, "top_k: %d # default: 40\n", params.top_k);
-    fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p);
-    fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p);
-    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
-}
--- a/common/common.h
+++ b/common/common.h
@@ -4,9 +4,6 @@

 #include "llama.h"

-#define LOG_NO_FILE_LINE_FUNCTION
-#include "log.h"
-
 #include <string>
 #include <vector>
 #include <random>
@@ -14,20 +11,6 @@
 #include <unordered_map>
 #include <tuple>

-#ifdef _WIN32
-#define DIRECTORY_SEPARATOR '\\'
-#else
-#define DIRECTORY_SEPARATOR '/'
-#endif // _WIN32
-
-#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
-#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
-
-#define print_build_info() do {                                                             \
-    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);         \
-    fprintf(stderr, "%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);  \
-} while(0)
-
 //
 // CLI argument parsing
 //
@@ -40,10 +23,8 @@ struct gpt_params {
    int32_t n_ctx                           = 512;  // context size
    int32_t n_batch                         = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                          = 0;    // number of tokens to keep from initial prompt
-    int32_t n_draft                         = 16;   // number of tokens to draft during speculative decoding
    int32_t n_chunks                        = -1;   // max number of chunks to process (-1 = unlimited)
-    int32_t n_gpu_layers                    = -1;   // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft              = -1;   // number of layers to store in VRAM for the draft model (-1 - use default)
+    int32_t n_gpu_layers                    = 0;    // number of layers to store in VRAM
    int32_t main_gpu                        = 0;    // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};  // how split tensors should be distributed across GPUs
    int32_t n_probs                         = 0;    // if greater than 0, output the probabilities of top n_probs tokens.
@@ -73,7 +54,6 @@ struct gpt_params {
    float       cfg_scale         = 1.f;   // How strong is guidance

    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
-    std::string model_draft       = "";                              // draft model for speculative decoding
    std::string model_alias       = "unknown"; // model alias
    std::string prompt            = "";
    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
@@ -81,7 +61,6 @@ struct gpt_params {
    std::string input_suffix      = "";  // string to suffix user inputs with
    std::string grammar           = "";  // optional BNF-like grammar to constrain sampling
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-    std::string logdir            = "";  // directory in which to save YAML log files

    std::string lora_adapter = "";  // lora adapter path
    std::string lora_base    = "";  // base model path for the lora adapter
@@ -103,7 +82,6 @@ struct gpt_params {
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it

    bool embedding         = false; // get only sentence embedding
-    bool escape            = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
    bool interactive_first = false; // wait for user input immediately
    bool multiline_input   = false; // reverse the usage of `\`
    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
@@ -115,6 +93,7 @@ struct gpt_params {
    bool perplexity        = false; // compute perplexity over the prompt
    bool use_mmap          = true;  // use mmap for faster loads
    bool use_mlock         = false; // use mlock to keep model in memory
+    bool mem_test          = false; // compute maximum memory usage
    bool numa              = false; // attempt optimizations that help on some NUMA systems
    bool export_cgraph     = false; // export the computation graph
    bool verbose_prompt    = false; // print prompt tokens before generation
@@ -165,47 +144,3 @@ std::string llama_detokenize_spm(
 std::string llama_detokenize_bpe(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens);
-
-//
-// Sampling utils
-//
-
-// this is a common sampling function used across the examples for convenience
-// it can serve as a starting point for implementing your own sampling function
-//
-// required:
-//  - ctx:    context to use for sampling
-//  - params: sampling parameters
-//
-// optional:
-//  - ctx_guidance:  context to use for classifier-free guidance, ignore if NULL
-//  - grammar:       grammar to use for sampling, ignore if NULL
-//  - last_tokens:   needed for repetition penalty, ignore if empty
-//  - idx:           sample from llama_get_logits(ctx) + idx * n_vocab
-//
-// returns:
-//  - token:      sampled token
-//  - candidates: vector of candidate tokens
-//
-llama_token llama_sample_token(
-                  struct llama_context * ctx,
-                  struct llama_context * ctx_guidance,
-                  struct llama_grammar * grammar,
-               const struct gpt_params & params,
-        const std::vector<llama_token> & last_tokens,
-         std::vector<llama_token_data> & candidates,
-                                   int   idx = 0);
-
-//
-// YAML utils
-//
-
-bool create_directory_with_parents(const std::string & path);
-void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
-void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
-void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
-std::string get_sortable_timestamp();
-
-void dump_non_result_info_yaml(
-    FILE * stream, const gpt_params & params, const llama_context * lctx,
-    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -158,7 +158,7 @@ namespace console {
        }
    }

-    static char32_t getchar32() {
+    char32_t getchar32() {
 #if defined(_WIN32)
        HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
        wchar_t high_surrogate = 0;
@@ -212,7 +212,7 @@ namespace console {
 #endif
    }

-    static void pop_cursor() {
+    void pop_cursor() {
 #if defined(_WIN32)
        if (hConsole != NULL) {
            CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
@@ -233,16 +233,15 @@ namespace console {
        putc('\b', out);
    }

-    static int estimateWidth(char32_t codepoint) {
+    int estimateWidth(char32_t codepoint) {
 #if defined(_WIN32)
-        (void)codepoint;
        return 1;
 #else
        return wcwidth(codepoint);
 #endif
    }

-    static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
+    int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
 #if defined(_WIN32)
        CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
        if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
@@ -303,7 +302,7 @@ namespace console {
 #endif
    }

-    static void replace_last(char ch) {
+    void replace_last(char ch) {
 #if defined(_WIN32)
        pop_cursor();
        put_codepoint(&ch, 1, 1);
@@ -312,7 +311,7 @@ namespace console {
 #endif
    }

-    static void append_utf8(char32_t ch, std::string & out) {
+    void append_utf8(char32_t ch, std::string & out) {
        if (ch <= 0x7F) {
            out.push_back(static_cast<unsigned char>(ch));
        } else if (ch <= 0x7FF) {
@@ -333,7 +332,7 @@ namespace console {
    }

    // Helper function to remove the last UTF-8 character from a string
-    static void pop_back_utf8_char(std::string & line) {
+    void pop_back_utf8_char(std::string & line) {
        if (line.empty()) {
            return;
        }
@@ -349,7 +348,7 @@ namespace console {
        line.erase(pos);
    }

-    static bool readline_advanced(std::string & line, bool multiline_input) {
+    bool readline_advanced(std::string & line, bool multiline_input) {
        if (out != stdout) {
            fflush(stdout);
        }
@@ -452,7 +451,7 @@ namespace console {
        return has_more;
    }

-    static bool readline_simple(std::string & line, bool multiline_input) {
+    bool readline_simple(std::string & line, bool multiline_input) {
 #if defined(_WIN32)
        std::wstring wline;
        if (!std::getline(std::wcin, wline)) {
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -9,7 +9,7 @@
 namespace grammar_parser {
    // NOTE: assumes valid utf8 (but checks for overrun)
    // copied from llama.cpp
-    static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
+    std::pair<uint32_t, const char *> decode_utf8(const char * src) {
        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
        uint8_t  first_byte = static_cast<uint8_t>(*src);
        uint8_t  highbits   = first_byte >> 4;
@@ -24,19 +24,19 @@ namespace grammar_parser {
        return std::make_pair(value, pos);
    }

-    static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
+    uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
        return result.first->second;
    }

-    static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
+    uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
        return next_id;
    }

-    static void add_rule(
+    void add_rule(
            parse_state & state,
            uint32_t      rule_id,
            const std::vector<llama_grammar_element> & rule) {
@@ -46,11 +46,11 @@ namespace grammar_parser {
        state.rules[rule_id] = rule;
    }

-    static bool is_word_char(char c) {
+    bool is_word_char(char c) {
        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
    }

-    static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
+    std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
        const char * pos   = src;
        const char * end   = src + size;
        uint32_t     value = 0;
@@ -73,7 +73,7 @@ namespace grammar_parser {
        return std::make_pair(value, pos);
    }

-    static const char * parse_space(const char * src, bool newline_ok) {
+    const char * parse_space(const char * src, bool newline_ok) {
        const char * pos = src;
        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
@@ -88,7 +88,7 @@ namespace grammar_parser {
        return pos;
    }

-    static const char * parse_name(const char * src) {
+    const char * parse_name(const char * src) {
        const char * pos = src;
        while (is_word_char(*pos)) {
            pos++;
@@ -99,7 +99,7 @@ namespace grammar_parser {
        return pos;
    }

-    static std::pair<uint32_t, const char *> parse_char(const char * src) {
+    std::pair<uint32_t, const char *> parse_char(const char * src) {
        if (*src == '\\') {
            switch (src[1]) {
                case 'x': return parse_hex(src + 2, 2);
@@ -129,7 +129,7 @@ namespace grammar_parser {
            uint32_t            rule_id,
            bool                is_nested);

-    static const char * parse_sequence(
+    const char * parse_sequence(
            parse_state                        & state,
            const char                         * src,
            const std::string                  & rule_name,
@@ -247,7 +247,7 @@ namespace grammar_parser {
        return pos;
    }

-    static const char * parse_rule(parse_state & state, const char * src) {
+    const char * parse_rule(parse_state & state, const char * src) {
        const char * name_end = parse_name(src);
        const char * pos      = parse_space(name_end, false);
        size_t       name_len = name_end - src;
@@ -285,7 +285,7 @@ namespace grammar_parser {
        }
    }

-    static void print_grammar_char(FILE * file, uint32_t c) {
+    void print_grammar_char(FILE * file, uint32_t c) {
        if (0x20 <= c && c <= 0x7f) {
            fprintf(file, "%c", static_cast<char>(c));
        } else {
@@ -294,7 +294,7 @@ namespace grammar_parser {
        }
    }

-    static bool is_char_element(llama_grammar_element elem) {
+    bool is_char_element(llama_grammar_element elem) {
        switch (elem.type) {
            case LLAMA_GRETYPE_CHAR:           return true;
            case LLAMA_GRETYPE_CHAR_NOT:       return true;
@@ -304,7 +304,7 @@ namespace grammar_parser {
        }
    }

-    static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
+    void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
        for (auto elem : rule) {
            switch (elem.type) {
                case LLAMA_GRETYPE_END:            fprintf(file, "END");            break;
@@ -334,7 +334,7 @@ namespace grammar_parser {
        fprintf(file, "\n");
    }

-    static void print_rule(
+    void print_rule(
            FILE     * file,
            uint32_t   rule_id,
            const std::vector<llama_grammar_element> & rule,
@@ -415,7 +415,6 @@ namespace grammar_parser {

    std::vector<const llama_grammar_element *> parse_state::c_rules() {
        std::vector<const llama_grammar_element *> ret;
-        ret.reserve(rules.size());
        for (const auto & rule : rules) {
            ret.push_back(rule.data());
        }
--- a/common/log.h
+++ b/common/log.h
@@ -1,643 +0,0 @@
-#pragma once
-
-#include <chrono>
-#include <cstring>
-#include <sstream>
-#include <iostream>
-#include <thread>
-#include <vector>
-#include <algorithm>
-#include <cinttypes>
-
-// --------------------------------
-//
-// Basic usage:
-//
-// --------
-//
-//  The LOG() and LOG_TEE() macros are ready to go by default
-//   they do not require any initialization.
-//
-//  LOGLN() and LOG_TEELN() are variants which automatically
-//   include \n character at the end of the log string.
-//
-//  LOG() behaves exactly like printf, by default writing to a logfile.
-//  LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
-//
-//  Default logfile is named
-//   "llama.<threadID>.log"
-//  Default LOG_TEE() secondary output target is
-//   stderr
-//
-//  Logs can be dynamically disabled or enabled using functions:
-//   log_disable()
-//  and
-//   log_enable()
-//
-//  A log target can be changed with:
-//   log_set_target( string )
-//    creating and opening, or re-opening a file by string filename
-//  or
-//   log_set_target( FILE* )
-//    allowing to point at stderr, stdout, or any valid FILE* file handler.
-//
-// --------
-//
-// End of Basic usage.
-//
-// --------------------------------
-
-// Specifies a log target.
-//  default uses log_handler() with "llama.log" log file
-//  this can be changed, by defining LOG_TARGET
-//  like so:
-//
-//  #define LOG_TARGET (a valid FILE*)
-//  #include "log.h"
-//
-//  or it can be simply redirected to stdout or stderr
-//  like so:
-//
-//  #define LOG_TARGET stderr
-//  #include "log.h"
-//
-//  The log target can also be redirected to a diffrent function
-//  like so:
-//
-//  #define LOG_TARGET log_handler_diffrent()
-//  #include "log.h"
-//
-//  FILE* log_handler_diffrent()
-//  {
-//      return stderr;
-//  }
-//
-//  or:
-//
-//  #define LOG_TARGET log_handler_another_one("somelog.log")
-//  #include "log.h"
-//
-//  FILE* log_handler_another_one(char*filename)
-//  {
-//      static FILE* logfile = nullptr;
-//      (...)
-//      if( !logfile )
-//      {
-//          fopen(...)
-//      }
-//      (...)
-//      return logfile
-//  }
-//
-#ifndef LOG_TARGET
-    #define LOG_TARGET log_handler()
-#endif
-
-#ifndef LOG_TEE_TARGET
-    #define LOG_TEE_TARGET stderr
-#endif
-
-// Utility to obtain "pid" like unique process id and use it when creating log files.
-inline std::string log_get_pid()
-{
-    static std::string pid;
-    if (pid.empty())
-    {
-        // std::this_thread::get_id() is the most portable way of obtaining a "process id"
-        //  it's not the same as "pid" but is unique enough to solve multiple instances
-        //  trying to write to the same log.
-        std::stringstream ss;
-        ss << std::this_thread::get_id();
-        pid = ss.str();
-    }
-
-    return pid;
-}
-
-// Utility function for generating log file names with unique id based on thread id.
-//  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
-//  where the number is a runtime id of the current thread.
-
-#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(log_file_basename, log_file_extension)
-
-// INTERNAL, DO NOT USE
-inline std::string log_filename_generator_impl(const std::string & log_file_basename, const std::string & log_file_extension)
-{
-    std::stringstream buf;
-
-    buf << log_file_basename;
-    buf << ".";
-    buf << log_get_pid();
-    buf << ".";
-    buf << log_file_extension;
-
-    return buf.str();
-}
-
-#ifndef LOG_DEFAULT_FILE_NAME
-    #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
-#endif
-
-// Utility for turning #define values into string literals
-//  so we can have a define for stderr and
-//  we can print "stderr" instead of literal stderr, etc.
-#define LOG_STRINGIZE1(s) #s
-#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
-
-#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
-
-// Allows disabling timestamps.
-//  in order to disable, define LOG_NO_TIMESTAMPS
-//  like so:
-//
-//  #define LOG_NO_TIMESTAMPS
-//  #include "log.h"
-//
-#ifndef LOG_NO_TIMESTAMPS
-    #ifndef _MSC_VER
-        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #else
-        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #endif
-#else
-    #define LOG_TIMESTAMP_FMT "%s"
-    #define LOG_TIMESTAMP_VAL ,""
-#endif
-
-#ifdef LOG_TEE_TIMESTAMPS
-    #ifndef _MSC_VER
-        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #else
-        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #endif
-#else
-    #define LOG_TEE_TIMESTAMP_FMT "%s"
-    #define LOG_TEE_TIMESTAMP_VAL ,""
-#endif
-
-// Allows disabling file/line/function prefix
-//  in order to disable, define LOG_NO_FILE_LINE_FUNCTION
-//  like so:
-//
-//  #define LOG_NO_FILE_LINE_FUNCTION
-//  #include "log.h"
-//
-#ifndef LOG_NO_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
-        #define LOG_FLF_FMT "[%24s:%5d][%24s] "
-        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #else
-        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #endif
-#else
-    #define LOG_FLF_FMT "%s"
-    #define LOG_FLF_VAL ,""
-#endif
-
-#ifdef LOG_TEE_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
-        #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #else
-        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #endif
-#else
-    #define LOG_TEE_FLF_FMT "%s"
-    #define LOG_TEE_FLF_VAL ,""
-#endif
-
-// Utility for synchronizing log configuration state
-//  since std::optional was introduced only in c++17
-enum LogTriState
-{
-    LogTriStateSame,
-    LogTriStateFalse,
-    LogTriStateTrue
-};
-
-// INTERNAL, DO NOT USE
-//  USE LOG() INSTEAD
-//
-#ifndef _MSC_VER
-    #define LOG_IMPL(str, ...)                                                                                          \
-    {                                                                                                               \
-        if (LOG_TARGET != nullptr)                                                                                  \
-        {                                                                                                           \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
-            fflush(LOG_TARGET);                                                                                     \
-        }                                                                                                           \
-    }
-#else
-    #define LOG_IMPL(str, ...)                                                                                               \
-    {                                                                                                                    \
-        if (LOG_TARGET != nullptr)                                                                                       \
-        {                                                                                                                \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
-            fflush(LOG_TARGET);                                                                                          \
-        }                                                                                                                \
-    }
-#endif
-
-// INTERNAL, DO NOT USE
-//  USE LOG_TEE() INSTEAD
-//
-#ifndef _MSC_VER
-    #define LOG_TEE_IMPL(str, ...)                                                                                                          \
-    {                                                                                                                                   \
-        if (LOG_TARGET != nullptr)                                                                                                      \
-        {                                                                                                                               \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__);                     \
-            fflush(LOG_TARGET);                                                                                                         \
-        }                                                                                                                               \
-        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                         \
-        {                                                                                                                               \
-            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
-            fflush(LOG_TEE_TARGET);                                                                                                     \
-        }                                                                                                                               \
-    }
-#else
-    #define LOG_TEE_IMPL(str, ...)                                                                                                               \
-    {                                                                                                                                        \
-        if (LOG_TARGET != nullptr)                                                                                                           \
-        {                                                                                                                                    \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__);                     \
-            fflush(LOG_TARGET);                                                                                                              \
-        }                                                                                                                                    \
-        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                              \
-        {                                                                                                                                    \
-            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
-            fflush(LOG_TEE_TARGET);                                                                                                          \
-        }                                                                                                                                    \
-    }
-#endif
-
-// The '\0' as a last argument, is a trick to bypass the silly
-//  "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
-//  so we can have a single macro which can be called just like printf.
-
-// Main LOG macro.
-//  behaves like printf, and supports arguments the exact same way.
-//
-#ifndef _MSC_VER
-    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
-#else
-    #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
-#endif
-
-// Main TEE macro.
-//  does the same as LOG
-//  and
-//  simultaneously writes stderr.
-//
-// Secondary target can be changed just like LOG_TARGET
-//  by defining LOG_TEE_TARGET
-//
-#ifndef _MSC_VER
-    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
-#else
-    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
-#endif
-
-// LOG macro variants with auto endline.
-#ifndef _MSC_VER
-    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
-    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
-#else
-    #define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
-    #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
-#endif
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
-{
-    static bool _initialized{false};
-    static bool _disabled{(filename.empty() && target == nullptr)};
-    static std::string log_current_filename{filename};
-    static FILE *log_current_target{target};
-    static FILE *logfile = nullptr;
-
-    if (change)
-    {
-        if (disable == LogTriStateTrue)
-        {
-            // Disable primary target
-            _disabled = true;
-        }
-        // If previously disabled, only enable, and keep previous target
-        else if (disable == LogTriStateFalse)
-        {
-            _disabled = false;
-        }
-        // Otherwise, process the arguments
-        else if (log_current_filename != filename || log_current_target != target)
-        {
-            _initialized = false;
-        }
-    }
-
-    if (_disabled)
-    {
-        // Log is disabled
-        return nullptr;
-    }
-
-    if (_initialized)
-    {
-        // with fallback in case something went wrong
-        return logfile ? logfile : stderr;
-    }
-
-    // do the (re)initialization
-    if (target != nullptr)
-    {
-        if (logfile != nullptr && logfile != stdout && logfile != stderr)
-        {
-            fclose(logfile);
-        }
-
-        log_current_filename = LOG_DEFAULT_FILE_NAME;
-        log_current_target = target;
-
-        logfile = target;
-    }
-    else
-    {
-        if (log_current_filename != filename)
-        {
-            if (logfile != nullptr && logfile != stdout && logfile != stderr)
-            {
-                fclose(logfile);
-            }
-        }
-
-        logfile = fopen(filename.c_str(), "w");
-    }
-
-    if (!logfile)
-    {
-        //  Verify whether the file was opened, otherwise fallback to stderr
-        logfile = stderr;
-
-        fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
-        fflush(stderr);
-
-        // At this point we let the init flag be to true below, and let the target fallback to stderr
-        //  otherwise we would repeatedly fopen() which was already unsuccessful
-    }
-
-    _initialized = true;
-
-    return logfile ? logfile : stderr;
-}
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler2_impl(bool change = false, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
-{
-    return log_handler1_impl(change, disable, filename, target);
-}
-
-// Disables logs entirely at runtime.
-//  Makes LOG() and LOG_TEE() produce no output,
-//  untill enabled back.
-#define log_disable() log_disable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_disable_impl()
-{
-    return log_handler1_impl(true, LogTriStateTrue);
-}
-
-// Enables logs at runtime.
-#define log_enable() log_enable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_enable_impl()
-{
-    return log_handler1_impl(true, LogTriStateFalse);
-}
-
-// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
-#define log_set_target(target) log_set_target_impl(target)
-
-// INTERNAL, DO NOT USE
-inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, filename); }
-inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, target); }
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler() { return log_handler1_impl(); }
-
-inline void log_test()
-{
-    log_disable();
-    LOG("01 Hello World to nobody, because logs are disabled!\n")
-    log_enable();
-    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET))
-    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n")
-    log_set_target(stderr);
-    LOG("04 Hello World to stderr!\n")
-    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n")
-    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("06 Hello World to default log file!\n")
-    log_set_target(stdout);
-    LOG("07 Hello World to stdout!\n")
-    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("08 Hello World to default log file again!\n")
-    log_disable();
-    LOG("09 Hello World _1_ into the void!\n")
-    log_enable();
-    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n")
-    log_disable();
-    log_set_target("llama.anotherlog.log");
-    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n")
-    log_enable();
-    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n")
-    log_set_target("llama.yetanotherlog.log");
-    LOG("13 Hello World this time in yet new file?\n")
-    log_set_target(log_filename_generator("llama_autonamed", "log"));
-    LOG("14 Hello World in log with generated filename!\n")
-#ifdef _MSC_VER
-    LOG_TEE("15 Hello msvc TEE without arguments\n")
-    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
-    LOG_TEELN("17 Hello msvc TEELN without arguments\n")
-    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test")
-    LOG("19 Hello msvc LOG without arguments\n")
-    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test")
-    LOGLN("21 Hello msvc LOGLN without arguments\n")
-    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test")
-#endif
-}
-
-inline bool log_param_single_parse(const std::string & param)
-{
-    if ( param == "--log-test")
-    {
-        log_test();
-        return true;
-    }
-
-    if ( param == "--log-disable")
-    {
-        log_disable();
-        return true;
-    }
-
-    if ( param == "--log-enable")
-    {
-        log_enable();
-        return true;
-    }
-
-    return false;
-}
-
-inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
-{
-    if ( param == "--log-file")
-    {
-        if (!check_but_dont_parse)
-        {
-            log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
-        }
-
-        return true;
-    }
-
-    return false;
-}
-
-inline void log_print_usage()
-{
-    printf("log options:\n");
-    /* format
-    printf("  -h, --help            show this help message and exit\n");*/
-    /* spacing
-    printf("__-param----------------Description\n");*/
-    printf("  --log-test            Run simple logging test\n");
-    printf("  --log-disable         Disable trace logs\n");
-    printf("  --log-enable          Enable trace logs\n");
-    printf("  --log-file            Specify a log filename (without extension)\n");
-    printf("                        Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /*  */
-}
-
-#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
-
-// INTERNAL, DO NOT USE
-inline void log_dump_cmdline_impl(int argc, char **argv)
-{
-    std::stringstream buf;
-    for (int i = 0; i < argc; ++i)
-    {
-        if (std::string(argv[i]).find(' ') != std::string::npos)
-        {
-            buf << " \"" << argv[i] <<"\"";
-        }
-        else
-        {
-            buf << " " << argv[i];
-        }
-    }
-    LOGLN("Cmd:%s", buf.str().c_str())
-}
-
-#define log_tostr(var) log_var_to_string_impl(var).c_str()
-
-inline std::string log_var_to_string_impl(bool var)
-{
-    return var ? "true" : "false";
-}
-
-inline std::string log_var_to_string_impl(std::string var)
-{
-    return var;
-}
-
-inline std::string log_var_to_string_impl(const std::vector<int> & var)
-{
-    std::stringstream buf;
-    buf << "[ ";
-    bool first = true;
-    for (auto e : var)
-    {
-        if (first)
-        {
-            first = false;
-        }
-        else
-        {
-            buf << ", ";
-        }
-        buf << std::to_string(e);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-#define LOG_TOKENS_TOSTR_PRETTY(ctx, tokens)                                 \
-    [&tokens, &ctx]()                                                        \
-    {                                                                        \
-        std::stringstream buf;                                               \
-        buf << "[ ";                                                         \
-                                                                             \
-        bool first = true;                                                   \
-        for (const auto &token : tokens)                                     \
-        {                                                                    \
-            if (!first)                                                      \
-                buf << ", ";                                                 \
-            else                                                             \
-                first = false;                                               \
-                                                                             \
-            auto detokenized = llama_token_to_piece(ctx, token);             \
-                                                                             \
-            detokenized.erase(                                               \
-                std::remove_if(                                              \
-                    detokenized.begin(),                                     \
-                    detokenized.end(),                                       \
-                    [](const unsigned char c) { return !std::isprint(c); }), \
-                detokenized.end());                                          \
-                                                                             \
-            buf                                                              \
-                << "'" << detokenized << "'"                                 \
-                << ":" << std::to_string(token);                             \
-        }                                                                    \
-        buf << " ]";                                                         \
-                                                                             \
-        return buf.str();                                                    \
-    }()                                                                      \
-        .c_str()
-
-#ifdef LOG_DISABLE_LOGS
-
-#undef LOG
-#define LOG(...) // dummy stub
-#undef LOGLN
-#define LOGLN(...) // dummy stub
-
-#undef LOG_TEE
-#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
-
-#undef LOG_TEELN
-#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__); // convert to normal fprintf
-
-#undef LOG_DISABLE
-#define LOG_DISABLE() // dummy stub
-
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
-
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
-
-#undef LOG_SET_TARGET
-#define LOG_SET_TARGET(...) // dummy stub
-
-#undef LOG_DUMP_CMDLINE
-#define LOG_DUMP_CMDLINE(...) // dummy stub
-
-#endif // LOG_DISABLE_LOGS
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -1,304 +0,0 @@
-#!/usr/bin/env python3
-# HF baichuan --> gguf conversion
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import struct
-import sys
-from pathlib import Path
-from typing import TYPE_CHECKING, Any
-import itertools
-import gguf
-import numpy as np
-import torch
-from sentencepiece import SentencePieceProcessor  # type: ignore[import]
-
-
-if TYPE_CHECKING:
-    from typing import TypeAlias
-
-NDArray: TypeAlias = 'np.ndarray[Any, Any]'
-
-# reverse HF permute back to original pth layout
-
-
-def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
-    if n_kv_head is not None and n_head != n_kv_head:
-        n_head //= n_kv_head
-
-    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-            .swapaxes(1, 2)
-            .reshape(weights.shape))
-
-def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
-        r = weights.shape[0] // 3
-        return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
-
-def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
-        r = weights.shape[0] // 3
-        return weights[r * n_part : r * n_part + r, ...]
-
-def count_model_parts(dir_model: str) -> int:
-    num_parts = 0
-
-    for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
-            num_parts += 1
-
-    if num_parts > 0:
-        print("gguf: found " + str(num_parts) + " model parts")
-
-    return num_parts
-
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
-    parser.add_argument(
-        "--vocab-only", action="store_true",
-        help="extract only the vocab",
-    )
-    parser.add_argument(
-        "--outfile", type=Path,
-        help="path to write to; default: based on input",
-    )
-    parser.add_argument(
-        "model", type=Path,
-        help="directory containing model file, or model file itself (*.bin)",
-    )
-    parser.add_argument(
-        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
-        help="output format - use 0 for float32, 1 for float16",
-    )
-    return parser.parse_args()
-
-args = parse_args()
-
-dir_model = args.model
-ftype = args.ftype
-if not dir_model.is_dir():
-    print(f'Error: {args.model} is not a directory', file = sys.stderr)
-    sys.exit(1)
-
-# possible tensor data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-if args.outfile is not None:
-    fname_out = args.outfile
-else:
-    # output in the same directory as the model by default
-    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
-
-print("gguf: loading model "+dir_model.name)
-
-with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-print("hello print: ",hparams["architectures"][0])
-if hparams["architectures"][0] != "BaichuanForCausalLM":
-    print("Model architecture not supported: " + hparams["architectures"][0])
-
-    sys.exit()
-
-# get number of model parts
-num_parts = count_model_parts(dir_model)
-print(f"num_parts:{num_parts}\n")
-ARCH=gguf.MODEL_ARCH.BAICHUAN
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
-
-print("gguf: get model metadata")
-
-block_count = hparams["num_hidden_layers"]
-head_count = hparams["num_attention_heads"]
-
-if "num_key_value_heads" in hparams:
-    head_count_kv = hparams["num_key_value_heads"]
-else:
-    head_count_kv = head_count
-
-if "_name_or_path" in hparams:
-    hf_repo = hparams["_name_or_path"]
-else:
-    hf_repo = ""
-
-if "max_sequence_length" in hparams:
-    ctx_length = hparams["max_sequence_length"]
-elif "max_position_embeddings" in hparams:
-    ctx_length = hparams["max_position_embeddings"]
-elif "model_max_length" in hparams:
-    ctx_length = hparams["model_max_length"]
-else:
-    print("gguf: can not find ctx length parameter.")
-
-    sys.exit()
-
-
-gguf_writer.add_name(dir_model.name)
-gguf_writer.add_source_hf_repo(hf_repo)
-gguf_writer.add_tensor_data_layout("Meta AI original pth")
-gguf_writer.add_context_length(ctx_length)
-gguf_writer.add_embedding_length(hparams["hidden_size"])
-gguf_writer.add_block_count(block_count)
-gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
-gguf_writer.add_head_count(head_count)
-gguf_writer.add_head_count_kv(head_count_kv)
-gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
-
-if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
-    if "type" in hparams["rope_scaling"]:
-        if hparams["rope_scaling"]["type"] == "linear":
-            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
-
-
-# TOKENIZATION
-
-print("gguf: get tokenizer metadata")
-
-tokens: list[bytes] = []
-scores: list[float] = []
-toktypes: list[int] = []
-
-tokenizer_model_file = dir_model / 'tokenizer.model'
-if not tokenizer_model_file.is_file():
-    print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
-    sys.exit(1)
-
-# vocab type sentencepiece
-print("gguf: get sentencepiece tokenizer vocab, scores and token types")
-
-tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
-
-for i in range(tokenizer.vocab_size()):
-    text: bytes
-    score: float
-
-    piece = tokenizer.id_to_piece(i)
-    text = piece.encode("utf-8")
-    score = tokenizer.get_score(i)
-
-    toktype = 1  # defualt to normal token type
-    if tokenizer.is_unknown(i):
-        toktype = 2
-    if tokenizer.is_control(i):
-        toktype = 3
-
-    # toktype = 4 is user-defined = tokens from added_tokens.json
-
-    if tokenizer.is_unused(i):
-        toktype = 5
-    if tokenizer.is_byte(i):
-        toktype = 6
-
-    tokens.append(text)
-    scores.append(score)
-    toktypes.append(toktype)
-
-added_tokens_file = dir_model / 'added_tokens.json'
-if added_tokens_file.is_file():
-    with open(added_tokens_file, "r", encoding="utf-8") as f:
-        addtokens_json = json.load(f)
-
-        print("gguf: get added tokens")
-
-        for key in addtokens_json:
-            tokens.append( key.encode("utf-8") )
-            scores.append(-1000.0)
-            toktypes.append(4) # user-defined token type
-
-
-gguf_writer.add_tokenizer_model("llama")
-gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
-gguf_writer.add_token_types(toktypes)
-
-special_vocab = gguf.SpecialVocab(dir_model)
-special_vocab.add_to_gguf(gguf_writer)
-
-# TENSORS
-
-tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
-
-# tensor info
-print("gguf: get tensor metadata")
-
-if num_parts == 0:
-    part_names = iter(("pytorch_model.bin",))
-else:
-    part_names = (
-        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
-    )
-
-
-for part_name in part_names:
-    if args.vocab_only:
-        break
-    print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
-
-    tmp=model_part
-    for i in range(block_count):
-        if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
-            print(f"Unpacking and permuting layer {i}")
-            tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
-            tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
-            tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
-            del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
-
-    for name in model_part.keys():
-        data = model_part[name]
-        # we don't need these
-        if name.endswith(".rotary_emb.inv_freq"):
-            continue
-
-        old_dtype = data.dtype
-
-        # convert any unsupported data types to float32
-        if data.dtype != torch.float16 and data.dtype != torch.float32:
-            data = data.to(torch.float32)
-
-        data = data.squeeze().numpy()
-
-        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-        if new_name is None:
-            print("Can not map tensor '" + name + "'")
-            sys.exit()
-
-        n_dims = len(data.shape)
-        data_dtype = data.dtype
-
-        # if f32 desired, convert any float16 to float32
-        if ftype == 0 and data_dtype == np.float16:
-            data = data.astype(np.float32)
-
-        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-            data = data.astype(np.float32)
-
-        # if f16 desired, convert any float32 2-dim weight tensors to float16
-        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-            data = data.astype(np.float16)
-
-        print(name + " -> " +  new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-        gguf_writer.add_tensor(new_name, data)
-
-
-print("gguf: write header")
-gguf_writer.write_header_to_file()
-print("gguf: write metadata")
-gguf_writer.write_kv_data_to_file()
-if not args.vocab_only:
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-gguf_writer.close()
-
-print(f"gguf: model successfully exported to '{fname_out}'")
-print("")
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -1,24 +1,17 @@
 #!/usr/bin/env python3
 # HF falcon--> gguf conversion

-from __future__ import annotations
-
-import argparse
-import json
+import gguf
 import os
-import struct
 import sys
-from pathlib import Path
-from typing import Any
-
+import struct
+import json
 import numpy as np
 import torch
-from transformers import AutoTokenizer  # type: ignore[import]
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf

+from typing import Any, List
+from pathlib import Path
+from transformers import AutoTokenizer

 def bytes_to_unicode():
    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
@@ -39,10 +32,11 @@ def bytes_to_unicode():
            bs.append(b)
            cs.append(2**8+n)
            n += 1
-    return dict(zip(bs, (chr(n) for n in cs)))
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))


-def count_model_parts(dir_model: Path) -> int:
+def count_model_parts(dir_model: str) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
@@ -53,34 +47,17 @@ def count_model_parts(dir_model: Path) -> int:
    return num_parts


-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
-    parser.add_argument(
-        "--vocab-only", action="store_true",
-        help="extract only the vocab",
-    )
-    parser.add_argument(
-        "--outfile", type=Path,
-        help="path to write to; default: based on input",
-    )
-    parser.add_argument(
-        "model", type=Path,
-        help="directory containing model file, or model file itself (*.bin)",
-    )
-    parser.add_argument(
-        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
-        help="output format - use 0 for float32, 1 for float16",
-    )
-    return parser.parse_args()
-
-args = parse_args()
-
-dir_model = args.model
-ftype = args.ftype
-if not dir_model.is_dir():
-    print(f'Error: {args.model} is not a directory', file = sys.stderr)
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
    sys.exit(1)

+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
@@ -88,21 +65,25 @@ if not dir_model.is_dir():
 # map from ftype to string
 ftype_str = ["f32", "f16"]

-if args.outfile is not None:
-    fname_out = args.outfile
-else:
-    # output in the same directory as the model by default
-    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))

-print("gguf: loading model "+dir_model.name)
+        sys.exit(1)

-with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)

 if hparams["architectures"][0] != "RWForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])

-    sys.exit(1)
+    sys.exit()

 # get number of model parts
 num_parts = count_model_parts(dir_model)
@@ -132,60 +113,77 @@ gguf_writer.add_file_type(ftype)

 print("gguf: get tokenizer metadata")

-tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
+tokens: List[str] = []
+scores: List[float] = []
+toktypes: List[int] = []
+merges: List[str] = []

-tokenizer_json_file = dir_model / 'tokenizer.json'
-if not tokenizer_json_file.is_file():
-    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
-    sys.exit(1)

-# gpt2 tokenizer
-gguf_writer.add_tokenizer_model("gpt2")
+if Path(dir_model + "/tokenizer.json").is_file():
+    # gpt2 tokenizer
+    gguf_writer.add_tokenizer_model("gpt2")

-with open(tokenizer_json_file, "r", encoding="utf-8") as f:
-    tokenizer_json = json.load(f)
+    print("gguf: get gpt2 tokenizer merges")

-print("gguf: get gpt2 tokenizer vocab")
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer_json = json.load(f)
+    merges = tokenizer_json["model"]["merges"]

-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
+    gguf_writer.add_token_merges(merges)

-# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
+    print("gguf: get gpt2 tokenizer vocab")

-reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v: k for k, v in byte_encoder.items()}
+    vocab_size = len(tokenizer_json["model"]["vocab"])

-for i in range(vocab_size):
-    if i in reverse_vocab:
-        try:
-            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
-        except KeyError:
-            text = bytearray()
-            for c in reverse_vocab[i]:
-                if ord(c) < 256:  # single byte character
-                    text.append(byte_decoder[ord(c)])
-                else:  # multibyte special token character
-                    text.extend(c.encode('utf-8'))
-    else:
-        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
-        pad_token = f"[PAD{i}]".encode("utf8")
-        text = bytearray(pad_token)
+    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)

-    tokens.append(text)
-    scores.append(0.0)                      # dymmy
-    toktypes.append(gguf.TokenType.NORMAL)  # dummy
+    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+    byte_encoder = bytes_to_unicode()
+    byte_decoder = {v: k for k, v in byte_encoder.items()}

-gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
-gguf_writer.add_token_types(toktypes)
+    for i in range(vocab_size):
+        if i in reverse_vocab:
+            try:
+                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+            except KeyError:
+                text = bytearray()
+                for c in reverse_vocab[i]:
+                    if ord(c) < 256:  # single byte character
+                        text.append(byte_decoder[ord(c)])
+                    else:  # multibyte special token character
+                        text.extend(c.encode('utf-8'))
+        else:
+            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+            pad_token = f"[PAD{i}]".encode("utf8")
+            text = bytearray(pad_token)
+
+        tokens.append(text)
+        scores.append(0.0)                      # dymmy
+        toktypes.append(gguf.TokenType.NORMAL)  # dummy
+
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_scores(scores)
+    gguf_writer.add_token_types(toktypes)
+
+print("gguf: get special token ids")
+# Look for special tokens in config.json
+
+if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
+    gguf_writer.add_bos_token_id(hparams["bos_token_id"])
+
+if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
+    gguf_writer.add_eos_token_id(hparams["eos_token_id"])
+
+if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
+    gguf_writer.add_unk_token_id(hparams["unk_token_id"])
+
+if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
+    gguf_writer.add_sep_token_id(hparams["sep_token_id"])
+
+if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
+    gguf_writer.add_pad_token_id(hparams["pad_token_id"])

-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
-special_vocab.add_to_gguf(gguf_writer)

 # TENSORS

@@ -201,17 +199,15 @@ head_dim = hparams["hidden_size"] // n_head
 print("gguf: get tensor metadata")

 if num_parts == 0:
-    part_names = iter(("pytorch_model.bin",))
+    part_names = ("pytorch_model.bin",)
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )

 for part_name in part_names:
-    if args.vocab_only:
-        break
    print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(dir_model / part_name, map_location="cpu")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")

    for name in model_part.keys():
        data = model_part[name]
@@ -242,8 +238,11 @@ for part_name in part_names:
        data = data.squeeze().numpy()

        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-        if new_name is None:
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
            print("Can not map tensor '" + name + "'")
            sys.exit()

@@ -262,20 +261,19 @@ for part_name in part_names:
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)

-        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))

-        gguf_writer.add_tensor(new_name, data)
+        gguf_writer.add_tensor(name, data)


 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
-if not args.vocab_only:
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()

 gguf_writer.close()

-print(f"gguf: model successfully exported to '{fname_out}'")
+print("gguf: model successfully exported to '" + fname_out + "'")
 print("")
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@@ -1,23 +1,17 @@
 #!/usr/bin/env python3
 # HF gptneox--> gguf conversion

-from __future__ import annotations
-
-import argparse
-import json
+import gguf
 import os
-import struct
 import sys
-from pathlib import Path
-from typing import Any
-
+import struct
+import json
 import numpy as np
 import torch
-from transformers import AutoTokenizer  # type: ignore[import]

-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
+from typing import Any, List
+from pathlib import Path
+from transformers import AutoTokenizer

 # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py

@@ -40,10 +34,11 @@ def bytes_to_unicode():
            bs.append(b)
            cs.append(2**8+n)
            n += 1
-    return dict(zip(bs, (chr(n) for n in cs)))
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))


-def count_model_parts(dir_model: Path) -> int:
+def count_model_parts(dir_model: str) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
@@ -54,34 +49,17 @@ def count_model_parts(dir_model: Path) -> int:
    return num_parts


-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
-    parser.add_argument(
-        "--vocab-only", action="store_true",
-        help="extract only the vocab",
-    )
-    parser.add_argument(
-        "--outfile", type=Path,
-        help="path to write to; default: based on input",
-    )
-    parser.add_argument(
-        "model", type=Path,
-        help="directory containing model file, or model file itself (*.bin)",
-    )
-    parser.add_argument(
-        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
-        help="output format - use 0 for float32, 1 for float16",
-    )
-    return parser.parse_args()
-
-args = parse_args()
-
-dir_model = args.model
-ftype = args.ftype
-if not dir_model.is_dir():
-    print(f'Error: {args.model} is not a directory', file = sys.stderr)
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
    sys.exit(1)

+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
@@ -89,15 +67,19 @@ if not dir_model.is_dir():
 # map from ftype to string
 ftype_str = ["f32", "f16"]

-if args.outfile is not None:
-    fname_out = args.outfile
-else:
-    # output in the same directory as the model by default
-    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))

-print("gguf: loading model "+dir_model.name)
+        sys.exit(1)

-with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)

 if hparams["architectures"][0] != "GPTNeoXForCausalLM":
@@ -115,7 +97,7 @@ print("gguf: get model metadata")

 block_count = hparams["num_hidden_layers"]

-gguf_writer.add_name(dir_model.name)
+gguf_writer.add_name(last_dir)
 gguf_writer.add_context_length(hparams["max_position_embeddings"])
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
@@ -129,52 +111,86 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])

 print("gguf: get tokenizer metadata")

-tokens: list[bytearray] = []
+tokens: List[str] = []
+merges: List[str] = []

-tokenizer_json_file = dir_model / 'tokenizer.json'
-if not tokenizer_json_file.is_file():
-    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
-    sys.exit(1)

-# gpt2 tokenizer
-gguf_writer.add_tokenizer_model("gpt2")
+if Path(dir_model + "/tokenizer.json").is_file():
+    # gpt2 tokenizer
+    gguf_writer.add_tokenizer_model("gpt2")

-with open(tokenizer_json_file, "r", encoding="utf-8") as f:
-    tokenizer_json = json.load(f)
+    print("gguf: get gpt2 tokenizer merges")

-print("gguf: get gpt2 tokenizer vocab")
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer_json = json.load(f)
+    merges = tokenizer_json["model"]["merges"]

-vocab_size = len(tokenizer_json["model"]["vocab"])
+    gguf_writer.add_token_merges(merges)

-# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
+    print("gguf: get gpt2 tokenizer vocab")

-reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v: k for k, v in byte_encoder.items()}
+    vocab_size = len(tokenizer_json["model"]["vocab"])

-for i in range(vocab_size):
-    if i in reverse_vocab:
-        try:
-            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
-        except KeyError:
-            text = bytearray()
-            for c in reverse_vocab[i]:
-                if ord(c) < 256:  # single byte character
-                    text.append(byte_decoder[ord(c)])
-                else:  # multibyte special token character
-                    text.extend(c.encode('utf-8'))
-    else:
-        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
-        pad_token = f"[PAD{i}]".encode("utf8")
-        text = bytearray(pad_token)
+    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+    tokenizer = AutoTokenizer.from_pretrained(dir_model)

-    tokens.append(text)
+    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+    byte_encoder = bytes_to_unicode()
+    byte_decoder = {v: k for k, v in byte_encoder.items()}

-gguf_writer.add_token_list(tokens)
+    for i in range(vocab_size):
+        if i in reverse_vocab:
+            try:
+                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+            except KeyError:
+                text = bytearray()
+                for c in reverse_vocab[i]:
+                    if ord(c) < 256:  # single byte character
+                        text.append(byte_decoder[ord(c)])
+                    else:  # multibyte special token character
+                        text.extend(c.encode('utf-8'))
+        else:
+            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+            pad_token = f"[PAD{i}]".encode("utf8")
+            text = bytearray(pad_token)
+
+        tokens.append(text)
+
+    gguf_writer.add_token_list(tokens)
+
+    if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
+        print("gguf: get special token ids")
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        # find special token ids
+
+        if "bos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config:
+            for key in tokenizer_json["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]:
+                    gguf_writer.add_pad_token_id(key["id"])

-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
-special_vocab.add_to_gguf(gguf_writer)

 # TENSORS

@@ -184,15 +200,13 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 print("gguf: get tensor metadata")

 if num_parts == 0:
-    part_names = iter(("pytorch_model.bin",))
+    part_names = ("pytorch_model.bin",)
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )

 for part_name in part_names:
-    if args.vocab_only:
-        break
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")

@@ -212,8 +226,11 @@ for part_name in part_names:
        data = data.squeeze().numpy()

        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-        if new_name is None:
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
            print("Can not map tensor '" + name + "'")
            sys.exit()

@@ -232,20 +249,19 @@ for part_name in part_names:
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)

-        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))

-        gguf_writer.add_tensor(new_name, data)
+        gguf_writer.add_tensor(name, data)


 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
-if not args.vocab_only:
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()

 gguf_writer.close()

-print(f"gguf: model successfully exported to '{fname_out}'")
+print("gguf: model successfully exported to '" + fname_out + "'")
 print("")
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+# 7b pth llama --> gguf conversion
+# Only models with a single datafile are supported, like 7B
+# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List
+from pathlib import Path
+from sentencepiece import SentencePieceProcessor
+
+#NDArray = np.ndarray[Any, Any]
+# compatible with python < 3.9
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("consolidated."):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "LlamaForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+if num_parts > 1:
+    print("gguf: Only models with a single datafile are supported.")
+
+    sys.exit()
+
+ARCH=gguf.MODEL_ARCH.LLAMA
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+    head_count_kv = hparams["num_key_value_heads"]
+else:
+    head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+    hf_repo = hparams["_name_or_path"]
+else:
+    hf_repo = ""
+
+if "max_sequence_length" in hparams:
+    ctx_length = hparams["max_sequence_length"]
+elif "max_position_embeddings" in hparams:
+    ctx_length = hparams["max_position_embeddings"]
+else:
+    print("gguf: can not find ctx length parameter.")
+
+    sys.exit()
+
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(ctx_length)
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[bytes] = []
+scores: List[float] = []
+toktypes: List[int] = []
+
+if Path(dir_model + "/tokenizer.model").is_file():
+    # vocab type sentencepiece
+    print("gguf: get sentencepiece tokenizer vocab and scores")
+
+    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+
+    for i in range(tokenizer.vocab_size()):
+        text: bytes
+        score: float
+
+        piece = tokenizer.id_to_piece(i)
+        text = piece.encode("utf-8")
+        score = tokenizer.get_score(i)
+
+        toktype = 1  # defualt to normal token type
+        if tokenizer.is_unknown(i):
+            toktype = 2
+        if tokenizer.is_control(i):
+            toktype = 3
+
+        # toktype = 4 is user-defined = tokens from added_tokens.json
+
+        if tokenizer.is_unused(i):
+            toktype = 5
+        if tokenizer.is_byte(i):
+            toktype = 6
+
+        tokens.append(text)
+        scores.append(score)
+        toktypes.append(toktype)
+
+    if Path(dir_model + "/added_tokens.json").is_file():
+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+            addtokens_json = json.load(f)
+
+            print("gguf: get added tokens")
+
+            for key in addtokens_json:
+                tokens.append( key.encode("utf-8") )
+                scores.append(-1000.0)
+                toktypes.append(4) # user-defined token type
+
+    gguf_writer.add_tokenizer_model("llama")
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_scores(scores)
+    gguf_writer.add_token_types(toktypes)
+
+
+print("gguf: get special token ids")
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # Look for special tokens in tokenizer.json if it exists
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer = json.load(f)
+
+    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]["content"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]["content"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]["content"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]["content"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]["content"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+else:
+    # If no tokenizer.json: Look for special tokens in config.json
+
+    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
+        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
+
+    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
+        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
+
+    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
+        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
+
+    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
+        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
+
+    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
+        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name == "rope.freqs":
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@@ -1,18 +1,9 @@
 #!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import math
-import struct
-import sys
-from enum import IntEnum
+import sys, struct, math, argparse
 from pathlib import Path

 import numpy as np

-import os
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf

 # Note: Does not support GGML_QKK_64
@@ -35,35 +26,10 @@ GGML_QUANT_SIZES = {
    gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
 }

-class GGMLFormat(IntEnum):
-    GGML = 0
-    GGMF = 1
-    GGJT = 2
-
-class GGMLFType(IntEnum):
-    ALL_F32              = 0
-    MOSTLY_F16           = 1
-    MOSTLY_Q4_0          = 2
-    MOSTLY_Q4_1          = 3
-    MOSTLY_Q4_1_SOME_F16 = 4
-    MOSTLY_Q8_0          = 7
-    MOSTLY_Q5_0          = 8
-    MOSTLY_Q5_1          = 9
-    MOSTLY_Q2_K          = 10
-    MOSTLY_Q3_K_S        = 11
-    MOSTLY_Q3_K_M        = 12
-    MOSTLY_Q3_K_L        = 13
-    MOSTLY_Q4_K_S        = 14
-    MOSTLY_Q4_K_M        = 15
-    MOSTLY_Q5_K_S        = 16
-    MOSTLY_Q5_K_M        = 17
-    MOSTLY_Q6_K          = 18
-
 class Hyperparameters:
    def __init__(self):
-        self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
-        self.n_layer = self.n_rot = self.n_ff = 0
-        self.ftype = GGMLFType.ALL_F32
+        self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
+        self.n_ff = 0

    def set_n_ff(self, model):
        ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
@@ -79,21 +45,16 @@ class Hyperparameters:
            self.n_head,
            self.n_layer,
            self.n_rot,
-            ftype,
+            self.ftype,
        ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
-        try:
-            self.ftype = GGMLFType(ftype)
-        except ValueError:
-            raise ValueError(f'Invalid ftype {ftype}')
        return 4 * 7

    def __str__(self):
-        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
+        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'

 class Vocab:
-    def __init__(self, load_scores = True):
+    def __init__(self):
        self.items = []
-        self.load_scores = load_scores

    def load(self, data, offset, n_vocab):
        orig_offset = offset
@@ -101,24 +62,20 @@ class Vocab:
            itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
            assert itemlen < 4096, 'Absurd vocab item length'
            offset += 4
-            item_text = bytes(data[offset:offset + itemlen])
+            vocab = bytes(data[offset:offset + itemlen])
            offset += itemlen
-            if self.load_scores:
-                item_score = struct.unpack('<f', data[offset:offset + 4])[0]
-                offset += 4
-            else:
-                item_score = 0.0
-            self.items.append((item_text, item_score))
+            score = struct.unpack('<f', data[offset:offset + 4])[0]
+            offset += 4
+            self.items.append((vocab, score))
        return offset - orig_offset

 class Tensor:
-    def __init__(self, use_padding = True):
+    def __init__(self):
        self.name = None
-        self.dims: tuple[int, ...] = ()
+        self.dims = ()
        self.dtype = None
        self.start_offset = 0
-        self.len_bytes = np.int64(0)
-        self.use_padding = use_padding
+        self.len_bytes = 0

    def load(self, data, offset):
        orig_offset = offset
@@ -134,7 +91,7 @@ class Tensor:
        offset += 4 * n_dims
        self.name = bytes(data[offset:offset + name_len])
        offset += name_len
-        pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
+        pad = ((offset + 31) & ~31) - offset
        offset += pad
        n_elems = np.prod(self.dims)
        n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
@@ -144,7 +101,7 @@ class Tensor:
        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
        return offset - orig_offset

-class GGMLModel:
+class GGMLV3Model:
    def __init__(self):
        self.hyperparameters = None
        self.vocab = None
@@ -152,52 +109,20 @@ class GGMLModel:
        self.tensors = []

    def validate_header(self, data, offset):
-        magic = bytes(data[offset:offset + 4])
-        if magic == b'GGUF':
-            raise ValueError('File is already in GGUF format.')
-        if magic == b'lmgg':
-            self.file_format = GGMLFormat.GGML
-            self.format_version = 1
-            return 4
-        version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
-        if magic == b'fmgg':
-            if version != 1:
-                raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
-            self.file_format = GGMLFormat.GGMF
-            self.format_version = version
-            return 8
-        if magic == b'tjgg':
-            if version < 1 or version > 3:
-                raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
-            self.file_format = GGMLFormat.GGJT
-            self.format_version = version
-            return 8
-        raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
-
-    def validate_conversion(self, ftype):
-        err = ''
-        if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
-            if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
-                err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
-        elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
-            if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
-                          GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
-                err = 'Q4 and Q8 quantizations changed in GGJTv3.'
-        if len(err) > 0:
-            raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
+        if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
+            raise ValueError('Only GGJTv3 supported')
+        return 8

    def load(self, data, offset):
        offset += self.validate_header(data, offset)
        hp = Hyperparameters()
        offset += hp.load(data, offset)
-        print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
-        self.validate_conversion(hp.ftype)
-        vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
+        vocab = Vocab()
        offset += vocab.load(data, offset, hp.n_vocab)
-        tensors: list[Tensor] = []
+        tensors = []
        tensor_map = {}
        while offset < len(data):
-            tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
+            tensor = Tensor()
            offset += tensor.load(data, offset)
            tensor_map[tensor.name] = len(tensors)
            tensors.append(tensor)
@@ -209,14 +134,13 @@ class GGMLModel:
        return offset

 class GGMLToGGUF:
-    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
+    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
        hp = ggml_model.hyperparameters
        self.model = ggml_model
        self.data = data
        self.cfg = cfg
        self.params_override = params_override
        self.vocab_override = vocab_override
-        self.special_vocab = special_vocab
        if params_override is not None:
            n_kv_head = params_override.n_head_kv
        else:
@@ -235,14 +159,9 @@ class GGMLToGGUF:

    def save(self):
        print('* Preparing to save GGUF file')
-        gguf_writer = gguf.GGUFWriter(
-            self.cfg.output,
-            gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
-            use_temp_file = False )
+        gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
        self.add_params(gguf_writer)
        self.add_vocab(gguf_writer)
-        if self.special_vocab is not None:
-            self.special_vocab.add_to_gguf(gguf_writer)
        self.add_tensors(gguf_writer)
        print("    gguf: write header")
        gguf_writer.write_header_to_file()
@@ -255,10 +174,7 @@ class GGMLToGGUF:
    def add_params(self, gguf_writer):
        hp = self.model.hyperparameters
        cfg = self.cfg
-        if cfg.desc is not None:
-            desc = cfg.desc
-        else:
-            desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
+        desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
        try:
            # Filenames aren't necessarily valid UTF8.
            name = cfg.name if cfg.name is not None else cfg.input.name
@@ -268,7 +184,6 @@ class GGMLToGGUF:
        if name is not None:
            gguf_writer.add_name(name)
        gguf_writer.add_description(desc)
-        gguf_writer.add_file_type(int(hp.ftype))
        if self.params_override is not None:
            po = self.params_override
            assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
@@ -305,8 +220,7 @@ class GGMLToGGUF:
                tokens.append(vbytes)
                scores.append(score)
                toktypes.append(ttype)
-            assert len(tokens) == hp.n_vocab, \
-                f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
+            assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
            gguf_writer.add_token_list(tokens)
            gguf_writer.add_token_scores(scores)
            if len(toktypes) > 0:
@@ -345,24 +259,27 @@ class GGMLToGGUF:
        gguf_writer.add_eos_token_id(2)

    def add_tensors(self, gguf_writer):
-        tensor_map = self.name_map
+        nm = self.name_map
        data = self.data
        print(f'* Adding {len(self.model.tensors)} tensor(s)')
        for tensor in self.model.tensors:
            name = str(tensor.name, 'UTF-8')
-            mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+            if name.endswith('.weight'):
+                name = name[:-7]
+                suffix = '.weight'
+            elif name.endswith('.bias'):
+                name = name[:-5]
+                suffix = '.bias'
+            mapped_name = nm.get(name)
            assert mapped_name is not None, f'Bad name {name}'
+            mapped_name += suffix
            tempdims = list(tensor.dims[:])
            if len(tempdims) > 1:
                temp = tempdims[1]
                tempdims[1] = tempdims[0]
                tempdims[0] = temp
            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
-            gguf_writer.add_tensor(
-                mapped_name,
-                data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
-                raw_shape = tempdims,
-                raw_dtype = tensor.dtype )
+            gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)

 def handle_metadata(cfg, hp):
    import convert
@@ -384,66 +301,43 @@ def handle_metadata(cfg, hp):
        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
    else:
        raise ValueError('Unable to load metadata')
-    vocab = convert.load_vocab(
-        cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
-        cfg.vocabtype )
-    # FIXME: Respect cfg.vocab_dir?
-    svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
+    vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
    convert.check_vocab_size(params, vocab)
-    return (params, vocab, svocab)
+    return (params, vocab)

 def handle_args():
-    parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
-    parser.add_argument('--input', '-i', type = Path, required = True,
-        help = 'Input GGMLv3 filename')
-    parser.add_argument('--output', '-o', type = Path, required = True,
-        help ='Output GGUF filename')
-    parser.add_argument('--name',
-        help = 'Set model name')
-    parser.add_argument('--desc',
-        help = 'Set model description')
-    parser.add_argument('--gqa', type = int, default = 1,
-        help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
-    parser.add_argument('--eps', default = '5.0e-06',
-        help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
-    parser.add_argument('--context-length', '-c', type=int, default = 2048,
-        help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
-    parser.add_argument('--model-metadata-dir', '-m', type = Path,
-        help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
-    parser.add_argument("--vocab-dir", type=Path,
-        help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
-    parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
-        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
+    parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
+    parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename')
+    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename')
+    parser.add_argument('--name', help = 'Set model name')
+    parser.add_argument('--desc', help = 'Set model description')
+    parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
+    parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
+    parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
+    parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
+    parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
+    parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
    return parser.parse_args()

 def main():
    cfg = handle_args()
    print(f'* Using config: {cfg}')
    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
-    if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
-        print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
    data = np.memmap(cfg.input, mode = 'r')
-    model = GGMLModel()
+    model = GGMLV3Model()
    print('* Scanning GGML input file')
    offset = model.load(data, 0)
    print(f'* GGML model hyperparameters: {model.hyperparameters}')
    vocab_override = None
    params_override = None
-    special_vocab = None
    if cfg.model_metadata_dir is not None:
-        (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
+        (params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
        print(f'* Overriding params: {params_override}')
        print(f'* Overriding vocab: {vocab_override}')
-        print(f'* Special vocab: {special_vocab}')
    else:
        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
-        if model.file_format == GGMLFormat.GGML:
-            print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
-    converter = GGMLToGGUF(model, data, cfg,
-        params_override = params_override,
-        vocab_override = vocab_override,
-        special_vocab = special_vocab )
+    converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
    converter.save()
    print(f'* Successful completion. Output saved to: {cfg.output}')

--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+# HF llama --> gguf conversion
+
+import gguf
+import os
+import sys
+import struct
+import json
+import numpy as np
+import torch
+
+from typing import Any, List, Optional
+from pathlib import Path
+from sentencepiece import SentencePieceProcessor
+
+#NDArray = np.ndarray[Any, Any]
+# compatible with python < 3.9
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'
+
+# reverse HF permute back to original pth layout
+# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
+
+
+def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray:
+    if n_kv_head is not None and n_head != n_kv_head:
+        n_head //= n_kv_head
+
+    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape))
+
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+
+    return num_parts
+
+
+if len(sys.argv) < 3:
+    print("Usage: convert-h5-to-ggml.py dir-model ftype\n")
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+
+    sys.exit(1)
+
+
+# output in the same directory as the model
+dir_model = sys.argv[1]
+last_dir = os.path.basename(os.path.normpath(dir_model))
+
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+
+        sys.exit(1)
+
+fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
+
+print("gguf: loading model "+last_dir)
+
+with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "LlamaForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.LLAMA
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+    head_count_kv = hparams["num_key_value_heads"]
+else:
+    head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+    hf_repo = hparams["_name_or_path"]
+else:
+    hf_repo = ""
+
+if "max_sequence_length" in hparams:
+    ctx_length = hparams["max_sequence_length"]
+elif "max_position_embeddings" in hparams:
+    ctx_length = hparams["max_position_embeddings"]
+else:
+    print("gguf: can not find ctx length parameter.")
+
+    sys.exit()
+
+
+gguf_writer.add_name(last_dir)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(ctx_length)
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: List[bytes] = []
+scores: List[float] = []
+toktypes: List[int] = []
+
+if Path(dir_model + "/tokenizer.model").is_file():
+    # vocab type sentencepiece
+    print("gguf: get sentencepiece tokenizer vocab, scores and token types")
+
+    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+
+    for i in range(tokenizer.vocab_size()):
+        text: bytes
+        score: float
+
+        piece = tokenizer.id_to_piece(i)
+        text = piece.encode("utf-8")
+        score = tokenizer.get_score(i)
+
+        toktype = 1  # defualt to normal token type
+        if tokenizer.is_unknown(i):
+            toktype = 2
+        if tokenizer.is_control(i):
+            toktype = 3
+
+        # toktype = 4 is user-defined = tokens from added_tokens.json
+
+        if tokenizer.is_unused(i):
+            toktype = 5
+        if tokenizer.is_byte(i):
+            toktype = 6
+
+        tokens.append(text)
+        scores.append(score)
+        toktypes.append(toktype)
+
+    if Path(dir_model + "/added_tokens.json").is_file():
+        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
+            addtokens_json = json.load(f)
+
+            print("gguf: get added tokens")
+
+            for key in addtokens_json:
+                tokens.append( key.encode("utf-8") )
+                scores.append(-1000.0)
+                toktypes.append(4) # user-defined token type
+
+
+    gguf_writer.add_tokenizer_model("llama")
+    gguf_writer.add_token_list(tokens)
+    gguf_writer.add_token_scores(scores)
+    gguf_writer.add_token_types(toktypes)
+
+
+print("gguf: get special token ids")
+
+if Path(dir_model + "/tokenizer.json").is_file():
+    # Look for special tokens in tokenizer.json if it exists
+
+    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
+        tokenizer = json.load(f)
+
+    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
+
+        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
+            tokenizer_config = json.load(f)
+
+        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["bos_token"]["content"]:
+                    gguf_writer.add_bos_token_id(key["id"])
+
+        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["eos_token"]["content"]:
+                    gguf_writer.add_eos_token_id(key["id"])
+
+        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["unk_token"]["content"]:
+                    gguf_writer.add_unk_token_id(key["id"])
+
+        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["sep_token"]["content"]:
+                    gguf_writer.add_sep_token_id(key["id"])
+
+        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
+            for key in tokenizer["added_tokens"]:
+                if key["content"] == tokenizer_config["pad_token"]["content"]:
+                    gguf_writer.add_pad_token_id(key["id"])
+else:
+    # If no tokenizer.json: Look for special tokens in config.json
+
+    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
+        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
+
+    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
+        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
+
+    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
+        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
+
+    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
+        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
+
+    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
+        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = ("pytorch_model.bin",)
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name.endswith(".rotary_emb.inv_freq"):
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # reverse permute these
+        if name.endswith(".q_proj.weight"):
+            data = reverse_hf_permute(data, head_count)
+        if name.endswith(".k_proj.weight"):
+            data = reverse_hf_permute(data, head_count, head_count_kv)
+
+        # map tensor names
+        if name.endswith(".weight") and name[:-7] in tensor_map:
+            name = tensor_map[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tensor_map:
+            name = tensor_map[name[:-5]] + ".bias"
+        else:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+
+print("gguf: model successfully exported to '" + fname_out + "'")
+print("")
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -1,17 +1,15 @@
 #!/usr/bin/env python3
-from __future__ import annotations
-
 import json
 import os
 import re
 import struct
 import sys
-from typing import Any, BinaryIO, Sequence
+from typing import Any, Dict, Sequence, TextIO

 import numpy as np
 import torch

-NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
+NUMPY_TYPE_TO_FTYPE: Dict[str, int] = {"float32": 0, "float16": 1}


 HF_SUBLAYER_TO_GGML = {
@@ -48,7 +46,7 @@ def translate_tensor_name(t: str) -> str:
        sys.exit(1)


-def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
+def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
    fout.write(b"ggla"[::-1])  # magic (ggml lora)
    fout.write(struct.pack("i", 1))  # file version
    fout.write(struct.pack("i", params["r"]))
@@ -62,7 +60,7 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:


 def write_tensor_header(
-    self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
+    self, name: str, shape: Sequence[int], data_type: np.dtype
 ) -> None:
    sname = name.encode("utf-8")
    fout.write(
--- a/convert-starcoder-hf-to-gguf.py
+++ b/convert-starcoder-hf-to-gguf.py
@@ -1,248 +0,0 @@
-#!/usr/bin/env python3
-# HF starcoder --> gguf conversion
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import struct
-import sys
-from pathlib import Path
-from typing import Any
-
-import numpy as np
-import torch
-from transformers import AutoTokenizer  # type: ignore[import]
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-
-def bytes_to_unicode():
-    # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    return dict(zip(bs, (chr(n) for n in cs)))
-
-
-def count_model_parts(dir_model: Path) -> int:
-    num_parts = 0
-    for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
-            num_parts += 1
-
-    if num_parts > 0:
-        print("gguf: found " + str(num_parts) + " model parts")
-    return num_parts
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
-    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
-    parser.add_argument("--outfile",    type=Path,           help="path to write to; default: based on input")
-    parser.add_argument("model",        type=Path,           help="directory containing model file, or model file itself (*.bin)")
-    parser.add_argument("ftype",        type=int,            help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
-    return parser.parse_args()
-
-args = parse_args()
-
-dir_model = args.model
-ftype = args.ftype
-if not dir_model.is_dir():
-    print(f'Error: {args.model} is not a directory', file = sys.stderr)
-    sys.exit(1)
-
-# possible tensor data types
-#   ftype == 0 -> float32
-#   ftype == 1 -> float16
-
-# map from ftype to string
-ftype_str = ["f32", "f16"]
-
-if args.outfile is not None:
-    fname_out = args.outfile
-else:
-    # output in the same directory as the model by default
-    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
-
-print("gguf: loading model "+dir_model.name)
-
-with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-    hparams = json.load(f)
-
-if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
-    print("Model architecture not supported: " + hparams["architectures"][0])
-
-    sys.exit(1)
-
-# get number of model parts
-num_parts = count_model_parts(dir_model)
-
-ARCH=gguf.MODEL_ARCH.STARCODER
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
-
-print("gguf: get model metadata")
-
-block_count = hparams["n_layer"]
-
-gguf_writer.add_name("StarCoder")
-gguf_writer.add_context_length(hparams["n_positions"])
-gguf_writer.add_embedding_length(hparams["n_embd"])
-gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
-gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(hparams["n_head"])
-gguf_writer.add_head_count_kv(1)
-gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
-gguf_writer.add_file_type(ftype)
-
-# TOKENIZATION
-
-print("gguf: get tokenizer metadata")
-
-tokens: list[bytearray] = []
-scores: list[float] = []
-toktypes: list[int] = []
-
-tokenizer_json_file = dir_model / 'tokenizer.json'
-if not tokenizer_json_file.is_file():
-    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
-    sys.exit(1)
-
-# gpt2 tokenizer
-gguf_writer.add_tokenizer_model("gpt2")
-
-with open(tokenizer_json_file, "r", encoding="utf-8") as f:
-    tokenizer_json = json.load(f)
-
-print("gguf: get gpt2 tokenizer vocab")
-
-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
-
-# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-tokenizer = AutoTokenizer.from_pretrained(dir_model)
-
-reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-byte_encoder = bytes_to_unicode()
-byte_decoder = {v: k for k, v in byte_encoder.items()}
-
-for i in range(vocab_size):
-    if i in reverse_vocab:
-        try:
-            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
-        except KeyError:
-            text = bytearray()
-            for c in reverse_vocab[i]:
-                if ord(c) < 256:  # single byte character
-                    text.append(byte_decoder[ord(c)])
-                else:  # multibyte special token character
-                    text.extend(c.encode('utf-8'))
-    else:
-        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
-        pad_token = f"[PAD{i}]".encode("utf8")
-        text = bytearray(pad_token)
-
-    tokens.append(text)
-    scores.append(0.0)                      # dymmy
-    toktypes.append(gguf.TokenType.NORMAL)  # dummy
-
-gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
-gguf_writer.add_token_types(toktypes)
-
-special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
-special_vocab.add_to_gguf(gguf_writer)
-
-# TENSORS
-
-tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
-
-# params for qkv transform
-n_head    = hparams["n_head"]
-n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
-
-head_dim = hparams["n_embd"] // n_head
-
-# tensor info
-print("gguf: get tensor metadata")
-
-if num_parts == 0:
-    part_names = iter(("pytorch_model.bin",))
-else:
-    part_names = (
-        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
-    )
-
-for part_name in part_names:
-    if args.vocab_only:
-        break
-    print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(dir_model / part_name, map_location="cpu")
-
-    for name in model_part.keys():
-        data = model_part[name]
-
-        old_dtype = data.dtype
-
-        # convert any unsupported data types to float32
-        if data.dtype != torch.float16 and data.dtype != torch.float32:
-            data = data.to(torch.float32)
-
-        data = data.squeeze().numpy()
-
-        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-        if new_name is None:
-            print("Can not map tensor '" + name + "'")
-            sys.exit()
-
-        n_dims = len(data.shape)
-        data_dtype = data.dtype
-
-        # if f32 desired, convert any float16 to float32
-        if ftype == 0 and data_dtype == np.float16:
-            data = data.astype(np.float32)
-
-        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-            data = data.astype(np.float32)
-
-        # if f16 desired, convert any float32 2-dim weight tensors to float16
-        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-            data = data.astype(np.float16)
-
-        print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-
-        gguf_writer.add_tensor(new_name, data)
-
-
-print("gguf: write header")
-gguf_writer.write_header_to_file()
-print("gguf: write metadata")
-gguf_writer.write_kv_data_to_file()
-if not args.vocab_only:
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-gguf_writer.close()
-
-print(f"gguf: model successfully exported to '{fname_out}'")
-print("")
--- a/convert.py
+++ b/convert.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python3
-from __future__ import annotations

+import gguf
 import argparse
 import concurrent.futures
+from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
 import copy
 import enum
 import faulthandler
@@ -19,27 +20,21 @@ import struct
 import sys
 import time
 import zipfile
+import numpy as np
+
 from abc import ABCMeta, abstractmethod
-from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar
-
-import numpy as np
-from sentencepiece import SentencePieceProcessor  # type: ignore[import]
-
-import os
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
+from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, TypeVar, Union)
+from sentencepiece import SentencePieceProcessor  # type: ignore

 if TYPE_CHECKING:
-    from typing import TypeAlias
+    from typing_extensions import TypeAlias

 if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
    faulthandler.register(signal.SIGUSR1)

-NDArray: TypeAlias = 'np.ndarray[Any, Any]'
+NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'

 ARCH=gguf.MODEL_ARCH.LLAMA
 NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
@@ -52,8 +47,8 @@ DEFAULT_CONCURRENCY = 8
@dataclass(frozen=True)
 class DataType:
    name: str
-    dtype: np.dtype[Any]
-    valid_conversions: list[str]
+    dtype: 'np.dtype[Any]'
+    valid_conversions: List[str]

    def elements_to_bytes(self, n_elements: int) -> int:
        return n_elements * self.dtype.itemsize
@@ -70,7 +65,7 @@ DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_convers
@dataclass(frozen=True)
 class QuantizedDataType(DataType):
    block_size: int
-    quantized_dtype: np.dtype[Any]
+    quantized_dtype: 'np.dtype[Any]'
    ggml_type: gguf.GGMLQuantizationType

    def quantize(self, arr: NDArray) -> NDArray:
@@ -89,7 +84,7 @@ class Q8_0QuantizedDataType(QuantizedDataType):
        n_blocks = arr.size // self.block_size
        blocks = arr.reshape((n_blocks, self.block_size))
        # Much faster implementation of block quantization contributed by @Cebtenzzre
-        def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
+        def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[Tuple[Any, Any]]:
            d = abs(blocks).max(axis = 1) / np.float32(127)
            with np.errstate(divide = 'ignore'):
                qs = (blocks / d[:, None]).round()
@@ -103,13 +98,13 @@ DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
    quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))

 # Quantized types skipped here because they may also map to np.float32
-NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
+NUMPY_TYPE_TO_DATA_TYPE: Dict['np.dtype[Any]', DataType] = {}
 for dt in (DT_BF16, DT_F16, DT_F32, DT_I32):
    if dt.dtype in NUMPY_TYPE_TO_DATA_TYPE:
        raise ValueError(f'Invalid duplicate data type {dt}')
    NUMPY_TYPE_TO_DATA_TYPE[dt.dtype] = dt

-SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
+SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
    'BF16': DT_BF16,
    'F16': DT_F16,
    'F32': DT_F32,
@@ -124,14 +119,14 @@ class GGMLFileType(enum.IntEnum):
    MostlyF16  = 1  # except 1d tensors
    MostlyQ8_0 = 7  # except 1d tensors

-    def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType:
+    def type_for_tensor(self, name: str, tensor: 'LazyTensor') -> DataType:
        dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self)
        if dt is None:
            raise ValueError(self)
        # 1D tensors are always F32.
        return dt if len(tensor.shape) > 1 else DT_F32

-GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
+GGML_FILE_TYPE_TO_DATA_TYPE: Dict[GGMLFileType, DataType] = {
    GGMLFileType.AllF32    : DT_F32,
    GGMLFileType.MostlyF16 : DT_F16,
    GGMLFileType.MostlyQ8_0: DT_Q8_0,
@@ -145,6 +140,7 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
 class Params:
    n_vocab:    int
    n_embd:     int
+    n_mult:     int
    n_layer:    int
    n_ctx:      int
    n_ff:       int
@@ -152,16 +148,25 @@ class Params:
    n_head_kv:  int
    f_norm_eps: float

-    f_rope_freq_base: float | None = None
-    f_rope_scale: float | None = None
+    f_rope_freq_base: Optional[float] = None
+    f_rope_scale: Optional[float] = None

-    ftype: GGMLFileType | None = None
+    ftype: Optional[GGMLFileType] = None

    # path to the directory containing the model files
-    path_model: Path | None = None
+    path_model: Optional['Path'] = None

    @staticmethod
-    def guessed(model: LazyModel) -> Params:
+    def find_n_mult(n_ff: int, n_embd: int) -> int:
+        # hardcoded magic range
+        for n_mult in range(8192, 1, -1):
+            calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult
+            if calc_ff == n_ff:
+                return n_mult
+        raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).")
+
+    @staticmethod
+    def guessed(model: 'LazyModel') -> 'Params':
        # try transformer naming first
        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape

@@ -187,6 +192,7 @@ class Params:
        return Params(
            n_vocab    = n_vocab,
            n_embd     = n_embd,
+            n_mult     = n_mult,
            n_layer    = n_layer,
            n_ctx      = -1,
            n_ff       = n_ff,
@@ -196,7 +202,7 @@ class Params:
        )

    @staticmethod
-    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
+    def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
        config = json.load(open(config_path))

        n_vocab          = config["vocab_size"]
@@ -214,6 +220,8 @@ class Params:
        else:
            f_rope_scale = None

+        n_mult = Params.find_n_mult(n_ff, n_embd)
+
        if "max_sequence_length" in config:
            n_ctx = config["max_sequence_length"]
        elif "max_position_embeddings" in config:
@@ -225,6 +233,7 @@ class Params:
        return Params(
            n_vocab          = n_vocab,
            n_embd           = n_embd,
+            n_mult           = n_mult,
            n_layer          = n_layer,
            n_ctx            = n_ctx,
            n_ff             = n_ff,
@@ -236,14 +245,15 @@ class Params:
        )

    # LLaMA v2 70B params.json
-    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
+    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1
    @staticmethod
-    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
+    def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
        config = json.load(open(config_path))

        n_vocab          = config["vocab_size"] if "vocab_size" in config else -1
        n_embd           = config["dim"]
        n_layer          = config["n_layers"]
+        n_mult           = config["multiple_of"]
        n_ff             = -1
        n_head           = config["n_heads"]
        n_head_kv        = config["n_kv_heads"] if "n_kv_heads" in config else n_head
@@ -251,7 +261,7 @@ class Params:
        f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None

        # hack to determine LLaMA v1 vs v2 vs CodeLlama
-        if f_rope_freq_base == 1000000:
+        if f_rope_freq_base and f_rope_freq_base == 1000000:
            # CodeLlama
            n_ctx = 16384
        elif config["norm_eps"] == 1e-05:
@@ -270,6 +280,7 @@ class Params:
        return Params(
            n_vocab          = n_vocab,
            n_embd           = n_embd,
+            n_mult           = n_mult,
            n_layer          = n_layer,
            n_ctx            = n_ctx,
            n_ff             = n_ff,
@@ -280,7 +291,7 @@ class Params:
        )

    @staticmethod
-    def load(model_plus: ModelPlus) -> Params:
+    def load(model_plus: 'ModelPlus') -> 'Params':
        hf_config_path   = model_plus.paths[0].parent / "config.json"
        orig_config_path = model_plus.paths[0].parent / "params.json"

@@ -288,10 +299,8 @@ class Params:
            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
        elif orig_config_path.exists():
            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
-        elif model_plus.format != 'none':
-            params = Params.guessed(model_plus.model)
        else:
-            raise ValueError('Cannot guess params when model format is none')
+            params = Params.guessed(model_plus.model)

        params.path_model = model_plus.paths[0].parent

@@ -303,31 +312,19 @@ class Params:
 #

 class BpeVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
-        added_tokens: dict[str, int]
+        added_tokens: Dict[str, int]
        if fname_added_tokens is not None:
-            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
        else:
-            # Fall back to trying to find the added tokens in tokenizer.json
-            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
-            if not tokenizer_json_file.is_file():
-                added_tokens = {}
-            else:
-                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
-                added_tokens = dict(
-                    (item['content'], item['id'])
-                    for item in tokenizer_json.get('added_tokens', [])
-                    # Added tokens here can be duplicates of the main vocabulary.
-                    if item['content'] not in self.bpe_tokenizer )
+            added_tokens = {}

        vocab_size: int = len(self.bpe_tokenizer)
        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids      = sorted(added_tokens.values())
        if expected_ids != actual_ids:
-            expected_end_id = vocab_size + len(actual_ids) - 1
-            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
+            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")

        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
        self.added_tokens_list    = [text for (text, idx) in items]
@@ -336,45 +333,33 @@ class BpeVocab:
        self.fname_tokenizer      = fname_tokenizer
        self.fname_added_tokens   = fname_added_tokens

-    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+    def bpe_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.bpe_tokenizer
-        from transformers.models.gpt2 import tokenization_gpt2  # type: ignore[import]
+        from transformers.models.gpt2 import tokenization_gpt2
        byte_encoder = tokenization_gpt2.bytes_to_unicode()
        byte_decoder = {v: k for k, v in byte_encoder.items()}
-        score = 0.0
        for i, item in enumerate(tokenizer):
            text: bytes = item.encode("utf-8")
-            # FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
-            if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
-                if i == 0 and text == b'<unk>':
-                    toktype = gguf.TokenType.UNKNOWN
-                elif i == 1 or i == 2:
-                    toktype = gguf.TokenType.CONTROL
-                elif i >= 3 and text.startswith(b'<0x'):
-                    toktype = gguf.TokenType.BYTE
-                else:
-                    toktype = gguf.TokenType.NORMAL
-            else:
-                toktype = gguf.TokenType.NORMAL
-            yield text, score, toktype
+            score: float = -i
+            yield text, score, gguf.TokenType.USER_DEFINED

-    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+    def added_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            score = -1000.0
            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED

-    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+    def all_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
        yield from self.bpe_tokens()
        yield from self.added_tokens()

    def __repr__(self) -> str:
-        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+        return f"BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


 class SentencePieceVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None:
        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
-        added_tokens: dict[str, int]
+        added_tokens: Dict[str, int]
        if fname_added_tokens is not None:
            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
        else:
@@ -393,7 +378,7 @@ class SentencePieceVocab:
        self.fname_tokenizer = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens

-    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+    def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.sentencepiece_tokenizer
        for i in range(tokenizer.vocab_size()):
            piece = tokenizer.id_to_piece(i)
@@ -417,19 +402,20 @@ class SentencePieceVocab:

            yield text, score, toktype

-    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+    def added_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
        for text in self.added_tokens_list:
            score = -1000.0
            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED

-    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+    def all_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
        yield from self.sentencepiece_tokens()
        yield from self.added_tokens()

    def __repr__(self) -> str:
        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"

-Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
+Vocab = Union[BpeVocab, SentencePieceVocab]
+

 #
 # data loading
@@ -449,18 +435,18 @@ class Tensor(metaclass=ABCMeta):
    data_type: DataType

    @abstractmethod
-    def astype(self, data_type: DataType) -> Tensor: ...
+    def astype(self, data_type: DataType) -> 'Tensor': ...
    @abstractmethod
-    def permute(self, n_head: int, n_head_kv: int) -> Tensor: ...
+    def permute(self, n_head: int, n_head_kv: int) -> 'Tensor': ...
    @abstractmethod
-    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ...
+    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
    @abstractmethod
-    def part(self, n_part: int) -> UnquantizedTensor: ...
+    def part(self, n_part: int) -> 'UnquantizedTensor': ...
    @abstractmethod
-    def to_ggml(self) -> GGMLCompatibleTensor: ...
+    def to_ggml(self) -> 'GGMLCompatibleTensor': ...


-def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
+def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
    assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
    fp32_arr = bf16_arr.astype(np.uint32) << 16
    return fp32_arr.view(np.float32)
@@ -478,22 +464,22 @@ class UnquantizedTensor(Tensor):
            self.ndarray = bf16_to_fp32(self.ndarray)
        return UnquantizedTensor(self.ndarray.astype(dtype))

-    def to_ggml(self) -> UnquantizedTensor:
+    def to_ggml(self) -> 'UnquantizedTensor':
        return self

-    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor:
+    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
        r = self.ndarray.shape[0] // 3
-        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
+        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head))

-    def part(self, n_part: int) -> UnquantizedTensor:
+    def part(self, n_part: int) -> 'UnquantizedTensor':
        r = self.ndarray.shape[0] // 3
        return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...])

-    def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor:
+    def permute(self, n_head: int, n_head_kv: int) -> 'UnquantizedTensor':
        return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv))


-def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray:
+def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray:
    tensor = lazy_tensor.load()
    assert isinstance(tensor, UnquantizedTensor)

@@ -509,13 +495,13 @@ def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, conver
    return tensor.ndarray


-GGMLCompatibleTensor = UnquantizedTensor
+GGMLCompatibleTensor = Union[UnquantizedTensor]


@dataclass
 class LazyTensor:
    _load: Callable[[], Tensor]
-    shape: list[int]
+    shape: List[int]
    data_type: DataType
    description: str

@@ -526,7 +512,7 @@ class LazyTensor:
                (self.data_type, ret.data_type, self.description)
        return ret

-    def astype(self, data_type: DataType) -> LazyTensor:
+    def astype(self, data_type: DataType) -> 'LazyTensor':
        self.validate_conversion_to(data_type)

        def load() -> Tensor:
@@ -538,24 +524,24 @@ class LazyTensor:
            raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.')


-LazyModel: TypeAlias = 'dict[str, LazyTensor]'
+LazyModel = Dict[str, LazyTensor]


@dataclass
 class ModelPlus:
    model: LazyModel
-    paths: list[Path]  # Where this was read from.
-    format: Literal['ggml', 'torch', 'safetensors', 'none']
-    vocab: Vocab | None  # For GGML models (which have vocab built in), the vocab.
+    paths: List[Path]  # Where this was read from.
+    format: Literal['ggml', 'torch', 'safetensors']
+    vocab: Optional[Vocab]  # For GGML models (which have vocab built in), the vocab.


-def merge_sharded(models: list[LazyModel]) -> LazyModel:
+def merge_sharded(models: List[LazyModel]) -> LazyModel:
    # Original LLaMA models have each file contain one part of each tensor.
    # Use a dict instead of a set to preserve order.
    names = {name: None for model in models for name in model}

    def convert(name: str) -> LazyTensor:
-        lazy_tensors: list[LazyTensor] = [model[name] for model in models]
+        lazy_tensors: List[LazyTensor] = [model[name] for model in models]
        if len(lazy_tensors) == 1:
            # only one file; don't go through this procedure since there might
            # be quantized tensors
@@ -583,7 +569,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel:
    return {name: convert(name) for name in names}


-def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
+def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus:
    formats = set(mp.format for mp in models_plus)
    assert len(formats) == 1, "different formats?"
    format = formats.pop()
@@ -611,12 +597,12 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
        return lazy_tensor.load().permute(n_head, n_head_kv)
    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)

-def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
+def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
    def load() -> Tensor:
-        return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
+        return lazy_tensor.load().permute_part(n_part, n_head)
    s = lazy_tensor.shape.copy()
    s[0] = s[0] // 3
-    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
+    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)

 def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
    def load() -> Tensor:
@@ -657,7 +643,7 @@ class LazyUnpickler(pickle.Unpickler):
        assert isinstance(pid[1], LazyStorageKind)
        data_type = pid[1].data_type
        filename_stem = pid[2]
-        filename = f'{self.data_base_path}/{filename_stem}'
+        filename = self.data_base_path + '/' + filename_stem
        info = self.zip_file.getinfo(filename)

        def load(offset: int, elm_count: int) -> NDArray:
@@ -671,8 +657,9 @@ class LazyUnpickler(pickle.Unpickler):
        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
        return LazyStorage(load=load, kind=pid[1], description=description)

-    @staticmethod
+    # @staticmethod
    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
+                               # pyright: ignore[reportSelfClsParameterName]
                               requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
        assert isinstance(storage, LazyStorage)

@@ -682,15 +669,13 @@ class LazyUnpickler(pickle.Unpickler):
        description = f'pickled storage_offset={storage_offset} in {storage.description}'
        return LazyTensor(load, list(size), storage.kind.data_type, description)

-    @staticmethod
+    # @staticmethod
    def rebuild_from_type_v2(func, new_type, args, state):
        return func(*args)

-    CLASSES: dict[tuple[str, str], Any] = {
-        # getattr used here as a workaround for mypy not being smart enough to detrmine
-        # the staticmethods have a __func__ attribute.
-        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
-        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
+    CLASSES: Dict[Any, Any] = {
+        ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2,
+        ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
@@ -719,15 +704,15 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:

 def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
    header_size, = struct.unpack('<Q', fp.read(8))
-    header: dict[str, dict[str, Any]] = json.loads(fp.read(header_size))
+    header: Dict[str, Dict[str, Any]] = json.loads(fp.read(header_size))
    # Use mmap for the actual data to avoid race conditions with the file offset.
    mapped = memoryview(mmap.mmap(fp.fileno(), 0, access=mmap.ACCESS_READ))
    byte_buf = mapped[8 + header_size:]

-    def convert(info: dict[str, Any]) -> LazyTensor:
+    def convert(info: Dict[str, Any]) -> LazyTensor:
        data_type = SAFETENSORS_DATA_TYPES[info['dtype']]
        numpy_dtype = data_type.dtype
-        shape: list[int] = info['shape']
+        shape: List[int] = info['shape']
        begin, end = info['data_offsets']
        assert 0 <= begin <= end <= len(byte_buf)
        assert end - begin == math.prod(shape) * numpy_dtype.itemsize
@@ -766,7 +751,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
 In = TypeVar('In')
 Out = TypeVar('Out')

-def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
+def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, factory: Callable = ThreadPoolExecutor) -> Iterable[Out]:
    '''Parallel map, but with backpressure.  If the caller doesn't call `next`
    fast enough, this will stop calling `func` at some point rather than
    letting results pile up in memory.  Specifically, there is a max of one
@@ -775,13 +760,8 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
        yield from map(func, iterable)
        # Not reached.
    iterable = iter(iterable)
-    executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor]
-    if use_processpool_executor:
-        executor_class = ProcessPoolExecutor
-    else:
-        executor_class = ThreadPoolExecutor
-    with executor_class(max_workers = max_workers) as executor:
-        futures: list[concurrent.futures.Future[Out]] = []
+    with factory(max_workers = max_workers) as executor:
+        futures: List[concurrent.futures.Future[Out]] = []
        done = False
        for _ in range(concurrency):
            try:
@@ -823,12 +803,10 @@ class OutputFile:

    def add_meta_arch(self, params: Params) -> None:
        name = "LLaMA"
-
-        # TODO: better logic to determine model name
-        if params.n_ctx == 4096:
+        if (params.n_ctx == 4096):
            name = "LLaMA v2"
-        elif params.path_model is not None:
-            name = str(params.path_model.parent).split('/')[-1]
+            if params.path_model:
+                name = str(params.path_model.parent).split('/')[-1]

        self.gguf.add_name                (name)
        self.gguf.add_context_length      (params.n_ctx)
@@ -840,38 +818,31 @@ class OutputFile:
        self.gguf.add_head_count_kv       (params.n_head_kv)
        self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps)

-        if params.f_rope_freq_base is not None:
+        if params.f_rope_freq_base:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)

-        if params.f_rope_scale is not None:
+        if params.f_rope_scale:
            self.gguf.add_rope_scale_linear(params.f_rope_scale)

-        if params.ftype is not None:
+        if params.ftype:
            self.gguf.add_file_type(params.ftype)

    def add_meta_vocab(self, vocab: Vocab) -> None:
        tokens = []
        scores = []
        toktypes = []
-        # NOTE: `all_tokens` returns the base vocabulary and added tokens
+        # NOTE: `all_tokens` returns the the base vocabulary and added tokens
+        # TODO: add special tokens?
        for text, score, toktype in vocab.all_tokens():
            tokens.append(text)
            scores.append(score)
            toktypes.append(toktype)

-        if isinstance(vocab, SentencePieceVocab):
-            self.gguf.add_tokenizer_model("llama")
-        elif isinstance(vocab, BpeVocab):
-            self.gguf.add_tokenizer_model("gpt2")
-        else:
-            raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
+        self.gguf.add_tokenizer_model("llama")
        self.gguf.add_token_list(tokens)
        self.gguf.add_token_scores(scores)
        self.gguf.add_token_types(toktypes)

-    def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
-        svocab.add_to_gguf(self.gguf)
-
    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
        n_elements = int(np.prod(tensor.shape))
        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
@@ -890,7 +861,7 @@ class OutputFile:
        self.gguf.close()

    @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
+    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab) -> None:
        check_vocab_size(params, vocab)

        of = OutputFile(fname_out)
@@ -898,27 +869,25 @@ class OutputFile:
        # meta data
        of.add_meta_arch(params)
        of.add_meta_vocab(vocab)
-        of.add_meta_special_vocab(svocab)
-
        of.write_meta()

        of.close()

    @staticmethod
-    def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]:
+    def do_item(item: Tuple[str, LazyTensor]) -> Tuple[DataType, NDArray]:
        name, lazy_tensor = item
        tensor = lazy_tensor.load().to_ggml()
        return (lazy_tensor.data_type, tensor.ndarray)

    @staticmethod
-    def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
+    def maybe_do_quantize(item: Tuple[DataType, NDArray]) -> NDArray:
        dt, arr = item
        if not isinstance(dt, QuantizedDataType):
            return arr
        return dt.quantize(arr)

    @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
+    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
        check_vocab_size(params, vocab)

        of = OutputFile(fname_out)
@@ -926,7 +895,6 @@ class OutputFile:
        # meta data
        of.add_meta_arch(params)
        of.add_meta_vocab(vocab)
-        of.add_meta_special_vocab(svocab)

        # tensor info
        for name, lazy_tensor in model.items():
@@ -938,7 +906,7 @@ class OutputFile:
        # tensor data
        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
        if ftype == GGMLFileType.MostlyQ8_0:
-            ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, use_processpool_executor = True)
+            ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, factory = ProcessPoolExecutor)
        else:
            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)

@@ -952,7 +920,7 @@ class OutputFile:

        of.close()

-def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
+def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
    wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type

    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
@@ -971,8 +939,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
            for (name, tensor) in model.items()}

 def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
-    tmap = gguf.TensorNameMap(ARCH, params.n_layer)
-    should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
+    tmap = gguf.get_tensor_name_map(ARCH, params.n_layer)

    tmp = model

@@ -988,31 +955,37 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
            tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy        (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
-            del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
        else:
            break

    out: LazyModel = {}
    for name, lazy_tensor in model.items():
-        tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
-        if name_new is None:
+        name_new = name
+
+        if name in tmap:
+            name_new = tmap[name]
+        elif name.endswith(".weight") and name[:-7] in tmap:
+            name_new = tmap[name[:-7]] + ".weight"
+        elif name.endswith(".bias") and name[:-5] in tmap:
+            name_new = tmap[name[:-5]] + ".bias"
+        else:
            raise Exception(f"Unexpected tensor name: {name}")

-        if tensor_type in should_skip:
+        if gguf.should_skip_tensor_TMP(ARCH, params.n_layer, name_new):
            print(f"skipping tensor {name_new}")
            continue
-
-        print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
-        out[name_new] = lazy_tensor
+        else:
+            print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
+            out[name_new] = lazy_tensor

    return out

-def nth_multifile_path(path: Path, n: int) -> Path | None:
+def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
    the nth path in the model.
    '''
    # Support the following patterns:
-    patterns: list[tuple[str, str]] = [
+    patterns: List[Tuple[str, str]] = [
        # - x.00.pth, x.01.pth, etc.
        (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'),
        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
@@ -1028,11 +1001,11 @@ def nth_multifile_path(path: Path, n: int) -> Path | None:
    return None


-def find_multifile_paths(path: Path) -> list[Path]:
+def find_multifile_paths(path: Path) -> List[Path]:
    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
    the whole list of paths in the model.
    '''
-    ret: list[Path] = []
+    ret: List[Path] = []
    for i in itertools.count():
        nth_path = nth_multifile_path(path, i)
        if nth_path is None:
@@ -1063,7 +1036,7 @@ def load_some_model(path: Path) -> ModelPlus:
        path = files[0]

    paths = find_multifile_paths(path)
-    models_plus: list[ModelPlus] = []
+    models_plus: List[ModelPlus] = []
    for path in paths:
        print(f"Loading model file {path}")
        models_plus.append(lazy_load_file(path))
@@ -1072,7 +1045,7 @@ def load_some_model(path: Path) -> ModelPlus:
    return model_plus


-def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
+def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, SentencePieceVocab]:
    # Be extra-friendly and accept either a file or a directory.  Also, if it's
    # a directory, it might be the model directory, and tokenizer.model might
    # be in the parent of that.
@@ -1103,7 +1076,7 @@ def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
        raise ValueError(f"Unsupported vocabulary type {vocabtype}")


-def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
+def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path:
    namestr = {
        GGMLFileType.AllF32:    "f32",
        GGMLFileType.MostlyF16: "f16",
@@ -1126,7 +1099,7 @@ def do_dump_model(model_plus: ModelPlus) -> None:
        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")


-def main(args_in: list[str] | None = None) -> None:
+def main(args_in: Optional[List[str]] = None) -> None:
    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
@@ -1143,16 +1116,8 @@ def main(args_in: list[str] | None = None) -> None:
    if args.dump_single:
        model_plus = lazy_load_file(args.model)
        do_dump_model(model_plus)
-        return

-    if not args.vocab_only:
-        model_plus = load_some_model(args.model)
-    else:
-        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
-
-    if args.dump:
-        do_dump_model(model_plus)
-        return
+    model_plus = load_some_model(args.model)

    params = Params.load(model_plus)
    if params.n_ctx == -1:
@@ -1174,34 +1139,33 @@ def main(args_in: list[str] | None = None) -> None:

    vocab: Vocab
    if args.vocab_only:
-        assert args.outfile, "need --outfile if using --vocab-only"
-        # FIXME: Try to respect vocab_dir somehow?
        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
-        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
+        assert args.outfile, "need --outfile if using --vocab-only"
        outfile = args.outfile
-        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
+        OutputFile.write_vocab_only(outfile, params, vocab)
        print(f"Wrote {outfile}")
-        return
-
-    if model_plus.vocab is not None and args.vocab_dir is None:
-        vocab = model_plus.vocab
    else:
-        vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
-        vocab = load_vocab(vocab_dir, args.vocabtype)
-    # FIXME: Try to respect vocab_dir somehow?
-    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
+        if args.dump:
+            do_dump_model(model_plus)
+            return

-    model   = model_plus.model
-    model   = convert_model_names(model, params)
-    ftype   = pick_output_type(model, args.outtype)
-    model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_outfile(model_plus.paths, ftype)
+        if model_plus.vocab is not None and args.vocab_dir is None:
+            vocab = model_plus.vocab
+        else:
+            vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
+            vocab = load_vocab(vocab_dir, args.vocabtype)

-    params.ftype = ftype
-    print(f"Writing {outfile}, format {ftype}")
+        model   = model_plus.model
+        model   = convert_model_names(model, params)
+        ftype   = pick_output_type(model, args.outtype)
+        model   = convert_to_output_type(model, ftype)
+        outfile = args.outfile or default_outfile(model_plus.paths, ftype)

-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
-    print(f"Wrote {outfile}")
+        params.ftype = ftype
+        print(f"Writing {outfile}, format {ftype}")
+
+        OutputFile.write_all(outfile, ftype, params, model, vocab, concurrency = args.concurrency)
+        print(f"Wrote {outfile}")


 if __name__ == '__main__':
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -23,10 +23,9 @@ else()
    add_subdirectory(train-text-from-scratch)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
-    add_subdirectory(speculative)
    add_subdirectory(embd-input)
    add_subdirectory(llama-bench)
-    add_subdirectory(beam-search)
+    add_subdirectory(beam_search)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -9,12 +9,12 @@
 #endif

 #ifdef LLAMA_DEFAULT_RMS_EPS
-constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
+static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
 #else
-constexpr float rms_norm_eps = 5e-6f;
+static const float rms_norm_eps = 5e-6f;
 #endif

-static float frand() {
+float frand() {
    return (float)rand()/(float)RAND_MAX;
 }

@@ -25,21 +25,19 @@ struct random_normal_distribution {
    float max;
 };

-static void init_random_normal_distribution(
-    struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
-) {
+void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
    rnd->gen = std::mt19937(seed);
    rnd->nd = std::normal_distribution<float>{mean, std};
    rnd->min = min;
    rnd->max = max;
 }

-static float frand_normal(struct random_normal_distribution * rnd) {
+float frand_normal(struct random_normal_distribution * rnd) {
    const float r = rnd->nd(rnd->gen);
    return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
 }

-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);

    if (plan.work_size > 0) {
@@ -50,9 +48,13 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
    ggml_graph_compute(graph, &plan);
 }

-static struct ggml_tensor * randomize_tensor(
-    struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
-) {
+struct ggml_tensor * randomize_tensor(
+        struct ggml_tensor * tensor,
+        int ndims,
+        const int64_t ne[],
+        float fmin,
+        float fmax) {
+
    switch (ndims) {
        case 1:
            for (int i0 = 0; i0 < ne[0]; i0++) {
@@ -93,9 +95,11 @@ static struct ggml_tensor * randomize_tensor(
    return tensor;
 }

-static struct ggml_tensor * randomize_tensor_normal(
-    struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
-) {
+struct ggml_tensor * randomize_tensor_normal(
+        struct ggml_tensor * tensor,
+        int ndims,
+        const int64_t ne[],
+        struct random_normal_distribution * rnd) {
    float scale = 1.0; // xavier
    switch (ndims) {
        case 1:
@@ -155,7 +159,7 @@ struct llama_hparams {
    }
 };

-static uint32_t get_n_ff(const struct llama_hparams* hparams) {
+uint32_t get_n_ff(const struct llama_hparams* hparams) {
    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
    return n_ff;
 }
@@ -256,7 +260,7 @@ struct llama_model_lora {
    std::vector<llama_layer_lora> layers;
 };

-static void init_model(struct llama_model * model) {
+void init_model(struct llama_model * model) {
    const auto & hparams = model->hparams;

    const uint32_t n_embd  = hparams.n_embd;
@@ -293,7 +297,7 @@ static void init_model(struct llama_model * model) {
 }


-static void init_model_lora(struct llama_model_lora * model) {
+void init_model_lora(struct llama_model_lora * model) {
    const auto & hparams = model->hparams;

    const uint32_t n_embd  = hparams.n_embd;
@@ -336,7 +340,7 @@ static void init_model_lora(struct llama_model_lora * model) {
    }
 }

-static void set_param_model(struct llama_model * model) {
+void set_param_model(struct llama_model * model) {
    const auto& hparams = model->hparams;

    const uint32_t n_layer = hparams.n_layer;
@@ -362,7 +366,7 @@ static void set_param_model(struct llama_model * model) {
    }
 }

-static void set_param_model_lora(struct llama_model_lora * model) {
+void set_param_model_lora(struct llama_model_lora * model) {
    const auto& hparams = model->hparams;

    const uint32_t n_layer = hparams.n_layer;
@@ -393,7 +397,7 @@ static void set_param_model_lora(struct llama_model_lora * model) {
    }
 }

-static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
+void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
    const auto & hparams = model->hparams;

    const uint32_t n_layer = hparams.n_layer;
@@ -422,9 +426,7 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl
 }


-static void randomize_model_lora(
-    struct llama_model_lora * model, int seed, float mean, float std, float min, float max
-) {
+void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) {
    const auto & hparams = model->hparams;

    const uint32_t n_layer = hparams.n_layer;
@@ -457,7 +459,7 @@ static void randomize_model_lora(
    }
 }

-static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
+bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
    const auto & hparams = model->hparams;

    const uint32_t n_ctx   = hparams.n_ctx;
@@ -493,7 +495,7 @@ static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * mod
    return true;
 }

-static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
+bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
    const auto & hparams = model->hparams;

    const uint32_t n_ctx   = hparams.n_ctx;
@@ -529,15 +531,15 @@ static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_
    return true;
 }

-static struct ggml_tensor * forward(
-    struct llama_model    * model,
-    struct llama_kv_cache * cache,
-    struct ggml_context   * ctx0,
-    struct ggml_cgraph    * gf,
-    struct ggml_tensor    * tokens_input,
-    const  int              n_tokens,
-    const  int              n_past
-) {
+struct ggml_tensor * forward(
+        struct llama_model    * model,
+        struct llama_kv_cache * cache,
+        struct ggml_context   * ctx0,
+        struct ggml_cgraph    * gf,
+        struct ggml_tensor    * tokens_input,
+        const  int              n_tokens,
+        const  int              n_past) {
+
    const int N = n_tokens;

    struct llama_kv_cache& kv_self = *cache;
@@ -754,25 +756,25 @@ static struct ggml_tensor * forward(
    return inpL;
 }

-static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
+void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
    GGML_ASSERT(tensor->n_dims == 1);
    GGML_ASSERT(tensor->ne[0] == ne0);
 }

-static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
+void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
    GGML_ASSERT(tensor->n_dims == 2);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
 }

-static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
+void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
    GGML_ASSERT(tensor->n_dims == 3);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
 }

-static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
    GGML_ASSERT(tensor->n_dims == 4);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
@@ -780,16 +782,16 @@ static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne
    GGML_ASSERT(tensor->ne[3] == ne3);
 }

-static struct ggml_tensor * forward_batch(
-    struct llama_model    * model,
-    struct llama_kv_cache * cache,
-    struct ggml_context   * ctx0,
-    struct ggml_cgraph    * gf,
-    struct ggml_tensor    * tokens_input,
-    const  int              n_tokens,
-    const  int              n_past,
-    const  int              n_batch
-) {
+struct ggml_tensor * forward_batch(
+        struct llama_model    * model,
+        struct llama_kv_cache * cache,
+        struct ggml_context   * ctx0,
+        struct ggml_cgraph    * gf,
+        struct ggml_tensor    * tokens_input,
+        const  int              n_tokens,
+        const  int              n_past,
+        const  int              n_batch) {
+
    const int N = n_tokens;

    struct llama_kv_cache& kv_self = *cache;
@@ -1071,15 +1073,16 @@ static struct ggml_tensor * forward_batch(
    return inpL;
 }

-static struct ggml_tensor * forward_lora(
-    struct llama_model_lora * model,
-    struct llama_kv_cache   * cache,
-    struct ggml_context     * ctx0,
-    struct ggml_cgraph      * gf,
-    struct ggml_tensor      * tokens_input,
-    const  int                n_tokens,
-    const  int                n_past
-) {
+
+struct ggml_tensor * forward_lora(
+        struct llama_model_lora * model,
+        struct llama_kv_cache   * cache,
+        struct ggml_context     * ctx0,
+        struct ggml_cgraph      * gf,
+        struct ggml_tensor      * tokens_input,
+        const  int                n_tokens,
+        const  int                n_past) {
+
    const int N = n_tokens;

    struct llama_kv_cache& kv_self = *cache;
@@ -1325,7 +1328,7 @@ static struct ggml_tensor * forward_lora(
    return inpL;
 }

-static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
+void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
    assert(logits->n_dims == 2);
    assert(probs->n_dims == 2);
    assert(best_samples->n_dims == 1);
@@ -1356,10 +1359,7 @@ static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * pro
    }
 }

-static void sample_softmax_batch(
-    struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
-    struct ggml_tensor * best_samples
-) {
+void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
    GGML_ASSERT(best_samples->n_dims == 2);
    GGML_ASSERT(logits->n_dims == 3);
    GGML_ASSERT(probs->n_dims == 3);
@@ -1393,7 +1393,7 @@ static void sample_softmax_batch(
    }
 }

-static void print_row(struct ggml_tensor * probs, int i) {
+void print_row(struct ggml_tensor * probs, int i) {
    for (int k = 0; k < probs->ne[0]; ++k) {
        float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
        printf(" %.2f", p);
@@ -1401,7 +1401,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
    printf("\n");
 }

-static void print_matrix(struct ggml_tensor * probs) {
+void print_matrix(struct ggml_tensor * probs) {
    assert(probs->n_dims == 2);
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
@@ -1412,7 +1412,7 @@ static void print_matrix(struct ggml_tensor * probs) {
    }
 }

-static void print_token(int token, int n_vocab) {
+void print_token(int token, int n_vocab) {
    for (int k = 0; k < token; ++k) {
        printf(" ");
    }
@@ -1423,14 +1423,14 @@ static void print_token(int token, int n_vocab) {
    printf("\n");
 }

-static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
+void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
    for (int i=0; i<tokens->ne[0]; ++i) {
        int token = ggml_get_i32_1d(tokens, i);
        print_token(token, n_vocab);
    }
 }

-static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
+void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
    int n_tokens = tokens_input->ne[0];
    int n_vocab = targets->ne[0];
    float randomness = 0.0f;
@@ -1451,9 +1451,7 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu
    }
 }

-static void get_example_targets_batch(
-    struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
-) {
+void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
    GGML_ASSERT(tokens_input->n_dims == 2);
    GGML_ASSERT(     targets->n_dims == 3);
    int n_tokens = tokens_input->ne[0];
@@ -1476,7 +1474,7 @@ static void get_example_targets_batch(
    }
 }

-static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
+void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
    int n_tokens = tokens_input->ne[0];
    int n_vocab = targets->ne[0];
    for (int i=0; i<n_tokens-n_shift; ++i) {
@@ -1487,16 +1485,12 @@ static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tenso
    }
 }

-static struct ggml_tensor * square_error_loss(
-    struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
-) {
+struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
    // todo: instead of a-b: a[1:]-b[:-1]
    return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
 }

-static struct ggml_tensor * cross_entropy_loss(
-    struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
-) {
+struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
    const float eps = 1e-3f;
    return
        ggml_sum(ctx,
@@ -1623,10 +1617,15 @@ int main(int argc, char ** argv) {

        float error_before_opt = ggml_get_f32_1d(e, 0);

+        struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
        struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
+        opt_params_adam.print_forward_graph = false;
+        opt_params_adam.print_backward_graph = false;
        opt_params_lbfgs.print_forward_graph = false;
        opt_params_lbfgs.print_backward_graph = false;
+        opt_params_adam.adam.n_iter = 16;
        opt_params_lbfgs.lbfgs.n_iter = 16;
+        // ggml_opt(ctx0, opt_params_adam, e);
        ggml_opt(ctx0, opt_params_lbfgs, e);
        //
        ggml_build_forward_expand(&gf, e);
--- a/examples/beam-search/CMakeLists.txt
+++ b/examples/beam-search/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET beam-search)
-add_executable(${TARGET} beam-search.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/beam_search/CMakeLists.txt
+++ b/examples/beam_search/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET speculative)
-add_executable(${TARGET} speculative.cpp)
+set(TARGET beam_search)
+add_executable(${TARGET} beam_search.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/beam_search/beam_search.cpp
+++ b/examples/beam_search/beam_search.cpp
@@ -1,5 +1,10 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"

 #include <cassert>
 #include <cinttypes>
@@ -17,9 +22,7 @@
 #include <unistd.h>
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
+#define NOMINMAX
 #include <windows.h>
 #include <signal.h>
 #endif
@@ -29,8 +32,7 @@ struct ostream_beam_view {
    llama_context * ctx;
    llama_beam_view beam_view;
 };
-
-static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
+std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
    os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
    for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
        os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
@@ -46,7 +48,7 @@ struct beam_search_callback_data {

 // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
 // For example, eob can be flagged due to maximum token length, stop words, etc.
-static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
+bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
    return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
 }

@@ -56,7 +58,7 @@ static bool is_at_eob(const beam_search_callback_data & callback_data, const lla
 //  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
 //    This is also called when the stop condition is met.
 //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
-static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
+void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
    auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
    // Mark beams as EOS as needed.
    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
@@ -71,7 +73,7 @@ static void beam_search_callback(void * callback_data_ptr, llama_beams_state bea
        assert(0u < beams_state.n_beams);
        const llama_token * tokens = beams_state.beam_views[0].tokens;
        std::copy(tokens, tokens + n, callback_data.response.end() - n);
-        printf("%zu", n);
+        printf("%lu", n);
    }
    fflush(stdout);
 #if 1 // DEBUG: print current beams for this iteration
@@ -143,7 +145,7 @@ int main(int argc, char ** argv)

    if (tokens_list.size() > max_tokens_list_size)
    {
-        fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
+        fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
             __func__ , tokens_list.size() , max_tokens_list_size );
        return 1;
    }
--- a/examples/benchmark/CMakeLists.txt
+++ b/examples/benchmark/CMakeLists.txt
@@ -1,8 +1,7 @@
 set(TARGET benchmark)
 add_executable(${TARGET} benchmark-matmult.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
-target_include_directories(${TARGET} PRIVATE ../../common)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -1,6 +1,5 @@
-#include "build-info.h"
-#include "common.h"
 #include "ggml.h"
+#include "build-info.h"

 #include <locale.h>
 #include <assert.h>
@@ -21,7 +20,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);

    if (plan.work_size > 0) {
@@ -32,19 +31,19 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
    ggml_graph_compute(graph, &plan);
 }

-static float tensor_sum_elements(const ggml_tensor * tensor) {
-    double sum = 0;
-    if (tensor->type == GGML_TYPE_F32) {
+float tensor_sum_elements(const ggml_tensor * tensor) {
+    float sum = 0;
+    if (tensor->type==GGML_TYPE_F32) {
        for (int j = 0; j < tensor->ne[1]; j++) {
            for (int k = 0; k < tensor->ne[0]; k++) {
-                sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
+                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k];
            }
        }
    }
    return sum;
 }

-static void tensor_dump(const ggml_tensor * tensor, const char * name) {
+void tensor_dump(const ggml_tensor * tensor, const char * name) {
    printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
        tensor->type, ggml_type_name(tensor->type),
        tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
@@ -59,7 +58,7 @@ struct benchmark_params_struct {
    int32_t n_iterations  = 10;
 };

-static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
+void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
@@ -100,7 +99,7 @@ int main(int argc, char ** argv)  {
        exit(1);
    }

-    print_build_info();
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
    printf("Starting Test\n");

    // create the ggml context
@@ -126,15 +125,12 @@ int main(int argc, char ** argv)  {

    //printf("Memsize required = %i\n", sizex*sizex);

-    // TODO: perform the bench for all types or for a user specified type
-    const ggml_type qtype = GGML_TYPE_Q4_1;
-
    size_t ctx_size = 0;
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
    ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
-    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
-    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
    ctx_size += 1024*1024*16;
@@ -167,7 +163,7 @@ int main(int argc, char ** argv)  {
    struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
    ggml_set_f32(m2, 2.0f);

-    printf("\n------ Test 1 - Matrix Mult via F32 code\n");
+    printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
    // printf("Creating new tensor m11xm2\n");
    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);

@@ -185,16 +181,17 @@ int main(int argc, char ** argv)  {

    TENSOR_DUMP(gf.nodes[0]);

-    printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
+    printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");

    int32_t nelements = sizex*sizey;
+    int32_t ne[2] = { sizex, sizey };

    std::vector<int64_t> hist_cur(1 << 4, 0);

    // Set up a the benchmark matrices
    // printf("Creating new tensor q11 & Running quantize\n");
-    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());
+    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
+    ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());

    // Set up a the compute graph
    // printf("Creating new tensor q31\n");
@@ -205,8 +202,8 @@ int main(int argc, char ** argv)  {

    // Set up a second graph computation to make sure we override the CPU cache lines
    // printf("Creating new tensor q12 & Running quantize\n");
-    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());
+    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
+    ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());

    // printf("Creating new tensor q32\n");
    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
@@ -223,7 +220,7 @@ int main(int argc, char ** argv)  {
    printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);


-    // Let's use the F32 result from above as a reference for the quantized multiplication
+    // Let's use the F32 result from above as a reference for the q4_0 multiplication
    float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);

    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
@@ -253,7 +250,7 @@ int main(int argc, char ** argv)  {
        // Check that the matrix multiplication result is in the right ballpark
        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
        float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
-        float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
+        float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6

        if (delta > allowed_delta)  {
--- a/examples/chat.sh
+++ b/examples/chat.sh
@@ -11,6 +11,6 @@ cd ..
 #
 #   "--keep 48" is based on the contents of prompts/chat-with-bob.txt
 #
-./main -m ./models/llama-7b/ggml-model-q4_0.gguf -c 512 -b 1024 -n 256 --keep 48 \
+./main -m ./models/7B/ggml-model-q4_0.bin -c 512 -b 1024 -n 256 --keep 48 \
    --repeat_penalty 1.0 --color -i \
    -r "User:" -f prompts/chat-with-bob.txt
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -1,6 +1,5 @@
 #include "ggml.h"
 #include "llama.h"
-#include "common.h"

 #include <unordered_map>
 #include <vector>
@@ -76,7 +75,7 @@ typedef struct {
    int seq_len; // max sequence length
 } Config;

-struct TransformerWeights {
+typedef struct {
    // token embedding table
    float* token_embedding_table;    // (vocab_size, dim)
    // weights for rmsnorms
@@ -98,24 +97,9 @@ struct TransformerWeights {
    // float* freq_cis_imag; // (seq_len, dim/2)
    // (optional) classifier weights for the logits, on the last layer
    float* wcls;
+} TransformerWeights;

-    ~TransformerWeights() {
-        delete[] token_embedding_table;
-        delete[] rms_att_weight;
-        delete[] rms_ffn_weight;
-        delete[] wq;
-        delete[] wk;
-        delete[] wv;
-        delete[] wo;
-        delete[] w1;
-        delete[] w2;
-        delete[] w3;
-        delete[] rms_final_weight;
-        delete[] wcls;
-    }
-};
-
-static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
+void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
    // we calloc instead of malloc to keep valgrind happy
    w->token_embedding_table = new float[p->vocab_size * p->dim]();
    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
@@ -158,7 +142,7 @@ static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights
    }
 }

-static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
+int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
@@ -189,7 +173,22 @@ static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bo
    return 0;
 }

-static void print_sample_weights(TransformerWeights *w){
+void free_weights(TransformerWeights* w) {
+    delete w->token_embedding_table;
+    delete w->rms_att_weight;
+    delete w->rms_ffn_weight;
+    delete w->wq;
+    delete w->wk;
+    delete w->wv;
+    delete w->wo;
+    delete w->w1;
+    delete w->w2;
+    delete w->w3;
+    delete w->rms_final_weight;
+    if (w->wcls) delete w->wcls;
+}
+
+void print_sample_weights(TransformerWeights *w){
    printf("----- Quick print of first of the weight vales of all the variables\n");
    printf("%f\n", w->token_embedding_table[0]);
    printf("%f\n", w->rms_att_weight[0]);
@@ -324,7 +323,7 @@ struct train_params {
    int mem_compute1_gb;
 };

-static void print_params(struct my_llama_hparams * params) {
+void print_params(struct my_llama_hparams * params) {
    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
@@ -335,7 +334,7 @@ static void print_params(struct my_llama_hparams * params) {
    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
 }

-static void init_model(struct my_llama_model * model) {
+void init_model(struct my_llama_model * model) {
    const auto & hparams = model->hparams;

    const uint32_t n_embd  = hparams.n_embd;
@@ -408,17 +407,17 @@ static void init_model(struct my_llama_model * model) {
    }
 }

-static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
    return *ptr;
 }

-static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
    return *ptr;
 }

-static void print_row(struct ggml_tensor * probs, int i) {
+void print_row(struct ggml_tensor * probs, int i) {
    for (int k = 0; k < probs->ne[0]; ++k) {
        float p = get_f32_2d(probs, k, i);
        printf(" %f", p);
@@ -426,7 +425,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
    printf("\n");
 }

-static void print_matrix(struct ggml_tensor * probs) {
+void print_matrix(struct ggml_tensor * probs) {
    assert(probs->n_dims == 2);
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
@@ -500,10 +499,10 @@ struct llama_file {
        errno = 0;
        std::size_t ret = std::fread(ptr, size, 1, fp);
        if (ferror(fp)) {
-            die_fmt("fread failed: %s", strerror(errno));
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
        }
        if (ret != 1) {
-            die("unexpectedly reached end of file");
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
        }
    }

@@ -531,7 +530,7 @@ struct llama_file {
    }
 };

-static bool is_ggml_file(const char * filename) {
+bool is_ggml_file(const char *filename) {
    llama_file file(filename, "rb");
    if (file.size < 4) {
        return false;
@@ -540,7 +539,7 @@ static bool is_ggml_file(const char * filename) {
    return magic == GGUF_MAGIC;
 }

-static std::string llama_escape_whitespaces(const std::string & text) {
+static std::string llama_escape_whitespaces(const std::string& text) {
    std::ostringstream out;
    for (char c : text) {
        if (c == ' ') out << "\xe2\x96\x81";
@@ -549,7 +548,7 @@ static std::string llama_escape_whitespaces(const std::string & text) {
    return out.str();
 }

-static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
+void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
    if (is_ggml_file(filename)) {
        struct ggml_context * ctx_data = NULL;

@@ -597,9 +596,6 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
        // assume llama2.c vocabulary
        printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
        llama_file file(filename, "rb");
-        if (!file.fp) {
-            die_fmt("%s: %s", strerror(errno), filename);
-        }
        const int  n_vocab = config->vocab_size;
        /* uint32_t max_token_length =  */ file.read_u32(); // unused
        vocab->id_to_token.resize(n_vocab);
@@ -637,7 +633,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab
    }
 }

-static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
+void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
    int ct;
    switch (gg_weights->n_dims){
        case 1:
@@ -673,37 +669,36 @@ static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const floa
    }
 }

-static void save_as_llama_model(
-    struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
-) {
-    // convert AK weights into GG weights one by one.
+void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
+    // stuff AK weights into GG weights one by one.
    // w->token_embedding_table -> model->tok_embeddings
    // float*                   -> struct ggml_tensor
-    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
-    convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
+    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+    stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);

-    convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
+    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
    //print_row(model->norm, 0);

    // for rms-att-weight
    int row_length = model->hparams.n_embd;
+    const auto & hparams = model->hparams;
    int n_ff = model->hparams.n_ff;

    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
        auto & layer = model->layers[i];
        // 1d
-        convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
-        convert_weights_ak_to_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);

        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
-        convert_weights_ak_to_gg(layer.wq            , &w->wq[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);

-        convert_weights_ak_to_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
-        convert_weights_ak_to_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
-        convert_weights_ak_to_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
+        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
    }

    struct gguf_context * ctx = gguf_init_empty();
@@ -787,7 +782,7 @@ static void save_as_llama_model(
    gguf_free(ctx);
 }

-static struct train_params get_default_train_params() {
+struct train_params get_default_train_params() {
    struct train_params params;
    params.fn_vocab_model    = "models/7B/ggml-model-f16.gguf";
    params.fn_llama2c_output_model = "ak_llama_model.bin";
@@ -837,7 +832,7 @@ static struct train_params get_default_train_params() {
    return params;
 }

-static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
@@ -848,7 +843,7 @@ static void print_usage(int /*argc*/, char ** argv, const struct train_params *
    fprintf(stderr, "\n");
 }

-static bool params_parse(int argc, char ** argv, struct train_params * params) {
+bool params_parse(int argc, char ** argv, struct train_params * params) {
    bool invalid_param = false;
    bool reqd_param_found = false;
    std::string arg;
@@ -903,8 +898,8 @@ static bool params_parse(int argc, char ** argv, struct train_params * params) {
    return true;
 }

-static std::string basename(const std::string &path) {
-    size_t pos = path.find_last_of("/\\");
+std::string basename(const std::string &path) {
+    size_t pos = path.find_last_of("/");
    if (pos == std::string::npos) {
        return path;
    }
@@ -917,7 +912,7 @@ int main(int argc, char ** argv) {
        return 1;
    }
    Config config;
-    TransformerWeights weights = {};
+    TransformerWeights weights;
    {
        FILE *file = fopen(params.fn_llama2c_model, "rb");
        if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
@@ -959,5 +954,6 @@ int main(int argc, char ** argv) {
    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);

    ggml_free(model.ctx);
+    free_weights(&weights);
    return 0;
 }
--- a/examples/embd-input/embd-input-lib.cpp
+++ b/examples/embd-input/embd-input-lib.cpp
@@ -1,5 +1,8 @@
-#include "build-info.h"
-#include "common.h"
+// Defines sigaction on msys:
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
 #include "embd-input.h"

 #include <cassert>
@@ -20,11 +23,11 @@ extern "C" {
 struct MyModel* create_mymodel(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params)) {
+    if (gpt_params_parse(argc, argv, params) == false) {
        return nullptr;
    }

-    print_build_info();
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = uint32_t(time(NULL));
--- a/examples/embd-input/embd-input.h
+++ b/examples/embd-input/embd-input.h
@@ -3,6 +3,7 @@

 #include "common.h"
 #include "llama.h"
+#include "build-info.h"

 extern "C" {

--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,6 +1,6 @@
-#include "build-info.h"
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"

 #include <ctime>

@@ -11,13 +11,18 @@
 int main(int argc, char ** argv) {
    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params)) {
+    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }

    params.embedding = true;

-    print_build_info();
+    if (params.n_ctx > 2048) {
+        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
+                "expect poor results\n", __func__, params.n_ctx);
+    }
+
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
@@ -42,12 +47,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx_train = llama_n_ctx_train(ctx);
-    if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, params.n_ctx);
-    }
-
    // print system information
    {
        fprintf(stderr, "\n");
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET gguf)
-add_executable(${TARGET} gguf.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -13,14 +13,14 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))

-template <typename T>
+template<typename T>
 static std::string to_string(const T & val) {
    std::stringstream ss;
    ss << val;
    return ss.str();
 }

-static bool gguf_ex_write(const std::string & fname) {
+bool gguf_ex_write(const std::string & fname) {
    struct gguf_context * ctx = gguf_init_empty();

    gguf_set_val_u8  (ctx, "some.parameter.uint8",    0x12);
@@ -76,7 +76,7 @@ static bool gguf_ex_write(const std::string & fname) {

    gguf_write_to_file(ctx, fname.c_str(), false);

-    printf("%s: wrote file '%s;\n", __func__, fname.c_str());
+    fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());

    ggml_free(ctx_data);
    gguf_free(ctx);
@@ -85,7 +85,7 @@ static bool gguf_ex_write(const std::string & fname) {
 }

 // just read tensor info
-static bool gguf_ex_read_0(const std::string & fname) {
+bool gguf_ex_read_0(const std::string & fname) {
    struct gguf_init_params params = {
        /*.no_alloc = */ false,
        /*.ctx      = */ NULL,
@@ -93,20 +93,20 @@ static bool gguf_ex_read_0(const std::string & fname) {

    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);

-    printf("%s: version:      %d\n", __func__, gguf_get_version(ctx));
-    printf("%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
-    printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));

    // kv
    {
        const int n_kv = gguf_get_n_kv(ctx);

-        printf("%s: n_kv: %d\n", __func__, n_kv);
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);

        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ctx, i);

-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }

@@ -116,10 +116,10 @@ static bool gguf_ex_read_0(const std::string & fname) {

        const int keyidx = gguf_find_key(ctx, findkey);
        if (keyidx == -1) {
-            printf("%s: find key: %s not found.\n", __func__, findkey);
+            fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
        } else {
            const char * key_value = gguf_get_val_str(ctx, keyidx);
-            printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
+            fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
        }
    }

@@ -127,13 +127,13 @@ static bool gguf_ex_read_0(const std::string & fname) {
    {
        const int n_tensors = gguf_get_n_tensors(ctx);

-        printf("%s: n_tensors: %d\n", __func__, n_tensors);
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);

        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);

-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
        }
    }

@@ -143,7 +143,7 @@ static bool gguf_ex_read_0(const std::string & fname) {
 }

 // read and create ggml_context containing the tensors and their data
-static bool gguf_ex_read_1(const std::string & fname) {
+bool gguf_ex_read_1(const std::string & fname) {
    struct ggml_context * ctx_data = NULL;

    struct gguf_init_params params = {
@@ -153,20 +153,20 @@ static bool gguf_ex_read_1(const std::string & fname) {

    struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);

-    printf("%s: version:      %d\n", __func__, gguf_get_version(ctx));
-    printf("%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
-    printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
+    fprintf(stdout, "%s: version:      %d\n", __func__, gguf_get_version(ctx));
+    fprintf(stdout, "%s: alignment:   %zu\n", __func__, gguf_get_alignment(ctx));
+    fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));

    // kv
    {
        const int n_kv = gguf_get_n_kv(ctx);

-        printf("%s: n_kv: %d\n", __func__, n_kv);
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);

        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ctx, i);

-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }

@@ -174,13 +174,13 @@ static bool gguf_ex_read_1(const std::string & fname) {
    {
        const int n_tensors = gguf_get_n_tensors(ctx);

-        printf("%s: n_tensors: %d\n", __func__, n_tensors);
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);

        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);

-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
        }
    }

@@ -189,13 +189,13 @@ static bool gguf_ex_read_1(const std::string & fname) {
        const int n_tensors = gguf_get_n_tensors(ctx);

        for (int i = 0; i < n_tensors; ++i) {
-            printf("%s: reading tensor %d data\n", __func__, i);
+            fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);

            const char * name = gguf_get_tensor_name(ctx, i);

            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);

-            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
+            fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);

            // print first 10 elements
            const float * data = (const float *) cur->data;
@@ -219,7 +219,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
        }
    }

-    printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
+    fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));

    ggml_free(ctx_data);
    gguf_free(ctx);
@@ -229,7 +229,7 @@ static bool gguf_ex_read_1(const std::string & fname) {

 int main(int argc, char ** argv) {
    if (argc < 3) {
-        printf("usage: %s data.gguf r|w\n", argv[0]);
+        fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
        return -1;
    }

--- a/examples/gptneox-wip/falcon-main.cpp
+++ b/examples/gptneox-wip/falcon-main.cpp
@@ -305,9 +305,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)

    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
    if( cur == NULL ) {
-        printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
+        fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str());
    } else {
-//        printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
+//        fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
    }

    return cur;
@@ -333,21 +333,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        return false;
    }

-    printf("%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
-    printf("%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
-    printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
+    fprintf(stdout, "%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
+    fprintf(stdout, "%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
+    fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));

    // print all kv
    #if 0
    {
        const int n_kv = gguf_get_n_kv(ggufctx);

-        printf("%s: n_kv: %d\n", __func__, n_kv);
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);

        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ggufctx, i);

-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }
    #endif
@@ -357,21 +357,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        int keyidx;

        keyidx = gguf_find_key(ggufctx, "general.name");
-        if (keyidx != -1) { printf("%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.description");
-        if (keyidx != -1) { printf("%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.author");
-        if (keyidx != -1) { printf("%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.license");
-        if (keyidx != -1) { printf("%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.architecture");
-        if (keyidx != -1) { printf("%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.file_type");
-        if (keyidx != -1) { printf("%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
-        if (keyidx != -1) { printf("%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
-        if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
    }

    // check required metadata
@@ -382,11 +382,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        keyidx = gguf_find_key(ggufctx, "general.architecture");
        if (keyidx != -1) {
            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) {
-                printf("%s: model architecture not supported!\n", __func__);
+                fprintf(stdout, "%s: model architecture not supported!\n", __func__);
                return false;
            }
        } else {
-            printf("%s: gguf model architecture not found!\n", __func__);
+            fprintf(stdout, "%s: gguf model architecture not found!\n", __func__);
            return false;
        }

@@ -394,11 +394,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout");
        if (keyidx != -1) {
            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) {
-                printf("%s: model tensor data layout not supported!\n", __func__);
+                fprintf(stdout, "%s: model tensor data layout not supported!\n", __func__);
                return false;
            }
        } else {
-            printf("%s: gguf model tensor data layout not found!\n", __func__);
+            fprintf(stdout, "%s: gguf model tensor data layout not found!\n", __func__);
            return false;
        }

@@ -455,11 +455,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_

        if (keyidx != -1) {
            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
-                printf("%s: tokenizer model not supported!\n", __func__);
+                fprintf(stdout, "%s: tokenizer model not supported!\n", __func__);
                return false;
            }
        } else {
-            printf("%s: tokenizer model not found!\n", __func__);
+            fprintf(stdout, "%s: tokenizer model not found!\n", __func__);
            return false;
        }

@@ -467,22 +467,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");

        if (tokens_keyidx == -1) {
-            printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
+            fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__);
            return false;
        }

        int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");

        if (merges_keyidx == -1) {
-            printf("%s: gpt2 tokenizer merges not found!\n", __func__);
+            fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__);
            return false;
        }

        hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
        hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);

-        printf("%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
-        printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
+        fprintf(stdout, "%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
+        fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);

        for (size_t i = 0; i < hparams.n_vocab; i++) {
            std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
@@ -523,12 +523,12 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) {   vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }

-        if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
-        if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
-        if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
-        if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
-        if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
-        if( vocab.linefeed_id    != -1 ) { printf("%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }
+        if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
+        if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
+        if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
+        if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
+        if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
+        if( vocab.linefeed_id    != -1 ) { fprintf(stdout, "%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }

    }

@@ -543,13 +543,13 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
    {
        const int n_tensors = gguf_get_n_tensors(ggufctx);

-        printf("%s: n_tensors: %d\n", __func__, n_tensors);
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);

        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ggufctx, i);
            const size_t offset = gguf_get_tensor_offset(ggufctx, i);

-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
        }
    }
    #endif
@@ -953,7 +953,7 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params)) {
+    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }

--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
@@ -318,9 +318,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)

    struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
    if( cur == NULL ) {
-        printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
+        fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str());
    } else {
-//        printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
+//        fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
    }

    return cur;
@@ -346,21 +346,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
        return false;
    }

-    printf("%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
-    printf("%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
-    printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
+    fprintf(stdout, "%s: gguf version     = %d\n", __func__, gguf_get_version(ggufctx));
+    fprintf(stdout, "%s: gguf alignment   = %zu\n", __func__, gguf_get_alignment(ggufctx));
+    fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));

    // print all kv
    #if 0
    {
        const int n_kv = gguf_get_n_kv(ggufctx);

-        printf("%s: n_kv: %d\n", __func__, n_kv);
+        fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);

        for (int i = 0; i < n_kv; ++i) {
            const char * key = gguf_get_key(ggufctx, i);

-            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
+            fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
        }
    }
    #endif
@@ -370,21 +370,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
        int keyidx;

        keyidx = gguf_find_key(ggufctx, "general.name");
-        if (keyidx != -1) { printf("%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model name           = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.description");
-        if (keyidx != -1) { printf("%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model description    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.author");
-        if (keyidx != -1) { printf("%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model author         = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.license");
-        if (keyidx != -1) { printf("%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model license        = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.architecture");
-        if (keyidx != -1) { printf("%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model architecture   = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.file_type");
-        if (keyidx != -1) { printf("%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model file type      = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
-        if (keyidx != -1) { printf("%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model data layout    = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
        keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
-        if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
+        if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
    }

    // check required metadata
@@ -395,11 +395,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
        keyidx = gguf_find_key(ggufctx, "general.architecture");
        if (keyidx != -1) {
            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) {
-                printf("%s: model architecture not supported!\n", __func__);
+                fprintf(stdout, "%s: model architecture not supported!\n", __func__);
                return false;
            }
        } else {
-            printf("%s: gguf model architecture not found!\n", __func__);
+            fprintf(stdout, "%s: gguf model architecture not found!\n", __func__);
            return false;
        }

@@ -456,11 +456,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2

        if (keyidx != -1) {
            if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
-                printf("%s: tokenizer model not supported!\n", __func__);
+                fprintf(stdout, "%s: tokenizer model not supported!\n", __func__);
                return false;
            }
        } else {
-            printf("%s: tokenizer model not found!\n", __func__);
+            fprintf(stdout, "%s: tokenizer model not found!\n", __func__);
            return false;
        }

@@ -468,22 +468,22 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
        int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");

        if (tokens_keyidx == -1) {
-            printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
+            fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__);
            return false;
        }

        int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");

        if (merges_keyidx == -1) {
-            printf("%s: gpt2 tokenizer merges not found!\n", __func__);
+            fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__);
            return false;
        }

        hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
        hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);

-        printf("%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
-        printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
+        fprintf(stdout, "%s: gpt2 tokenizer vocab  = %zu\n", __func__, hparams.n_vocab);
+        fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);

        for (size_t i = 0; i < hparams.n_vocab; i++) {
            std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
@@ -524,12 +524,12 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
        keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) {   vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }

-        if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
-        if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
-        if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
-        if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
-        if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
-        if( vocab.linefeed_id    != -1 ) { printf("%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }
+        if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
+        if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
+        if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
+        if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
+        if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
+        if( vocab.linefeed_id    != -1 ) { fprintf(stdout, "%s: LF token  = %d\n",      __func__, vocab.linefeed_id ); }
    }


@@ -543,13 +543,13 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
    {
        const int n_tensors = gguf_get_n_tensors(ggufctx);

-        printf("%s: n_tensors: %d\n", __func__, n_tensors);
+        fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);

        for (int i = 0; i < n_tensors; ++i) {
            const char * name   = gguf_get_tensor_name  (ggufctx, i);
            const size_t offset = gguf_get_tensor_offset(ggufctx, i);

-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
        }
    }
    #endif
@@ -660,10 +660,9 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
 ggml_tensor * gpt_neox_ff(
        const gpt_neox_block &block,
        ggml_context * ctx0,
-        ggml_tensor * inp,
-        const gpt_neox_hparams &hparams) {
+        ggml_tensor * inp) {

-    ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps);
+    ggml_tensor * cur = ggml_norm(ctx0, inp);

    cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
    cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
@@ -754,7 +753,7 @@ bool gpt_neox_eval(
        // self-attention
        {
            {
-                cur = ggml_norm(ctx0, inpL, hparams.norm_eps);
+                cur = ggml_norm(ctx0, inpL);

                cur = ggml_add(ctx0,
                        ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
@@ -845,7 +844,7 @@ bool gpt_neox_eval(
        if (hparams.par_res == 0) {
            struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);

-            cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams);
+            cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF);

            // input for next layer
            inpL = ggml_add(ctx0, cur, inpFF);
@@ -854,7 +853,7 @@ bool gpt_neox_eval(

            // this is independent of the self-attention result, so it could be done in parallel to the self-attention
            // note here we pass inpL instead of cur
-            cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams);
+            cur = gpt_neox_ff(model.blocks[il], ctx0, inpL);

            // layer input + FF
            cur  = ggml_add(ctx0, cur, inpFF);
@@ -868,7 +867,7 @@ bool gpt_neox_eval(

    // norm
    {
-        inpL = ggml_norm(ctx0, inpL, hparams.norm_eps);
+        inpL = ggml_norm(ctx0, inpL);

        // inpL = ln_f_g*inpL + ln_f_b
        inpL = ggml_add(ctx0,
@@ -925,7 +924,7 @@ int main(int argc, char ** argv) {

    gpt_params params;

-    if (!gpt_params_parse(argc, argv, params)) {
+    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }

--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -3,9 +3,6 @@
 #include <cassert>
 #include <chrono>
 #include <cinttypes>
-#include <clocale>
-#include <cmath>
-#include <cstdio>
 #include <cstring>
 #include <ctime>
 #include <iterator>
@@ -13,6 +10,7 @@
 #include <numeric>
 #include <regex>
 #include <sstream>
+#include <stdio.h>
 #include <string>
 #include <vector>

@@ -74,6 +72,14 @@ static T stdev(const std::vector<T> & v) {
    return stdev;
 }

+static bool ggml_cpu_has_metal() {
+#if defined(GGML_USE_METAL)
+    return true;
+#else
+    return false;
+#endif
+}
+
 static std::string get_cpu_info() {
    std::string id;
 #ifdef __linux__
@@ -157,26 +163,26 @@ static const cmd_params cmd_params_defaults = {
 };

 static void print_usage(int /* argc */, char ** argv) {
-    printf("usage: %s [options]\n", argv[0]);
-    printf("\n");
-    printf("options:\n");
-    printf("  -h, --help\n");
-    printf("  -m, --model <filename>            (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
-    printf("  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
-    printf("  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
-    printf("  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
-    printf("  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
-    printf("  -ngl N, --n-gpu-layers <n>        (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-    printf("  -mg i, --main-gpu <n>             (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
-    printf("  -lv, --low-vram <0|1>             (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
-    printf("  -mmq, --mul-mat-q <0|1>           (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
-    printf("  -ts, --tensor_split <ts0/ts1/..>               \n");
-    printf("  -r, --repetitions <n>             (default: %d)\n", cmd_params_defaults.reps);
-    printf("  -o, --output <csv|json|md|sql>    (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
-    printf("  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
-    printf("\n");
-    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
+    fprintf(stdout, "usage: %s [options]\n", argv[0]);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "options:\n");
+    fprintf(stdout, "  -h, --help\n");
+    fprintf(stdout, "  -m, --model <filename>            (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
+    fprintf(stdout, "  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
+    fprintf(stdout, "  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
+    fprintf(stdout, "  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
+    fprintf(stdout, "  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
+    fprintf(stdout, "  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    fprintf(stdout, "  -ngl N, --n-gpu-layers <n>        (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    fprintf(stdout, "  -mg i, --main-gpu <n>             (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
+    fprintf(stdout, "  -lv, --low-vram <0|1>             (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
+    fprintf(stdout, "  -mmq, --mul-mat-q <0|1>           (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
+    fprintf(stdout, "  -ts, --tensor_split <ts0/ts1/..>               \n");
+    fprintf(stdout, "  -r, --repetitions <n>             (default: %d)\n", cmd_params_defaults.reps);
+    fprintf(stdout, "  -o, --output <csv|json|md|sql>    (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
+    fprintf(stdout, "  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
+    fprintf(stdout, "\n");
+    fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");

 }

@@ -910,9 +916,6 @@ static void llama_null_log_callback(enum llama_log_level level, const char * tex
 }

 int main(int argc, char ** argv) {
-    // try to set locale for unicode characters in markdown
-    setlocale(LC_CTYPE, ".UTF-8");
-
 #if !defined(NDEBUG)
    fprintf(stderr, "warning: asserts enabled, performance may be affected\n");
 #endif
@@ -978,12 +981,7 @@ int main(int argc, char ** argv) {
        test t(inst, lmodel, ctx);

        // warmup run
-        if (t.n_prompt > 0) {
-            test_prompt(ctx, std::min(2, t.n_batch), 0, t.n_batch, t.n_threads);
-        }
-        if (t.n_gen > 0) {
-            test_gen(ctx, 1, 0, t.n_threads);
-        }
+        test_gen(ctx, 1, 0, t.n_threads);

        for (int i = 0; i < params.reps; i++) {
            uint64_t t_start = get_time_ns();
--- a/examples/llm.vim
+++ b/examples/llm.vim
@@ -8,7 +8,7 @@ function! Llm()
  let buffer_content = join(getline(1, '$'), "\n")

  " Create the JSON payload
-  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":256,"stop": ["\n\n\n"],"stream": v:false}
+  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false}
  let json_payload.prompt = buffer_content

  " Define the curl command
@@ -25,4 +25,3 @@ function! Llm()
 endfunction

 command! Llm call Llm()
-noremap <F2> :Llm<CR>
--- a/examples/main-cmake-pkg/.gitignore
+++ b/examples/main-cmake-pkg/.gitignore
@@ -1,51 +0,0 @@
-# Prerequisites
-*.d
-
-# Compiled Object files
-*.slo
-*.lo
-*.o
-*.obj
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Compiled Dynamic libraries
-*.so
-*.dylib
-*.dll
-
-# Fortran module files
-*.mod
-*.smod
-
-# Compiled Static libraries
-*.lai
-*.la
-*.a
-*.lib
-
-# Executables
-*.exe
-*.out
-*.app
-
-*.gguf
-
-*.log
-.DS_Store
-.build/
-.cache/
-.direnv/
-.envrc
-.swiftpm
-.venv
-.clang-tidy
-.vs/
-.vscode/
-
-build*/
-out/
-tmp/
-
--- a/examples/main-cmake-pkg/CMakeLists.txt
+++ b/examples/main-cmake-pkg/CMakeLists.txt
@@ -1,36 +0,0 @@
-cmake_minimum_required(VERSION 3.12)
-project("main-cmake-pkg" C CXX)
-set(TARGET main-cmake-pkg)
-
-find_package(Llama 0.0.1 REQUIRED)
-
-# Bake common functionality in with target. Because applications
-# using the relocatable Llama package should be outside of the
-# source tree, main-cmake-pkg pretends the dependencies are built-in.
-
-set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
-add_library(common OBJECT
-    ${_common_path}/common.h
-    ${_common_path}/common.cpp
-    ${_common_path}/console.h
-    ${_common_path}/console.cpp
-    ${_common_path}/grammar-parser.h
-    ${_common_path}/grammar-parser.cpp
-    )
-
-# WARNING: because build-info.h is auto-generated, it will only
-# be available after the user has built the llama.cpp sources.
-#
-configure_file(${_common_path}/../build-info.h
-    ${CMAKE_CURRENT_BINARY_DIR}/build-info.h
-    COPYONLY)
-
-target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
-    ${CMAKE_CURRENT_BINARY_DIR})
-
-add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
-target_include_directories(${TARGET} PRIVATE ${_common_path})
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@@ -1,37 +0,0 @@
-# llama.cpp/example/main-cmake-pkg
-
-This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
-
-## Building
-
-Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
-
-### Considerations
-
-When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
-
-### Build llama.cpp and install to C:\LlamaCPP directory
-
-In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
-
-```cmd
-git clone https://github.com/ggerganov/llama.cpp
-cd llama.cpp
-mkdir build
-cd build
-cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
-cmake --build . --config Release
-cmake --install . --prefix C:/LlamaCPP
-```
-
-### Build main-cmake-pkg
-
-
-```cmd
-cd ..\examples\main-cmake-pkg
-mkdir build
-cd build
-cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
-cmake --build . --config Release
-cmake --install . --prefix C:/MyLlamaApp
-```
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -34,7 +34,7 @@ For an interactive experience, try this command:
 #### Unix-based systems (Linux, macOS, etc.):

 ```bash
-./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \
+./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
 'User: Hi
 AI: Hello. I am an AI chatbot. Would you like to talk?
 User: Sure!
@@ -45,7 +45,7 @@ User:'
 #### Windows:

 ```powershell
-main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
+main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
 ```

 The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
@@ -144,7 +144,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by

 Some fine-tuned models have extened the context length by scaling RoPE. For example, if the original pretrained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.

-   `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
+- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.

 ### Keep Prompt

@@ -274,7 +274,7 @@ These options help improve the performance and memory usage of the LLaMA models.

 ### NUMA support

-   `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
+-   `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.

 ### Memory Float 32

@@ -302,6 +302,7 @@ These options provide extra functionality and customization when running the LLa

 -   `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
 -   `--verbose-prompt`: Print the prompt before generating text.
+-   `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
 -   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 -   `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -1,5 +1,9 @@
-#include "common.h"
+// Defines sigaction on msys:
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif

+#include "common.h"
 #include "console.h"
 #include "llama.h"
 #include "build-info.h"
@@ -13,7 +17,6 @@
 #include <ctime>
 #include <fstream>
 #include <iostream>
-#include <sstream>
 #include <string>
 #include <vector>

@@ -33,61 +36,11 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-static llama_context           ** g_ctx;
-static llama_model             ** g_model;
-static gpt_params               * g_params;
-static std::vector<llama_token> * g_input_tokens;
-static std::ostringstream       * g_output_ss;
-static std::vector<llama_token> * g_output_tokens;
+static llama_context ** g_ctx;
 static bool is_interacting = false;

-
-static void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const std::vector<llama_token> & input_tokens, const std::string & output,
-    const std::vector<llama_token> & output_tokens
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-
-    const std::string timestamp = get_sortable_timestamp();
-
-    const bool success = create_directory_with_parents(params.logdir);
-    if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
-        return;
-    }
-
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-
-    if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-
-    fprintf(logfile, "binary: main\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
-
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Generation Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-
-    dump_string_yaml_multiline(logfile, "output", output.c_str());
-    dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
-
-    llama_dump_timing_info_yaml(logfile, ctx);
-    fclose(logfile);
-}
-
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-static void sigint_handler(int signo) {
+void sigint_handler(int signo) {
    if (signo == SIGINT) {
        if (!is_interacting) {
            is_interacting = true;
@@ -95,7 +48,6 @@ static void sigint_handler(int signo) {
            console::cleanup();
            printf("\n");
            llama_print_timings(*g_ctx);
-            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            _exit(130);
        }
    }
@@ -104,21 +56,11 @@ static void sigint_handler(int signo) {

 int main(int argc, char ** argv) {
    gpt_params params;
-    g_params = &params;

-    if (!gpt_params_parse(argc, argv, params)) {
+    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }

-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("main", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
-    // TODO: Dump params ?
-    //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity));
-
    // save choice to use color for later
    // (note for later: this is a slightly awkward choice)
    console::init(params.simple_io, params.use_color);
@@ -141,38 +83,42 @@ int main(int argc, char ** argv) {
    }

    if (params.rope_freq_base != 10000.0) {
-        LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
+        fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
    }

    if (params.rope_freq_scale != 1.0) {
-        LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
+        fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
    }

-    LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
-    LOG_TEE("%s: built with %s for %s\n", __func__, BUILD_COMPILER, BUILD_TARGET);
+    if (params.n_ctx > 2048) {
+        // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
+        fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
+    } else if (params.n_ctx < 8) {
+        fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
+        params.n_ctx = 8;
+    }
+
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }

-    LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
+    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);

    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = gpt_random_prompt(rng);
    }

-    LOG("%s: llama backend init\n", __func__);
    llama_backend_init(params.numa);

    llama_model * model;
    llama_context * ctx;
    llama_context * ctx_guidance = NULL;
-    g_model = &model;
    g_ctx = &ctx;

    // load the model and apply lora adapter, if any
-    LOG("%s: load the model and apply lora adapter, if any\n", __func__);
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
    if (params.cfg_scale > 1.f) {
        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
@@ -180,26 +126,34 @@ int main(int argc, char ** argv) {
    }

    if (model == NULL) {
-        LOG_TEE("%s: error: unable to load model\n", __func__);
+        fprintf(stderr, "%s: error: unable to load model\n", __func__);
        return 1;
    }

-    const int n_ctx_train = llama_n_ctx_train(ctx);
-    if (params.n_ctx > n_ctx_train) {
-        LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, params.n_ctx);
-    } else if (params.n_ctx < 8) {
-        LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
-        params.n_ctx = 8;
-    }
-
    // print system information
    {
-        LOG_TEE("\n");
-        LOG_TEE("system_info: n_threads = %d / %d | %s\n",
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }

+    // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
+    // uncomment the "used_mem" line in llama.cpp to see the results
+    if (params.mem_test) {
+        {
+            fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
+
+            const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
+            llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
+        }
+
+        llama_print_timings(ctx);
+        llama_free(ctx);
+        llama_free_model(model);
+
+        return 0;
+    }
+
    // export the cgraph and exit
    if (params.export_cgraph) {
        llama_eval_export(ctx, "llama.ggml");
@@ -213,7 +167,7 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> session_tokens;

    if (!path_session.empty()) {
-        LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
+        fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());

        // fopen to check for existing session
        FILE * fp = std::fopen(path_session.c_str(), "rb");
@@ -223,38 +177,33 @@ int main(int argc, char ** argv) {
            session_tokens.resize(params.n_ctx);
            size_t n_token_count_out = 0;
            if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
-                LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                return 1;
            }
            session_tokens.resize(n_token_count_out);
            llama_set_rng_seed(ctx, params.seed);

-            LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
+            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
        } else {
-            LOG_TEE("%s: session file does not exist, will create\n", __func__);
+            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
        }
    }

+    // Add BOS if SPM tokenizer
    const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
-    LOG("add_bos: %d\n", add_bos);

+    // tokenize the prompt
    std::vector<llama_token> embd_inp;

    if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
-        LOG("tokenize the prompt\n");
        embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
    } else {
-        LOG("use session tokens\n");
        embd_inp = session_tokens;
    }

-    LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
-    LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
-
    // Should not run without any tokens
    if (embd_inp.empty()) {
        embd_inp.push_back(llama_token_bos(ctx));
-        LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp));
    }

    // Tokenize negative prompt
@@ -262,31 +211,23 @@ int main(int argc, char ** argv) {
    int guidance_offset = 0;
    int original_prompt_len = 0;
    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt));
-
        guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp));

        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp));
-
        original_prompt_len = original_inp.size();
        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
-        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
-        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
    }

    const int n_ctx = llama_n_ctx(ctx);
-    LOG("n_ctx: %d\n", n_ctx);

    if ((int) embd_inp.size() > n_ctx - 4) {
-        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
+        fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
        return 1;
    }

    // debug message about similarity of saved session, if applicable
    size_t n_matching_session_tokens = 0;
-    if (!session_tokens.empty()) {
+    if (session_tokens.size()) {
        for (llama_token id : session_tokens) {
            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
                break;
@@ -294,27 +235,22 @@ int main(int argc, char ** argv) {
            n_matching_session_tokens++;
        }
        if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
-            LOG_TEE("%s: using full prompt from session file\n", __func__);
+            fprintf(stderr, "%s: using full prompt from session file\n", __func__);
        } else if (n_matching_session_tokens >= embd_inp.size()) {
-            LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
+            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
-            LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
                __func__, n_matching_session_tokens, embd_inp.size());
        } else {
-            LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
+            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
                __func__, n_matching_session_tokens, embd_inp.size());
        }
    }

-    LOGLN(
-            "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu",
-            log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
-
    // if we will use the cache for the full prompt without reaching the end of the cache, force
    // reevaluation of the last token token to recalculate the cached logits
-    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
-        LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
-
+    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
+            session_tokens.size() > embd_inp.size()) {
        session_tokens.resize(embd_inp.size() - 1);
    }

@@ -327,9 +263,6 @@ int main(int argc, char ** argv) {
    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false);

-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx));
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx));
-
    // in instruct mode, we inject a prefix and a suffix to each input by the user
    if (params.instruct) {
        params.interactive_first = true;
@@ -342,30 +275,30 @@ int main(int argc, char ** argv) {
    }

    if (params.verbose_prompt) {
-        LOG_TEE("\n");
-        LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
-        LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+        fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
        for (int i = 0; i < (int) embd_inp.size(); i++) {
-            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
+            fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }

        if (ctx_guidance) {
-            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
-            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
+            fprintf(stderr, "\n");
+            fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
+            fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
+                fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
            }
        }

        if (params.n_keep > 0) {
-        LOG_TEE("%s: static prompt based on n_keep: '", __func__);
+        fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
-                LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
+                fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
            }
-            LOG_TEE("'\n");
+            fprintf(stderr, "'\n");
        }
-        LOG_TEE("\n");
+        fprintf(stderr, "\n");
    }

    if (params.interactive) {
@@ -382,48 +315,47 @@ int main(int argc, char ** argv) {
        SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif

-        LOG_TEE("%s: interactive mode on.\n", __func__);
+        fprintf(stderr, "%s: interactive mode on.\n", __func__);

-        if (!params.antiprompt.empty()) {
-            for (const auto & antiprompt : params.antiprompt) {
-                LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
+        if (params.antiprompt.size()) {
+            for (auto antiprompt : params.antiprompt) {
+                fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
            }
        }

        if (params.input_prefix_bos) {
-            LOG_TEE("Input prefix with BOS\n");
+            fprintf(stderr, "Input prefix with BOS\n");
        }

        if (!params.input_prefix.empty()) {
-            LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
+            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
        }

        if (!params.input_suffix.empty()) {
-            LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
+            fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
        }
    }
-    LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
-    LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
-    LOG_TEE("\n\n");
+    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
+    fprintf(stderr, "\n\n");

-    struct llama_grammar * grammar = NULL;
    grammar_parser::parse_state parsed_grammar;
-
+    llama_grammar *             grammar = NULL;
    if (!params.grammar.empty()) {
        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
        // will be empty (default) if there are parse errors
        if (parsed_grammar.rules.empty()) {
            return 1;
        }
-        LOG_TEE("%s: grammar:\n", __func__);
+        fprintf(stderr, "%s: grammar:\n", __func__);
        grammar_parser::print_grammar(stderr, parsed_grammar);
-        LOG_TEE("\n");
+        fprintf(stderr, "\n");

        {
            auto it = params.logit_bias.find(llama_token_eos(ctx));
            if (it != params.logit_bias.end() && it->second == -INFINITY) {
-                LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
+                fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
            }
        }

@@ -433,8 +365,8 @@ int main(int argc, char ** argv) {
    }

    // TODO: replace with ring-buffer
-    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_tokens.begin(), last_tokens.end(), 0);
+    std::vector<llama_token> last_n_tokens(n_ctx);
+    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);

    if (params.interactive) {
        const char *control_message;
@@ -446,11 +378,11 @@ int main(int argc, char ** argv) {
                              " - To return control without starting a new line, end your input with '/'.\n"
                              " - If you want to submit another line, end your input with '\\'.\n";
        }
-        LOG_TEE("== Running in interactive mode. ==\n");
+        fprintf(stderr, "== Running in interactive mode. ==\n"
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
-        LOG_TEE(       " - Press Ctrl+C to interject at any time.\n");
+               " - Press Ctrl+C to interject at any time.\n"
 #endif
-        LOG_TEE(       "%s\n", control_message);
+               "%s\n", control_message);

        is_interacting = params.interactive_first;
    }
@@ -465,37 +397,33 @@ int main(int argc, char ** argv) {
    int n_session_consumed = 0;
    int n_past_guidance    = 0;

-    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
-    std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
-    std::ostringstream output_ss;     g_output_ss     = &output_ss;
-
    // the first thing we will do is to output the prompt, so set color accordingly
    console::set_display(console::prompt);

    std::vector<llama_token> embd;
    std::vector<llama_token> embd_guidance;

-    const int n_vocab = llama_n_vocab(ctx);
-
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
+    // do one empty run to warm up the model
+    {
+        const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
+        llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
+        llama_reset_timings(ctx);
+    }

    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
-        if (!embd.empty()) {
+        if (embd.size() > 0) {
            // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via
            // --prompt or --file which uses the same value.
-            int max_embd_size = n_ctx - 4;
-
+            auto max_embd_size = n_ctx - 4;
            // Ensure the input doesn't exceed the context size by truncating embd if necessary.
-            if ((int) embd.size() > max_embd_size) {
-                const int skipped_tokens = (int) embd.size() - max_embd_size;
-                embd.resize(max_embd_size);
-
+            if ((int)embd.size() > max_embd_size) {
+                auto skipped_tokens = embd.size() - max_embd_size;
                console::set_display(console::error);
-                printf("<<input too long: skipped %d token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
+                printf("<<input too long: skipped %zu token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
                console::set_display(console::reset);
                fflush(stdout);
+                embd.resize(max_embd_size);
            }

            // infinite text generation via context swapping
@@ -504,26 +432,28 @@ int main(int argc, char ** argv) {
            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
                if (params.n_predict == -2) {
-                    LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
+                    fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
                    break;
                }

                const int n_left = n_past - params.n_keep;
-                LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d\n", n_past, n_left, n_ctx, params.n_keep);
-
                // always keep the first token - BOS
-                n_past          = std::max(1, params.n_keep);
+                n_past = std::max(1, params.n_keep);
                n_past_guidance = std::max(1, params.n_keep + guidance_offset);

-                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+                // insert n_left/2 tokens at the start of embd from last_n_tokens
+                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());

-                // insert n_left/2 tokens at the start of embd from last_tokens
-                embd.insert(embd.begin(), last_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_tokens.end() - embd.size());
-
-                LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
-
-                LOG("clear session path\n");
+                // stop saving session if we run out of context
                path_session.clear();
+
+                //printf("\n---\n");
+                //printf("resetting: '");
+                //for (int i = 0; i < (int) embd.size(); i++) {
+                //    printf("%s", llama_token_to_piece(ctx, embd[i]));
+                //}
+                //printf("'\n");
+                //printf("\n---\n");
            }

            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
@@ -553,7 +483,7 @@ int main(int argc, char ** argv) {

            if (ctx_guidance) {
                int input_size = 0;
-                llama_token * input_buf = NULL;
+                llama_token* input_buf = NULL;

                if (n_past_guidance < (int) guidance_inp.size()) {
                    // Guidance context should have the same data with these modifications:
@@ -569,19 +499,22 @@ int main(int argc, char ** argv) {
                        );
                    }

-                    input_buf  = embd_guidance.data();
+                    input_buf = embd_guidance.data();
                    input_size = embd_guidance.size();
-
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance));
+                    //fprintf(stderr, "\n---------------------\n");
+                    //for (int i = 0; i < (int) embd_guidance.size(); i++) {
+                        //fprintf(stderr, "%s", llama_token_to_piece(ctx, embd_guidance[i]));
+                    //}
+                    //fprintf(stderr, "\n---------------------\n");
                } else {
-                    input_buf  = embd.data();
+                    input_buf = embd.data();
                    input_size = embd.size();
                }

                for (int i = 0; i < input_size; i += params.n_batch) {
                    int n_eval = std::min(input_size - i, params.n_batch);
                    if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
-                        LOG_TEE("%s : failed to eval\n", __func__);
+                        fprintf(stderr, "%s : failed to eval\n", __func__);
                        return 1;
                    }

@@ -594,20 +527,14 @@ int main(int argc, char ** argv) {
                if (n_eval > params.n_batch) {
                    n_eval = params.n_batch;
                }
-
-                LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd));
-
                if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
-                    LOG_TEE("%s : failed to eval\n", __func__);
+                    fprintf(stderr, "%s : failed to eval\n", __func__);
                    return 1;
                }
-
                n_past += n_eval;
-
-                LOG("n_past = %d\n", n_past);
            }

-            if (!embd.empty() && !path_session.empty()) {
+            if (embd.size() > 0 && !path_session.empty()) {
                session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
                n_session_consumed = session_tokens.size();
            }
@@ -617,21 +544,106 @@ int main(int argc, char ** argv) {
        embd_guidance.clear();

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
+            // out of user input, sample next token
+            const float   temp            = params.temp;
+            const int32_t top_k           = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
+            const float   top_p           = params.top_p;
+            const float   tfs_z           = params.tfs_z;
+            const float   typical_p       = params.typical_p;
+            const int32_t repeat_last_n   = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
+            const float   repeat_penalty  = params.repeat_penalty;
+            const float   alpha_presence  = params.presence_penalty;
+            const float   alpha_frequency = params.frequency_penalty;
+            const int     mirostat        = params.mirostat;
+            const float   mirostat_tau    = params.mirostat_tau;
+            const float   mirostat_eta    = params.mirostat_eta;
+            const bool    penalize_nl     = params.penalize_nl;
+
            // optionally save the session on first sample (for faster prompt loading next time)
            if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) {
                need_to_save_session = false;
                llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
-
-                LOG("saved session to %s\n", path_session.c_str());
            }

-            const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates);
+            llama_token id = 0;

-            last_tokens.erase(last_tokens.begin());
-            last_tokens.push_back(id);
+            {
+                auto logits  = llama_get_logits(ctx);
+                auto n_vocab = llama_n_vocab(ctx);

-            LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, last_tokens));
+                // Apply params.logit_bias map
+                for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
+                    logits[it->first] += it->second;
+                }

+                std::vector<llama_token_data> candidates;
+                candidates.reserve(n_vocab);
+                for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                    candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+                }
+
+                llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+                if (ctx_guidance) {
+                    llama_sample_classifier_free_guidance(ctx, &candidates_p, ctx_guidance, params.cfg_scale);
+                }
+
+                // Apply penalties
+                float nl_logit = logits[llama_token_nl(ctx)];
+                auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+                llama_sample_repetition_penalty(ctx, &candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, repeat_penalty);
+                llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+                    last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+                    last_n_repeat, alpha_frequency, alpha_presence);
+                if (!penalize_nl) {
+                    for (size_t idx = 0; idx < candidates_p.size; idx++) {
+                        if (candidates_p.data[idx].id == llama_token_nl(ctx)) {
+                            candidates_p.data[idx].logit = nl_logit;
+                            break;
+                        }
+                    }
+                }
+
+                if (grammar != NULL) {
+                    llama_sample_grammar(ctx, &candidates_p, grammar);
+                }
+
+                if (temp <= 0) {
+                    // Greedy sampling
+                    id = llama_sample_token_greedy(ctx, &candidates_p);
+                } else {
+                    if (mirostat == 1) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        const int mirostat_m = 100;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+                    } else if (mirostat == 2) {
+                        static float mirostat_mu = 2.0f * mirostat_tau;
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+                    } else {
+                        // Temperature sampling
+                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
+                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
+                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+                        llama_sample_temperature(ctx, &candidates_p, temp);
+                        id = llama_sample_token(ctx, &candidates_p);
+                    }
+                }
+                // printf("`%d`", candidates_p.size);
+
+                if (grammar != NULL) {
+                    llama_grammar_accept_token(ctx, grammar, id);
+                }
+
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(id);
+            }
+
+            // add it to the context
            embd.push_back(id);

            // echo this to console
@@ -639,15 +651,12 @@ int main(int argc, char ** argv) {

            // decrement remaining sampling budget
            --n_remain;
-
-            LOG("n_remain: %d\n", n_remain);
        } else {
            // some user input remains from prompt or interaction, forward it to processing
-            LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
            while ((int) embd_inp.size() > n_consumed) {
                embd.push_back(embd_inp[n_consumed]);
-                last_tokens.erase(last_tokens.begin());
-                last_tokens.push_back(embd_inp[n_consumed]);
+                last_n_tokens.erase(last_n_tokens.begin());
+                last_n_tokens.push_back(embd_inp[n_consumed]);
                ++n_consumed;
                if ((int) embd.size() >= params.n_batch) {
                    break;
@@ -658,29 +667,22 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo) {
            for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id);
-                printf("%s", token_str.c_str());
-
-                if (embd.size() > 1) {
-                    input_tokens.push_back(id);
-                } else {
-                    output_tokens.push_back(id);
-                    output_ss << token_str;
-                }
+                printf("%s", llama_token_to_piece(ctx, id).c_str());
            }
            fflush(stdout);
        }
        // reset color to default if we there is no pending user input
-        if (input_echo && (int) embd_inp.size() == n_consumed) {
+        if (input_echo && (int)embd_inp.size() == n_consumed) {
            console::set_display(console::reset);
        }

        // if not currently processing queued inputs;
        if ((int) embd_inp.size() <= n_consumed) {
+
            // check for reverse prompt
-            if (!params.antiprompt.empty()) {
+            if (params.antiprompt.size()) {
                std::string last_output;
-                for (auto id : last_tokens) {
+                for (auto id : last_n_tokens) {
                    last_output += llama_token_to_piece(ctx, id);
                }

@@ -694,7 +696,7 @@ int main(int argc, char ** argv) {
                        ? last_output.length() - static_cast<size_t>(antiprompt.length() + extra_padding)
                        : 0;

-                    if (last_output.find(antiprompt, search_start_pos) != std::string::npos) {
+                    if (last_output.find(antiprompt.c_str(), search_start_pos) != std::string::npos) {
                        if (params.interactive) {
                            is_interacting = true;
                            console::set_display(console::user_input);
@@ -704,18 +706,12 @@ int main(int argc, char ** argv) {
                        break;
                    }
                }
-
-                if (is_antiprompt) {
-                    LOG("found antiprompt: %s\n", last_output.c_str());
-                }
            }

            // deal with end of text token in interactive mode
-            if (last_tokens.back() == llama_token_eos(ctx)) {
-                LOG("found EOS token\n");
-
+            if (last_n_tokens.back() == llama_token_eos(ctx)) {
                if (params.interactive) {
-                    if (!params.antiprompt.empty()) {
+                    if (params.antiprompt.size() != 0) {
                        // tokenize and inject first reverse prompt
                        const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
                        embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
@@ -732,20 +728,16 @@ int main(int argc, char ** argv) {
            }

            if (n_past > 0 && is_interacting) {
-                LOG("waiting for user input\n");
-
                if (params.instruct) {
                    printf("\n> ");
                }

                if (params.input_prefix_bos) {
-                    LOG("adding input prefix BOS token\n");
                    embd_inp.push_back(llama_token_bos(ctx));
                }

                std::string buffer;
                if (!params.input_prefix.empty()) {
-                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                    buffer += params.input_prefix;
                    printf("%s", buffer.c_str());
                }
@@ -765,43 +757,25 @@ int main(int argc, char ** argv) {
                if (buffer.length() > 1) {
                    // append input suffix if any
                    if (!params.input_suffix.empty()) {
-                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                        buffer += params.input_suffix;
                        printf("%s", params.input_suffix.c_str());
                    }

-                    LOG("buffer: '%s'\n", buffer.c_str());
-
-                    const size_t original_size = embd_inp.size();
-
                    // instruct mode: insert instruction prefix
                    if (params.instruct && !is_antiprompt) {
-                        LOG("inserting instruction prefix\n");
                        n_consumed = embd_inp.size();
                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
                    }

-                    const auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp));
-
+                    auto line_inp = ::llama_tokenize(ctx, buffer, false);
                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

                    // instruct mode: insert response suffix
                    if (params.instruct) {
-                        LOG("inserting instruction suffix\n");
                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                    }

-                    for (size_t i = original_size; i < embd_inp.size(); ++i) {
-                        const llama_token token = embd_inp[i];
-                        output_tokens.push_back(token);
-                        output_ss << llama_token_to_piece(ctx, token);
-                    }
-
                    n_remain -= line_inp.size();
-                    LOG("n_remain: %d\n", n_remain);
-                } else {
-                    LOG("empty line, passing control back\n");
                }

                input_echo = false; // do not echo this again
@@ -813,7 +787,7 @@ int main(int argc, char ** argv) {
                    if (grammar != NULL) {
                        llama_grammar_free(grammar);

-                        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
+                        std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
                        grammar = llama_grammar_init(
                            grammar_rules.data(), grammar_rules.size(),
                            parsed_grammar.symbol_ids.at("root"));
@@ -825,7 +799,7 @@ int main(int argc, char ** argv) {

        // end of text token
        if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
-            LOG_TEE(" [end of text]\n");
+            fprintf(stderr, " [end of text]\n");
            break;
        }

@@ -838,13 +812,11 @@ int main(int argc, char ** argv) {
    }

    if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
-        LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
+        fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
        llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }

    llama_print_timings(ctx);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
-
    if (ctx_guidance) { llama_free(ctx_guidance); }
    llama_free(ctx);
    llama_free_model(model);
@@ -854,9 +826,5 @@ int main(int argc, char ** argv) {
    }
    llama_backend_free();

-#ifndef LOG_DISABLE_LOGS
-    LOG_TEE("Log end\n")
-#endif // LOG_DISABLE_LOGS
-
    return 0;
 }
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1,83 +1,19 @@
-#include "build-info.h"
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"

 #include <cmath>
-#include <cstdio>
-#include <cstring>
 #include <ctime>
 #include <sstream>
+#include <cstring>
 #include <thread>
 #include <mutex>
-#include <vector>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-struct results_perplexity {
-    std::vector<llama_token> tokens;
-    double                   ppl_value;
-    std::vector<float>       logits;
-    std::vector<float>       probs;
-};
-
-struct results_log_softmax {
-    double log_softmax;
-    float  logit;
-    float  prob;
-};
-
-static void write_logfile(
-    const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const struct results_perplexity & results
-) {
-    if (params.logdir.empty()) {
-        return;
-    }
-
-    if (params.hellaswag) {
-        fprintf(stderr, "%s: warning: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
-        return;
-    }
-
-    const std::string timestamp = get_sortable_timestamp();
-
-    const bool success = create_directory_with_parents(params.logdir);
-    if (!success) {
-        fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
-                __func__, params.logdir.c_str());
-        return;
-    }
-
-    const std::string logfile_path = params.logdir + timestamp + ".yml";
-    FILE * logfile = fopen(logfile_path.c_str(), "w");
-
-    if (logfile == NULL) {
-        fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
-        return;
-    }
-
-    fprintf(logfile, "binary: main\n");
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);
-
-    fprintf(logfile, "\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "# Perplexity Results #\n");
-    fprintf(logfile, "######################\n");
-    fprintf(logfile, "\n");
-
-    dump_vector_float_yaml(logfile, "logits", results.logits);
-    fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
-    dump_vector_float_yaml(logfile, "probs", results.probs);
-
-    llama_dump_timing_info_yaml(logfile, ctx);
-    fclose(logfile);
-}
-
-static std::vector<float> softmax(const std::vector<float>& logits) {
+std::vector<float> softmax(const std::vector<float>& logits) {
    std::vector<float> probs(logits.size());
    float max_logit = logits[0];
    for (float v : logits) max_logit = std::max(max_logit, v);
@@ -93,21 +29,20 @@ static std::vector<float> softmax(const std::vector<float>& logits) {
    return probs;
 }

-static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
+float log_softmax(int n_vocab, const float * logits, int tok) {
    float max_logit = logits[0];
    for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
    double sum_exp = 0.0;
    for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
-    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
+    return logits[tok] - max_logit - log(sum_exp);
 }

-static void process_logits(
-    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
-    double & nll, double & nll2, float * logit_history, float * prob_history
-) {
+void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread>& workers,
+        double& nll, double& nll2) {
+
    std::mutex mutex;
    int counter = 0;
-    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
+    auto compute = [&mutex, &counter, &nll, &nll2, n_vocab, logits, tokens, n_token] () {
        double local_nll = 0, local_nll2 = 0;
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
@@ -117,51 +52,34 @@ static void process_logits(
                break;
            }
            lock.unlock();
-            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
-            const double v = -results.log_softmax;
+            double v = -log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
            local_nll += v;
            local_nll2 += v*v;
-
-            logit_history[i] = results.logit;
-            prob_history[i]  = results.prob;
        }
    };
-    for (auto & w : workers) w = std::thread(compute);
+    for (auto& w : workers) w = std::thread(compute);
    compute();
-    for (auto & w : workers) w.join();
+    for (auto& w : workers) w.join();

 }

-static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
+void perplexity_v2(llama_context * ctx, const gpt_params & params) {
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
    // Output: `perplexity: 13.5106 [114/114]`
    // BOS tokens will be added for each chunk before eval

+    if (params.ppl_stride <= 0) {
+        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
+        return;
+    }
+
    const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
    const bool add_bos = is_spm;

    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
-
-    if (int(tokens.size()) < 2*params.n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
-                params.n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
-        return {std::move(tokens), 0., {}, {}};
-    }
-
-    std::vector<float>       logit_history;
-    std::vector<float>       prob_history;
-
-    logit_history.resize(tokens.size());
-    prob_history.resize(tokens.size());
-
-    if (params.ppl_stride <= 0) {
-        fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
-        return {tokens, -1, logit_history, prob_history};
-    }
+    auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);

    const int calc_chunk = params.n_ctx;

@@ -170,7 +88,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
    if (int(tokens.size()) <= calc_chunk) {
        fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
                tokens.size(), params.n_ctx, params.ppl_stride);
-        return {tokens, -1, logit_history, prob_history};
+        return;
    }

    const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1)  / params.ppl_stride;
@@ -202,7 +120,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
            //fprintf(stderr, "    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
                //fprintf(stderr, "%s : failed to eval\n", __func__);
-                return {tokens, -1, logit_history, prob_history};
+                return;
            }

            // save original token and restore it after eval
@@ -243,8 +161,6 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
                logits.begin() + (j + 1) * n_vocab);

            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
-            logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
-            prob_history[start + j + 1]  = prob;

            nll += -std::log(prob);
            ++count;
@@ -258,13 +174,12 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
        fflush(stdout);
    }
    printf("\n");
-
-    return {tokens, std::exp(nll / count), logit_history, prob_history};
 }

-static results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
+void perplexity(llama_context * ctx, const gpt_params & params) {
    if (params.ppl_stride > 0) {
-        return perplexity_v2(ctx, params);
+        perplexity_v2(ctx, params);
+        return;
    }

    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
@@ -278,24 +193,11 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
+    auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);

    auto tim2 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

-    if (int(tokens.size()) < 2*params.n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens to evaluate perplexity with a context of %d\n",__func__,2*params.n_ctx,
-                params.n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
-        return {std::move(tokens), 0., {}, {}};
-    }
-
-    std::vector<float> logit_history;
-    logit_history.resize(tokens.size());
-
-    std::vector<float> prob_history;
-    prob_history.resize(tokens.size());
-
    const int n_chunk_max = tokens.size() / params.n_ctx;

    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
@@ -334,7 +236,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par

            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
-                return {tokens, -1, logit_history, prob_history};
+                return;
            }

            // restore the original token in case it was set to BOS
@@ -369,9 +271,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-        const int first = params.n_ctx/2;
-        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first,
-                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
+        const int first = std::min(512, params.n_ctx/2);
+        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, workers, nll, nll2);
        count += params.n_ctx - first - 1;

        // perplexity is e^(average negative log-likelihood)
@@ -386,24 +287,20 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        fflush(stdout);
    }
    printf("\n");
-
    nll2 /= count;
    nll /= count;
-    const double ppl = exp(nll);
    nll2 -= nll * nll;
    if (nll2 > 0) {
        nll2 = sqrt(nll2/(count-1));
+        double ppl = exp(nll);
        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
    } else {
        printf("Unexpected negative standard deviation of log(prob)\n");
    }
-
-    return {tokens, ppl, logit_history, prob_history};
 }

-static std::vector<float> hellaswag_evaluate_tokens(
-    llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread
-) {
+std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
+        int n_vocab, int n_thread) {
    std::vector<float> result;
    result.reserve(tokens.size() * n_vocab);
    size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
@@ -423,7 +320,7 @@ static std::vector<float> hellaswag_evaluate_tokens(
    return result;
 }

-static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
+void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    // Calculates hellaswag score (acc_norm) from prompt
    //
    // Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
@@ -657,7 +554,7 @@ int main(int argc, char ** argv) {
    gpt_params params;

    params.n_batch = 512;
-    if (!gpt_params_parse(argc, argv, params)) {
+    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }

@@ -670,7 +567,12 @@ int main(int argc, char ** argv) {
        params.n_ctx += params.ppl_stride/2;
    }

-    print_build_info();
+    if (params.n_ctx > 2048) {
+        fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
+                "expect poor results\n", __func__, params.n_ctx);
+    }
+
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
@@ -695,12 +597,6 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx_train = llama_n_ctx_train(ctx);
-    if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, params.n_ctx);
-    }
-
    // print system information
    {
        fprintf(stderr, "\n");
@@ -708,16 +604,13 @@ int main(int argc, char ** argv) {
                params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
    }

-    struct results_perplexity results;
    if (params.hellaswag) {
        hellaswag_score(ctx, params);
    } else {
-        results = perplexity(ctx, params);
+        perplexity(ctx, params);
    }

    llama_print_timings(ctx);
-    write_logfile(ctx, params, model, results);
-
    llama_free(ctx);
    llama_free_model(model);

--- a/examples/quantize-stats/CMakeLists.txt
+++ b/examples/quantize-stats/CMakeLists.txt
@@ -2,5 +2,4 @@ set(TARGET quantize-stats)
 add_executable(${TARGET} quantize-stats.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
-target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,7 +1,7 @@
-#define LLAMA_API_INTERNAL
-#include "build-info.h"
-#include "common.h"
 #include "ggml.h"
+#include "build-info.h"
+
+#define LLAMA_API_INTERNAL
 #include "llama.h"

 #include <algorithm>
@@ -34,8 +34,8 @@ struct quantize_stats_params {
    std::vector<enum ggml_type> include_types;
 };

-constexpr size_t HISTOGRAM_BUCKETS = 150;
-constexpr double HISTOGRAM_RANGE = 0.03;
+const size_t HISTOGRAM_BUCKETS = 150;
+const double HISTOGRAM_RANGE = 0.03;

 struct error_stats {
    size_t num_samples;
@@ -44,7 +44,8 @@ struct error_stats {
    uint64_t error_histogram[HISTOGRAM_BUCKETS];
 };

-static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
+
+void quantize_stats_print_usage(int /*argc*/, char ** argv) {
    quantize_stats_params params;
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
@@ -70,7 +71,7 @@ static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
 }

 // Check if a layer is included/excluded by command line
-static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
+bool layer_included(const quantize_stats_params params, const std::string & layer) {
    for (const auto& excluded : params.exclude_layers) {
        if (std::regex_search(layer, std::regex(excluded))) {
            return false;
@@ -85,7 +86,7 @@ static bool layer_included(const quantize_stats_params & params, const std::stri
 }

 // Update error statistics given vectors with the before/after result of quantization
-static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
+void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
    for (int64_t i = 0; i < nelements; i++) {
        double diff = input[i] - output[i];
        stats.total_error += diff * diff;
@@ -95,14 +96,14 @@ static void update_error_stats(int64_t nelements, const float * input, const flo
    stats.num_samples += nelements;
 }

-static void combine_error_stats(error_stats & into, const error_stats & from) {
+void combine_error_stats(error_stats & into, const error_stats & from) {
    into.num_samples += from.num_samples;
    into.total_error += from.total_error;
    if (from.max_error > into.max_error) into.max_error = from.max_error;
    for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
 }

-static double find_quantile(const error_stats & stats, double quantile) {
+double find_quantile(const error_stats & stats, double quantile) {
    double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);

    double accum = 0;
@@ -115,7 +116,7 @@ static double find_quantile(const error_stats & stats, double quantile) {
    return INFINITY;
 }

-static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
+void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
    double rmse = sqrt(stats.total_error / (double) stats.num_samples);
    double median = find_quantile(stats, .5);
    double pct95 = find_quantile(stats, .95);
@@ -142,10 +143,17 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }

-static void test_roundtrip_on_chunk(
-    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
-    float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
-) {
+void test_roundtrip_on_chunk(
+        const ggml_tensor * layer,
+        int64_t offset,
+        int64_t chunk_size,
+        const ggml_type_traits_t & qfns,
+        bool use_reference,
+        float * input_scratch,
+        char * quantized_scratch,
+        float * output_scratch,
+        error_stats & stats) {
+
    if (layer->type == GGML_TYPE_F16) {
        for (int i = 0; i < chunk_size; i++) {
            input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
@@ -166,11 +174,18 @@ static void test_roundtrip_on_chunk(


 // Run quantization function for a single layer and update error stats
-static void test_roundtrip_on_layer(
-    std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
-    const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
-    std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
-) {
+void test_roundtrip_on_layer(
+        std::string & name,
+        bool print_layer_stats,
+        const ggml_type_traits_t & qfns,
+        bool use_reference,
+        const ggml_tensor * layer,
+        std::vector<float> & input_scratch,
+        std::vector<char> & quantized_scratch,
+        std::vector<float> & output_scratch,
+        error_stats & total_error,
+        int max_thread = 0) {
+
    assert(tensor_is_contiguous(layer));
    error_stats layer_error {};
    uint64_t nelements = ggml_nelements(layer);
@@ -299,7 +314,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    print_build_info();
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

    // load the model
    fprintf(stderr, "Loading model\n");
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -2,7 +2,6 @@ set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
-target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -1,5 +1,5 @@
 #include "build-info.h"
-#include "common.h"
+
 #include "llama.h"

 #include <cstdio>
@@ -35,12 +35,10 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q8_0",   LLAMA_FTYPE_MOSTLY_Q8_0,   " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
    { "F16",    LLAMA_FTYPE_MOSTLY_F16,    "13.00G              @ 7B", },
    { "F32",    LLAMA_FTYPE_ALL_F32,       "26.00G              @ 7B", },
-    // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
-    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
 };


-static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
+bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
    std::string ftype_str;

    for (auto ch : ftype_str_in) {
@@ -72,18 +70,13 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
 // usage:
 //  ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
 //
-static void usage(const char * executable) {
-    printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
-    printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
-    printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
-    printf("\nAllowed quantization types:\n");
+void usage(const char * executable) {
+    fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+    fprintf(stderr, "  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
+    fprintf(stderr, "  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
+    fprintf(stderr, "\nAllowed quantization types:\n");
    for (auto & it : QUANT_OPTIONS) {
-        if (it.name != "COPY") {
-            printf("  %2d  or  ", it.ftype);
-        } else {
-            printf("          ");
-        }
-        printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
+        printf("  %2d  or  %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
    }
    exit(1);
 }
@@ -128,9 +121,6 @@ int main(int argc, char ** argv) {
        // export as [inp path]/ggml-model-[ftype].gguf
        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
        arg_idx++;
-        if (ftype_str == "COPY") {
-            params.only_copy = true;
-        }
    }
    else {
        fname_out = argv[arg_idx];
@@ -144,9 +134,6 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
            return 1;
        }
-        if (ftype_str == "COPY") {
-           params.only_copy = true;
-        }
        arg_idx++;
    }

@@ -161,7 +148,7 @@ int main(int argc, char ** argv) {
        }
    }

-    print_build_info();
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

    fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
    if (params.nthread > 0) {
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -1,6 +1,6 @@
-#include "build-info.h"
 #include "common.h"
 #include "llama.h"
+#include "build-info.h"

 #include <vector>
 #include <cstdio>
@@ -13,11 +13,11 @@ int main(int argc, char ** argv) {
    params.repeat_last_n = 64;
    params.prompt = "The quick brown fox";

-    if (!gpt_params_parse(argc, argv, params)) {
+    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }

-    print_build_info();
+    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

    if (params.n_predict < 0) {
        params.n_predict = 16;
@@ -44,7 +44,7 @@ int main(int argc, char ** argv) {
        llama_free_model(model);
        return 1;
    }
-    auto tokens = llama_tokenize(ctx, params.prompt, true);
+    auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
    auto n_prompt_tokens = tokens.size();
    if (n_prompt_tokens < 1) {
        fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -145,29 +145,7 @@
      color: #888;
    }

-
-    @keyframes loading-bg-wipe {
-      0% {
-        background-position: 0%;
-      }
-      100% {
-        background-position: 100%;
-      }
-    }
-
-    .loading {
-      --loading-color-1: #eeeeee00;
-      --loading-color-2: #eeeeeeff;
-      background-size: 50% 100%;
-      background-image: linear-gradient(90deg, var(--loading-color-1), var(--loading-color-2), var(--loading-color-1));
-      animation: loading-bg-wipe 2s linear infinite;
-    }
-
    @media (prefers-color-scheme: dark) {
-      .loading {
-        --loading-color-1: #22222200;
-        --loading-color-2: #222222ff;
-      }
      .popover-content {
        background-color: black;
      }
@@ -343,10 +321,7 @@
    const llamaStats = signal(null)
    const controller = signal(null)

-    // currently generating a completion?
-    const generating = computed(() => controller.value != null)
-
-    // has the user started a chat?
+    const generating = computed(() => controller.value == null )
    const chatStarted = computed(() => session.value.transcript.length > 0)

    const transcriptUpdate = (transcript) => {
@@ -455,19 +430,11 @@
      return html`
        <form onsubmit=${submit}>
          <div>
-            <textarea
-               className=${generating.value ? "loading" : null}
-               oninput=${(e) => message.value = e.target.value}
-               onkeypress=${enterSubmits}
-               placeholder="Say something..."
-               rows=2
-               type="text"
-               value="${message}"
-            />
+            <textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>
          </div>
          <div class="right">
-            <button type="submit" disabled=${generating.value}>Send</button>
-            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
+            <button type="submit" disabled=${!generating.value} >Send</button>
+            <button onclick=${stop} disabled=${generating}>Stop</button>
            <button onclick=${reset}>Reset</button>
          </div>
        </form>
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -17,8 +17,6 @@
 #include "completion.js.hpp"
 #include "json-schema-to-grammar.mjs.hpp"

-#include <cstddef>
-
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
 #endif
@@ -118,7 +116,7 @@ static void server_log(const char *level, const char *function, int line,
    }

    const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
-    printf("%.*s\n", (int)str.size(), str.data());
+    fprintf(stdout, "%.*s\n", (int)str.size(), str.data());
    fflush(stdout);
 }

@@ -139,7 +137,7 @@ static std::string tokens_to_output_formatted_string(const llama_context *ctx, c
 }

 // convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> & probs)
+static json probs_vector_to_json(const llama_context *ctx, const std::vector<completion_token_output> probs)
 {
    json out = json::array();
    for (const auto &prob : probs)
@@ -271,7 +269,7 @@ struct llama_server_context
        return true;
    }

-    std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
+    std::vector<llama_token> tokenize(json json_prompt, bool add_bos)
    {
        // If `add_bos` is true, we only add BOS, when json_prompt is a string,
        // or the first element of the json_prompt array is a string.
@@ -611,7 +609,7 @@ struct llama_server_context

    completion_token_output doCompletion()
    {
-        auto token_with_probs = nextToken();
+        const completion_token_output token_with_probs = nextToken();

        const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok);
        generated_text += token_text;
@@ -694,50 +692,50 @@ struct llama_server_context
 static void server_print_usage(const char *argv0, const gpt_params &params,
                               const server_params &sparams)
 {
-    printf("usage: %s [options]\n", argv0);
-    printf("\n");
-    printf("options:\n");
-    printf("  -h, --help            show this help message and exit\n");
-    printf("  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    printf("  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-freq-base N    RoPE base frequency (default: loaded from model)\n");
-    printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: loaded from model)\n");
-    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    fprintf(stdout, "usage: %s [options]\n", argv0);
+    fprintf(stdout, "\n");
+    fprintf(stdout, "options:\n");
+    fprintf(stdout, "  -h, --help            show this help message and exit\n");
+    fprintf(stdout, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
+    fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
+    fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
+    fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
+    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
    if (llama_mlock_supported())
    {
-        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+        fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
    if (llama_mmap_supported())
    {
-        printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+        fprintf(stdout, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
    }
-    printf("  --numa                attempt optimizations that help on some NUMA systems\n");
+    fprintf(stdout, "  --numa                attempt optimizations that help on some NUMA systems\n");
 #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-    printf("  -ngl N, --n-gpu-layers N\n");
-    printf("                        number of layers to store in VRAM\n");
-    printf("  -ts SPLIT --tensor-split SPLIT\n");
-    printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
-    printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
-    printf("  -lv, --low-vram       don't allocate VRAM scratch buffer\n");
-    printf("  -nommq, --no-mul-mat-q\n");
-    printf("                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
-    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
+    fprintf(stdout, "  -ngl N, --n-gpu-layers N\n");
+    fprintf(stdout, "                        number of layers to store in VRAM\n");
+    fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n");
+    fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+    fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n");
+    fprintf(stdout, "  -nommq, --no-mul-mat-q\n");
+    fprintf(stdout, "                        use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
+    fprintf(stdout, "                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif
-    printf("  -m FNAME, --model FNAME\n");
-    printf("                        model path (default: %s)\n", params.model.c_str());
-    printf("  -a ALIAS, --alias ALIAS\n");
-    printf("                        set an alias for the model, will be added as `model` field in completion response\n");
-    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    printf("  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
-    printf("  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
-    printf("  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
-    printf("  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str());
-    printf("  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
-    printf("  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
-    printf("\n");
+    fprintf(stdout, "  -m FNAME, --model FNAME\n");
+    fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stdout, "  -a ALIAS, --alias ALIAS\n");
+    fprintf(stdout, "                        set an alias for the model, will be added as `model` field in completion response\n");
+    fprintf(stdout, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
+    fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
+    fprintf(stdout, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
+    fprintf(stdout, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port);
+    fprintf(stdout, "  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str());
+    fprintf(stdout, "  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
+    fprintf(stdout, "  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
+    fprintf(stdout, "\n");
 }

 static void server_params_parse(int argc, char **argv, server_params &sparams,
@@ -1040,7 +1038,7 @@ static json format_timings(llama_server_context &llama)
 {
    const auto timings = llama_get_timings(llama.ctx);

-    assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
+    assert(timings.n_eval == llama.num_tokens_predicted);

    return json{
        {"prompt_n", timings.n_p_eval},
@@ -1083,9 +1081,8 @@ static json format_final_response(llama_server_context &llama, const std::string
    return res;
 }

-static json format_partial_response(
-    llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs
-) {
+static json format_partial_response(llama_server_context &llama, const std::string &content, const std::vector<completion_token_output> &probs)
+{
    json res = json{
        {"content", content},
        {"stop", false},
@@ -1216,7 +1213,7 @@ static void log_server_request(const Request &req, const Response &res)
                           });
 }

-static bool is_at_eob(llama_server_context &server_context, const llama_token *tokens, const size_t n_tokens) {
+bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
    return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
 }

@@ -1226,7 +1223,7 @@ static bool is_at_eob(llama_server_context &server_context, const llama_token *t
 //  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
 //    This is also called when the stop condition is met.
 //    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
-static void beam_search_callback(void *callback_data, llama_beams_state beams_state) {
+void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
    auto & llama = *static_cast<llama_server_context*>(callback_data);
    // Mark beams as EOS as needed.
    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
@@ -1242,7 +1239,7 @@ static void beam_search_callback(void *callback_data, llama_beams_state beams_st
        const llama_token * tokens = beams_state.beam_views[0].tokens;
        const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
        std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
-        printf("%zu", n);
+        printf("%lu", n);
    }
    fflush(stdout);
 #if 0 // DEBUG: print current beams for this iteration
@@ -1256,11 +1253,10 @@ static void beam_search_callback(void *callback_data, llama_beams_state beams_st
 struct token_translator {
    llama_context * ctx;
    std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
-    std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
+    std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
 };

-static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama)
-{
+void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
    auto & gtps = llama.generated_token_probs;
    auto translator = token_translator{llama.ctx};
    auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
@@ -1381,13 +1377,7 @@ int main(int argc, char **argv)
                }
            }

-            auto probs = llama.generated_token_probs;
-            if (llama.params.n_probs > 0 && llama.stopped_word) {
-                const std::vector<llama_token> stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false);
-                probs = std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size());
-            }
-
-            const json data = format_final_response(llama, llama.generated_text, probs);
+            const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);

            llama_print_timings(llama.ctx);

@@ -1464,11 +1454,7 @@ int main(int argc, char **argv)

                    if (!llama.has_next_token) {
                        // Generation is done, send extra information.
-                        const json data = format_final_response(
-                            llama,
-                            "",
-                            std::vector<completion_token_output>(llama.generated_token_probs.begin(), llama.generated_token_probs.begin() + sent_token_probs_index)
-                        );
+                        const json data = format_final_response(llama, "", llama.generated_token_probs);

                        const std::string str =
                            "data: " +
@@ -1562,7 +1548,7 @@ int main(int argc, char **argv)

    svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep)
                              {
-        const char fmt[] = "500 Internal Server Error\n%s";
+        const auto * fmt = "500 Internal Server Error\n%s";
        char buf[BUFSIZ];
        try {
            std::rethrow_exception(std::move(ep));
@@ -1597,7 +1583,7 @@ int main(int argc, char **argv)
    svr.set_base_dir(sparams.public_path);

    // to make it ctrl+clickable:
-    printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
+    fprintf(stdout, "\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);

    LOG_INFO("HTTP server listening", {
                                          {"hostname", sparams.hostname},
--- a/examples/simple/CMakeLists.txt
+++ b/examples/simple/CMakeLists.txt
@@ -3,3 +3,6 @@ add_executable(${TARGET} simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
+if(TARGET BUILD_INFO)
+  add_dependencies(${TARGET} BUILD_INFO)
+endif()
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -1,3 +1,9 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "build-info.h"
+
 #include "common.h"
 #include "llama.h"

--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -1,311 +0,0 @@
-#include "build-info.h"
-
-#include "common.h"
-#include "llama.h"
-#include "grammar-parser.h"
-
-#include <cmath>
-#include <cstdio>
-#include <string>
-#include <vector>
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (params.model_draft.empty()) {
-        fprintf(stderr, "%s: error: --model-draft is required\n", __func__);
-        return 1;
-    }
-
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("speculative", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
-    // init llama.cpp
-    llama_backend_init(params.numa);
-
-    llama_model * model_tgt = NULL;
-    llama_model * model_dft = NULL;
-
-    llama_context * ctx_tgt = NULL;
-    llama_context * ctx_dft = NULL;
-
-    // load the target model
-    params.perplexity = true; // HACK: enable logits_all = true
-    std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
-
-    // load the draft model
-    params.model = params.model_draft;
-    params.n_gpu_layers = params.n_gpu_layers_draft;
-    std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
-
-    // tokenize the prompt
-    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
-
-    const int max_context_size     = llama_n_ctx(ctx_tgt);
-    const int max_tokens_list_size = max_context_size - 4;
-
-    if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
-        return 1;
-    }
-
-    fprintf(stderr, "\n\n");
-
-    for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx_tgt, id).c_str());
-    }
-
-    fflush(stderr);
-
-    const int n_input = inp.size();
-
-    const auto t_enc_start = ggml_time_us();
-
-    // eval the prompt with both models
-    llama_eval(ctx_tgt,  inp.data(), int(inp.size() - 1), 0, params.n_threads);
-    llama_eval(ctx_tgt, &inp.back(),      1, inp.size() - 1, params.n_threads);
-    llama_eval(ctx_dft,  inp.data(),     int(inp.size()), 0, params.n_threads);
-
-    const auto t_enc_end = ggml_time_us();
-
-    // the 2 models should have the same vocab
-    const int n_ctx   = llama_n_ctx(ctx_tgt);
-    const int n_vocab = llama_n_vocab(ctx_tgt);
-    //GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
-
-    // how many tokens to draft each time
-    int n_draft = params.n_draft;
-
-    int n_predict = 0;
-    int n_drafted = 0;
-    int n_accept  = 0;
-
-    int n_past_tgt = inp.size();
-    int n_past_dft = inp.size();
-
-    std::vector<llama_token> drafted;
-
-    std::vector<llama_token> last_tokens(n_ctx);
-    std::fill(last_tokens.begin(), last_tokens.end(), 0);
-
-    for (auto & id : inp) {
-        last_tokens.erase(last_tokens.begin());
-        last_tokens.push_back(id);
-    }
-
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-
-    // used to determine end of generation
-    bool has_eos = false;
-
-    // grammar stuff
-    struct llama_grammar * grammar_dft = NULL;
-    struct llama_grammar * grammar_tgt = NULL;
-
-    grammar_parser::parse_state parsed_grammar;
-
-    // if requested - load the grammar, error checking is omitted for brevity
-    if (!params.grammar.empty()) {
-        parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-        // will be empty (default) if there are parse errors
-        if (parsed_grammar.rules.empty()) {
-            return 1;
-        }
-
-        std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-        grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-    }
-
-    const auto t_dec_start = ggml_time_us();
-
-    while (true) {
-        LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
-
-        int i_dft = 0;
-
-        while (true) {
-            // sample from the target model
-            const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
-
-            // remember which tokens were sampled - used for repetition penalties during sampling
-            last_tokens.erase(last_tokens.begin());
-            last_tokens.push_back(id);
-
-            //LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, last_tokens));
-
-            const std::string token_str = llama_token_to_piece(ctx_tgt, id);
-            printf("%s", token_str.c_str());
-            fflush(stdout);
-
-            if (id == llama_token_eos(ctx_tgt)) {
-                has_eos = true;
-            }
-
-            ++n_predict;
-
-            // check if the draft matches the target
-            if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
-                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
-                ++n_accept;
-                ++n_past_tgt;
-                ++n_past_dft;
-                ++i_dft;
-
-                continue;
-            }
-
-            // the drafted token was rejected or we are out of drafted tokens
-
-            if (i_dft < (int) drafted.size()) {
-                LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
-                        i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
-            } else {
-                LOG("out of drafted tokens\n");
-            }
-
-            llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
-            ++n_past_dft;
-
-            // heuristic for n_draft
-            {
-                const int  n_draft_cur  = (int) drafted.size();
-                const bool all_accepted = i_dft == n_draft_cur;
-
-                LOG("n_draft      = %d\n", n_draft);
-                LOG("n_draft_cur  = %d\n", n_draft_cur);
-                LOG("i_dft        = %d\n", i_dft);
-                LOG("all_accepted = %d\n", all_accepted);
-
-                if (all_accepted && n_draft == n_draft_cur) {
-                    LOG(" - max drafted tokens accepted - n_draft += 8\n");
-                    n_draft = std::min(30, n_draft + 8);
-                } else if (all_accepted) {
-                    LOG(" - partially drafted tokens accepted - no change\n");
-                } else {
-                    LOG(" - drafted token rejected - n_draft -= 1\n");
-                    n_draft = std::max(2, n_draft - 1);
-                }
-            }
-
-            drafted.clear();
-            drafted.push_back(id);
-
-            break;
-        }
-
-        if (n_predict > params.n_predict || has_eos) {
-            break;
-        }
-
-        if (grammar_tgt) {
-            if (grammar_dft) {
-                llama_grammar_free(grammar_dft);
-            }
-            grammar_dft = llama_grammar_copy(grammar_tgt);
-
-            LOG("copied target grammar to draft grammar\n");
-        }
-
-        // sample n_draft tokens from the draft model using greedy decoding
-        int n_past_cur = n_past_dft;
-        for (int i = 0; i < n_draft; ++i) {
-            float * logits = llama_get_logits(ctx_dft);
-
-            candidates.clear();
-            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-            }
-
-            llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
-
-            if (grammar_dft != NULL) {
-                llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
-            }
-
-            // computes softmax and sorts the candidates
-            llama_sample_softmax(ctx_dft, &cur_p);
-
-            for (int i = 0; i < 3; ++i) {
-                LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
-            }
-
-            // TODO: better logic?
-            if (cur_p.data[0].p < 2*cur_p.data[1].p) {
-                LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
-                break;
-            }
-
-            // drafted token
-            const llama_token id = cur_p.data[0].id;
-
-            drafted.push_back(id);
-            ++n_drafted;
-
-            // no need to evaluate the last drafted token, since we won't use the result
-            if (i == n_draft - 1) {
-                break;
-            }
-
-            // evaluate the drafted token on the draft model
-            llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
-            ++n_past_cur;
-
-            if (grammar_dft != NULL) {
-                llama_grammar_accept_token(ctx_dft, grammar_dft, id);
-            }
-        }
-
-        // evaluate the target model on the drafted tokens
-        llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads);
-        ++n_past_tgt;
-
-        // the first token is always proposed by the traget model before the speculation loop
-        drafted.erase(drafted.begin());
-    }
-
-    auto t_dec_end = ggml_time_us();
-
-    LOG_TEE("\n\n");
-
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
-
-    // TODO: make sure these numbers are computed correctly
-    LOG_TEE("\n");
-    LOG_TEE("n_draft   = %d\n", n_draft);
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_drafted = %d\n", n_drafted);
-    LOG_TEE("n_accept  = %d\n", n_accept);
-    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
-
-    LOG_TEE("\ndraft:\n");
-    llama_print_timings(ctx_dft);
-
-    LOG_TEE("\ntarget:\n");
-    llama_print_timings(ctx_tgt);
-
-    llama_free(ctx_tgt);
-    llama_free_model(model_tgt);
-
-    llama_free(ctx_dft);
-    llama_free_model(model_dft);
-
-    if (grammar_dft != NULL) {
-        llama_grammar_free(grammar_dft);
-        llama_grammar_free(grammar_tgt);
-    }
-    llama_backend_free();
-
-    fprintf(stderr, "\n\n");
-
-    return 0;
-}
--- a/examples/train-text-from-scratch/README.md
+++ b/examples/train-text-from-scratch/README.md
@@ -8,15 +8,15 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s

 # train
 ./bin/train-text-from-scratch \
-        --vocab-model ../models/ggml-vocab-llama.gguf \
+        --vocab-model ../models/ggml-vocab.bin \
        --ctx 64 --embd 256 --head 8 --layer 16 \
-        --checkpoint-in  chk-shakespeare-256x16.gguf \
-        --checkpoint-out chk-shakespeare-256x16.gguf \
-        --model-out ggml-shakespeare-256x16-f32.gguf \
+        --checkpoint-in  chk-shakespeare-256x16.bin \
+        --checkpoint-out chk-shakespeare-256x16.bin \
+        --model-out ggml-shakespeare-256x16-f32.bin \
        --train-data "shakespeare.txt" \
-        -t 6 -b 16 --seed 1 --adam-iter 256 \
-        --no-checkpointing
+        -t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
+        --print-details-interval 0 --predict 16 --use-flash

 # predict
-./bin/main -m ggml-shakespeare-256x16-f32.gguf
+./bin/main -m ggml-shakespeare-256x16-f32.bin
 ```
--- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
+++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py
@@ -1,495 +0,0 @@
-#!/usr/bin/env python3
-# train-text-from-scratch checkpoint --> gguf conversion
-
-import argparse
-import os
-import struct
-import sys
-import numpy as np
-from pathlib import Path
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf'))
-import gguf
-
-# gguf constants
-LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
-LLM_KV_OPTIMIZER_TYPE_ADAM  = "adam"
-LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
-LLM_KV_OPTIMIZER_FILE_VERSION               = "optimizer.file_version"
-LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT     = "optimizer.convergence_past_count"
-LLM_KV_OPTIMIZER_PARAMETER_COUNT            = "optimizer.parameter_count"
-LLM_KV_OPTIMIZER_ITERATION_COUNT            = "optimizer.iteration_count"
-LLM_KV_OPTIMIZER_JUST_INITIALIZED           = "optimizer.just_initialized"
-LLM_KV_OPTIMIZER_ADAM_BEST_LOSS             = "optimizer.adam.best_loss"
-LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS         = "optimizer.adam.previous_loss"
-LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT  = "optimizer.adam.no_improvement_count"
-LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
-LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS            = "optimizer.lbfgs.best_loss"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP     = "optimizer.lbfgs.line_search_step"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J        = "optimizer.lbfgs.line_search_j"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K        = "optimizer.lbfgs.line_search_k"
-LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END      = "optimizer.lbfgs.line_search_end"
-LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
-
-LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS    = "optimizer.adam.first_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS   = "optimizer.adam.second_moments"
-LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
-
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS  = "optimizer.lbfgs.current_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
-LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS   = "optimizer.lbfgs.current_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS  = "optimizer.lbfgs.previous_gradients"
-LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION    = "optimizer.lbfgs.search_direction"
-LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES    = "optimizer.lbfgs.past_loss_values"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA        = "optimizer.lbfgs.memory_alpha"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS           = "optimizer.lbfgs.memory_ys"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S            = "optimizer.lbfgs.memory_s"
-LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y            = "optimizer.lbfgs.memory_y"
-
-LLM_KV_TRAINING_FILE_VERSION    = "training.file_version"
-LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
-LLM_KV_TRAINING_SAMPLE_COUNT    = "training.sample_count"
-LLM_KV_TRAINING_TOKEN_COUNT     = "training.token_count"
-
-class Tensor:
-    def __init__(self, dtype='f', ne=None):
-        if ne is None:
-            ne = []
-        self.dtype = dtype
-        self.ne = ne
-        self.nbytes = 0
-        if self.dtype == 'f':
-            if len(self.ne) == 0:
-                self.nbytes = 0
-            else:
-                self.nbytes = int(np.product(self.ne)) * 4
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-    def load(self, data, offset):
-        nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        assert(nd == len(self.ne))
-        ne = []
-        for d in range(nd):
-            n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-            ne.append(n)
-
-        assert(tuple(ne) == tuple(self.ne))
-
-        if self.dtype == 'f':
-            assert(dtype == 0)
-        else:
-            raise ValueError(f"Unhandled data type '{self.dtype}'")
-
-        self.name = bytes(data[offset:offset+namelen]); offset += namelen
-        # 32-byte alignment
-        offset += (0 - offset) & 31
-        self.data = data[offset:offset+self.nbytes]
-        offset += self.nbytes
-        return offset
-
-    def max_storage_size(self):
-        result = 0
-        result += 4 # nd
-        result += 4 # namelen
-        result += 4 # dtype
-        result += len(self.ne)*8 # ne
-        result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
-        result += 31 # 32-byte alignment
-        result += self.nbytes
-        return result
-
-    def save_gguf(self, gguf_writer, name):
-        gguf_writer.add_tensor(
-            name=name,
-            tensor=self.data,
-            raw_shape=np.array(list(reversed(self.ne))),
-            raw_dtype=gguf.GGMLQuantizationType.F32)
-
-class OptimizationParamsV0:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.type                 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_threads            = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.past                 = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.delta                = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.print_forward_graph  = struct.unpack('<?', bytes(data[offset:offset + 1]))[0];  offset += 4 # 32bit-aligned
-        self.print_backward_graph = struct.unpack('<?', bytes(data[offset:offset + 1]))[0];  offset += 4 # 32bit-aligned
-        self.adam_n_iter          = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_sched           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_decay           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_alpha           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_beta1           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_beta2           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_eps             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_eps_f           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.adam_eps_g           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_m              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_n_iter         = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_max_linesearch = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_eps            = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_ftol           = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_wolfe          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_min_step       = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_max_step       = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.lbfgs_linesearch     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-class OptimizationContext:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
-        offset += 4
-
-        if self.version == 0:
-            params = OptimizationParamsV0()
-            offset = params.load(data, offset)
-            self.past = params.past
-            self.lbfgs_m = params.lbfgs_m
-            self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0];  offset += 8
-            self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
-            self.type = params.type
-
-            self.adam_m  = Tensor('f', [self.nx])
-            self.adam_v  = Tensor('f', [self.nx])
-            self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
-
-            self.lbfgs_x    = Tensor('f', [self.nx])
-            self.lbfgs_xp   = Tensor('f', [self.nx])
-            self.lbfgs_g    = Tensor('f', [self.nx])
-            self.lbfgs_gp   = Tensor('f', [self.nx])
-            self.lbfgs_d    = Tensor('f', [self.nx])
-            self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
-            self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
-            self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
-
-            if self.type == 0:
-                # these tensors are stored, but we don't need their data
-                x  = Tensor('f', [self.nx])
-                g  = Tensor('f', [self.nx])
-                g2 = Tensor('f', [self.nx])
-                mh = Tensor('f', [self.nx])
-                vh = Tensor('f', [self.nx])
-
-                offset = x.load(data, offset)
-                offset = g.load(data, offset)
-                offset = g2.load(data, offset)
-                offset = self.adam_m.load(data, offset)
-                offset = self.adam_v.load(data, offset)
-                offset = mh.load(data, offset)
-                offset = vh.load(data, offset)
-                offset = self.adam_pf.load(data, offset)
-
-                self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-            elif self.type == 1:
-                offset = self.lbfgs_x.load(data, offset)
-                offset = self.lbfgs_xp.load(data, offset)
-                offset = self.lbfgs_g.load(data, offset)
-                offset = self.lbfgs_gp.load(data, offset)
-                offset = self.lbfgs_d.load(data, offset)
-                offset = self.lbfgs_pf.load(data, offset)
-                offset = self.lbfgs_lmal.load(data, offset)
-                offset = self.lbfgs_lmys.load(data, offset)
-                offset = self.lbfgs_lms.load(data, offset)
-                offset = self.lbfgs_lmy.load(data, offset)
-
-                self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-            else:
-                raise ValueError('Unknown optimizer type')
-
-
-        elif self.version == 1:
-            self.past    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.nx      = struct.unpack('N',  bytes(data[offset:offset + 8]))[0];  offset += 8
-            self.iter    = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-            self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]);  offset += 4
-
-            self.adam_m  = Tensor('f', [self.nx])
-            self.adam_v  = Tensor('f', [self.nx])
-            self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
-
-            self.lbfgs_x    = Tensor('f', [self.nx])
-            self.lbfgs_xp   = Tensor('f', [self.nx])
-            self.lbfgs_g    = Tensor('f', [self.nx])
-            self.lbfgs_gp   = Tensor('f', [self.nx])
-            self.lbfgs_d    = Tensor('f', [self.nx])
-            self.lbfgs_pf   = Tensor('f', [self.past] if self.past > 0 else [])
-            self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
-            self.lbfgs_lms  = Tensor('f', [self.nx, self.lbfgs_m])
-            self.lbfgs_lmy  = Tensor('f', [self.nx, self.lbfgs_m])
-
-            # forgot to save type in version 1:
-            # guess self.type from number of remaining bytes
-            size_type_0 = 12 + sum([t.max_storage_size() for t in
-                                    [self.adam_m, self.adam_v]
-                                    +([self.adam_pf] if (self.past > 0) else [])])
-            size_type_1 = 24 + sum([t.max_storage_size() for t in
-                                    [self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
-                                     self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
-                                     self.lbfgs_lmal, self.lbfgs_lmys,
-                                     self.lbfgs_lms, self.lbfgs_lmy]
-                                     +([self.lbfgs_pf] if (self.past > 0) else [])])
-            # due to alignment padding the size might not by exact
-            # but the difference in size for both types is significant,
-            # so we can just use whichever is closest
-            remaining = len(data) - offset
-            if abs(remaining - size_type_0) < abs(remaining - size_type_1):
-                self.type = 0
-            else:
-                self.type = 1
-
-            if self.type == 0:
-                offset = self.adam_m.load(data, offset)
-                offset = self.adam_v.load(data, offset)
-                offset = self.adam_pf.load(data,offset)
-
-                self.adam_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_fx_prev          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-            elif self.type == 1:
-                offset = self.lbfgs_x.load(data, offset)
-                offset = self.lbfgs_xp.load(data, offset)
-                offset = self.lbfgs_g.load(data, offset)
-                offset = self.lbfgs_gp.load(data, offset)
-                offset = self.lbfgs_d.load(data, offset)
-                offset = self.lbfgs_pf.load(data, offset)
-                offset = self.lbfgs_lmal.load(data, offset)
-                offset = self.lbfgs_lmys.load(data, offset)
-                offset = self.lbfgs_lms.load(data, offset)
-                offset = self.lbfgs_lmy.load(data, offset)
-
-                self.lbfgs_fx_best          = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_step             = struct.unpack('<f', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_j                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_k                = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_end              = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-                self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0];  offset += 4
-
-        else:
-            raise ValueError('Invalid version of checkpoint file')
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
-        gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
-        gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
-        gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
-
-        if self.type == 0:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
-
-            self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
-            self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
-            if self.past > 0:
-                self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
-
-        elif self.type == 1:
-            gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
-            gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
-            gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
-            gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
-
-            self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
-            self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
-            self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
-            self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
-            self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
-            if self.past > 0:
-                self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
-            self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
-            self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
-            self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
-            self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
-        else:
-            raise ValueError('Unknown optimizer type')
-
-class ModelParams:
-    def __init__(self):
-        pass
-
-    def load(self, data, offset):
-        self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_embd  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_mult  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_head  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        self.n_rot   = struct.unpack('<I', bytes(data[offset:offset + 4]))[0];  offset += 4
-        return offset
-
-    def get_n_ff(self):
-        # struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
-        return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
-
-    def save_gguf(self, gguf_writer):
-        # self.n_vocab not saved
-        gguf_writer.add_embedding_length(self.n_embd)
-        gguf_writer.add_head_count(self.n_head)
-        gguf_writer.add_block_count(self.n_layer)
-        gguf_writer.add_rope_dimension_count(self.n_rot)
-        gguf_writer.add_feed_forward_length(self.get_n_ff())
-
-def tensor_name(key, bid=None):
-    return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + ".weight"
-
-class Layer:
-    def __init__(self, params, bid):
-        self.bid = bid
-        self.att_norm = Tensor('f', [params.n_embd])
-        self.wq       = Tensor('f', [params.n_embd, params.n_embd])
-        self.wk       = Tensor('f', [params.n_embd, params.n_embd])
-        self.wv       = Tensor('f', [params.n_embd, params.n_embd])
-        self.wo       = Tensor('f', [params.n_embd, params.n_embd])
-        self.ffn_norm = Tensor('f', [params.n_embd])
-        self.w1       = Tensor('f', [params.n_embd, params.get_n_ff()])
-        self.w2       = Tensor('f', [params.get_n_ff(), params.n_embd])
-        self.w3       = Tensor('f', [params.n_embd, params.get_n_ff()])
-
-    def load(self, data, offset):
-        offset = self.att_norm.load(data, offset)
-        offset = self.wq.load(data, offset)
-        offset = self.wk.load(data, offset)
-        offset = self.wv.load(data, offset)
-        offset = self.wo.load(data, offset)
-        offset = self.ffn_norm.load(data, offset)
-        offset = self.w1.load(data, offset)
-        offset = self.w2.load(data, offset)
-        offset = self.w3.load(data, offset)
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.att_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid))
-        self.wq.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q,    self.bid))
-        self.wk.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K,    self.bid))
-        self.wv.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V,    self.bid))
-        self.wo.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT,  self.bid))
-        self.ffn_norm.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM,  self.bid))
-        self.w1.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE,  self.bid))
-        self.w2.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN,  self.bid))
-        self.w3.save_gguf      (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP,    self.bid))
-
-class Model:
-    def __init__(self):
-        self.params = ModelParams()
-        self.layers = []
-
-    def load(self, data, offset):
-        offset = self.params.load(data, offset)
-
-        self.tok_embd = Tensor('f', [self.params.n_embd, self.params.n_vocab])
-        self.norm     = Tensor('f', [self.params.n_embd])
-        self.output   = Tensor('f', [self.params.n_embd, self.params.n_vocab])
-
-        offset = self.tok_embd.load(data, offset)
-        offset = self.norm.load(data, offset)
-        offset = self.output.load(data, offset)
-
-        self.layers.clear()
-        for bid in range(self.params.n_layer):
-            layer = Layer(self.params, bid)
-            offset = layer.load(data, offset)
-            self.layers.append(layer)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        self.params.save_gguf(gguf_writer)
-
-        self.tok_embd.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD))
-        self.norm.save_gguf    (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM))
-        self.output.save_gguf  (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT))
-
-        for layer in self.layers:
-            layer.save_gguf(gguf_writer)
-
-class Checkpoint:
-    def __init__(self):
-        self.model = Model()
-        self.opt_ctx = OptimizationContext()
-
-    def load(self, data, offset):
-        magic   = bytes(reversed(data[offset:offset + 4])); offset += 4
-        if magic != b'ggcp':
-            raise ValueError(f"File header magic indicates, that this is no checkpoint file. Expected 'ggcp', Got '{str(magic)}'")
-
-        self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        if self.version != 0:
-            raise ValueError('Invalid version of checkpoint file')
-
-        self.train_its     = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-        self.train_tokens  = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
-
-        offset = self.model.load(data, offset)
-        offset = self.opt_ctx.load(data, offset)
-
-        return offset
-
-    def save_gguf(self, gguf_writer):
-        gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
-        gguf_writer.add_layer_norm_rms_eps(1e-5)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION,    0)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT,    self.train_samples)
-        gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT,     self.train_tokens)
-        self.model.save_gguf(gguf_writer)
-        self.opt_ctx.save_gguf(gguf_writer)
-
-def handle_args():
-    parser = argparse.ArgumentParser(description = 'Convert train-text-from-scratch checkpoints to GGUF')
-    parser.add_argument('--input',  '-i', type = Path, help = 'Input train checkpoint filename', required=True)
-    parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename', required=True)
-    return parser.parse_args()
-
-def main():
-    cfg = handle_args()
-    data = np.memmap(cfg.input, mode = 'r')
-    chk = Checkpoint()
-    offset = 0
-    offset = chk.load(data, offset)
-    # we should have read all available data
-    assert(offset == len(data))
-
-    gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
-    chk.save_gguf(gguf_writer)
-    print("    gguf: write header")
-    gguf_writer.write_header_to_file()
-    print("    gguf: write metadata")
-    gguf_writer.write_kv_data_to_file()
-    print("    gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-    gguf_writer.close()
-
-if __name__ == '__main__':
-    main()
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
--- a/flake.nix
+++ b/flake.nix
@@ -34,7 +34,7 @@
            with pkgs; [ openblas ]
        );
        pkgs = import nixpkgs { inherit system; };
-        nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
+        nativeBuildInputs = with pkgs; [ cmake ninja pkgconfig ];
        llama-python =
          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
        postPatch = ''
@@ -45,15 +45,12 @@
        postInstall = ''
          mv $out/bin/main $out/bin/llama
          mv $out/bin/server $out/bin/llama-server
-          mkdir -p $out/include
-          cp ${src}/llama.h $out/include/
        '';
        cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
      in
      {
        packages.default = pkgs.stdenv.mkDerivation {
-          inherit name src meta postPatch nativeBuildInputs postInstall;
-          buildInputs = osSpecific;
+          inherit name src meta postPatch nativeBuildInputs buildInputs postInstall;
          cmakeFlags = cmakeFlags
            ++ (if isAarch64 && isDarwin then [
            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
@@ -96,10 +93,6 @@
          type = "app";
          program = "${self.packages.${system}.default}/bin/quantize";
        };
-        apps.train-text-from-scratch = {
-          type = "app";
-          program = "${self.packages.${system}.default}/bin/train-text-from-scratch";
-        };
        apps.default = self.apps.${system}.llama;
        devShells.default = pkgs.mkShell {
          buildInputs = [ llama-python ];
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -6,26 +6,6 @@
 #include <stdlib.h>
 #include <string.h>

-#ifdef __has_include
-    #if __has_include(<unistd.h>)
-        #include <unistd.h>
-        #if defined(_POSIX_MAPPED_FILES)
-            #include <sys/types.h>
-            #include <sys/mman.h>
-        #endif
-    #endif
-#endif
-
-#if defined(_WIN32)
-    #define WIN32_LEAN_AND_MEAN
-    #ifndef NOMINMAX
-        #define NOMINMAX
-    #endif
-    #include <windows.h>
-    #include <memoryapi.h>
-#endif
-
-
 #define UNUSED(x) (void)(x)
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
@@ -119,28 +99,15 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens
 }
 #endif

-static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+
+static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    return ggml_nbytes(tensor);

    UNUSED(alloc);
 }

-// check if a tensor is allocated by this buffer
-static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
-    void * ptr = tensor->data;
-    return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
-}
-
-static bool ggml_is_view(struct ggml_tensor * t) {
-    return t->view_src != NULL;
-}
-
 void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
-#ifdef GGML_ALLOCATOR_DEBUG
-    GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
-    GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
-#endif
-    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);

    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
@@ -164,14 +131,14 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
    if (best_fit_block == -1) {
        // the last block is our last resort
        struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
-        max_avail = MAX(max_avail, block->size);
        if (block->size >= size) {
            best_fit_block = alloc->n_free_blocks - 1;
+            max_avail = MAX(max_avail, block->size);
        } else {
            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
                    __func__, size, max_avail);
            GGML_ASSERT(!"not enough space in the buffer");
-            return;
+        return;
        }
    }
    struct free_block * block = &alloc->free_blocks[best_fit_block];
@@ -206,17 +173,17 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
 }

 // this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
+static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
    void * ptr = tensor->data;

-    if (ggml_allocr_is_own(alloc, tensor) == false) {
+    if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
        // the tensor was not allocated in this buffer
        // this can happen because the graph allocator will try to free weights and other tensors from different buffers
        // the easiest way to deal with this is just to ignore it
        return;
    }

-    size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+    size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
    size = aligned_offset(NULL, size, alloc->alignment);
    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);

@@ -310,68 +277,17 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment)
    return alloc;
 }

-// OS specific functions to allocate and free uncommitted virtual memory
-static void * alloc_vmem(size_t size) {
-#if defined(_WIN32)
-    return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
-#elif defined(_POSIX_MAPPED_FILES)
-    void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
-    if (ptr == MAP_FAILED) {
-        return NULL;
-    }
-    return ptr;
-#else
-    // use a fixed address for other platforms
-    uintptr_t base_addr = (uintptr_t)-size - 0x100;
-    return (void *)base_addr;
-#endif
-}
-
-static void free_vmem(void * base_addr, size_t size) {
-#if defined(_WIN32)
-    VirtualFree(base_addr, 0, MEM_RELEASE);
-    UNUSED(size);
-#elif defined(_POSIX_MAPPED_FILES)
-    munmap(base_addr, size);
-#else
-    // nothing to do
-    UNUSED(base_addr);
-    UNUSED(size);
-#endif
-}
-
-// allocate uncommitted virtual memory to measure the size of the graph
-static void alloc_measure_vmem(void ** base_addr, size_t * size) {
-    // 128GB for 64-bit, 1GB for 32-bit
-    *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
-    do {
-        *base_addr = alloc_vmem(*size);
-        if (*base_addr != NULL) {
-            AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
-            return;
-        }
-        // try again with half the size
-        *size /= 2;
-    } while (*size > 0);
-
-    GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
-}
-
-static void free_measure_vmem(void * base_addr, size_t size) {
-    free_vmem(base_addr, size);
-}
+// address and size of the buffer when measuring
+// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
+static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
+static const size_t MEASURE_MAX_SIZE  = 1ULL<<40; // 1 TB

 struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
    struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);

-    void * base_addr;
-    size_t size;
-
-    alloc_measure_vmem(&base_addr, &size);
-
    *alloc = (struct ggml_allocr){
-        /*.data          = */ base_addr,
-        /*.size          = */ size,
+        /*.data          = */ MEASURE_BASE_ADDR,
+        /*.size          = */ MEASURE_MAX_SIZE,
        /*.alignment     = */ alignment,
        /*.n_free_blocks = */ 0,
        /*.free_blocks   = */ {{0}},
@@ -391,9 +307,6 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
 }

 void ggml_allocr_free(struct ggml_allocr * alloc) {
-    if (alloc->measure) {
-        free_measure_vmem(alloc->data, alloc->size);
-    }
    free(alloc);
 }

@@ -403,6 +316,11 @@ bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {

 //////////// compute graph allocator

+static bool ggml_is_view(struct ggml_tensor * t) {
+    return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
+           t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
+}
+
 static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
    if (a->type != b->type) {
        return false;
@@ -418,6 +336,28 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml
    return true;
 }

+static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
+    switch (t->op) {
+        case GGML_OP_PERMUTE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_VIEW:
+            return t->src[0];
+        case GGML_OP_CPY:
+            return t->src[1];
+        default:
+            return NULL;
+    }
+}
+
+static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
+    struct ggml_tensor * parent = t;
+    do {
+        parent = get_view_parent(parent);
+    } while (ggml_is_view(parent));
+    return parent;
+}
+
 static bool ggml_op_can_inplace(enum ggml_op op) {
    switch (op) {
        case GGML_OP_SCALE:
@@ -425,6 +365,7 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_ADD:
        case GGML_OP_ADD1:
+        case GGML_OP_ACC:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
@@ -434,6 +375,7 @@ static bool ggml_op_can_inplace(enum ggml_op op) {
        case GGML_OP_UNARY:
        case GGML_OP_ROPE:
        case GGML_OP_RMS_NORM:
+        case GGML_OP_SET:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_CONT:
            return true;
@@ -447,8 +389,24 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
    struct hash_node * ht = alloc->hash_table;
    if (node->data == NULL) {
        if (ggml_is_view(node)) {
-            assert(node->view_src->data != NULL);
-            node->data = (char *)node->view_src->data + node->view_offs;
+            size_t offset;
+            switch(node->op) {
+                case GGML_OP_VIEW:
+                    memcpy(&offset, node->op_params, sizeof(size_t));
+                    node->data = (char *) node->src[0]->data + offset;
+                    break;
+                case GGML_OP_PERMUTE:
+                case GGML_OP_RESHAPE:
+                case GGML_OP_TRANSPOSE:
+                    node->data = node->src[0]->data;
+                    break;
+                case GGML_OP_CPY:
+                    node->data = node->src[1]->data;
+                    break;
+                default:
+                    GGML_ASSERT(!"unknown view op");
+                    break;
+            }
        } else {
            // see if we can reuse a parent's buffer (inplace)
            if (ggml_op_can_inplace(node->op)) {
@@ -459,7 +417,8 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                    }

                    // if the node's data is external, then we cannot re-use it
-                    if (ggml_allocr_is_own(alloc, parent) == false) {
+                    if ((char *) parent->data < (char *) alloc->data ||
+                        (char *) parent->data >= ((char *) alloc->data + alloc->size)) {
                        AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
                        continue;
                    }
@@ -467,7 +426,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
                    struct hash_node * p_hn = hash_get(ht, parent);
                    if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
                        if (ggml_is_view(parent)) {
-                            struct ggml_tensor * view_src = parent->view_src;
+                            struct ggml_tensor * view_src = get_view_source(parent);
                            struct hash_node * view_src_hn = hash_get(ht, view_src);
                            if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
                                // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
@@ -493,7 +452,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node)
    }
 }

-static size_t ggml_allocr_alloc_graph_tensors_n(
+static size_t ggml_allocator_alloc_graph_tensors_n(
    struct ggml_allocr * alloc,
    struct ggml_cgraph ** graphs, int n_graphs,
    struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
@@ -509,7 +468,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
            struct ggml_tensor * node = gf->nodes[i];

            if (ggml_is_view(node)) {
-                struct ggml_tensor * view_src = node->view_src;
+                struct ggml_tensor * view_src = get_view_source(node);
                hash_get(ht, view_src)->n_views += 1;
            }

@@ -571,10 +530,11 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
                AT_PRINTF("\n");
            }

+
            // update parents
            // update immediately if there is no parse_seq
            // update only at barriers if there is parse_seq
-            if ((alloc->parse_seq_len == 0) || alloc->parse_seq[ind] == -1) {
+            if ((alloc->parse_seq_len==0) || alloc->parse_seq[ind] == -1) {
                int update_start = alloc->parse_seq_len ? last_barrier_pos : ind;
                int update_end   = alloc->parse_seq_len ? ind              : ind + 1;
                for (int i = update_start; i < update_end; i++) {
@@ -593,17 +553,17 @@ static size_t ggml_allocr_alloc_graph_tensors_n(

                        if (p_hn->n_children == 0 && p_hn->n_views == 0) {
                            if (ggml_is_view(parent)) {
-                                struct ggml_tensor * view_src = parent->view_src;
+                                struct ggml_tensor * view_src = get_view_source(parent);
                                struct hash_node * view_src_hn = hash_get(ht, view_src);
                                view_src_hn->n_views -= 1;
                                AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
                                if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
-                                    ggml_allocr_free_tensor(alloc, view_src);
+                                    ggml_allocator_free_tensor(alloc, view_src);
                                }
                            }
                            else {
                                if (parent->data != node->data) {
-                                    ggml_allocr_free_tensor(alloc, parent);
+                                    ggml_allocator_free_tensor(alloc, parent);
                                }
                            }
                        }
@@ -620,7 +580,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
            for (int i = 0; outputs[g][i] != NULL; i++) {
                struct ggml_tensor * output = outputs[g][i];
                AT_PRINTF("output: %s\n", output->name);
-                ggml_allocr_free_tensor(alloc, output);
+                ggml_allocator_free_tensor(alloc, output);
            }
        }
    }
@@ -629,5 +589,5 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
 }

 size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
-    return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+    return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
 }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -11,7 +11,6 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))

-// TODO: temporary - reuse llama.cpp logging
 #ifdef GGML_METAL_NDEBUG
 #define metal_printf(...)
 #else
@@ -63,10 +62,7 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(relu);
    GGML_METAL_DECL_KERNEL(gelu);
    GGML_METAL_DECL_KERNEL(soft_max);
-    GGML_METAL_DECL_KERNEL(soft_max_4);
    GGML_METAL_DECL_KERNEL(diag_mask_inf);
-    GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
-    GGML_METAL_DECL_KERNEL(get_rows_f32);
    GGML_METAL_DECL_KERNEL(get_rows_f16);
    GGML_METAL_DECL_KERNEL(get_rows_q4_0);
    GGML_METAL_DECL_KERNEL(get_rows_q4_1);
@@ -78,10 +74,7 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(get_rows_q6_K);
    GGML_METAL_DECL_KERNEL(rms_norm);
    GGML_METAL_DECL_KERNEL(norm);
-    GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
-    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
-    GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
@@ -90,7 +83,6 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
    GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
-    GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
    GGML_METAL_DECL_KERNEL(mul_mm_q4_1_f32);
@@ -121,51 +113,26 @@ static NSString * const msl_library_source = @"see metal.metal";
@end

 struct ggml_metal_context * ggml_metal_init(int n_cb) {
-    metal_printf("%s: allocating\n", __func__);
+    fprintf(stderr, "%s: allocating\n", __func__);

-    id <MTLDevice> device;
-    NSString * s;
-
-#if TARGET_OS_OSX
-    // Show all the Metal device instances in the system
-    NSArray * devices = MTLCopyAllDevices();
-    for (device in devices) {
-        s = [device name];
-        metal_printf("%s: found device: %s\n", __func__, [s UTF8String]);
-    }
-#endif
-
-    // Pick and show default Metal device
-    device = MTLCreateSystemDefaultDevice();
-    s = [device name];
-    metal_printf("%s: picking default device: %s\n", __func__, [s UTF8String]);
-
-    // Configure context
    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
-    ctx->device = device;
+
    ctx->n_cb   = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
+    ctx->device = MTLCreateSystemDefaultDevice();
    ctx->queue  = [ctx->device newCommandQueue];
    ctx->n_buffers = 0;
    ctx->concur_list_len = 0;

-    ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
+    ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);

-#ifdef GGML_SWIFT
-    // load the default.metallib file
+#if 0
+    // compile from source string and show compile log
    {
        NSError * error = nil;

-        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
-        NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
-        NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
-        NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
-        NSURL * libURL = [NSURL fileURLWithPath:libPath];
-
-        // Load the metallib file into a Metal library
-        ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
-
+        ctx->library = [ctx->device newLibraryWithSource:msl_library_source options:nil error:&error];
        if (error) {
-            metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
    }
@@ -178,12 +145,12 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {

        //NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
        NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
-        NSString * path   = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
-        metal_printf("%s: loading '%s'\n", __func__, [path UTF8String]);
+        NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
+        fprintf(stderr, "%s: loading '%s'\n", __func__, [path UTF8String]);

        NSString * src  = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
        if (error) {
-            metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }

@@ -195,7 +162,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
 #endif
        if (error) {
-            metal_printf("%s: error: %s\n", __func__, [[error description] UTF8String]);
+            fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
            return NULL;
        }
    }
@@ -207,11 +174,11 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #define GGML_METAL_ADD_KERNEL(name) \
        ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
        ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
-        metal_printf("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
+        fprintf(stderr, "%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
                (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
                (int) ctx->pipeline_##name.threadExecutionWidth); \
        if (error) { \
-            metal_printf("%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
+            fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
            return NULL; \
        }

@@ -224,10 +191,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(relu);
        GGML_METAL_ADD_KERNEL(gelu);
        GGML_METAL_ADD_KERNEL(soft_max);
-        GGML_METAL_ADD_KERNEL(soft_max_4);
        GGML_METAL_ADD_KERNEL(diag_mask_inf);
-        GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
-        GGML_METAL_ADD_KERNEL(get_rows_f32);
        GGML_METAL_ADD_KERNEL(get_rows_f16);
        GGML_METAL_ADD_KERNEL(get_rows_q4_0);
        GGML_METAL_ADD_KERNEL(get_rows_q4_1);
@@ -239,10 +203,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(get_rows_q6_K);
        GGML_METAL_ADD_KERNEL(rms_norm);
        GGML_METAL_ADD_KERNEL(norm);
-        GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
-        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
-        GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
@@ -251,7 +212,6 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
        GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
-        GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
        GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
@@ -270,21 +230,19 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
 #undef GGML_METAL_ADD_KERNEL
    }

-    metal_printf("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
-#if TARGET_OS_OSX
-    metal_printf("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+    fprintf(stderr, "%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+    fprintf(stderr, "%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
    if (ctx->device.maxTransferRate != 0) {
-        metal_printf("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
+        fprintf(stderr, "%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
    } else {
-        metal_printf("%s: maxTransferRate               = built-in GPU\n", __func__);
+        fprintf(stderr, "%s: maxTransferRate               = built-in GPU\n", __func__);
    }
-#endif

    return ctx;
 }

 void ggml_metal_free(struct ggml_metal_context * ctx) {
-    metal_printf("%s: deallocating\n", __func__);
+    fprintf(stderr, "%s: deallocating\n", __func__);
 #define GGML_METAL_DEL_KERNEL(name) \
    [ctx->function_##name release]; \
    [ctx->pipeline_##name release];
@@ -298,10 +256,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(relu);
    GGML_METAL_DEL_KERNEL(gelu);
    GGML_METAL_DEL_KERNEL(soft_max);
-    GGML_METAL_DEL_KERNEL(soft_max_4);
    GGML_METAL_DEL_KERNEL(diag_mask_inf);
-    GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
-    GGML_METAL_DEL_KERNEL(get_rows_f32);
    GGML_METAL_DEL_KERNEL(get_rows_f16);
    GGML_METAL_DEL_KERNEL(get_rows_q4_0);
    GGML_METAL_DEL_KERNEL(get_rows_q4_1);
@@ -313,10 +268,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(get_rows_q6_K);
    GGML_METAL_DEL_KERNEL(rms_norm);
    GGML_METAL_DEL_KERNEL(norm);
-    GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
-    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
-    GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
    GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
@@ -325,7 +277,6 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
    GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
-    GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
    GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
@@ -358,9 +309,9 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {

 void * ggml_metal_host_malloc(size_t n) {
    void * data = NULL;
-    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
+    const int result = posix_memalign((void **) &data, getpagesize(), n);
    if (result != 0) {
-        metal_printf("%s: error: posix_memalign failed\n", __func__);
+        fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
        return NULL;
    }

@@ -388,7 +339,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
 // Metal buffer based on the host memory pointer
 //
 static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
-    //metal_printf("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
+    //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);

    const int64_t tsize = ggml_nbytes(t);

@@ -396,17 +347,16 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
    for (int i = 0; i < ctx->n_buffers; ++i) {
        const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;

-        //metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
        if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
            *offs = (size_t) ioffs;

-            //metal_printf("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
+            //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);

            return ctx->buffers[i].metal;
        }
    }

-    metal_printf("%s: error: buffer is nil\n", __func__);
+    fprintf(stderr, "%s: error: buffer is nil\n", __func__);

    return nil;
 }
@@ -418,7 +368,7 @@ bool ggml_metal_add_buffer(
                         size_t   size,
                         size_t   max_size) {
    if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
-        metal_printf("%s: too many buffers\n", __func__);
+        fprintf(stderr, "%s: too many buffers\n", __func__);
        return false;
    }

@@ -428,12 +378,12 @@ bool ggml_metal_add_buffer(
            const int64_t ioffs = (int64_t) data - (int64_t) ctx->buffers[i].data;

            if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
-                metal_printf("%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
+                fprintf(stderr, "%s: error: buffer '%s' overlaps with '%s'\n", __func__, name, ctx->buffers[i].name);
                return false;
            }
        }

-        const size_t size_page = sysconf(_SC_PAGESIZE);
+        const size_t size_page = getpagesize();

        size_t size_aligned = size;
        if ((size_aligned % size_page) != 0) {
@@ -449,11 +399,11 @@ bool ggml_metal_add_buffer(
            ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];

            if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
+                fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
                return false;
            }

-            metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
+            fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);

            ++ctx->n_buffers;
        } else {
@@ -473,32 +423,28 @@ bool ggml_metal_add_buffer(
                ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];

                if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                    metal_printf("%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
+                    fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
                    return false;
                }

-                metal_printf("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
+                fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
                if (i + size_step < size) {
-                    metal_printf("\n");
+                    fprintf(stderr, "\n");
                }

                ++ctx->n_buffers;
            }
        }

-#if TARGET_OS_OSX
-        metal_printf(", (%8.2f / %8.2f)",
+        fprintf(stderr, ", (%8.2f / %8.2f)",
                ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
                ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);

        if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
-            metal_printf(", warning: current allocated size is greater than the recommended max working set size\n");
+            fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
        } else {
-            metal_printf("\n");
+            fprintf(stderr, "\n");
        }
-#else
-        metal_printf(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
-#endif
    }

    return true;
@@ -507,6 +453,8 @@ bool ggml_metal_add_buffer(
 void ggml_metal_set_tensor(
        struct ggml_metal_context * ctx,
        struct ggml_tensor * t) {
+    metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
+
    size_t offs;
    id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);

@@ -516,6 +464,8 @@ void ggml_metal_set_tensor(
 void ggml_metal_get_tensor(
        struct ggml_metal_context * ctx,
        struct ggml_tensor * t) {
+    metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
+
    size_t offs;
    id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);

@@ -610,13 +560,15 @@ void ggml_metal_graph_find_concurrency(
    }

    if (ctx->concur_list_len > GGML_MAX_CONCUR) {
-        metal_printf("%s: too many elements for metal ctx->concur_list!\n", __func__);
+        fprintf(stderr, "%s: too many elements for metal ctx->concur_list!\n", __func__);
    }
 }

 void ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
               struct ggml_cgraph * gf) {
+    metal_printf("%s: evaluating graph\n", __func__);
+
    @autoreleasepool {

    // if there is ctx->concur_list, dispatch concurrently
@@ -664,7 +616,7 @@ void ggml_metal_graph_compute(
                    continue;
                }

-                //metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+                metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));

                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
                struct ggml_tensor * src1 = gf->nodes[i]->src[1];
@@ -733,16 +685,8 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_ADD:
                        {
-                            GGML_ASSERT(ggml_is_contiguous(src0));
-                            GGML_ASSERT(ggml_is_contiguous(src1));
-
-                            // utilize float4
-                            GGML_ASSERT(ne00 % 4 == 0);
-                            const int64_t nb = ne00/4;
-
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
-                                GGML_ASSERT(ne11 == 1);
                                [encoder setComputePipelineState:ctx->pipeline_add_row];
                            } else {
                                [encoder setComputePipelineState:ctx->pipeline_add];
@@ -750,24 +694,16 @@ void ggml_metal_graph_compute(
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&nb     length:sizeof(nb) atIndex:3];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];

-                            const int64_t n = ggml_nelements(dst)/4;
+                            const int64_t n = ggml_nelements(dst);

                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_MUL:
                        {
-                            GGML_ASSERT(ggml_is_contiguous(src0));
-                            GGML_ASSERT(ggml_is_contiguous(src1));
-
-                            // utilize float4
-                            GGML_ASSERT(ne00 % 4 == 0);
-                            const int64_t nb = ne00/4;
-
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
-                                GGML_ASSERT(ne11 == 1);
                                [encoder setComputePipelineState:ctx->pipeline_mul_row];
                            } else {
                                [encoder setComputePipelineState:ctx->pipeline_mul];
@@ -775,16 +711,14 @@ void ggml_metal_graph_compute(
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&nb     length:sizeof(nb) atIndex:3];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];

-                            const int64_t n = ggml_nelements(dst)/4;
+                            const int64_t n = ggml_nelements(dst);

                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_SCALE:
                        {
-                            GGML_ASSERT(ggml_is_contiguous(src0));
-
                            const float scale = *(const float *) src1->data;

                            [encoder setComputePipelineState:ctx->pipeline_scale];
@@ -792,7 +726,7 @@ void ggml_metal_graph_compute(
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];

-                            const int64_t n = ggml_nelements(dst)/4;
+                            const int64_t n = ggml_nelements(dst);

                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
@@ -804,7 +738,7 @@ void ggml_metal_graph_compute(
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];

-                                    const int64_t n = ggml_nelements(dst)/4;
+                                    const int64_t n = ggml_nelements(dst);

                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
@@ -824,13 +758,13 @@ void ggml_metal_graph_compute(
                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];

-                                    const int64_t n = ggml_nelements(dst)/4;
+                                    const int64_t n = ggml_nelements(dst);

                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                                } break;
                            default:
                                {
-                                    metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                    fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                                    GGML_ASSERT(false);
                                }
                        } break;
@@ -838,16 +772,13 @@ void ggml_metal_graph_compute(
                        {
                            const int nth = 32;

-                            if (ne00%4 == 0) {
-                                [encoder setComputePipelineState:ctx->pipeline_soft_max_4];
-                            } else {
-                                [encoder setComputePipelineState:ctx->pipeline_soft_max];
-                            }
+                            [encoder setComputePipelineState:ctx->pipeline_soft_max];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];

                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
@@ -855,23 +786,14 @@ void ggml_metal_graph_compute(
                        {
                            const int n_past = ((int32_t *)(dst->op_params))[0];

-                            if (ne00%8 == 0) {
-                                [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf_8];
-                            } else {
-                                [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
-                            }
+                            [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
                            [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
                            [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
                            [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];

-                            if (ne00%8 == 0) {
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            }
-                            else {
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            }
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_MUL_MAT:
                        {
@@ -884,14 +806,13 @@ void ggml_metal_graph_compute(

                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                            if (!ggml_is_transposed(src0) &&
-                                !ggml_is_transposed(src1) &&
+                            if (ggml_is_contiguous(src0) &&
+                                ggml_is_contiguous(src1) &&
                                src1t == GGML_TYPE_F32 &&
                                [ctx->device supportsFamily:MTLGPUFamilyApple7] &&
                                ne00%32 == 0 &&
                                ne11 > 1) {
                                switch (src0->type) {
-                                    case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32];  break;
                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32];  break;
                                    case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_0_f32]; break;
                                    case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_1_f32]; break;
@@ -911,39 +832,22 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:5];
                                [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:6];
                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
-                                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:8];
-                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:9];
-                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:10];
-                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:11];
-                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
-                                [encoder setBytes:&gqa     length:sizeof(gqa)  atIndex:13];
+                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:8];
+                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:9];
+                                [encoder setBytes:&gqa     length:sizeof(gqa)  atIndex:10];
                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                            } else {
                                int nth0 = 32;
                                int nth1 = 1;
-                                int nrows = 1;

                                // use custom matrix x vector kernel
                                switch (src0t) {
-                                    case GGML_TYPE_F32:
-                                        {
-                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
-                                            nrows = 4;
-                                        } break;
                                    case GGML_TYPE_F16:
                                        {
-                                            nth0 = 32;
+                                            nth0 = 64;
                                            nth1 = 1;
-                                            if (ne11 * ne12 < 4) {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
-                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
-                                                nrows = ne11;
-                                            } else {
-                                                [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
-                                                nrows = 4;
-                                            }
+                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
                                        } break;
                                    case GGML_TYPE_Q4_0:
                                        {
@@ -995,8 +899,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);

-                                            nth0 = 4; //1;
-                                            nth1 = 8; //32;
+                                            nth0 = 2;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
                                        } break;
                                    case GGML_TYPE_Q5_K:
@@ -1019,7 +923,7 @@ void ggml_metal_graph_compute(
                                        } break;
                                    default:
                                        {
-                                            metal_printf("Asserting on type %d\n",(int)src0t);
+                                            fprintf(stderr, "Asserting on type %d\n",(int)src0t);
                                            GGML_ASSERT(false && "not implemented");
                                        }
                                };
@@ -1044,12 +948,9 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&gqa  length:sizeof(gqa)  atIndex:17];

                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
-                                    src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
+                                    src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
-                                else if (src0t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
                                else if (src0t == GGML_TYPE_Q3_K) {
 #ifdef GGML_QKK_64
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@@ -1063,15 +964,14 @@ void ggml_metal_graph_compute(
                                else if (src0t == GGML_TYPE_Q6_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
-                                    int64_t ny = (ne11 + nrows - 1)/nrows;
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                                    [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
+                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                            }
                        } break;
                    case GGML_OP_GET_ROWS:
                        {
                            switch (src0->type) {
-                                case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f32];  break;
                                case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_get_rows_f16];  break;
                                case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
                                case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
@@ -1087,9 +987,9 @@ void ggml_metal_graph_compute(
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
-                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:5];
+                            [encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
+                            [encoder setBytes:&(dst->nb[1])  length:sizeof(uint64_t) atIndex:5];

                            const int64_t n = ggml_nelements(src1);

@@ -1210,7 +1110,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&freq_base  length:sizeof(float) atIndex:21];
                            [encoder setBytes:&freq_scale length:sizeof(float) atIndex:22];

-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
                    case GGML_OP_DUP:
                    case GGML_OP_CPY:
@@ -1261,7 +1161,7 @@ void ggml_metal_graph_compute(
                        } break;
                    default:
                        {
-                            metal_printf("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                            fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
                            GGML_ASSERT(false);
                        }
                }
@@ -1286,7 +1186,7 @@ void ggml_metal_graph_compute(

        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
        if (status != MTLCommandBufferStatusCompleted) {
-            metal_printf("%s: command buffer %d failed with status %lu\n", __func__, i, status);
+            fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
            GGML_ASSERT(false);
        }
    }
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
        return;
    }

-    cl_mem mem = (cl_mem)tensor->extra;
+    cl_mem mem = (cl_mem)tensor->data;
    clReleaseMemObject(mem);
 }

@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
    size_t d_size;

    cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
-    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
+    cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
    cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst


@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    size_t d_size;
    cl_mem d_X;
    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
-        d_X = (cl_mem) src0->extra;
+        d_X = (cl_mem) src0->data;
    } else {
-        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
+        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
    }
    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
    size_t d_size;
    cl_mem d_X;
    if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
-        d_X = (cl_mem) src0->extra;
+        d_X = (cl_mem) src0->data;
    } else {
        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
    }
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
                events.emplace_back();
                CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
            } else if (src0->backend == GGML_BACKEND_GPU) {
-                d_Q = (cl_mem) src0->extra;
+                d_Q = (cl_mem) src0->data;
            } else {
                GGML_ASSERT(false);
            }
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {

    CL_CHECK(clFinish(queue));

-    tensor->extra = dst;
+    tensor->data = dst;
    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 }
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -195,14 +195,6 @@
 #    define GGML_DEPRECATED(func, hint) func
 #endif

-#ifndef __GNUC__
-#    define GGML_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__)
-#    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-
 #include <stdint.h>
 #include <stddef.h>
 #include <stdbool.h>
@@ -487,9 +479,6 @@ extern "C" {
        int64_t perf_cycles;
        int64_t perf_time_us;

-        struct ggml_tensor * view_src;
-        size_t               view_offs;
-
        void * data;

        char name[GGML_MAX_NAME];
@@ -672,7 +661,7 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);

    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
-    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
+    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);

    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);

@@ -693,7 +682,6 @@ extern "C" {

    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
-    GGML_ATTRIBUTE_FORMAT(2, 3)
    GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);

    //
@@ -964,11 +952,11 @@ extern "C" {

    // a - x
    // b - dy
+    // TODO: update with configurable eps
    GGML_API struct ggml_tensor * ggml_rms_norm_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            float                 eps);
+            struct ggml_tensor  * b);

    // A: n columns, m rows
    // B: n columns, p rows  (i.e. we transpose it internally)
@@ -1624,8 +1612,7 @@ extern "C" {
            struct ggml_tensor  * tensor);


-    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
+    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);

    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
@@ -1690,8 +1677,6 @@ extern "C" {
        GGML_LINESEARCH_INVALID_PARAMETERS,
    };

-    typedef void (*ggml_opt_callback)(void * data, float * sched);
-
    // optimization parameters
    //
    //   see ggml.c (ggml_opt_default_params) for default values
@@ -1727,14 +1712,12 @@ extern "C" {

            float sched; // schedule multiplier (fixed, decay or warmup)
            float decay; // weight decay for AdamW, use 0.0f to disable
-            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
            float alpha; // learning rate
            float beta1;
            float beta2;
            float eps;   // epsilon for numerical stability
            float eps_f; // epsilon for convergence test
            float eps_g; // epsilon for convergence test
-            float gclip; // gradient clipping
        } adam;

        // LBFGS parameters
@@ -1762,12 +1745,14 @@ extern "C" {

        bool just_initialized;

-        float loss_before;
-        float loss_after;
-
        struct {
+            struct ggml_tensor * x;  // view of the parameters
+            struct ggml_tensor * g1; // gradient
+            struct ggml_tensor * g2; // gradient squared
            struct ggml_tensor * m;  // first moment
            struct ggml_tensor * v;  // second moment
+            struct ggml_tensor * mh; // first moment hat
+            struct ggml_tensor * vh; // second moment hat
            struct ggml_tensor * pf; // past function values
            float fx_best;
            float fx_prev;
@@ -1804,10 +1789,10 @@ extern "C" {

    // initialize optimizer context
    GGML_API void ggml_opt_init(
-            struct ggml_context     * ctx,
+            struct ggml_context * ctx,
            struct ggml_opt_context * opt,
-            struct ggml_opt_params    params,
-            int64_t                   nx);
+            struct ggml_opt_params params,
+            int64_t nx);

    // continue optimizing the function defined by the tensor f
    GGML_API enum ggml_opt_result ggml_opt_resume(
@@ -1821,9 +1806,7 @@ extern "C" {
            struct ggml_opt_context * opt,
            struct ggml_tensor * f,
            struct ggml_cgraph * gf,
-            struct ggml_cgraph * gb,
-            ggml_opt_callback callback,
-            void * callback_data);
+            struct ggml_cgraph * gb);

    //
    // quantization
@@ -1875,39 +1858,39 @@ extern "C" {

    GGML_API const char * gguf_type_name(enum gguf_type type);

-    GGML_API int    gguf_get_version    (const struct gguf_context * ctx);
-    GGML_API size_t gguf_get_alignment  (const struct gguf_context * ctx);
-    GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
-    GGML_API void * gguf_get_data       (const struct gguf_context * ctx);
+    GGML_API int    gguf_get_version    (struct gguf_context * ctx);
+    GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
+    GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
+    GGML_API void * gguf_get_data       (struct gguf_context * ctx);

-    GGML_API int          gguf_get_n_kv(const struct gguf_context * ctx);
-    GGML_API int          gguf_find_key(const struct gguf_context * ctx, const char * key);
-    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
+    GGML_API int          gguf_get_n_kv(struct gguf_context * ctx);
+    GGML_API int          gguf_find_key(struct gguf_context * ctx, const char * key);
+    GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);

-    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
-    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
+    GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
+    GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);

    // results are undefined if the wrong type is used for the key
-    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int i);
-    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int i);
-    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int i);
-    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int i);
-    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int i);
-    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int i);
-    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int i);
-    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int i);
-    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int i);
-    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int i);
-    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int i);
-    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
-    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int i);
-    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
-    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
+    GGML_API uint8_t      gguf_get_val_u8  (struct gguf_context * ctx, int i);
+    GGML_API int8_t       gguf_get_val_i8  (struct gguf_context * ctx, int i);
+    GGML_API uint16_t     gguf_get_val_u16 (struct gguf_context * ctx, int i);
+    GGML_API int16_t      gguf_get_val_i16 (struct gguf_context * ctx, int i);
+    GGML_API uint32_t     gguf_get_val_u32 (struct gguf_context * ctx, int i);
+    GGML_API int32_t      gguf_get_val_i32 (struct gguf_context * ctx, int i);
+    GGML_API float        gguf_get_val_f32 (struct gguf_context * ctx, int i);
+    GGML_API uint64_t     gguf_get_val_u64 (struct gguf_context * ctx, int i);
+    GGML_API int64_t      gguf_get_val_i64 (struct gguf_context * ctx, int i);
+    GGML_API double       gguf_get_val_f64 (struct gguf_context * ctx, int i);
+    GGML_API bool         gguf_get_val_bool(struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
+    GGML_API int          gguf_get_arr_n   (struct gguf_context * ctx, int i);
+    GGML_API const void * gguf_get_arr_data(struct gguf_context * ctx, int i);
+    GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);

-    GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
-    GGML_API int    gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
-    GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
-    GGML_API char * gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
+    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
+    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);
+    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
+    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);

    // overrides existing values or adds a new one
    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
@@ -1952,11 +1935,11 @@ extern "C" {
    //

    // write the entire context to a binary file
-    GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
+    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);

    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
-    GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
-    GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
+    GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
+    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);

    //
    // system info
@@ -1970,7 +1953,6 @@ extern "C" {
    GGML_API int ggml_cpu_has_fma        (void);
    GGML_API int ggml_cpu_has_neon       (void);
    GGML_API int ggml_cpu_has_arm_fma    (void);
-    GGML_API int ggml_cpu_has_metal      (void);
    GGML_API int ggml_cpu_has_f16c       (void);
    GGML_API int ggml_cpu_has_fp16_va    (void);
    GGML_API int ggml_cpu_has_wasm_simd  (void);
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@@ -27,25 +27,8 @@ In this case, upgrade Pip to the latest:
 pip install --upgrade pip
 ```

-## Automatic publishing with CI
-
-There's a GitHub workflow to make a release automatically upon creation of tags in a specified format.
-
-1. Bump the version in `pyproject.toml`.
-2. Create a tag named `gguf-vx.x.x` where `x.x.x` is the semantic version number.
-
-```sh
-git tag -a gguf-v1.0.0 -m "Version 1.0 release"
-```
-
-3. Push the tags.
-
-```sh
-git push origin --tags
-```
-
-## Manual publishing
-If you want to publish the package manually for any reason, you need to have `twine` and `build` installed:
+## Publishing
+To publish the package, you need to have `twine` and `build` installed:

 ```sh
 pip install build twine
@@ -53,7 +36,7 @@ pip install build twine

 Then, folow these steps to release a new version:

-1. Bump the version in `pyproject.toml`.
+1. Update the version in `pyproject.toml`.
 2. Build the package:

 ```sh
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -1,19 +1,13 @@
 #!/usr/bin/env python3
-from __future__ import annotations
-
-import json
-import os
 import shutil
-import struct
 import sys
+import struct
 import tempfile
-from enum import IntEnum, auto
-from io import BufferedWriter
-from pathlib import Path
-from typing import IO, Any, BinaryIO, Callable, Sequence
-
 import numpy as np

+from enum import IntEnum, auto
+from typing import Any, IO, List, Optional
+
 #
 # constants
 #
@@ -77,48 +71,44 @@ KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"


 class MODEL_ARCH(IntEnum):
-    LLAMA         : int = auto()
-    FALCON        : int = auto()
-    BAICHUAN      : int = auto()
-    GPT2          : int = auto()
-    GPTJ          : int = auto()
-    GPTNEOX       : int = auto()
-    MPT           : int = auto()
-    STARCODER     : int = auto()
+    LLAMA   = auto()
+    FALCON  = auto()
+    GPT2    = auto()
+    GPTJ    = auto()
+    GPTNEOX = auto()
+    MPT     = auto()


 class MODEL_TENSOR(IntEnum):
-    TOKEN_EMBD   : int = auto()
-    POS_EMBD     : int = auto()
-    OUTPUT       : int = auto()
-    OUTPUT_NORM  : int = auto()
-    ROPE_FREQS   : int = auto()
-    ATTN_Q       : int = auto()
-    ATTN_K       : int = auto()
-    ATTN_V       : int = auto()
-    ATTN_QKV     : int = auto()
-    ATTN_OUT     : int = auto()
-    ATTN_NORM    : int = auto()
-    ATTN_NORM_2  : int = auto()
-    ATTN_ROT_EMBD: int = auto()
-    FFN_GATE     : int = auto()
-    FFN_DOWN     : int = auto()
-    FFN_UP       : int = auto()
-    FFN_NORM     : int = auto()
+    TOKEN_EMBD    = auto()
+    POS_EMBD      = auto()
+    OUTPUT        = auto()
+    OUTPUT_NORM   = auto()
+    ROPE_FREQS    = auto()
+    ATTN_Q        = auto()
+    ATTN_K        = auto()
+    ATTN_V        = auto()
+    ATTN_QKV      = auto()
+    ATTN_OUT      = auto()
+    ATTN_NORM     = auto()
+    ATTN_NORM_2   = auto()
+    ATTN_ROT_EMBD = auto()
+    FFN_GATE      = auto()
+    FFN_DOWN      = auto()
+    FFN_UP        = auto()
+    FFN_NORM      = auto()


-MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
-    MODEL_ARCH.LLAMA:          "llama",
-    MODEL_ARCH.FALCON:         "falcon",
-    MODEL_ARCH.BAICHUAN:       "baichuan",
-    MODEL_ARCH.GPT2:           "gpt2",
-    MODEL_ARCH.GPTJ:           "gptj",
-    MODEL_ARCH.GPTNEOX:        "gptneox",
-    MODEL_ARCH.MPT:            "mpt",
-    MODEL_ARCH.STARCODER:      "starcoder",
+MODEL_ARCH_NAMES = {
+    MODEL_ARCH.LLAMA:   "llama",
+    MODEL_ARCH.FALCON:  "falcon",
+    MODEL_ARCH.GPT2:    "gpt2",
+    MODEL_ARCH.GPTJ:    "gptj",
+    MODEL_ARCH.GPTNEOX: "gptneox",
+    MODEL_ARCH.MPT:     "mpt",
 }

-MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
+MODEL_TENSOR_NAMES = {
    MODEL_ARCH.LLAMA: {
        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
@@ -157,34 +147,6 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
        MODEL_TENSOR.FFN_DOWN:    "blk.{bid}.ffn_down",
        MODEL_TENSOR.FFN_UP:      "blk.{bid}.ffn_up",
    },
-    MODEL_ARCH.BAICHUAN: {
-        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
-        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
-        MODEL_TENSOR.OUTPUT:        "output",
-        MODEL_TENSOR.ROPE_FREQS:    "rope_freqs",
-        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
-        MODEL_TENSOR.ATTN_Q:        "blk.{bid}.attn_q",
-        MODEL_TENSOR.ATTN_K:        "blk.{bid}.attn_k",
-        MODEL_TENSOR.ATTN_V:        "blk.{bid}.attn_v",
-        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
-        MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
-        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
-        MODEL_TENSOR.FFN_GATE:      "blk.{bid}.ffn_gate",
-        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
-        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
-    },
-    MODEL_ARCH.STARCODER: {
-        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
-        MODEL_TENSOR.POS_EMBD:      "position_embd",
-        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
-        MODEL_TENSOR.OUTPUT:        "output",
-        MODEL_TENSOR.ATTN_NORM:     "blk.{bid}.attn_norm",
-        MODEL_TENSOR.ATTN_QKV:      "blk.{bid}.attn_qkv",
-        MODEL_TENSOR.ATTN_OUT:      "blk.{bid}.attn_output",
-        MODEL_TENSOR.FFN_NORM:      "blk.{bid}.ffn_norm",
-        MODEL_TENSOR.FFN_DOWN:      "blk.{bid}.ffn_down",
-        MODEL_TENSOR.FFN_UP:        "blk.{bid}.ffn_up",
-    },
    MODEL_ARCH.GPT2: {
        # TODO
    },
@@ -192,212 +154,175 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
 }

 # tensors that will not be serialized
-MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
+MODEL_TENSOR_SKIP = {
    MODEL_ARCH.LLAMA: [
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
    ],
-    MODEL_ARCH.BAICHUAN: [
-        MODEL_TENSOR.ROPE_FREQS,
-        MODEL_TENSOR.ATTN_ROT_EMBD,
-    ],
 }


-class TensorNameMap:
-    mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
-        # Token embeddings
-        MODEL_TENSOR.TOKEN_EMBD: (
-            "gpt_neox.embed_in",           # gptneox
-            "transformer.wte",             # gpt2 mpt
-            "transformer.word_embeddings", # falcon
-            "model.embed_tokens",          # llama-hf
-            "tok_embeddings",              # llama-pth
-        ),
+# TODO: the following helper functions should be removed
+#       instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
+#       however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
+# REMOVE
+def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
+    for skip in MODEL_TENSOR_SKIP.get(arch, []):
+        for i in range(n_blocks):
+            if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
+                return True

-        # Position embeddings
-        MODEL_TENSOR.POS_EMBD: (
-            "transformer.wpe", # gpt2
-        ),
+    return False

-        # Output
-        MODEL_TENSOR.OUTPUT: (
-            "embed_out", # gptneox
-            "lm_head",   # gpt2 mpt falcon llama-hf baichuan
-            "output",    # llama-pth
-        ),

-        # Output norm
-        MODEL_TENSOR.OUTPUT_NORM: (
-            "gpt_neox.final_layer_norm", # gptneox
-            "transformer.ln_f",          # gpt2 falcon
-            "model.norm",                # llama-hf baichuan
-            "norm",                      # llama-pth
-        ),
+def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
+    tensor_map = {}

-        # Rope frequencies
-        MODEL_TENSOR.ROPE_FREQS: (
-            "rope.freqs", # llama-pth
-        ),
-    }
+    # Token embeddings
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)

-    block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
+    tensor_map["gpt_neox.embed_in"]           = mapped_to  # gptneox
+    tensor_map["transformer.wte"]             = mapped_to  # gpt2 mpt
+    tensor_map["transformer.word_embeddings"] = mapped_to  # falcon
+    tensor_map["model.embed_tokens"]          = mapped_to  # llama-hf
+    tensor_map["tok_embeddings"]              = mapped_to  # llama-pth
+
+    # Position embeddings
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
+
+    tensor_map["transformer.wpe"] = mapped_to  # gpt2
+
+    # Output
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
+
+    tensor_map["embed_out"] = mapped_to  # gptneox
+    tensor_map["lm_head"]   = mapped_to  # gpt2 mpt falcon llama-hf
+    tensor_map["output"]    = mapped_to  # llama-pth
+
+    # Output norm
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
+
+    tensor_map["gpt_neox.final_layer_norm"] = mapped_to  # gptneox
+    tensor_map["transformer.ln_f"]          = mapped_to  # gpt2 falcon
+    tensor_map["transformer.norm_f"]        = mapped_to  # mpt
+    tensor_map["model.norm"]                = mapped_to  # llama-hf
+    tensor_map["norm"]                      = mapped_to  # llama-pth
+
+    # Rope frequencies
+    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
+
+    tensor_map["rope.freqs"] = mapped_to  # llama-pth
+
+    # Attention and feed-forward blocks
+    for i in range(0, n_blocks):
        # Attention norm
-        MODEL_TENSOR.ATTN_NORM: (
-            "gpt_neox.layers.{bid}.input_layernorm", # gptneox
-            "transformer.h.{bid}.ln_1",              # gpt2
-            "transformer.blocks.{bid}.norm_1",       # mpt
-            "transformer.h.{bid}.input_layernorm",   # falcon7b
-            "transformer.h.{bid}.ln_mlp",            # falcon40b
-            "model.layers.{bid}.input_layernorm",    # llama-hf
-            "layers.{bid}.attention_norm",           # llama-pth
-        ),
+        # TODO: is there are simpler way to write these 2 lines in Python?
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".ln_1"]              = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".norm_1"]       = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".input_layernorm"]   = mapped_to  # falcon7b
+        tensor_map["transformer.h."+str(i)+".ln_mlp"]            = mapped_to  # falcon40b
+        tensor_map["model.layers."+str(i)+".input_layernorm"]    = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention_norm"]           = mapped_to  # llama-pth

        # Attention norm 2
-        MODEL_TENSOR.ATTN_NORM_2: (
-            "transformer.h.{bid}.ln_attn", # falcon40b
-        ),
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to  # falcon40b

        # Attention query-key-value
-        MODEL_TENSOR.ATTN_QKV: (
-            "gpt_neox.layers.{bid}.attention.query_key_value",    # gptneox
-            "transformer.h.{bid}.attn.c_attn",                    # gpt2
-            "transformer.blocks.{bid}.attn.Wqkv",                 # mpt
-            "transformer.h.{bid}.self_attention.query_key_value", # falcon
-        ),
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"]    = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".attn.c_attn"]                    = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"]                 = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to  # falcon

        # Attention query
-        MODEL_TENSOR.ATTN_Q: (
-            "model.layers.{bid}.self_attn.q_proj", # llama-hf
-            "layers.{bid}.attention.wq",           # llama-pth
-        ),
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wq"]           = mapped_to  # llama-pth

        # Attention key
-        MODEL_TENSOR.ATTN_K: (
-            "model.layers.{bid}.self_attn.k_proj", # llama-hf
-            "layers.{bid}.attention.wk",           # llama-pth
-        ),
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wk"]           = mapped_to  # llama-pth

        # Attention value
-        MODEL_TENSOR.ATTN_V: (
-            "model.layers.{bid}.self_attn.v_proj", # llama-hf
-            "layers.{bid}.attention.wv",           # llama-pth
-        ),
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wv"]           = mapped_to  # llama-pth

        # Attention output
-        MODEL_TENSOR.ATTN_OUT: (
-            "gpt_neox.layers.{bid}.attention.dense",    # gptneox
-            "transformer.h.{bid}.attn.c_proj",          # gpt2
-            "transformer.blocks.{bid}.attn.out_proj",   # mpt
-            "transformer.h.{bid}.self_attention.dense", # falcon
-            "model.layers.{bid}.self_attn.o_proj",      # llama-hf
-            "layers.{bid}.attention.wo",                # llama-pth
-        ),
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".attention.dense"]    = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".attn.c_proj"]          = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".attn.out_proj"]   = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".self_attn.o_proj"]      = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.wo"]                = mapped_to  # llama-pth

        # Rotary embeddings
-        MODEL_TENSOR.ATTN_ROT_EMBD: (
-            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",  # llama-hf
-            "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
-        ),
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"]  = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to  # llama-pth

        # Feed-forward norm
-        MODEL_TENSOR.FFN_NORM: (
-            "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
-            "transformer.h.{bid}.ln_2",                       # gpt2
-            "transformer.blocks.{bid}.norm_2",                # mpt
-            "model.layers.{bid}.post_attention_layernorm",    # llama-hf
-            "layers.{bid}.ffn_norm",                          # llama-pth
-        ),
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".ln_2"]                       = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".norm_2"]                = mapped_to  # mpt
+        tensor_map["model.layers."+str(i)+".post_attention_layernorm"]    = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".ffn_norm"]                          = mapped_to  # llama-pth

        # Feed-forward up
-        MODEL_TENSOR.FFN_UP: (
-            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
-            "transformer.h.{bid}.mlp.c_fc",            # gpt2
-            "transformer.blocks.{bid}.ffn.up_proj",    # mpt
-            "transformer.h.{bid}.mlp.dense_h_to_4h",   # falcon
-            "model.layers.{bid}.mlp.up_proj",          # llama-hf
-            "layers.{bid}.feed_forward.w3",            # llama-pth
-        ),
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".mlp.c_fc"]            = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"]    = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"]   = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".mlp.up_proj"]          = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w3"]            = mapped_to  # llama-pth

        # Feed-forward gate
-        MODEL_TENSOR.FFN_GATE: (
-            "model.layers.{bid}.mlp.gate_proj", # llama-hf
-            "layers.{bid}.feed_forward.w1",     # llama-pth
-        ),
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+
+        tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w1"]     = mapped_to  # llama-pth

        # Feed-forward down
-        MODEL_TENSOR.FFN_DOWN: (
-            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
-            "transformer.h.{bid}.mlp.c_proj",          # gpt2
-            "transformer.blocks.{bid}.ffn.down_proj",  # mpt
-            "transformer.h.{bid}.mlp.dense_4h_to_h",   # falcon
-            "model.layers.{bid}.mlp.down_proj",        # llama-hf
-            "layers.{bid}.feed_forward.w2",            # llama-pth
-        ),
-    }
+        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
+        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None

-    mapping: dict[str, tuple[MODEL_TENSOR, str]]
+        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to  # gptneox
+        tensor_map["transformer.h."+str(i)+".mlp.c_proj"]          = mapped_to  # gpt2
+        tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"]  = mapped_to  # mpt
+        tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"]   = mapped_to  # falcon
+        tensor_map["model.layers."+str(i)+".mlp.down_proj"]        = mapped_to  # llama-hf
+        tensor_map["layers."+str(i)+".feed_forward.w2"]            = mapped_to  # llama-pth

-    tensor_names: dict[MODEL_TENSOR, str]
+    return tensor_map

-    def __init__(self, arch: MODEL_ARCH, n_blocks: int):
-        mapping = self.mapping = {}
-        tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch]
-        for tensor, keys in self.mappings_cfg.items():
-            tensor_name = tensor_names.get(tensor)
-            if tensor_name is None:
-                continue
-            mapping[tensor_name] = (tensor, tensor_name)
-            for key in keys:
-                mapping[key] = (tensor, tensor_name)
-        for bid in range(n_blocks):
-            for tensor, keys in self.block_mappings_cfg.items():
-                tensor_name = tensor_names.get(tensor)
-                if tensor_name is None:
-                    continue
-                tensor_name = tensor_name.format(bid = bid)
-                mapping[tensor_name] = (tensor, tensor_name)
-                for key in keys:
-                    key = key.format(bid = bid)
-                    mapping[key] = (tensor, tensor_name)
-
-    def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
-        result = self.mapping.get(key)
-        if result is not None:
-            return result
-        for suffix in try_suffixes:
-            if key.endswith(suffix):
-                result = self.mapping.get(key[:-len(suffix)])
-                if result is not None:
-                    return (result[0], result[1] + suffix)
-        return None
-
-    def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
-        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
-        if result is None:
-            return None
-        return result[1]
-
-    def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
-        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
-        if result is None:
-            return None
-        return result[0]
-
-    def __getitem__(self, key: str) -> str:
-        try:
-            return self.mapping[key][1]
-        except KeyError:
-            raise KeyError(key)
-
-    def __contains__(self, key: str) -> bool:
-        return key in self.mapping
-
-    def __repr__(self) -> str:
-        return repr(self.mapping)
-
-def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
-    return TensorNameMap(arch, n_blocks)

 class TokenType(IntEnum):
    NORMAL       = 1
@@ -463,21 +388,15 @@ class GGUFValueType(IntEnum):


 class GGUFWriter:
-    fout: BufferedWriter
-    arch: str
-    offset_tensor = 0
-    data_alignment = GGUF_DEFAULT_ALIGNMENT
-    kv_data = b""
-    kv_data_count = 0
-    ti_data = b""
-    ti_data_count = 0
-    use_temp_file: bool
-    temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
-    tensors: list[tuple[np.ndarray[Any, Any], int]]
-
-    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
+    def __init__(self, path: str, arch: str, use_temp_file = True):
        self.fout = open(path, "wb")
        self.arch = arch
+        self.offset_tensor = 0
+        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
+        self.kv_data = b""
+        self.kv_data_count = 0
+        self.ti_data = b""
+        self.ti_data_count = 0
        self.add_architecture()
        self.use_temp_file = use_temp_file
        self.tensors = []
@@ -551,27 +470,14 @@ class GGUFWriter:
        self.add_key(key)
        self.add_val(val, GGUFValueType.STRING)

-    def add_array(self, key: str, val: Sequence[Any]):
-        if not isinstance(val, Sequence):
-            raise ValueError("Value must be a sequence for array type")
+    def add_array(self, key: str, val: list):
+        if not isinstance(val, list):
+            raise ValueError("Value must be a list for array type")

        self.add_key(key)
        self.add_val(val, GGUFValueType.ARRAY)

-    _simple_value_packing = {
-        GGUFValueType.UINT8:   "<B",
-        GGUFValueType.INT8:    "<b",
-        GGUFValueType.UINT16:  "<H",
-        GGUFValueType.INT16:   "<h",
-        GGUFValueType.UINT32:  "<I",
-        GGUFValueType.INT32:   "<i",
-        GGUFValueType.FLOAT32: "<f",
-        GGUFValueType.UINT64:  "<Q",
-        GGUFValueType.INT64:   "<q",
-        GGUFValueType.FLOAT64: "<d",
-        GGUFValueType.BOOL:    "?" ,
-    }
-    def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
+    def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
        if vtype is None:
            vtype = GGUFValueType.get_type(val)

@@ -579,29 +485,47 @@ class GGUFWriter:
            self.kv_data += struct.pack("<I", vtype)
            self.kv_data_count += 1

-        pack_fmt = self._simple_value_packing.get(vtype)
-        if pack_fmt is not None:
-            self.kv_data += struct.pack(pack_fmt, val)
+        if vtype == GGUFValueType.UINT8:
+            self.kv_data += struct.pack("<B", val)
+        elif vtype == GGUFValueType.INT8:
+            self.kv_data += struct.pack("<b", val)
+        elif vtype == GGUFValueType.UINT16:
+            self.kv_data += struct.pack("<H", val)
+        elif vtype == GGUFValueType.INT16:
+            self.kv_data += struct.pack("<h", val)
+        elif vtype == GGUFValueType.UINT32:
+            self.kv_data += struct.pack("<I", val)
+        elif vtype == GGUFValueType.INT32:
+            self.kv_data += struct.pack("<i", val)
+        elif vtype == GGUFValueType.FLOAT32:
+            self.kv_data += struct.pack("<f", val)
+        elif vtype == GGUFValueType.UINT64:
+            self.kv_data += struct.pack("<Q", val)
+        elif vtype == GGUFValueType.INT64:
+            self.kv_data += struct.pack("<q", val)
+        elif vtype == GGUFValueType.FLOAT64:
+            self.kv_data += struct.pack("<d", val)
+        elif vtype == GGUFValueType.BOOL:
+            self.kv_data += struct.pack("?", val)
        elif vtype == GGUFValueType.STRING:
            encoded_val = val.encode("utf8") if isinstance(val, str) else val
            self.kv_data += struct.pack("<Q", len(encoded_val))
            self.kv_data += encoded_val
-        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
-            ltype = GGUFValueType.get_type(val[0])
-            if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
-                raise ValueError("All items in a GGUF array should be of the same type")
-            self.kv_data += struct.pack("<I", ltype)
+        elif vtype == GGUFValueType.ARRAY:
+            ltype = set([GGUFValueType.get_type(item) for item in val])
+            assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
+            self.kv_data += struct.pack("<I", list(ltype)[0])
            self.kv_data += struct.pack("<Q", len(val))
            for item in val:
                self.add_val(item, add_vtype=False)
        else:
-            raise ValueError("Invalid GGUF metadata value type or value")
+            raise ValueError("Invalid GGUF metadata value type")

    @staticmethod
    def ggml_pad(x: int, n: int) -> int:
        return ((x + n - 1) // n) * n

-    def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None):
+    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"

        encoded_name = name.encode("utf8")
@@ -620,18 +544,16 @@ class GGUFWriter:
        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
        self.ti_data_count += 1

-    def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
-        if self.use_temp_file and self.temp_file is None:
-            fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
-            fp.seek(0)
-            self.temp_file = fp
+    def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
+        if self.use_temp_file and not hasattr(self, "temp_file"):
+            self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
+            self.temp_file.seek(0)

-        shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
-        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
+        self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)

        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes

-        if  self.temp_file is None:
+        if not self.use_temp_file:
            self.tensors.append((tensor, pad))
            return

@@ -640,22 +562,25 @@ class GGUFWriter:
        if pad != 0:
            self.temp_file.write(bytes([0] * pad))

-    def write_padding(self, fp: BinaryIO, n: int, align: int | None = None):
-        pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
+    def write_tensor_data(self, tensor: np.ndarray):
+        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
        if pad != 0:
-            fp.write(bytes([0] * pad))
+            self.fout.write(bytes([0] * pad))

-    def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
-        self.write_padding(self.fout, self.fout.tell())
        tensor.tofile(self.fout)
-        self.write_padding(self.fout, tensor.nbytes)
+
+        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))

    def write_tensors_to_file(self):
        self.write_ti_data_to_file()

-        self.write_padding(self.fout, self.fout.tell())
+        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
+        if pad != 0:
+            self.fout.write(bytes([0] * pad))

-        if self.temp_file is None:
+        if not self.use_temp_file:
            for (currtensor, currpad) in self.tensors:
                currtensor.tofile(self.fout)
                if currpad != 0:
@@ -729,6 +654,10 @@ class GGUFWriter:
        self.add_bool(
            KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)

+    def add_tensor_data_layout(self, layout: str):
+        self.add_string(
+            KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
+
    def add_head_count(self, count: int):
        self.add_uint32(
            KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
@@ -766,16 +695,16 @@ class GGUFWriter:
    def add_tokenizer_model(self, model: str):
        self.add_string(KEY_TOKENIZER_MODEL, model)

-    def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
+    def add_token_list(self, tokens: List):
        self.add_array(KEY_TOKENIZER_LIST, tokens)

-    def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]):
+    def add_token_merges(self, merges: List):
        self.add_array(KEY_TOKENIZER_MERGES, merges)

-    def add_token_types(self, types: Sequence[TokenType] | Sequence[int]):
+    def add_token_types(self, types: List[int]):
        self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)

-    def add_token_scores(self, scores: Sequence[float]):
+    def add_token_scores(self, scores: List[float]):
        self.add_array(KEY_TOKENIZER_SCORES, scores)

    def add_bos_token_id(self, id: int):
@@ -794,84 +723,6 @@ class GGUFWriter:
        self.add_uint32(KEY_TOKENIZER_PAD_ID, id)


-class SpecialVocab:
-    load_merges: bool = False
-    merges: list[str] = []
-    special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
-    special_token_ids: dict[str, int] = {}
-
-    def __init__(self, path: Path, load_merges: bool = False, special_token_types: tuple[str, ...] | None = None):
-        self.special_token_ids = {}
-        self.load_merges = load_merges
-        if special_token_types is not None:
-            self.special_token_types = special_token_types
-        self.load(path)
-
-    def load(self, path: Path):
-        if not self.try_load_from_tokenizer_json(path):
-            self.try_load_from_config_json(path)
-
-    def try_load_from_tokenizer_json(self, path: Path) -> bool:
-        tokenizer_file = path / 'tokenizer.json'
-        if not tokenizer_file.is_file():
-            return False
-        with open(tokenizer_file, 'r', encoding = 'utf-8') as f:
-            tokenizer = json.load(f)
-        if self.load_merges:
-            merges = tokenizer.get('model', {}).get('merges')
-            if isinstance(merges, list) and len(merges) > 0 and isinstance(merges[0], str):
-                self.merges = merges
-        tokenizer_config_file = path / 'tokenizer_config.json'
-        added_tokens = tokenizer.get('added_tokens')
-        if added_tokens is None or not tokenizer_config_file.is_file():
-            return True
-        with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f:
-            tokenizer_config = json.load(f)
-        for typ in self.special_token_types:
-            entry = tokenizer_config.get(f'{typ}_token')
-            if isinstance(entry, str):
-                tc_content = entry
-            elif isinstance(entry, dict):
-                entry_content = entry.get('content')
-                if not isinstance(entry_content, str):
-                    continue
-                tc_content = entry_content
-            else:
-                continue
-            for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
-                if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
-                    self.special_token_ids[typ] = maybe_token_id
-                break
-        return True
-
-    def try_load_from_config_json(self, path: Path) -> bool:
-        config_file = path / 'config.json'
-        if not config_file.is_file():
-            return False
-        with open(config_file, 'r', encoding = 'utf-8') as f:
-            config = json.load(f)
-        for typ in self.special_token_types:
-            maybe_token_id = config.get(f'{typ}_token_id')
-            if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
-                self.special_token_ids[typ] = maybe_token_id
-        return True
-
-    def add_to_gguf(self, gw: GGUFWriter):
-        if len(self.merges) > 0:
-            print(f'gguf: Adding {len(self.merges)} merge(s).')
-            gw.add_token_merges(self.merges)
-        for typ, tokid in self.special_token_ids.items():
-            handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
-            if handler is None:
-                print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
-                continue
-            print(f'gguf: Setting special token type {typ} to {tokid}')
-            handler(tokid)
-
-    def __repr__(self):
-        return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids if self.special_token_ids else "unset"}>'
-
-
 # Example usage:
 if __name__ == "__main__":
    # Example usage with a file
--- a/gguf-py/gguf/py.typed
+++ b/gguf-py/gguf/py.typed
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,11 +1,10 @@
 [tool.poetry]
 name = "gguf"
-version = "0.3.3"
+version = "0.2.1"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
    {include = "gguf"},
-    {include = "gguf/py.typed"},
 ]
 readme = "README.md"
 homepage = "https://ggml.ai"
--- a/grammars/c.gbnf
+++ b/grammars/c.gbnf
@@ -1,42 +0,0 @@
-root ::= (declaration)*
-
-declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}"
-
-dataType  ::= "int" ws | "float" ws | "char" ws
-identifier ::= [a-zA-Z_] [a-zA-Z_0-9]*
-
-parameter ::= dataType identifier
-
-statement ::=
-    ( dataType identifier ws "=" ws expression ";" ) |
-    ( identifier ws "=" ws expression ";" ) |
-    ( identifier ws "(" argList? ")" ";" ) |
-    ( "return" ws expression ";" ) |
-    ( "while" "(" condition ")" "{" statement* "}" ) |
-    ( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) |
-    ( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) |
-    ( singleLineComment ) |
-    ( multiLineComment )
-
-forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression
-forUpdate ::= identifier ws "=" ws expression
-
-condition ::= expression relationOperator expression
-relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">")
-
-expression ::= term (("+" | "-") term)*
-term ::= factor(("*" | "/") factor)*
-
-factor ::= identifier | number | unaryTerm | funcCall | parenExpression
-unaryTerm ::= "-" factor
-funcCall ::= identifier "(" argList? ")"
-parenExpression ::= "(" ws expression ws ")"
-
-argList ::= expression ("," ws expression)*
-
-number ::= [0-9]+
-
-singleLineComment ::= "//" [^\n]* "\n"
-multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/"
-
-ws ::= ([ \t\n]+)
--- a/grammars/json_arr.gbnf
+++ b/grammars/json_arr.gbnf
@@ -1,34 +0,0 @@
-# This is the same as json.gbnf but we restrict whitespaces at the end of the root array
-# Useful for generating JSON arrays
-
-root   ::= arr
-value  ::= object | array | string | number | ("true" | "false" | "null") ws
-
-arr  ::=
-  "[\n" ws (
-            value
-    (",\n" ws value)*
-  )? "]"
-
-object ::=
-  "{" ws (
-            string ":" ws value
-    ("," ws string ":" ws value)*
-  )? "}" ws
-
-array  ::=
-  "[" ws (
-            value
-    ("," ws value)*
-  )? "]" ws
-
-string ::=
-  "\"" (
-    [^"\\] |
-    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
-  )* "\"" ws
-
-number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
-
-# Optional space: by convention, applied in this grammar after literal chars when allowed
-ws ::= ([ \t\n] ws)?
--- a/k_quants.c
+++ b/k_quants.c
@@ -13,26 +13,6 @@
 //
 #include <arm_neon.h>

-#if !defined(__aarch64__)
-inline static int32_t vaddvq_s16(int16x8_t v) {
-    return
-        (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
-        (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
-        (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
-        (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
-}
-
-inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
-    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
-    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
-    return vcombine_s16(a0, b0);
-}
-
-inline static int32_t vaddvq_s32(int32x4_t v) {
-    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
-}
-#endif
-
 #else

 #ifdef __wasm_simd128__
@@ -83,7 +63,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
        float ax = fabsf(x[i]);
        if (ax > amax) { amax = ax; max = x[i]; }
    }
-    if (amax < 1e-30f) { // all zero
+    if (!amax) { // all zero
        for (int i = 0; i < n; ++i) {
            L[i] = 0;
        }
@@ -203,9 +183,13 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
        int ntry, float alpha) {
    float min = x[0];
    float max = x[0];
+    float sum_x = 0;
+    float sum_x2 = 0;
    for (int i = 1; i < n; ++i) {
        if (x[i] < min) min = x[i];
        if (x[i] > max) max = x[i];
+        sum_x += x[i];
+        sum_x2 += x[i]*x[i];
    }
    if (max == min) {
        for (int i = 0; i < n; ++i) L[i] = 0;
@@ -1086,13 +1070,6 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict

        }

-        if (!max_abs_scale) {
-            memset(&y[i], 0, sizeof(block_q6_K));
-            y[i].d = ggml_fp32_to_fp16(0.f);
-            x += QK_K;
-            continue;
-        }
-
        float iscale = -128.f/max_scale;
        y[i].d = ggml_fp32_to_fp16(1/iscale);
        for (int ib = 0; ib < QK_K/16; ++ib) {
@@ -1329,9 +1306,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri

    const uint8x16_t m3 = vdupq_n_u8(0x3);
    const uint8x16_t m4 = vdupq_n_u8(0xF);
-#if defined(__ARM_FEATURE_DOTPROD)
    const int32x4_t  vzero = vdupq_n_s32(0);
-#endif

    int8x16x2_t q2bytes;
    uint8_t aux[16];
@@ -1637,9 +1612,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
 #ifdef __ARM_NEON

    const uint8x16_t m3 = vdupq_n_u8(0x3);
-#if defined(__ARM_FEATURE_DOTPROD)
    const int32x4_t  vzero = vdupq_n_s32(0);
-#endif

    int8x16x4_t q2bytes;

@@ -2087,7 +2060,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri

    __m256 acc = _mm256_setzero_ps();

-    const uint32_t *aux;
+    uint32_t *aux;

    for (int i = 0; i < nb; ++i) {

@@ -2097,7 +2070,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        const int8_t  * restrict q8 = y[i].qs;

        // Set up scales
-        aux = (const uint32_t *)x[i].scales;
+        aux = (uint32_t *)x[i].scales;
        __m128i scales128 = _mm_set_epi32(
                ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
                ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
@@ -2609,10 +2582,7 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri

        memcpy(utmp, x[i].scales, 12);

-        uint32x2_t mins8 = { 0 };
-        mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
-        mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
-
+        const uint32x2_t mins8 = {utmp[1] & kmask1, ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4)};
        utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
        utmp[0] &= kmask1;

@@ -2626,6 +2596,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
        const uint8_t * restrict q4 = x[i].qs;
        const int8_t  * restrict q8 = y[i].qs;

+        //int32x4_t isum = mzero;
+
        int32_t sumi1 = 0;
        int32_t sumi2 = 0;

@@ -2722,13 +2694,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
            const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
            __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
            p16l = _mm256_madd_epi16(scale_l, p16l);
+            sumi = _mm256_add_epi32(sumi, p16l);

            const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
            __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
            p16h = _mm256_madd_epi16(scale_h, p16h);
-            const __m256i sumj = _mm256_add_epi32(p16l, p16h);
+            sumi = _mm256_add_epi32(sumi, p16h);

-            sumi = _mm256_add_epi32(sumi, sumj);
        }

        __m256 vd = _mm256_set1_ps(d);
@@ -3124,11 +3096,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 #ifdef __ARM_NEON

    const uint8x16_t m4b = vdupq_n_u8(0xf);
+    const int32x4_t mzero = vdupq_n_s32(0);
    const uint8x16_t mone = vdupq_n_u8(1);
    const uint8x16_t mtwo = vdupq_n_u8(2);
-#if defined(__ARM_FEATURE_DOTPROD)
-    const int32x4_t mzero = vdupq_n_s32(0);
-#endif

    int8x16x4_t q5bytes;

@@ -3471,10 +3441,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
 #ifdef __ARM_NEON

    const uint8x16_t m4b = vdupq_n_u8(0xf);
-    const uint8x16_t mh = vdupq_n_u8(16);
-#if defined(__ARM_FEATURE_DOTPROD)
    const int32x4_t mzero = vdupq_n_s32(0);
-#endif
+    const uint8x16_t mh = vdupq_n_u8(16);

    int8x16x4_t q5bytes;
    uint8x16x4_t q5h;
@@ -3692,9 +3660,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
    float sum = 0;

    const uint8x16_t m4b = vdupq_n_u8(0xF);
-#if defined(__ARM_FEATURE_DOTPROD)
    const int32x4_t  vzero = vdupq_n_s32(0);
-#endif
    //const int8x16_t  m32s = vdupq_n_s8(32);

    const uint8x16_t mone = vdupq_n_u8(3);
@@ -4083,10 +4049,8 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
    float sum = 0;

    const uint8x16_t m4b = vdupq_n_u8(0xF);
-    const int8x16_t  m32s = vdupq_n_s8(32);
-#if defined(__ARM_FEATURE_DOTPROD)
    const int32x4_t  vzero = vdupq_n_s32(0);
-#endif
+    const int8x16_t  m32s = vdupq_n_s8(32);

    const uint8x16_t mone = vdupq_n_u8(3);

--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@@ -10,7 +10,6 @@
 #endif // GGML_USE_CUBLAS
 #include <stddef.h>
 #include <stdint.h>
-#include <stdio.h>
 #include <stdbool.h>

 #ifdef LLAMA_SHARED
@@ -164,7 +163,6 @@ extern "C" {
        enum llama_ftype ftype;      // quantize to this llama_ftype
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor; // quantize output.weight
-        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
    } llama_model_quantize_params;

    // grammar types
@@ -245,17 +243,15 @@ extern "C" {
    LLAMA_API bool llama_mmap_supported (void);
    LLAMA_API bool llama_mlock_supported(void);

-    LLAMA_API int llama_n_vocab    (const struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx      (const struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx_train(const struct llama_context * ctx);
-    LLAMA_API int llama_n_embd     (const struct llama_context * ctx);
+    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
+    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
+    LLAMA_API int llama_n_embd (const struct llama_context * ctx);

    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);

-    LLAMA_API int llama_model_n_vocab    (const struct llama_model * model);
-    LLAMA_API int llama_model_n_ctx      (const struct llama_model * model);
-    LLAMA_API int llama_model_n_ctx_train(const struct llama_model * model);
-    LLAMA_API int llama_model_n_embd     (const struct llama_model * model);
+    LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
+    LLAMA_API int llama_model_n_ctx  (const struct llama_model * model);
+    LLAMA_API int llama_model_n_embd (const struct llama_model * model);

    // Get a string describing the model type
    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
@@ -374,7 +370,6 @@ extern "C" {
    LLAMA_API int llama_tokenize(
            struct llama_context * ctx,
                      const char * text,
-                             int   text_len,
                     llama_token * tokens,
                             int   n_max_tokens,
                            bool   add_bos);
@@ -382,7 +377,6 @@ extern "C" {
    LLAMA_API int llama_tokenize_with_model(
        const struct llama_model * model,
                      const char * text,
-                             int   text_len,
                     llama_token * tokens,
                             int   n_max_tokens,
                            bool   add_bos);
@@ -414,8 +408,6 @@ extern "C" {

    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);

-    LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
-
    //
    // Sampling functions
    //
@@ -528,8 +520,6 @@ extern "C" {
    // If this is not called, or NULL is supplied, everything is output on stderr.
    LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);

-    LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
-
 #ifdef __cplusplus
 }
 #endif
@@ -542,9 +532,7 @@ extern "C" {

 struct ggml_tensor;

-const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
-    struct llama_context * ctx
-);
+const std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);

 #endif // LLAMA_API_INTERNAL

--- a/mypy.ini
+++ b/mypy.ini
@@ -1,5 +0,0 @@
-[mypy]
-strict = true
-allow_untyped_calls = true
-allow_untyped_defs = true
-allow_incomplete_defs = true
--- a/pocs/vdot/vdot.cpp
+++ b/pocs/vdot/vdot.cpp
@@ -16,7 +16,7 @@

 constexpr int kVecSize = 1 << 18;

-static float drawFromGaussianPdf(std::mt19937& rndm) {
+float drawFromGaussianPdf(std::mt19937& rndm) {
    constexpr double kScale = 1./(1. + std::mt19937::max());
    constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
    static float lastX;
@@ -28,8 +28,7 @@ static float drawFromGaussianPdf(std::mt19937& rndm) {
    haveX = true;
    return r*cos(phi);
 }
-
-static void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
+void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
    for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
 }

--- a/prompts/chat-with-baichuan.txt
+++ b/prompts/chat-with-baichuan.txt
@@ -1,4 +0,0 @@
-以下内容为人类用户与与一位智能助手的对话。
-
-用户:你好！
-助手:
--- a/run_with_preset.py
+++ b/run_with_preset.py
@@ -1,140 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import os
-import subprocess
-import sys
-
-import yaml
-
-CLI_ARGS_MAIN_PERPLEXITY = [
-    "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
-    "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
-    "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
-    "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
-    "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
-    "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
-    "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
-    "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
-    "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
-    "simple-io", "tensor-split", "threads", "temp", "tfs", "top-k", "top-p", "typical",
-    "verbose-prompt"
-]
-
-CLI_ARGS_LLAMA_BENCH = [
-    "batch-size", "memory-f32", "low-vram", "model", "mul-mat-q", "n-gen", "n-gpu-layers",
-    "n-prompt", "output", "repetitions", "tensor-split", "threads", "verbose"
-]
-
-CLI_ARGS_SERVER = [
-    "alias", "batch-size", "ctx-size", "embedding", "host", "memory-f32", "lora", "lora-base",
-    "low-vram", "main-gpu", "mlock", "model", "n-gpu-layers", "n-probs", "no-mmap", "no-mul-mat-q",
-    "numa", "path", "port", "rope-freq-base", "timeout", "rope-freq-scale", "tensor-split",
-    "threads", "verbose"
-]
-
-description = """Run llama.cpp binaries with presets from YAML file(s).
-To specify which binary should be run, specify the "binary" property (main, perplexity, llama-bench, and server are supported).
-To get a preset file template, run a llama.cpp binary with the "--logdir" CLI argument.
-
-Formatting considerations:
- The YAML property names are the same as the CLI argument names of the corresponding binary.
- Properties must use the long name of their corresponding llama.cpp CLI arguments.
- Like the llama.cpp binaries the property names do not differentiate between hyphens and underscores.
- Flags must be defined as "<PROPERTY_NAME>: true" to be effective.
- To define the logit_bias property, the expected format is "<TOKEN_ID>: <BIAS>" in the "logit_bias" namespace.
- To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
- To define a tensor split, pass a list of floats.
-"""
-usage = "run_with_preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
-epilog = ("  --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). "
-          "Unknown args will be ignored.")
-
-parser = argparse.ArgumentParser(
-    description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter)
-parser.add_argument("-bin", "--binary", help="The binary to run.")
-parser.add_argument("yaml_files", nargs="*",
-                    help="Arbitrary number of YAML files from which to read preset values. "
-                    "If two files specify the same values the later one will be used.")
-
-known_args, unknown_args = parser.parse_known_args()
-
-if not known_args.yaml_files and not unknown_args:
-    parser.print_help()
-    sys.exit(0)
-
-props = dict()
-
-for yaml_file in known_args.yaml_files:
-    with open(yaml_file, "r") as f:
-        props.update(yaml.load(f, yaml.SafeLoader))
-
-props = {prop.replace("_", "-"): val for prop, val in props.items()}
-
-binary = props.pop("binary", "main")
-if known_args.binary:
-    binary = known_args.binary
-
-if os.path.exists(f"./{binary}"):
-    binary = f"./{binary}"
-
-if binary.lower().endswith("main") or binary.lower().endswith("perplexity"):
-    cli_args = CLI_ARGS_MAIN_PERPLEXITY
-elif binary.lower().endswith("llama-bench"):
-    cli_args = CLI_ARGS_LLAMA_BENCH
-elif binary.lower().endswith("server"):
-    cli_args = CLI_ARGS_SERVER
-else:
-    print(f"Unknown binary: {binary}")
-    sys.exit(1)
-
-command_list = [binary]
-
-for cli_arg in cli_args:
-    value = props.pop(cli_arg, None)
-
-    if not value or value == -1:
-        continue
-
-    if cli_arg == "logit-bias":
-        for token, bias in value.items():
-            command_list.append("--logit-bias")
-            command_list.append(f"{token}{bias:+}")
-        continue
-
-    if cli_arg == "reverse-prompt" and not isinstance(value, str):
-        for rp in value:
-            command_list.append("--reverse-prompt")
-            command_list.append(str(rp))
-        continue
-
-    command_list.append(f"--{cli_arg}")
-
-    if cli_arg == "tensor-split":
-        command_list.append(",".join([str(v) for v in value]))
-        continue
-
-    value = str(value)
-
-    if value != "True":
-        command_list.append(str(value))
-
-num_unused = len(props)
-if num_unused > 10:
-    print(f"The preset file contained a total of {num_unused} unused properties.")
-elif num_unused > 0:
-    print("The preset file contained the following unused properties:")
-    for prop, value in props.items():
-        print(f"  {prop}: {value}")
-
-command_list += unknown_args
-
-sp = subprocess.Popen(command_list)
-
-while sp.returncode is None:
-    try:
-        sp.wait()
-    except KeyboardInterrupt:
-        pass
-
-sys.exit(sp.returncode)
--- a/scripts/LlamaConfig.cmake.in
+++ b/scripts/LlamaConfig.cmake.in
@@ -1,69 +0,0 @@
-set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
-set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
-set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
-set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
-set(LLAMA_BLAS @LLAMA_BLAS@)
-set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
-set(LLAMA_METAL @LLAMA_METAL@)
-set(LLAMA_MPI @LLAMA_MPI@)
-set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
-set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
-set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
-
-@PACKAGE_INIT@
-
-set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
-set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
-set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
-
-# Ensure transient dependencies satisfied
-
-find_package(Threads REQUIRED)
-if (APPLE AND LLAMA_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
-endif()
-
-if (LLAMA_BLAS)
-    find_package(BLAS REQUIRED)
-endif()
-
-if (LLAMA_CUBLAS)
-    find_package(CUDAToolkit REQUIRED)
-endif()
-
-if (LLAMA_METAL)
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK Metal REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
-endif()
-
-if (LLAMA_MPI)
-    find_package(MPI REQUIRED)
-endif()
-
-if (LLAMA_CLBLAST)
-    find_package(CLBlast REQUIRED)
-endif()
-
-if (LLAMA_HIPBLAS)
-    find_package(hip REQUIRED)
-    find_package(hipblas REQUIRED)
-    find_package(rocblas REQUIRED)
-endif()
-
-find_library(llama_LIBRARY llama
-    REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
-
-set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
-add_library(llama UNKNOWN IMPORTED)
-set_target_properties(llama
-    PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
-        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
-        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
-        IMPORTED_LOCATION "${llama_LIBRARY}"
-        INTERFACE_COMPILE_FEATURES cxx_std_11
-        POSITION_INDEPENDENT_CODE ON )
-
-check_required_components(Llama)
--- a/scripts/build-info.cmake
+++ b/scripts/build-info.cmake
@@ -2,8 +2,6 @@ set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
 set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
 set(BUILD_NUMBER 0)
 set(BUILD_COMMIT "unknown")
-set(BUILD_COMPILER "unknown")
-set(BUILD_TARGET "unknown")

 # Look for git
 find_package(Git)
@@ -43,45 +41,11 @@ if(Git_FOUND)
    endif()
 endif()

-if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0)
-    set(BUILD_COMMIT ${HEAD})
-    set(BUILD_NUMBER ${COUNT})
-endif()
-
-execute_process(
-    COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
-    OUTPUT_VARIABLE OUT
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-    RESULT_VARIABLE RES
-)
-if (RES EQUAL 0)
-    set(BUILD_COMPILER ${OUT})
-endif()
-
-execute_process(
-    COMMAND ${CMAKE_C_COMPILER} -dumpmachine
-    OUTPUT_VARIABLE OUT
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-    RESULT_VARIABLE RES
-)
-if (RES EQUAL 0)
-    set(BUILD_TARGET ${OUT})
-endif()
-
 # Only write the header if it's changed to prevent unnecessary recompilation
 if(EXISTS ${HEADER_FILE})
-    file(READ ${HEADER_FILE} CONTENTS)
-    string(REGEX MATCH "BUILD_COMMIT \"([^\"]*)\"" _ ${CONTENTS})
-    set(OLD_COMMIT ${CMAKE_MATCH_1})
-    string(REGEX MATCH "BUILD_COMPILER \"([^\"]*)\"" _ ${CONTENTS})
-    set(OLD_COMPILER ${CMAKE_MATCH_1})
-    string(REGEX MATCH "BUILD_TARGET \"([^\"]*)\"" _ ${CONTENTS})
-    set(OLD_TARGET ${CMAKE_MATCH_1})
-    if (
-        NOT OLD_COMMIT   STREQUAL BUILD_COMMIT   OR
-        NOT OLD_COMPILER STREQUAL BUILD_COMPILER OR
-        NOT OLD_TARGET   STREQUAL BUILD_TARGET
-    )
+    file(STRINGS ${HEADER_FILE} CONTENTS REGEX "BUILD_COMMIT \"([^\"]*)\"")
+    list(GET CONTENTS 0 EXISTING)
+    if(NOT EXISTING STREQUAL "#define BUILD_COMMIT \"${BUILD_COMMIT}\"")
        configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
    endif()
 else()
--- a/scripts/build-info.h.in
+++ b/scripts/build-info.h.in
@@ -3,7 +3,5 @@

 #define BUILD_NUMBER @BUILD_NUMBER@
 #define BUILD_COMMIT "@BUILD_COMMIT@"
-#define BUILD_COMPILER "@BUILD_COMPILER@"
-#define BUILD_TARGET "@BUILD_TARGET@"

 #endif // BUILD_INFO_H
--- a/scripts/build-info.sh
+++ b/scripts/build-info.sh
@@ -1,35 +1,23 @@
 #!/bin/sh

-CC=$1
+BUILD_NUMBER="0"
+BUILD_COMMIT="unknown"

-build_number="0"
-build_commit="unknown"
-build_compiler="unknown"
-build_target="unknown"
-
-if out=$(git rev-list --count HEAD); then
-  # git is broken on WSL so we need to strip extra newlines
-  build_number=$(printf '%s' "$out" | tr -d '\n')
+REV_LIST=$(git rev-list --count HEAD)
+if [ $? -eq 0 ]; then
+  BUILD_NUMBER=$REV_LIST
 fi

-if out=$(git rev-parse --short HEAD); then
-  build_commit=$(printf '%s' "$out" | tr -d '\n')
-fi
-
-if out=$($CC --version | head -1); then
-  build_compiler=$out
-fi
-
-if out=$($CC -dumpmachine); then
-  build_target=$out
+REV_PARSE=$(git rev-parse --short HEAD)
+if [ $? -eq 0 ]; then
+  BUILD_COMMIT=$REV_PARSE
 fi

 echo "#ifndef BUILD_INFO_H"
 echo "#define BUILD_INFO_H"
-echo
-echo "#define BUILD_NUMBER $build_number"
-echo "#define BUILD_COMMIT \"$build_commit\""
-echo "#define BUILD_COMPILER \"$build_compiler\""
-echo "#define BUILD_TARGET \"$build_target\""
-echo
+echo ""
+echo "#define BUILD_NUMBER $BUILD_NUMBER" | tr -d '\n'
+echo ""
+echo "#define BUILD_COMMIT \"$BUILD_COMMIT\"" | tr -d '\n'
+echo ""
 echo "#endif // BUILD_INFO_H"
--- a/scripts/qnt-all.sh
+++ b/scripts/qnt-all.sh
@@ -20,7 +20,6 @@ fi
 model="$1"
 out="../tmp/results-${model}"

-set -o pipefail
 set -e

 mkdir -p ${out}
--- a/scripts/run-all-perf.sh
+++ b/scripts/run-all-perf.sh
@@ -20,7 +20,6 @@ fi
 model="$1"
 out="../tmp/results-${model}"

-set -o pipefail
 set -e

 mkdir -p ${out}
--- a/scripts/run-all-ppl.sh
+++ b/scripts/run-all-ppl.sh
@@ -17,7 +17,6 @@ if [ ! -z "$3" ]; then
    args="$3"
 fi

-set -o pipefail
 set -e

 model="$1"
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -17,7 +17,7 @@ function(llama_build_and_test_executable source)
    get_filename_component(TEST_TARGET ${source} NAME_WE)
    add_executable(${TEST_TARGET} ${source})
    install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE llama common)
+    target_link_libraries(${TEST_TARGET} PRIVATE ggml llama common)
    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()

@@ -29,15 +29,11 @@ llama_build_executable(test-tokenizer-0-llama.cpp)
 llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
 llama_build_executable(test-tokenizer-0-falcon.cpp)
 #llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_build_executable(test-tokenizer-1-llama.cpp)
-llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_build_executable(test-tokenizer-1.cpp)
+# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
+#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 #llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
 llama_build_and_test_executable(test-grammar-parser.cpp)
 llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp) # SLOW
 # llama_build_and_test_executable(test-opt.cpp) # SLOW
-
-# dummy executable - not installed
-get_filename_component(TEST_TARGET test-c.c NAME_WE)
-add_executable(${TEST_TARGET} test-c.c)
-target_link_libraries(${TEST_TARGET} PRIVATE llama)
--- a/tests/test-c.c
+++ b/tests/test-c.c
@@ -1,3 +0,0 @@
-#include "llama.h"
-
-int main(void) {}
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -275,14 +275,14 @@ static bool check_gradient(

            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);

-            const double f0 = ggml_get_f32_1d(f, 0);
+            const float f0 = ggml_get_f32_1d(f, 0);

            ggml_set_f32_1d(x[i], k, xm);

            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);

-            const double f1 = ggml_get_f32_1d(f, 0);
-            const double g0 = (f0 - f1)/(2.0*(double) eps);
+            const float f1 = ggml_get_f32_1d(f, 0);
+            const float g0 = (f0 - f1)/(2.0f*eps);

            ggml_set_f32_1d(x[i], k, x0);

@@ -292,10 +292,10 @@ static bool check_gradient(

            ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);

-            const double g1 = ggml_get_f32_1d(x[i]->grad, k);
+            const float g1 = ggml_get_f32_1d(x[i]->grad, k);

-            const double error_abs = fabs(g0 - g1);
-            const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
+            const float error_abs = fabsf(g0 - g1);
+            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;

            if (error_abs > max_error_abs || error_rel > max_error_rel) {
                printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
@@ -531,7 +531,7 @@ int main(int argc, const char ** argv) {

                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));

-                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
+                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
            }
        }

@@ -1345,18 +1345,9 @@ int main(int argc, const char ** argv) {
                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                ggml_set_param(ctx0, x[0]);

-                float eps = 1e-6f;
-                // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
-                // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
-                struct ggml_tensor * f = ggml_sum(ctx0,
-                                            ggml_log(ctx0,
-                                                ggml_add1(ctx0,
-                                                    ggml_scale(ctx0,
-                                                        ggml_soft_max(ctx0, x[0]),
-                                                        ggml_new_f32(ctx0, 1.0f - eps)),
-                                                    ggml_new_f32(ctx0, eps))));
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_soft_max(ctx0, x[0]));

-                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
+                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
            }
        }

@@ -1367,26 +1358,15 @@ int main(int argc, const char ** argv) {
            int64_t ne2[4];
            get_random_dims(ne2, 4);

-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
+            for (int ndims = 1; ndims <= 3; ++ndims) {
+                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
-                // the second argument to cross_entropy_loss must sum up to 1 for each row
-                int nr = ggml_nrows(x[1]);
-                int nc = ggml_nelements(x[1]) / nr;
-                for (int ir = 0; ir < nr; ++ir) {
-                    float sum = 0;
-                    for (int ic = 0; ic < nc; ++ic) {
-                        sum += ((float *) x[1]->data)[ic + ir*nc];
-                    }
-                    for (int ic = 0; ic < nc; ++ic) {
-                        ((float *) x[1]->data)[ic + ir*nc] /= sum;
-                    }
-                }
                ggml_set_param(ctx0, x[0]);

-                struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
+                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));

-                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
+                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY);
+                // finite differences regularly fails!
            }
        }

@@ -1493,7 +1473,7 @@ int main(int argc, const char ** argv) {

                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));

-                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
+                    check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
                }
            }
        }
@@ -1534,7 +1514,7 @@ int main(int argc, const char ** argv) {

                    struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));

-                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
+                    check_gradient("flash_attn f16", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
                }
            }
        }
--- a/tests/test-opt.cpp
+++ b/tests/test-opt.cpp
@@ -36,15 +36,15 @@
 #define GGML_PRINT(...) printf(__VA_ARGS__)


-static float frand(void) {
+float frand(void) {
    return (float)rand()/(float)RAND_MAX;
 }

-static int irand(int n) {
+int irand(int n) {
    return rand()%n;
 }

-static void get_random_dims(int64_t * dims, int ndims) {
+void get_random_dims(int64_t * dims, int ndims) {
    dims[0] = dims[1] = dims[2] = dims[3] = 1;

    for (int i = 0; i < ndims; i++) {
@@ -52,7 +52,7 @@ static void get_random_dims(int64_t * dims, int ndims) {
    }
 }

-static void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
+void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
    dims[0] = dims[1] = dims[2] = dims[3] = 1;

    for (int i = 0; i < ndims; i++) {
@@ -61,9 +61,12 @@ static void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max)
 }


-static struct ggml_tensor * get_random_tensor(
-    struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
-) {
+struct ggml_tensor * get_random_tensor(
+        struct ggml_context * ctx0,
+        int ndims,
+        int64_t ne[],
+        float fmin,
+        float fmax) {
    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);

    switch (ndims) {
@@ -106,11 +109,11 @@ static struct ggml_tensor * get_random_tensor(
    return result;
 }

-static float get_element(const struct ggml_tensor * t, int idx) {
+float get_element(const struct ggml_tensor * t, int idx) {
    return ((float *)t->data)[idx];
 }

-static void set_element(struct ggml_tensor * t, int idx, float value) {
+void set_element(struct ggml_tensor * t, int idx, float value) {
    ((float *)t->data)[idx] = value;
 }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Green Sky	cec628e7fd	temporarily disable broken 512 build	2023-08-28 14:42:15 +02:00
Green Sky	0e1730a90e	ci: add linux binaries to release build	2023-08-28 14:42:15 +02:00