Implement server mode.

This new mode works by first loading the model then listening for TCP connections on a port. When a connection is received, arguments will be parsed using a simple protocol: - First the number of arguments will be read followed by a newline character. - Then each argument will be read, separated by the 0 byte. - With this we build an argument vector, similar to what is passed to the program entry point. We pass this to gpt_params_parse. Finally `run` will be executed with the input/output streams connected to the socket. Signed-off-by: Thiago Padilha <thiago@padilha.cc>
Remove direct access to std streams from "run"
2026-02-05 13:53:23 +02:00 · 2023-03-22 14:34:19 -03:00 · 2023-03-22 14:34:18 -03:00 · 2023-03-22 14:31:41 -03:00 · 2023-03-22 14:31:41 -03:00 · 2023-03-22 14:31:35 -03:00
428 changed files with 9806 additions and 171313 deletions
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,23 +0,0 @@
---
-Checks: >
-    bugprone-*,
-    -bugprone-easily-swappable-parameters,
-    -bugprone-implicit-widening-of-multiplication-result,
-    -bugprone-misplaced-widening-cast,
-    -bugprone-narrowing-conversions,
-    readability-*,
-    -readability-avoid-unconditional-preprocessor-if,
-    -readability-function-cognitive-complexity,
-    -readability-identifier-length,
-    -readability-implicit-bool-conversion,
-    -readability-magic-numbers,
-    -readability-uppercase-literal-suffix,
-    clang-analyzer-*,
-    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
-    performance-*,
-    portability-*,
-    misc-*,
-    -misc-const-correctness,
-    -misc-non-private-member-variables-in-classes,
-    -misc-no-recursion,
-FormatStyle: none
--- a/.devops/cloud-v-pipeline
+++ b/.devops/cloud-v-pipeline
@@ -1,22 +0,0 @@
-node('x86_runner1'){            // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
-    stage('Cleanup'){
-        cleanWs()               // Cleaning previous CI build in workspace
-    }
-    stage('checkout repo'){
-        retry(5){               // Retry if the cloning fails due to some reason
-            checkout scm        // Clone the repo on Runner
-        }
-    }
-    stage('Compiling llama.cpp'){
-        sh'''#!/bin/bash
-            make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
-        '''
-    }
-    stage('Running llama.cpp'){
-        sh'''#!/bin/bash
-            module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
-            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
-            cat llama_log.txt                   # Printing results
-        '''
-    }
-}
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -1,34 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
-
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
-
-RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
-
-RUN make
-
-ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -1,45 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=5.6
-
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-
-FROM ${BASE_ROCM_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
-    gfx803 \
-    gfx900 \
-    gfx906 \
-    gfx908 \
-    gfx90a \
-    gfx1010 \
-    gfx1030 \
-    gfx1100 \
-    gfx1101 \
-    gfx1102
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-ENV LLAMA_HIPBLAS=1
-ENV CC=/opt/rocm/llvm/bin/clang
-ENV CXX=/opt/rocm/llvm/bin/clang++
-
-RUN make
-
-ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -3,13 +3,10 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
+    apt-get install -y build-essential python3 python3-pip

 RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
+    && pip install numpy requests sentencepiece torch tqdm

 WORKDIR /app

@@ -17,6 +14,4 @@ COPY . .

 RUN make

-ENV LC_ALL=C.utf8
-
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ b/.devops/llama-cpp-clblast.srpm.spec
@@ -1,84 +0,0 @@
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-#    We need to declare standard versioning if people want to sort latest releases.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-#    It is up to the user to install the correct vendor-specific support.
-
-Name:           llama.cpp-clblast
-Version:        %( date "+%%Y%%m%%d" )
-Release:        1%{?dist}
-Summary:        OpenCL Inference of LLaMA model in C/C++
-License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires:  coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
-Requires:       clblast
-URL:            https://github.com/ggerganov/llama.cpp
-
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-
-%description
-CPU inference for Meta's Lllama2 models using default options.
-
-%prep
-%setup -n llama.cpp-master
-
-%build
-make -j LLAMA_CLBLAST=1
-
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamaclblast
-cp -p server %{buildroot}%{_bindir}/llamaclblastserver
-cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
-
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-
-[Install]
-WantedBy=default.target
-EOF
-
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-
-%files
-%{_bindir}/llamaclblast
-%{_bindir}/llamaclblastserver
-%{_bindir}/llamaclblastsimple
-/usr/lib/systemd/system/llamaclblast.service
-%config /etc/sysconfig/llama
-
-
-%pre
-
-%post
-
-%preun
-%postun
-
-%changelog
--- a/.devops/llama-cpp-cublas.srpm.spec
+++ b/.devops/llama-cpp-cublas.srpm.spec
@@ -1,83 +0,0 @@
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-#    We need to declare standard versioning if people want to sort latest releases.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-#    It is up to the user to install the correct vendor-specific support.
-
-Name:           llama.cpp-cublas
-Version:        %( date "+%%Y%%m%%d" )
-Release:        1%{?dist}
-Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
-License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
-Requires:       cuda-toolkit
-URL:            https://github.com/ggerganov/llama.cpp
-
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-
-%description
-CPU inference for Meta's Lllama2 models using default options.
-
-%prep
-%setup -n llama.cpp-master
-
-%build
-make -j LLAMA_CUBLAS=1
-
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamacppcublas
-cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
-cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
-
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacublas.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-
-[Install]
-WantedBy=default.target
-EOF
-
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-
-%files
-%{_bindir}/llamacppcublas
-%{_bindir}/llamacppcublasserver
-%{_bindir}/llamacppcublassimple
-/usr/lib/systemd/system/llamacublas.service
-%config /etc/sysconfig/llama
-
-%pre
-
-%post
-
-%preun
-%postun
-
-%changelog
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -1,85 +0,0 @@
-# SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
-# Built and maintained by John Boero - boeroboy@gmail.com
-# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
-
-# Notes for llama.cpp:
-# 1. Tags are currently based on hash - which will not sort asciibetically.
-#    We need to declare standard versioning if people want to sort latest releases.
-#    In the meantime, YYYYMMDD format will be used.
-# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
-# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
-#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
-# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
-#    It is up to the user to install the correct vendor-specific support.
-
-Name:           llama.cpp
-Version:        %( date "+%%Y%%m%%d" )
-Release:        1%{?dist}
-Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
-License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
-BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
-Requires:       libstdc++
-URL:            https://github.com/ggerganov/llama.cpp
-
-%define debug_package %{nil}
-%define source_date_epoch_from_changelog 0
-
-%description
-CPU inference for Meta's Lllama2 models using default options.
-Models are not included in this package and must be downloaded separately.
-
-%prep
-%setup -n llama.cpp-master
-
-%build
-make -j
-
-%install
-mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llama
-cp -p server %{buildroot}%{_bindir}/llamaserver
-cp -p simple %{buildroot}%{_bindir}/llamasimple
-
-mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
-[Unit]
-Description=Llama.cpp server, CPU only (no GPU support in this build).
-After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
-
-[Service]
-Type=simple
-EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
-ExecReload=/bin/kill -s HUP $MAINPID
-Restart=never
-
-[Install]
-WantedBy=default.target
-EOF
-
-mkdir -p %{buildroot}/etc/sysconfig
-%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
-LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
-EOF
-
-%clean
-rm -rf %{buildroot}
-rm -rf %{_builddir}/*
-
-%files
-%{_bindir}/llama
-%{_bindir}/llamaserver
-%{_bindir}/llamasimple
-/usr/lib/systemd/system/llama.service
-%config /etc/sysconfig/llama
-
-%pre
-
-%post
-
-%preun
-%postun
-
-%changelog
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -1,32 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=11.7.1
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the CUDA runtime image
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
-
-RUN apt-get update && \
-    apt-get install -y build-essential git
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
-
-RUN make
-
-FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
-
-COPY --from=build /app/main /main
-
-ENTRYPOINT [ "/main" ]
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@@ -1,26 +0,0 @@
-ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
-ARG UBUNTU_VERSION=22.04
-
-FROM intel/hpckit:$ONEAPI_VERSION as build
-
-RUN apt-get update && \
-    apt-get install -y git
-
-WORKDIR /app
-
-COPY . .
-
-# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
-RUN mkdir build && \
-    cd build && \
-    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
-    cmake --build . --config Release --target main server
-
-FROM ubuntu:$UBUNTU_VERSION as runtime
-
-COPY --from=build /app/build/bin/main /main
-COPY --from=build /app/build/bin/server /server
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/main" ]
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@@ -1,45 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-# This needs to generally match the container host's environment.
-ARG ROCM_VERSION=5.6
-
-# Target the CUDA build image
-ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
-
-FROM ${BASE_ROCM_DEV_CONTAINER} as build
-
-# Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
-# This is mostly tied to rocBLAS supported archs.
-ARG ROCM_DOCKER_ARCH=\
-    gfx803 \
-    gfx900 \
-    gfx906 \
-    gfx908 \
-    gfx90a \
-    gfx1010 \
-    gfx1030 \
-    gfx1100 \
-    gfx1101 \
-    gfx1102
-
-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install -r requirements.txt
-
-WORKDIR /app
-
-COPY . .
-
-# Set nvcc architecture
-ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
-# Enable ROCm
-ENV LLAMA_HIPBLAS=1
-ENV CC=/opt/rocm/llvm/bin/clang
-ENV CXX=/opt/rocm/llvm/bin/clang++
-
-RUN make
-
-ENTRYPOINT [ "/app/main" ]
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build

 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential

 WORKDIR /app

@@ -15,6 +15,4 @@ FROM ubuntu:$UBUNTU_VERSION as runtime

 COPY --from=build /app/main /main

-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/main" ]
+ENTRYPOINT [ "/main" ]
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@@ -1,22 +0,0 @@
-{
-  perSystem =
-    { config, lib, ... }:
-    {
-      apps =
-        let
-          inherit (config.packages) default;
-          binaries = [
-            "llama"
-            "llama-embedding"
-            "llama-server"
-            "quantize"
-            "train-text-from-scratch"
-          ];
-          mkApp = name: {
-            type = "app";
-            program = "${default}/bin/${name}";
-          };
-        in
-        lib.genAttrs binaries mkApp;
-    };
-}
--- a/.devops/nix/devshells.nix
+++ b/.devops/nix/devshells.nix
@@ -1,13 +0,0 @@
-{
-  perSystem =
-    { config, lib, ... }:
-    {
-      devShells =
-        lib.concatMapAttrs
-          (name: package: {
-            ${name} = package.passthru.shell;
-            ${name + "-extra"} = package.passthru.shell-extra;
-          })
-          config.packages;
-    };
-}
--- a/.devops/nix/jetson-support.nix
+++ b/.devops/nix/jetson-support.nix
@@ -1,39 +0,0 @@
-{ inputs, ... }:
-{
-  perSystem =
-    {
-      config,
-      system,
-      lib,
-      pkgsCuda,
-      ...
-    }:
-    {
-      legacyPackages =
-        let
-          caps.llamaPackagesXavier = "7.2";
-          caps.llamaPackagesOrin = "8.7";
-          caps.llamaPackagesTX2 = "6.2";
-          caps.llamaPackagesNano = "5.3";
-
-          pkgsFor =
-            cap:
-            import inputs.nixpkgs {
-              inherit system;
-              config = {
-                cudaSupport = true;
-                cudaCapabilities = [ cap ];
-                cudaEnableForwardCompat = false;
-                inherit (pkgsCuda.config) allowUnfreePredicate;
-              };
-            };
-        in
-        builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
-
-      packages = lib.optionalAttrs (system == "aarch64-linux") {
-        jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
-        jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
-        jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
-      };
-    };
-}
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@@ -1,47 +0,0 @@
-{ inputs, ... }:
-{
-  # The _module.args definitions are passed on to modules as arguments. E.g.
-  # the module `{ pkgs ... }: { /* config */ }` implicitly uses
-  # `_module.args.pkgs` (defined in this case by flake-parts).
-  perSystem =
-    { system, ... }:
-    {
-      _module.args = {
-        # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
-        # again, the below creates several nixpkgs instances which the
-        # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
-        #
-        # This is currently "slow" and "expensive", on a certain scale.
-        # This also isn't "right" in that this hinders dependency injection at
-        # the level of flake inputs. This might get removed in the foreseeable
-        # future.
-        #
-        # Note that you can use these expressions without Nix
-        # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
-
-        pkgsCuda = import inputs.nixpkgs {
-          inherit system;
-          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
-          # and ucx are built with CUDA support)
-          config.cudaSupport = true;
-          config.allowUnfreePredicate =
-            p:
-            builtins.all
-              (
-                license:
-                license.free
-                || builtins.elem license.shortName [
-                  "CUDA EULA"
-                  "cuDNN EULA"
-                ]
-              )
-              (p.meta.licenses or [ p.meta.license ]);
-        };
-        # Ensure dependencies use ROCm consistently
-        pkgsRocm = import inputs.nixpkgs {
-          inherit system;
-          config.rocmSupport = true;
-        };
-      };
-    };
-}
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -1,277 +0,0 @@
-{
-  lib,
-  config,
-  stdenv,
-  mkShell,
-  cmake,
-  ninja,
-  pkg-config,
-  git,
-  python3,
-  mpi,
-  openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
-  cudaPackages,
-  darwin,
-  rocmPackages,
-  clblast,
-  useBlas ? builtins.all (x: !x) [
-    useCuda
-    useMetalKit
-    useOpenCL
-    useRocm
-  ],
-  useCuda ? config.cudaSupport,
-  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
-  useMpi ? false, # Increases the runtime closure size by ~700M
-  useOpenCL ? false,
-  useRocm ? config.rocmSupport,
-  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
-}@inputs:
-
-let
-  inherit (lib)
-    cmakeBool
-    cmakeFeature
-    optionals
-    strings
-    versionOlder
-    ;
-
-  # It's necessary to consistently use backendStdenv when building with CUDA support,
-  # otherwise we get libstdc++ errors downstream.
-  stdenv = throw "Use effectiveStdenv instead";
-  effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
-
-  suffices =
-    lib.optionals useBlas [ "BLAS" ]
-    ++ lib.optionals useCuda [ "CUDA" ]
-    ++ lib.optionals useMetalKit [ "MetalKit" ]
-    ++ lib.optionals useMpi [ "MPI" ]
-    ++ lib.optionals useOpenCL [ "OpenCL" ]
-    ++ lib.optionals useRocm [ "ROCm" ];
-
-  pnameSuffix =
-    strings.optionalString (suffices != [ ])
-      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
-  descriptionSuffix =
-    strings.optionalString (suffices != [ ])
-      ", accelerated with ${strings.concatStringsSep ", " suffices}";
-
-  # TODO: package the Python in this repository in a Nix-like way.
-  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
-  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
-  # https://peps.python.org/pep-0517/
-  llama-python = python3.withPackages (
-    ps: [
-      ps.numpy
-      ps.sentencepiece
-    ]
-  );
-
-  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
-  llama-python-extra = python3.withPackages (
-    ps: [
-      ps.numpy
-      ps.sentencepiece
-      ps.tiktoken
-      ps.torchWithoutCuda
-      ps.transformers
-    ]
-  );
-
-  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
-  # separately
-  darwinBuildInputs =
-    with darwin.apple_sdk.frameworks;
-    [
-      Accelerate
-      CoreVideo
-      CoreGraphics
-    ]
-    ++ optionals useMetalKit [ MetalKit ];
-
-  cudaBuildInputs = with cudaPackages; [
-    cuda_cccl.dev # <nv/target>
-
-    # A temporary hack for reducing the closure size, remove once cudaPackages
-    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
-    cuda_cudart.dev
-    cuda_cudart.lib
-    cuda_cudart.static
-    libcublas.dev
-    libcublas.lib
-    libcublas.static
-  ];
-
-  rocmBuildInputs = with rocmPackages; [
-    clr
-    hipblas
-    rocblas
-  ];
-in
-
-effectiveStdenv.mkDerivation (
-  finalAttrs: {
-    pname = "llama-cpp${pnameSuffix}";
-    version = llamaVersion;
-
-    # Note: none of the files discarded here are visible in the sandbox or
-    # affect the output hash. This also means they can be modified without
-    # triggering a rebuild.
-    src = lib.cleanSourceWith {
-      filter =
-        name: type:
-        let
-          noneOf = builtins.all (x: !x);
-          baseName = baseNameOf name;
-        in
-        noneOf [
-          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
-          (lib.hasPrefix "." baseName) # Skip hidden files and directories
-          (baseName == "flake.lock")
-        ];
-      src = lib.cleanSource ../../.;
-    };
-
-    postPatch = ''
-      substituteInPlace ./ggml-metal.m \
-        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-
-      # TODO: Package up each Python script or service appropriately.
-      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
-      # we could make those *.py into setuptools' entrypoints
-      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
-    '';
-
-    nativeBuildInputs =
-      [
-        cmake
-        ninja
-        pkg-config
-        git
-      ]
-      ++ optionals useCuda [
-        cudaPackages.cuda_nvcc
-
-        # TODO: Replace with autoAddDriverRunpath
-        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
-        cudaPackages.autoAddOpenGLRunpathHook
-      ];
-
-    buildInputs =
-      optionals effectiveStdenv.isDarwin darwinBuildInputs
-      ++ optionals useCuda cudaBuildInputs
-      ++ optionals useMpi [ mpi ]
-      ++ optionals useOpenCL [ clblast ]
-      ++ optionals useRocm rocmBuildInputs;
-
-    cmakeFlags =
-      [
-        (cmakeBool "LLAMA_NATIVE" false)
-        (cmakeBool "LLAMA_BUILD_SERVER" true)
-        (cmakeBool "BUILD_SHARED_LIBS" true)
-        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-        (cmakeBool "LLAMA_BLAS" useBlas)
-        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
-        (cmakeBool "LLAMA_CUBLAS" useCuda)
-        (cmakeBool "LLAMA_HIPBLAS" useRocm)
-        (cmakeBool "LLAMA_METAL" useMetalKit)
-        (cmakeBool "LLAMA_MPI" useMpi)
-      ]
-      ++ optionals useCuda [
-        (
-          with cudaPackages.flags;
-          cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
-            builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
-          )
-        )
-      ]
-      ++ optionals useRocm [
-        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
-        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
-
-        # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
-        # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
-        # and select the line that matches the current nixpkgs version of rocBLAS.
-        # Should likely use `rocmPackages.clr.gpuTargets`.
-        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
-      ]
-      ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
-      ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
-
-    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
-    # if they haven't been added yet.
-    postInstall = ''
-      mv $out/bin/main $out/bin/llama
-      mv $out/bin/server $out/bin/llama-server
-      mkdir -p $out/include
-      cp $src/llama.h $out/include/
-    '';
-
-    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
-    passthru = {
-      inherit
-        useBlas
-        useCuda
-        useMetalKit
-        useMpi
-        useOpenCL
-        useRocm
-        ;
-
-      shell = mkShell {
-        name = "shell-${finalAttrs.finalPackage.name}";
-        description = "contains numpy and sentencepiece";
-        buildInputs = [ llama-python ];
-        inputsFrom = [ finalAttrs.finalPackage ];
-        shellHook = ''
-          addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
-        '';
-      };
-
-      shell-extra = mkShell {
-        name = "shell-extra-${finalAttrs.finalPackage.name}";
-        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
-        buildInputs = [ llama-python-extra ];
-        inputsFrom = [ finalAttrs.finalPackage ];
-      };
-    };
-
-    meta = {
-      # Configurations we don't want even the CI to evaluate. Results in the
-      # "unsupported platform" messages. This is mostly a no-op, because
-      # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
-
-      # Configurations that are known to result in build failures. Can be
-      # overridden by importing Nixpkgs with `allowBroken = true`.
-      broken = (useMetalKit && !effectiveStdenv.isDarwin);
-
-      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-      homepage = "https://github.com/ggerganov/llama.cpp/";
-      license = lib.licenses.mit;
-
-      # Accommodates `nix run` and `lib.getExe`
-      mainProgram = "llama";
-
-      # These people might respond, on the best effort basis, if you ping them
-      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
-      # Consider adding yourself to this list if you want to ensure this flake
-      # stays maintained and you're willing to invest your time. Do not add
-      # other people without their consent. Consider removing people after
-      # they've been unreachable for long periods of time.
-
-      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
-      # an attrset following the same format as in
-      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
-      maintainers = with lib.maintainers; [
-        philiptaron
-        SomeoneSerge
-      ];
-
-      # Extend `badPlatforms` instead
-      platforms = lib.platforms.all;
-    };
-  }
-)
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -1,16 +0,0 @@
-{
-  lib,
-  newScope,
-  llamaVersion ? "0.0.0",
-}:
-
-# We're using `makeScope` instead of just writing out an attrset
-# because it allows users to apply overlays later using `overrideScope'`.
-# Cf. https://noogle.dev/f/lib/makeScope
-
-lib.makeScope newScope (
-  self: {
-    inherit llamaVersion;
-    llama-cpp = self.callPackage ./package.nix { };
-  }
-)
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -7,39 +7,40 @@ arg1="$1"
 # Shift the arguments to remove the first one
 shift

-if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert.py "$@"
-elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    ./quantize "$@"
-elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./main "$@"
-elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
-    ./finetune "$@"
-elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
+# Join the remaining arguments into a single string
+arg2="$@"
+
+if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
+    python3 ./convert-pth-to-ggml.py $arg2
+elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
+    ./quantize $arg2
+elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
+    ./main $arg2
+elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
+    python3 ./download-pth.py $arg2
+elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
+    echo "Downloading model..."
+    python3 ./download-pth.py "$1" "$2"
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
        if [ -f "${i/f16/q4_0}" ]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            ./quantize "$i" "${i/f16/q4_0}" q4_0
+            ./quantize "$i" "${i/f16/q4_0}" 2
        fi
    done
-elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./server "$@"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
    echo "  --run (-r): Run a model previously converted into ggml"
    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
    echo "  --convert (-c): Convert a llama model into ggml"
-    echo "              ex: --outtype f16 \"/models/7B/\" "
+    echo "              ex: \"/models/7B/\" 1"
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --finetune (-f): Run finetune command to create a lora finetune of the model"
-    echo "              See documentation for finetune for command-line parameters"
-    echo "  --all-in-one (-a): Execute --convert & --quantize"
+    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
+    echo "              ex: \"/models/\" 7B"
+    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
-    echo "  --server (-s): Run a model on the server"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
 fi
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,14 +1,18 @@
 *.o
 *.a
 .cache/
-.git/
-.github/
-.gitignore
 .vs/
 .vscode/
 .DS_Store

-build*/
+build/
+build-em/
+build-debug/
+build-release/
+build-static/
+build-no-accel/
+build-sanitize-addr/
+build-sanitize-thread/

 models/*

@@ -17,4 +21,4 @@ models/*

 arm_neon.h
 compile_commands.json
-Dockerfile
+Dockerfile
--- a/.ecrc
+++ b/.ecrc
@@ -1,5 +0,0 @@
-{
-  "Disable": {
-    "IndentSize": true
-  }
-}
--- a/.editorconfig
+++ b/.editorconfig
@@ -1,28 +0,0 @@
-# https://EditorConfig.org
-
-# Top-most EditorConfig file
-root = true
-
-# Unix-style newlines with a newline ending every file, utf-8 charset
-[*]
-end_of_line = lf
-insert_final_newline = true
-trim_trailing_whitespace = true
-charset = utf-8
-indent_style = space
-indent_size = 4
-
-[Makefile]
-indent_style = tab
-
-[scripts/*.mk]
-indent_style = tab
-
-[prompts/*.txt]
-insert_final_newline = unset
-
-[examples/server/public/*]
-indent_size = 2
-
-[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
-indent_style = tab
--- a/.flake8
+++ b/.flake8
@@ -1,2 +0,0 @@
-[flake8]
-max-line-length = 125
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ b/.github/ISSUE_TEMPLATE/bug.md
@@ -1,9 +0,0 @@
---
-name: Bug template
-about: Used to report bugs in llama.cpp
-labels: ["bug-unconfirmed"]
-assignees: ''
-
---
-
-Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
--- a/.github/ISSUE_TEMPLATE/custom.md
+++ b/.github/ISSUE_TEMPLATE/custom.md
@@ -0,0 +1,198 @@
+---
+name: Custom issue template
+about: Used to report user-related issues with the software
+title: "[User] I encountered a problem .."
+labels: ''
+assignees: ''
+
+---
+
+# Prerequisites
+
+Please answer the following questions for yourself before submitting an issue.
+
+- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
+- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
+- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
+
+# Expected Behavior
+
+Please provide a detailed written description of what you were trying to do, and what you expected `lamma.cpp` to do.
+
+# Current Behavior
+
+Please provide a detailed written description of what `lamma.cpp` did, instead. 
+
+# Environment and Context 
+
+Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
+
+* Physical (or virtual) hardware you are using, e.g. for Linux:
+
+`$ lscpu`
+
+* Operating System, e.g. for Linux:
+
+`$ uname -a`
+
+* SDK version, e.g. for Linux:
+
+```
+$ python3 --version
+$ make --version
+$ g++ --version
+```
+
+# Models
+
+* The LLaMA models are officially distributed by Facebook and will never be provided through this repository. See this [pull request in Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to obtain access to the model data.
+* If your issue is with model conversion please verify the `sha256sum` of each of your `consolidated*.pth` and `ggml-model-XXX.bin` files to confirm that you have the correct model data files before logging an issue. [Latest sha256 sums for your reference](https://github.com/ggerganov/llama.cpp/issues/238).
+* If your issue is with model generation quality then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
+  * LLaMA:
+    * [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
+    * [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
+  * GPT-3
+    * [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
+  * GPT-3.5 / InstructGPT / ChatGPT:
+    * [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
+    * [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
+
+# Failure Information (for bugs)
+
+Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
+
+# Steps to Reproduce
+
+Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
+
+1. step 1
+2. step 2
+3. step 3
+4. etc.
+
+# Failure Logs
+
+Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
+
+Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability. e.g.
+
+```
+llama.cpp$ git log | head -1
+commit 2af23d30434a677c6416812eea52ccc0af65119c
+
+llama.cpp$ lscpu | egrep "AMD|Flags"
+Vendor ID:                       AuthenticAMD
+Model name:                      AMD Ryzen Threadripper 1950X 16-Core Processor
+Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev
+Virtualization:                  AMD-V
+
+llama.cpp$ python3 --version
+Python 3.10.9
+
+llama.cpp$ pip list | egrep "torch|numpy|sentencepiece"
+numpy                         1.24.2
+numpydoc                      1.5.0
+sentencepiece                 0.1.97
+torch                         1.13.1
+torchvision                   0.14.1
+
+llama.cpp$ make --version | head -1
+GNU Make 4.3
+
+$ md5sum ./models/65B/ggml-model-q4_0.bin
+dbdd682cce80e2d6e93cefc7449df487  ./models/65B/ggml-model-q4_0.bin
+```
+Here's a run with the Linux command [perf](https://www.brendangregg.com/perf.html)
+
+```
+llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered."
+main: seed = 1679149377
+llama_model_load: loading model from './models/65B/ggml-model-q4_0.bin' - please wait ...
+llama_model_load: n_vocab = 32000
+llama_model_load: n_ctx   = 512
+llama_model_load: n_embd  = 8192
+llama_model_load: n_mult  = 256
+llama_model_load: n_head  = 64
+llama_model_load: n_layer = 80
+llama_model_load: n_rot   = 128
+llama_model_load: f16     = 2
+llama_model_load: n_ff    = 22016
+llama_model_load: n_parts = 8
+llama_model_load: ggml ctx size = 41477.73 MB
+llama_model_load: memory_size =  2560.00 MB, n_mem = 40960
+llama_model_load: loading model part 1/8 from './models/65B/ggml-model-q4_0.bin'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 2/8 from './models/65B/ggml-model-q4_0.bin.1'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 3/8 from './models/65B/ggml-model-q4_0.bin.2'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 4/8 from './models/65B/ggml-model-q4_0.bin.3'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 5/8 from './models/65B/ggml-model-q4_0.bin.4'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 6/8 from './models/65B/ggml-model-q4_0.bin.5'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 7/8 from './models/65B/ggml-model-q4_0.bin.6'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.7'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+
+system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 
+
+main: prompt: 'Please close your issue when it has been answered.'
+main: number of tokens in prompt = 11
+     1 -> ''
+ 12148 -> 'Please'
+  3802 -> ' close'
+   596 -> ' your'
+  2228 -> ' issue'
+   746 -> ' when'
+   372 -> ' it'
+   756 -> ' has'
+  1063 -> ' been'
+  7699 -> ' answered'
+ 29889 -> '.'
+
+sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
+
+
+Please close your issue when it has been answered.
+@duncan-donut: I'm trying to figure out what kind of "support" you need for this script and why, exactly? Is there a question about how the code works that hasn't already been addressed in one or more comments below this ticket, or are we talking something else entirely like some sorta bugfixing job because your server setup is different from mine??
+I can understand if your site needs to be running smoothly and you need help with a fix of sorts but there should really be nothing wrong here that the code itself could not handle. And given that I'm getting reports about how it works perfectly well on some other servers, what exactly are we talking? A detailed report will do wonders in helping us get this resolved for ya quickly so please take your time and describe the issue(s) you see as clearly & concisely as possible!!
+@duncan-donut: I'm not sure if you have access to cPanel but you could try these instructions. It is worth a shot! Let me know how it goes (or what error message, exactly!) when/if ya give that code a go? [end of text]
+
+
+main: mem per token = 71159620 bytes
+main:     load time = 19309.95 ms
+main:   sample time =   168.62 ms
+main:  predict time = 223895.61 ms / 888.47 ms per token
+main:    total time = 246406.42 ms
+
+ Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
+
+        3636882.89 msec task-clock                #   14.677 CPUs utilized          
+             13509      context-switches          #    3.714 /sec                   
+              2436      cpu-migrations            #    0.670 /sec                   
+          10476679      page-faults               #    2.881 K/sec                  
+    13133115082869      cycles                    #    3.611 GHz                      (16.77%)
+       29314462753      stalled-cycles-frontend   #    0.22% frontend cycles idle     (16.76%)
+    10294402631459      stalled-cycles-backend    #   78.39% backend cycles idle      (16.74%)
+    23479217109614      instructions              #    1.79  insn per cycle         
+                                                  #    0.44  stalled cycles per insn  (16.76%)
+     2353072268027      branches                  #  647.002 M/sec                    (16.77%)
+        1998682780      branch-misses             #    0.08% of all branches          (16.76%)
+
+     247.802177522 seconds time elapsed
+
+    3618.573072000 seconds user
+      18.491698000 seconds sys
+```
--- a/.github/ISSUE_TEMPLATE/enhancement.md
+++ b/.github/ISSUE_TEMPLATE/enhancement.md
@@ -1,28 +0,0 @@
---
-name: Enhancement template
-about: Used to request enhancements for llama.cpp
-labels: ["enhancement"]
-assignees: ''
-
---
-
-# Prerequisites
-
-Please answer the following questions for yourself before submitting an issue.
-
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
-
-# Feature Description
-
-Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
-
-# Motivation
-
-Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
-
-# Possible Implementation
-
-If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -8,51 +8,22 @@ on:
        required: true
        type: boolean
  push:
-    branches:
-      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
+    paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
+    types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
+    paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']

 env:
-  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
+ BRANCH_NAME: ${{ github.head_ref || github.ref_name }}

 jobs:
-  ubuntu-focal-make:
-    runs-on: ubuntu-20.04
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential gcc-8
-
-      - name: Build
-        id: make_build
-        run: |
-          CC=gcc-8 make -j $(nproc)
-
-      - name: Test
-        id: make_test
-        run: |
-          CC=gcc-8 make tests -j $(nproc)
-          make test -j $(nproc)
-
-  ubuntu-latest-cmake:
+  ubuntu-latest-make:
    runs-on: ubuntu-latest

    steps:
      - name: Clone
        id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1

      - name: Dependencies
        id: depends
@@ -60,569 +31,116 @@ jobs:
          sudo apt-get update
          sudo apt-get install build-essential

+      - name: Build
+        id: make_build
+        run: |
+          make
+
+  ubuntu-latest-cmake:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          cmake ..
+          cmake --build . --config Release
+          ctest --output-on-failure
+
+  macOS-latest-make:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          brew update
+
+      - name: Build
+        id: make_build
+        run: |
+          make
+
+  macOS-latest-cmake:
+    runs-on: macOS-latest
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        run: |
+          brew update
+
+      - name: Build
+        run: |
+          mkdir build
+          cd build
+          cmake -DLLAMA_AVX2=OFF ..
+          cmake --build . --config Release
+          ctest --output-on-failure
+
+  windows-latest-cmake:
+    runs-on: windows-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
      - name: Build
        id: cmake_build
        run: |
          mkdir build
          cd build
          cmake ..
-          cmake --build . --config Release -j $(nproc)
+          cmake --build . --config Release
+          ctest -C Release --output-on-failure

-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-latest-cmake-sanitizer:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
-          cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  ubuntu-latest-cmake-mpi:
-    runs-on: ubuntu-latest
-
-    continue-on-error: true
-
-    strategy:
-      matrix:
-        mpi_library: [mpich, libopenmpi-dev]
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential ${{ matrix.mpi_library }}
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake -DLLAMA_MPI=ON ..
-          cmake --build . --config Release -j $(nproc)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose
-
-  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
-  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
-  macOS-latest-make:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: make_build
-        run: |
-          LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: make_test
-        run: |
-          LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
-          LLAMA_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
-
-  # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
-  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
-  #       would be great if we fix these
-  macOS-latest-cmake:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -DLLAMA_METAL=OFF ..
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
-
-      - name: Test
-        id: cmake_test
-        run: |
-          cd build
-          ctest -L main --verbose --timeout 900
-
-  macOS-latest-cmake-ios:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=iOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
-
-  macOS-latest-cmake-tvos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          mkdir build
-          cd build
-          cmake -G Xcode .. \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=tvOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
-          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
-
-  macOS-latest-swift:
-    runs-on: macos-latest
-
-    strategy:
-      matrix:
-        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v1
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: xcodebuild for swift package
-        id: xcodebuild
-        run: |
-          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
-
-      - name: Build Swift Example
-        id: make_build_swift_example
-        run: |
-            make swift
-
-  windows-latest-cmake:
-    runs-on: windows-latest
-
-    env:
-      OPENBLAS_VERSION: 0.3.23
-      OPENCL_VERSION: 2023.04.17
-      CLBLAST_VERSION: 1.6.0
-      SDE_VERSION: 9.33.0-2024-01-07
-
-    strategy:
-      matrix:
-        include:
-          - build: 'noavx'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx2'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
-          - build: 'avx512'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'clblast'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
-          - build: 'openblas'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-
-      - name: Download OpenCL SDK
-        id: get_opencl
-        if: ${{ matrix.build == 'clblast' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
-          mkdir $env:RUNNER_TEMP/opencl
-          tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
-
-      - name: Download CLBlast
-        id: get_clblast
-        if: ${{ matrix.build == 'clblast' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
-          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
-          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
-          rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
-          foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
-            $txt = Get-Content -Path $f -Raw
-            $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
-          }
-
-      - name: Download OpenBLAS
-        id: get_openblas
-        if: ${{ matrix.build == 'openblas' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
-          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
-          mkdir $env:RUNNER_TEMP/openblas
-          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
-          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. ${{ matrix.defines }}
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
-      - name: Add clblast.dll
-        id: add_clblast_dll
-        if: ${{ matrix.build == 'clblast' }}
-        run: |
-          cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
-          cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
-
-      - name: Add libopenblas.dll
-        id: add_libopenblas_dll
-        if: ${{ matrix.build == 'openblas' }}
-        run: |
-          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
-          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
-
-      - name: Check AVX512F support
-        id: check_avx512f
-        if: ${{ matrix.build == 'avx512' }}
-        continue-on-error: true
-        run: |
-          cd build
-          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
-          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
-          $cl =  $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
-          echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
-          & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
-          .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
-
-      - name: Test
-        id: cmake_test
-        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
-        run: |
-          cd build
-          ctest -L main -C Release --verbose --timeout 900
-
-      - name: Test (Intel SDE)
-        id: cmake_test_sde
-        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
-          # for some weird reason windows tar doesn't like sde tar.xz
-          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
-          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
-          $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
-          cd build
-          & $sde -future -- ctest -L main -C Release --verbose --timeout 900
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2

      - name: Pack artifacts
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
-          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
-        with:
-          path: |
-            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
-
-  windows-latest-cmake-cublas:
-    runs-on: windows-latest
-
-    strategy:
-      matrix:
-        cuda: ['12.2.0', '11.7.1']
-        build: ['cublas']
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-
-      - uses: Jimver/cuda-toolkit@v0.2.11
-        id: cuda-toolkit
-        with:
-          cuda: ${{ matrix.cuda }}
-          method: 'network'
-          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
-
-      - name: Build
-        id: cmake_build
-        run: |
-          mkdir build
-          cd build
-          cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
-          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        run: |
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
-
-      - name: Upload artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
-        with:
-          path: |
-            llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
-
-      - name: Copy and pack Cuda runtime
-        run: |
-          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
-          $dst='.\build\bin\cudart\'
-          robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
-
-      - name: Upload Cuda runtime
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
-        with:
-          path: |
-            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-
-  ios-xcode-build:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-
-      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
-
-  android-build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v3
-
-      - name: Set up JDK
-        uses: actions/setup-java@v3
-        with:
-          java-version: 17
-          distribution: zulu
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@v3
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - name: Build
-        run: |
-          cd examples/llama.android
-
-          # Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
-          ./gradlew build --no-daemon -Pskip-armeabi-v7a
-
-#  freeBSD-latest:
-#    runs-on: macos-12
-#    steps:
-#    - name: Clone
-#      uses: actions/checkout@v3
-#
-#    - name: Build
-#      uses: cross-platform-actions/action@v0.19.0
-#      with:
-#        operating_system: freebsd
-#        version: '13.2'
-#        hypervisor: 'qemu'
-#        run: |
-#            sudo pkg update
-#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
-#            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
-
-  release:
-    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-
-    runs-on: ubuntu-latest
-
-    needs:
-      - ubuntu-focal-make
-      - ubuntu-latest-cmake
-      - macOS-latest-make
-      - macOS-latest-cmake
-      - windows-latest-cmake
-      - windows-latest-cmake-cublas
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 0
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Download artifacts
-        id: download-artifact
-        uses: actions/download-artifact@v3
+          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\bin\Release\*

      - name: Create release
        id: create_release
-        uses: anzz1/action-create-release@v1
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: zendesk/action-create-release@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
-          tag_name: ${{ steps.tag.outputs.name }}
+          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}

      - name: Upload release
        id: upload_release
-        uses: actions/github-script@v3
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
-          github-token: ${{secrets.GITHUB_TOKEN}}
-          script: |
-            const path = require('path');
-            const fs = require('fs');
-            const release_id = '${{ steps.create_release.outputs.id }}';
-            for (let file of await fs.readdirSync('./artifact')) {
-              if (path.extname(file) === '.zip') {
-                console.log('uploadReleaseAsset', file);
-                await github.repos.uploadReleaseAsset({
-                  owner: context.repo.owner,
-                  repo: context.repo.repo,
-                  release_id: release_id,
-                  name: file,
-                  data: await fs.readFileSync(`./artifact/${file}`)
-                });
-              }
-            }
+          upload_url: ${{ steps.create_release.outputs.upload_url }} 
+          asset_path: .\llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
+          asset_name: llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
+          asset_content_type: application/octet-stream

 #  ubuntu-latest-gcc:
 #    runs-on: ubuntu-latest
@@ -633,7 +151,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v1
 #
 #      - name: Dependencies
 #        run: |
@@ -657,7 +175,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v1
 #
 #      - name: Dependencies
 #        run: |
@@ -681,7 +199,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v1
 #
 #      - name: Dependencies
 #        run: |
@@ -711,7 +229,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v1
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -750,7 +268,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v1
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -796,7 +314,7 @@ jobs:
 #
 #    steps:
 #      - name: Clone
-#        uses: actions/checkout@v3
+#        uses: actions/checkout@v1
 #
 #      - name: Dependencies
 #        run: |
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@@ -1,36 +0,0 @@
-name: Code Coverage
-on: [push, pull_request]
-
-env:
-  GGML_NLOOP: 3
-  GGML_N_THREADS: 1
-
-jobs:
-  run:
-    runs-on: ubuntu-20.04
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-
-      - name: Dependencies
-        run: |
-          sudo apt-get update
-          sudo apt-get install build-essential gcc-8 lcov
-
-      - name: Build
-        run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
-
-      - name: Run tests
-        run: CC=gcc-8 make test
-
-      - name: Generate coverage report
-        run: |
-          make coverage
-          make lcov-report
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v3
-        env:
-           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-        with:
-          files: lcov-report/coverage.info
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -18,24 +18,14 @@ on:
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
-    if: github.event.pull_request.draft == false
-
    runs-on: ubuntu-latest
    env:
      COMMIT_SHA: ${{ github.sha }}
    strategy:
      matrix:
        config:
-          - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
-          #                     have disabled them for now until the reason why
-          #                     is understood.
-          - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
-          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
+          - { tag: "light", dockerfile: ".devops/main.Dockerfile" }
+          - { tag: "full", dockerfile: ".devops/full.Dockerfile" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v3
@@ -53,44 +43,13 @@ jobs:
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: true
-          swap-storage: true
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v4
        with:
          context: .
          push: true
-          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}

      - name: Build and push Docker image (tagged)
@@ -98,6 +57,5 @@ jobs:
        with:
          context: .
          push: ${{ github.event_name == 'push' }}
-          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
-          file: ${{ matrix.config.dockerfile }}
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
+          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@@ -1,17 +0,0 @@
-name: EditorConfig Checker
-
-on:
-  push:
-    branches:
-      - master
-  pull_request:
-    branches:
-      - master
-
-jobs:
-  editorconfig:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: editorconfig-checker/action-editorconfig-checker@main
-      - run: editorconfig-checker
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -1,44 +0,0 @@
-# This workflow will upload a Python Package using Twine when a GGUF release is created
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
-
-# See `gguf-py/README.md` for how to make a release.
-
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
-name: Upload Python Package
-
-on:
-  workflow_dispatch:
-  push:
-    # Pattern matched against refs/tags
-    tags:
-      - 'gguf-v*'           # Push events to every version tag
-
-
-jobs:
-  deploy:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python
-      uses: actions/setup-python@v2
-      with:
-        python-version: '3.9.x'
-    - name: Install dependencies
-      run: |
-        cd gguf-py
-        python -m pip install poetry
-        poetry install
-
-    - name: Build package
-      run: cd gguf-py && poetry build
-    - name: Publish package
-      uses: pypa/gh-action-pypi-publish@release/v1
-      with:
-        password: ${{ secrets.PYPI_API_TOKEN }}
-        packages-dir: gguf-py/dist
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -1,62 +0,0 @@
-name: Nix aarch64 builds
-
-on:
-  workflow_dispatch: # allows manual triggering
-  schedule:
-    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
-    # 1.5h instead of minutes with the cold cache).
-    #
-    # randint(0, 59), randint(0, 23)
-    - cron: '26 12 * * *'
-  # But also rebuild if we touched any of the Nix expressions:
-  push:
-    branches:
-      - master
-    paths: ['**/*.nix', 'flake.lock']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['**/*.nix', 'flake.lock']
-
-jobs:
-  nix-build-aarch64:
-    if: ${{ vars.CACHIX_NAME != '' }}
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install QEMU
-      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y qemu-user-static qemu-system-aarch64
-        sudo usermod -a -G kvm $USER
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-platforms = aarch64-linux
-          extra-system-features = nixos-test kvm
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.aarch64-linux"
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --systems aarch64-linux
-          --flake
-          ".#checks.aarch64-linux"
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -1,69 +0,0 @@
-name: Nix CI
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-  pull_request:
-    types: [opened, synchronize, reopened]
-
-jobs:
-  nix-eval:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: List all flake outputs
-      run: nix flake show --all-systems
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
-  nix-build:
-    if: ${{ vars.CACHIX_NAME != '' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --flake
-          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
--- a/.github/workflows/nix-flake-update.yml
+++ b/.github/workflows/nix-flake-update.yml
@@ -1,22 +0,0 @@
-name: update-flake-lock
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
-
-jobs:
-  lockfile:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Install Nix
-        uses: DeterminateSystems/nix-installer-action@main
-      - name: Update flake.lock
-        uses: DeterminateSystems/update-flake-lock@main
-        with:
-          pr-title: "nix: update flake.lock"
-          pr-labels: |
-            nix
-          pr-reviewers: philiptaron,SomeoneSerge
-          token: ${{ secrets.FLAKE_TOKEN }}
--- a/.github/workflows/nix-publish-flake.yml
+++ b/.github/workflows/nix-publish-flake.yml
@@ -1,36 +0,0 @@
-# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
-name: "Publish a flake to flakestry & flakehub"
-on:
-    push:
-        tags:
-        - "*"
-    workflow_dispatch:
-        inputs:
-            tag:
-                description: "The existing tag to publish"
-                type: "string"
-                required: true
-jobs:
-    flakestry-publish:
-        runs-on: ubuntu-latest
-        permissions:
-            id-token: "write"
-            contents: "read"
-        steps:
-            - uses: flakestry/flakestry-publish@main
-              with:
-                version: "${{ inputs.tag || github.ref_name }}"
-    flakehub-publish:
-      runs-on: "ubuntu-latest"
-      permissions:
-        id-token: "write"
-        contents: "read"
-      steps:
-        - uses: "actions/checkout@v4"
-          with:
-            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
-        - uses: "DeterminateSystems/nix-installer-action@main"
-        - uses: "DeterminateSystems/flakehub-push@main"
-          with:
-            visibility: "public"
-            tag: "${{ inputs.tag }}"
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -1,29 +0,0 @@
-name: Python check requirements.txt
-
-on:
-  push:
-    paths:
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
-  pull_request:
-    paths:
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
-
-jobs:
-  python-check-requirements:
-    runs-on: ubuntu-latest
-    name: check-requirements
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v3
-      - name: Set up Python environment
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.11"
-      - name: Run check-requirements.sh script
-        run:  bash scripts/check-requirements.sh nocleanup
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -1,20 +0,0 @@
-name: flake8 Lint
-
-on: [push, pull_request]
-
-jobs:
-  flake8-lint:
-    runs-on: ubuntu-latest
-    name: Lint
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v3
-      - name: Set up Python environment
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.11"
-      - name: flake8 Lint
-        uses: py-actions/flake8@v2
-        with:
-            ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
-            exclude: "examples/*,examples/*/**,*/**/__init__.py"
--- a/.github/workflows/tidy-post.yml
+++ b/.github/workflows/tidy-post.yml
@@ -1,20 +0,0 @@
-name: clang-tidy review post comments
-
-on:
-  workflow_dispatch:
-    workflows: ["clang-tidy-review"]
-    types:
-      - completed
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: ZedThree/clang-tidy-review/post@v0.13.0
-        # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
-        with:
-          # adjust options as necessary
-          lgtm_comment_body: ''
-          annotations: false
-          max_comments: 25
--- a/.github/workflows/tidy-review.yml
+++ b/.github/workflows/tidy-review.yml
@@ -1,23 +0,0 @@
-name: clang-tidy-review
-
-on:
-  pull_request:
-    branches:
-      - master
-
-jobs:
-  clang-tidy-review:
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v3
-
-    - uses: ZedThree/clang-tidy-review@v0.13.0
-      id: review
-      with:
-        lgtm_comment_body: ''
-        build_dir: build
-        cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
-        split_workflow: true
-
-    - uses: ZedThree/clang-tidy-review/upload@v0.13.0
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@@ -1,25 +0,0 @@
-name: Zig CI
-
-on:
-  pull_request:
-  push:
-    branches:
-      - master
-
-jobs:
-  build:
-    strategy:
-      fail-fast: false
-      matrix:
-        runs-on: [ubuntu-latest, macos-latest, windows-latest]
-    runs-on: ${{ matrix.runs-on }}
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          submodules: recursive
-          fetch-depth: 0
-      - uses: goto-bus-stop/setup-zig@v2
-        with:
-          version: 0.11.0
-      - name: Build Summary
-        run: zig build --summary all -freference-trace
--- a/.gitignore
+++ b/.gitignore
@@ -1,91 +1,27 @@
 *.o
 *.a
-*.so
-*.gguf
-*.bin
-*.exe
-*.dll
-*.log
-*.gcov
-*.gcno
-*.gcda
-*.dot
-*.bat
-*.metallib
-.DS_Store
-.build/
 .cache/
-.ccls-cache/
-.direnv/
-.envrc
-.swiftpm
-.venv
-.clang-tidy
 .vs/
 .vscode/
+.DS_Store

-lcov-report/
-gcovr-report/
-
-build*
-out/
-tmp/
+build/
+build-em/
+build-debug/
+build-release/
+build-static/
+build-no-accel/
+build-sanitize-addr/
+build-sanitize-thread/

 models/*
-models-mnt

-/Pipfile
-/baby-llama
-/beam-search
-/benchmark-matmult
-/convert-llama2c-to-ggml
-/embd-input-test
-/embedding
-/gguf
-/gguf-llama-simple
-/imatrix
-/infill
-/libllama.so
-/llama-bench
-/llava-cli
-/lookahead
-/lookup
 /main
-/metal
-/passkey
-/perplexity
-/q8dot
 /quantize
-/quantize-stats
 /result
-/save-load-state
-/server
-/simple
-/batched
-/batched-bench
-/export-lora
-/finetune
-/speculative
-/parallel
-/train-text-from-scratch
-/tokenize
-/vdot
-/common/build-info.cpp
+
 arm_neon.h
 compile_commands.json
-CMakeSettings.json

-__pycache__
-dist
-
-zig-out/
-zig-cache/
-
-ppl-*.txt
-qnt-*.txt
-perf-*.txt
-
-examples/jeopardy/results.txt
-
-poetry.lock
-poetry.toml
+.envrc
+.direnv/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,15 +0,0 @@
-# See https://pre-commit.com for more information
-# See https://pre-commit.com/hooks.html for more hooks
-exclude: prompts/.*.txt
-repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v3.2.0
-  hooks:
-  - id: trailing-whitespace
-  - id: end-of-file-fixer
-  - id: check-yaml
-  - id: check-added-large-files
- repo: https://github.com/PyCQA/flake8
-  rev: 6.0.0
-  hooks:
-  -   id: flake8
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
+cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
 project("llama.cpp" C CXX)

 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -10,7 +10,7 @@ endif()

 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

-if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(LLAMA_STANDALONE ON)

    # configure project version
@@ -36,119 +36,53 @@ endif()
 # Option list
 #

-if (APPLE)
-    set(LLAMA_METAL_DEFAULT ON)
-else()
-    set(LLAMA_METAL_DEFAULT OFF)
-endif()
-
 # general
-option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
-option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
-option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
-option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
-option(LLAMA_CCACHE                     "llama: use ccache if available"                        ON)
+option(LLAMA_STATIC                 "llama: static link libraries"                          OFF)
+option(LLAMA_NATIVE                 "llama: enable -march=native flag"                      OFF)
+option(LLAMA_LTO                    "llama: enable link time optimization"                  OFF)

 # debug
-option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
-option(LLAMA_ALL_WARNINGS_3RD_PARTY     "llama: enable all compiler warnings in 3rd party libs" OFF)
-option(LLAMA_GPROF                      "llama: enable gprof"                                   OFF)
+option(LLAMA_ALL_WARNINGS           "llama: enable all compiler warnings"                   ON)
+option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
+option(LLAMA_GPROF                  "llama: enable gprof"                                   OFF)

 # sanitizers
-option(LLAMA_SANITIZE_THREAD            "llama: enable thread sanitizer"                        OFF)
-option(LLAMA_SANITIZE_ADDRESS           "llama: enable address sanitizer"                       OFF)
-option(LLAMA_SANITIZE_UNDEFINED         "llama: enable undefined sanitizer"                     OFF)
+option(LLAMA_SANITIZE_THREAD        "llama: enable thread sanitizer"                        OFF)
+option(LLAMA_SANITIZE_ADDRESS       "llama: enable address sanitizer"                       OFF)
+option(LLAMA_SANITIZE_UNDEFINED     "llama: enable undefined sanitizer"                     OFF)

 # instruction set specific
-if (LLAMA_NATIVE)
-    set(INS_ENB OFF)
-else()
-    set(INS_ENB ON)
-endif()
-
-option(LLAMA_AVX                             "llama: enable AVX"                                ${INS_ENB})
-option(LLAMA_AVX2                            "llama: enable AVX2"                               ${INS_ENB})
-option(LLAMA_AVX512                          "llama: enable AVX512"                             OFF)
-option(LLAMA_AVX512_VBMI                     "llama: enable AVX512-VBMI"                        OFF)
-option(LLAMA_AVX512_VNNI                     "llama: enable AVX512-VNNI"                        OFF)
-option(LLAMA_FMA                             "llama: enable FMA"                                ${INS_ENB})
-# in MSVC F16C is implied with AVX2/AVX512
-if (NOT MSVC)
-    option(LLAMA_F16C                        "llama: enable F16C"                               ${INS_ENB})
-endif()
-
-if (WIN32)
-    option(LLAMA_WIN_VER                     "llama: Windows Version"                           0x602)
-endif()
+option(LLAMA_AVX                    "llama: enable AVX"                                     ON)
+option(LLAMA_AVX2                   "llama: enable AVX2"                                    ON)
+option(LLAMA_FMA                    "llama: enable FMA"                                     ON)

 # 3rd party libs
-option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
-option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
-set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
-#option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
-option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
-option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
-set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
-set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
-option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
-set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
-set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
-                                             "llama: max. batch size for using peer access")
-option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
-option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
-option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
-option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
-option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
-option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
-option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
-option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
+option(LLAMA_ACCELERATE             "llama: enable Accelerate framework"                    ON)
+option(LLAMA_OPENBLAS               "llama: use OpenBLAS"                                   OFF)

-option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
-
-
-# add perf arguments
-option(LLAMA_PERF                            "llama: enable perf"                               OFF)
-if (LLAMA_PERF)
-    add_definitions(-DGGML_PERF)
-endif()
-
-# Required for relocatable CMake package
-include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+option(LLAMA_BUILD_TESTS            "llama: build tests"    ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})

 #
 # Compile flags
 #

-set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
-set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
-include(CheckCXXCompilerFlag)
-
-# enable libstdc++ assertions for debug builds
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
-endif()

 if (NOT MSVC)
    if (LLAMA_SANITIZE_THREAD)
        add_compile_options(-fsanitize=thread)
-        link_libraries(-fsanitize=thread)
    endif()

    if (LLAMA_SANITIZE_ADDRESS)
        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
-        link_libraries(-fsanitize=address)
    endif()

    if (LLAMA_SANITIZE_UNDEFINED)
        add_compile_options(-fsanitize=undefined)
-        link_libraries(-fsanitize=undefined)
    endif()
 endif()

@@ -158,407 +92,55 @@ if (APPLE AND LLAMA_ACCELERATE)
        message(STATUS "Accelerate framework found")

        add_compile_definitions(GGML_USE_ACCELERATE)
-        add_compile_definitions(ACCELERATE_NEW_LAPACK)
-        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
    else()
        message(WARNING "Accelerate framework not found")
    endif()
 endif()
-
-if (LLAMA_METAL)
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
-
-    message(STATUS "Metal framework found")
-    set(GGML_HEADERS_METAL ggml-metal.h)
-    set(GGML_SOURCES_METAL ggml-metal.m)
-
-    add_compile_definitions(GGML_USE_METAL)
-    if (LLAMA_METAL_NDEBUG)
-        add_compile_definitions(GGML_METAL_NDEBUG)
-    endif()
-
-    # get full path to the file
-    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
-
-    # copy ggml-metal.metal to bin directory
-    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
-
-    if (LLAMA_METAL_SHADER_DEBUG)
-        # custom command to do the following:
-        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
-        #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
-        #
-        # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
-        #       disabling fast math is needed in order to pass tests/test-backend-ops
-        # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
-        # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
-        #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
-        set(XC_FLAGS -fno-fast-math -fno-inline -g)
-        if (LLAMA_QKK_64)
-            set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
-        endif()
-
-        add_custom_command(
-            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
-            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            DEPENDS ggml-metal.metal
-            COMMENT "Compiling Metal kernels"
-        )
-
-        add_custom_target(
-            ggml-metal ALL
-            DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-        )
-    endif()
-
-    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
-        ${FOUNDATION_LIBRARY}
-        ${METAL_FRAMEWORK}
-        ${METALKIT_FRAMEWORK}
-        )
-endif()
-if (LLAMA_BLAS)
+if (LLAMA_OPENBLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
    endif()
-    if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
-        set(BLA_SIZEOF_INTEGER 8)
-    endif()

-    set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
+    set(BLA_VENDOR OpenBLAS)
    find_package(BLAS)
-
    if (BLAS_FOUND)
-        message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
+        message(STATUS "OpenBLAS found")

-        if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
-            # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
-            # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
-            find_package(PkgConfig REQUIRED)
-            if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
-                pkg_check_modules(DepBLAS REQUIRED blas)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
-                # As of openblas v0.3.22, the 64-bit is named openblas64.pc
-                pkg_check_modules(DepBLAS openblas64)
-                if (NOT DepBLAS_FOUND)
-                    pkg_check_modules(DepBLAS REQUIRED openblas)
-                endif()
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
-                pkg_check_modules(DepBLAS REQUIRED blis)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
-                pkg_check_modules(DepBLAS REQUIRED blas-atlas)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS")
-                pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
-                # all Intel* libraries share the same include path
-                pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
-                # this doesn't provide pkg-config
-                # suggest to assign BLAS_INCLUDE_DIRS on your own
-                if ("${NVHPC_VERSION}" STREQUAL "")
-                    message(WARNING "Better to set NVHPC_VERSION")
-                else()
-                    set(DepBLAS_FOUND ON)
-                    set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
-                endif()
-            endif()
-            if (DepBLAS_FOUND)
-                set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
-            else()
-                message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
-                " detected by pkgconfig, trying to find cblas.h from possible paths...")
-                find_path(BLAS_INCLUDE_DIRS
-                    NAMES cblas.h
-                    HINTS
-                        /usr/include
-                        /usr/local/include
-                        /usr/include/openblas
-                        /opt/homebrew/opt/openblas/include
-                        /usr/local/opt/openblas/include
-                        /usr/include/x86_64-linux-gnu/openblas/include
-                )
-            endif()
-        endif()
-
-        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
-        add_compile_options(${BLAS_LINKER_FLAGS})
        add_compile_definitions(GGML_USE_OPENBLAS)
-        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
-            add_compile_definitions(GGML_BLAS_USE_MKL)
-        endif()
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
-        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
-
+        add_link_options(${BLAS_LIBRARIES})
    else()
-        message(WARNING "BLAS not found, please refer to "
-        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
-        " to set correct LLAMA_BLAS_VENDOR")
+        message(WARNING "OpenBLAS not found")
    endif()
 endif()

-if (LLAMA_QKK_64)
-    add_compile_definitions(GGML_QKK_64)
-endif()
-
-if (LLAMA_CUBLAS)
-    cmake_minimum_required(VERSION 3.17)
-
-    find_package(CUDAToolkit)
-    if (CUDAToolkit_FOUND)
-        message(STATUS "cuBLAS found")
-
-        enable_language(CUDA)
-
-        set(GGML_HEADERS_CUDA ggml-cuda.h)
-        set(GGML_SOURCES_CUDA ggml-cuda.cu)
-
-        add_compile_definitions(GGML_USE_CUBLAS)
-#        if (LLAMA_CUDA_CUBLAS)
-#            add_compile_definitions(GGML_CUDA_CUBLAS)
-#        endif()
-        if (LLAMA_CUDA_FORCE_DMMV)
-            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
-        endif()
-        if (LLAMA_CUDA_FORCE_MMQ)
-            add_compile_definitions(GGML_CUDA_FORCE_MMQ)
-        endif()
-        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
-        add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
-        if (DEFINED LLAMA_CUDA_DMMV_Y)
-            add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
-        endif()
-        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
-            add_compile_definitions(GGML_CUDA_F16)
-        endif()
-        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
-        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
-
-        if (LLAMA_STATIC)
-            if (WIN32)
-                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-            else ()
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-            endif()
-        else()
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
-        endif()
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
-
-    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        # 52 == lowest CUDA 12 standard
-        # 60 == f16 CUDA intrinsics
-        # 61 == integer CUDA intrinsics
-        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
-        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
-        else()
-            set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
-            #set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
-        endif()
-    endif()
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
-    else()
-        message(WARNING "cuBLAS not found")
-    endif()
-endif()
-
-if (LLAMA_MPI)
-    cmake_minimum_required(VERSION 3.10)
-    find_package(MPI)
-    if (MPI_C_FOUND)
-        message(STATUS "MPI found")
-        set(GGML_HEADERS_MPI ggml-mpi.h)
-        set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
-        add_compile_definitions(GGML_USE_MPI)
-        add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
-        if (NOT MSVC)
-            add_compile_options(-Wno-cast-qual)
-        endif()
-        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${MPI_C_LIBRARIES})
-        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
-        # Even if you're only using the C header, C++ programs may bring in MPI
-        # C++ functions, so more linkage is needed
-        if (MPI_CXX_FOUND)
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}     ${MPI_CXX_LIBRARIES})
-        endif()
-    else()
-        message(WARNING "MPI not found")
-    endif()
-endif()
-
-if (LLAMA_CLBLAST)
-    find_package(CLBlast)
-    if (CLBlast_FOUND)
-        message(STATUS "CLBlast found")
-
-        set(GGML_HEADERS_OPENCL ggml-opencl.h)
-        set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
-
-        add_compile_definitions(GGML_USE_CLBLAST)
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
-    else()
-        message(WARNING "CLBlast not found")
-    endif()
-endif()
-
-if (LLAMA_HIPBLAS)
-    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
-
-    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
-        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
-    endif()
-    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
-        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
-    endif()
-
-    find_package(hip)
-    find_package(hipblas)
-    find_package(rocblas)
-
-    if (${hipblas_FOUND} AND ${hip_FOUND})
-        message(STATUS "HIP and hipBLAS found")
-        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
-        if (LLAMA_HIP_UMA)
-            add_compile_definitions(GGML_HIP_UMA)
-        endif()
-        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
-        if (BUILD_SHARED_LIBS)
-            set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
-        endif()
-        if (LLAMA_CUDA_FORCE_DMMV)
-            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
-        endif()
-        if (LLAMA_CUDA_FORCE_MMQ)
-            target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
-        endif()
-        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
-        target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
-        target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
-        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
-        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
-
-        if (LLAMA_STATIC)
-            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
-        endif()
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
-    else()
-        message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
-    endif()
-endif()
-
-function(get_flags CCID CCVER)
-    set(C_FLAGS "")
-    set(CXX_FLAGS "")
-
-    if (CCID MATCHES "Clang")
-        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
-        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
-
-        if (
-            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
-            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
-        )
-            list(APPEND C_FLAGS -Wdouble-promotion)
-        endif()
-    elseif (CCID STREQUAL "GNU")
-        set(C_FLAGS   -Wdouble-promotion)
-        set(CXX_FLAGS -Wno-array-bounds)
-
-        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
-            list(APPEND CXX_FLAGS -Wno-format-truncation)
-        endif()
-        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
-            list(APPEND CXX_FLAGS -Wextra-semi)
-        endif()
-    elseif (CCID MATCHES "Intel")
-        # enable max optimization level when using Intel compiler
-        set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
-        set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
-        add_link_options(-fuse-ld=lld -static-intel)
-    endif()
-
-    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
-    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
-endfunction()
-
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
-        set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        set(C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-                          -Werror=implicit-int -Werror=implicit-function-declaration)
-        set(CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
-
-        set(C_FLAGS   ${WARNING_FLAGS} ${C_FLAGS})
-        set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS})
-
-        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
-
-        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
-                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+        set(c_flags
+            -Wall
+            -Wextra
+            -Wpedantic
+            -Wshadow
+            -Wcast-qual
+            -Wstrict-prototypes
+            -Wpointer-arith
+            -Wno-unused-function
+        )
+        set(cxx_flags
+            -Wall
+            -Wextra
+            -Wpedantic
+            -Wcast-qual
+        )
    else()
        # todo : msvc
-        set(C_FLAGS   "")
-        set(CXX_FLAGS "")
-    endif()
-endif()
-
-set(CUDA_CXX_FLAGS "")
-
-if (LLAMA_CUBLAS)
-    set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
-    if (NOT MSVC)
-        list(APPEND CUDA_FLAGS -Wno-pedantic)
    endif()

-    if (LLAMA_ALL_WARNINGS AND NOT MSVC)
-        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
-        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-            list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
-        endif()
+    add_compile_options(
+            "$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
+            "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
+    )

-        execute_process(
-            COMMAND ${NVCC_CMD} -Xcompiler --version
-            OUTPUT_VARIABLE CUDA_CCFULLVER
-            ERROR_QUIET
-        )
-
-        if (NOT CUDA_CCFULLVER MATCHES clang)
-            set(CUDA_CCID "GNU")
-            execute_process(
-                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
-                OUTPUT_VARIABLE CUDA_CCVER
-                ERROR_QUIET
-            )
-        else()
-            if (CUDA_CCFULLVER MATCHES Apple)
-                set(CUDA_CCID "AppleClang")
-            else()
-                set(CUDA_CCID "Clang")
-            endif()
-            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
-        endif()
-
-        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
-
-        get_flags(${CUDA_CCID} ${CUDA_CCVER})
-        list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS})  # This is passed to -Xcompiler later
-    endif()
-endif()
-
-if (WIN32)
-    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
-
-    if (BUILD_SHARED_LIBS)
-        set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-    endif()
 endif()

 if (LLAMA_LTO)
@@ -571,38 +153,10 @@ if (LLAMA_LTO)
    endif()
 endif()

-if (LLAMA_CCACHE)
-    find_program(LLAMA_CCACHE_FOUND ccache)
-    if (LLAMA_CCACHE_FOUND)
-        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
-        set(ENV{CCACHE_SLOPPINESS} time_macros)
-        message(STATUS "Using ccache")
-    else()
-        message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF")
-    endif ()
-endif()
-
-# this version of Apple ld64 is buggy
-execute_process(
-    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
-    ERROR_VARIABLE output
-    OUTPUT_QUIET
-)
-if (output MATCHES "dyld-1015\.7")
-    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
-endif()
-
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
-if (MSVC)
-  string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
-  message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
-else ()
-  set(CMAKE_GENERATOR_PLATFORM_LWR "")
-endif ()
-
 if (NOT MSVC)
    if (LLAMA_STATIC)
        add_link_options(-static)
@@ -613,322 +167,99 @@ if (NOT MSVC)
    if (LLAMA_GPROF)
        add_compile_options(-pg)
    endif()
+    if (LLAMA_NATIVE)
+        add_compile_options(-march=native)
+    endif()
 endif()

-set(ARCH_FLAGS "")
-
-if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
+if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
    message(STATUS "ARM detected")
    if (MSVC)
-        add_compile_definitions(__ARM_NEON)
-        add_compile_definitions(__ARM_FEATURE_FMA)
-        add_compile_definitions(__ARM_FEATURE_DOTPROD)
-        # add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
-        add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
+        # TODO: arm msvc?
    else()
-        check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
-        if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
-            list(APPEND ARCH_FLAGS -mfp16-format=ieee)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
-            # Raspberry Pi 1, Zero
-            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
-            # Raspberry Pi 2
-            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
-        endif()
-        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
-            # Raspberry Pi 3, 4, Zero 2 (32-bit)
-            list(APPEND ARCH_FLAGS -mno-unaligned-access)
+        if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
+            add_compile_options(-mcpu=native)
        endif()
+        # TODO: armv6,7,8 version specific flags
    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
+elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
    message(STATUS "x86 detected")
    if (MSVC)
-        # instruction set detection for MSVC only
-        if (LLAMA_NATIVE)
-            include(cmake/FindSIMD.cmake)
-        endif ()
-        if (LLAMA_AVX512)
-            list(APPEND ARCH_FLAGS /arch:AVX512)
-            # MSVC has no compile-time flags enabling specific
-            # AVX512 extensions, neither it defines the
-            # macros corresponding to the extensions.
-            # Do it manually.
-            if (LLAMA_AVX512_VBMI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
-            endif()
-            if (LLAMA_AVX512_VNNI)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
-                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
-            endif()
-        elseif (LLAMA_AVX2)
-            list(APPEND ARCH_FLAGS /arch:AVX2)
+        if (LLAMA_AVX2)
+            add_compile_options(/arch:AVX2)
        elseif (LLAMA_AVX)
-            list(APPEND ARCH_FLAGS /arch:AVX)
+            add_compile_options(/arch:AVX)
        endif()
    else()
-        if (LLAMA_NATIVE)
-            list(APPEND ARCH_FLAGS -march=native)
-        endif()
-        if (LLAMA_F16C)
-            list(APPEND ARCH_FLAGS -mf16c)
-        endif()
+        add_compile_options(-mf16c)
        if (LLAMA_FMA)
-            list(APPEND ARCH_FLAGS -mfma)
+            add_compile_options(-mfma)
        endif()
        if (LLAMA_AVX)
-            list(APPEND ARCH_FLAGS -mavx)
+            add_compile_options(-mavx)
        endif()
        if (LLAMA_AVX2)
-            list(APPEND ARCH_FLAGS -mavx2)
+            add_compile_options(-mavx2)
        endif()
-        if (LLAMA_AVX512)
-            list(APPEND ARCH_FLAGS -mavx512f)
-            list(APPEND ARCH_FLAGS -mavx512bw)
-        endif()
-        if (LLAMA_AVX512_VBMI)
-            list(APPEND ARCH_FLAGS -mavx512vbmi)
-        endif()
-        if (LLAMA_AVX512_VNNI)
-            list(APPEND ARCH_FLAGS -mavx512vnni)
-        endif()
-    endif()
-elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
-    message(STATUS "PowerPC detected")
-    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-        list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
-    else()
-        list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
-        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
    endif()
 else()
+    # TODO: support PowerPC
    message(STATUS "Unknown architecture")
 endif()

-add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
-add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
-
-if (LLAMA_CUBLAS)
-    list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
-    list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
-    if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
-        list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
-    endif()
-    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
-endif()
-
-if (MINGW)
-    # Target Windows 8 for PrefetchVirtualMemory
-    add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
-endif()
-
 #
-# POSIX conformance
+# Build libraries
 #

-# clock_gettime came in POSIX.1b (1993)
-# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
-# posix_memalign came in POSIX.1-2001 / SUSv3
-# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
-add_compile_definitions(_XOPEN_SOURCE=600)
+add_library(utils OBJECT
+            utils.cpp
+            utils.h)

-# Somehow in OpenBSD whenever POSIX conformance is specified
-# some string functions rely on locale_t availability,
-# which was introduced in POSIX.1-2008, forcing us to go higher
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    remove_definitions(-D_XOPEN_SOURCE=600)
-    add_compile_definitions(_XOPEN_SOURCE=700)
-endif()
-
-# Data types, macros and functions related to controlling CPU affinity and
-# some memory allocation are available on Linux through GNU extensions in libc
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    add_compile_definitions(_GNU_SOURCE)
-endif()
-
-# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
-# and on macOS its availability depends on enabling Darwin extensions
-# similarly on DragonFly, enabling BSD extensions is necessary
-if (
-    CMAKE_SYSTEM_NAME MATCHES "Darwin" OR
-    CMAKE_SYSTEM_NAME MATCHES "iOS" OR
-    CMAKE_SYSTEM_NAME MATCHES "tvOS" OR
-    CMAKE_SYSTEM_NAME MATCHES "DragonFly"
-)
-    add_compile_definitions(_DARWIN_C_SOURCE)
-endif()
-
-# alloca is a non-standard interface that is not visible on BSDs when
-# POSIX conformance is specified, but not all of them provide a clean way
-# to enable it in such cases
-if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
-    add_compile_definitions(__BSD_VISIBLE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
-    add_compile_definitions(_NETBSD_SOURCE)
-endif()
-if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
-    add_compile_definitions(_BSD_SOURCE)
-endif()
-
-#
-# libraries
-#
-
-# ggml
-
-if (GGML_USE_CPU_HBM)
-    add_definitions(-DGGML_USE_CPU_HBM)
-    find_library(memkind memkind REQUIRED)
-endif()
+target_include_directories(utils PUBLIC .)
+target_compile_features(utils PUBLIC cxx_std_11) # don't bump
+target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})

 add_library(ggml OBJECT
            ggml.c
-            ggml.h
-            ggml-alloc.c
-            ggml-alloc.h
-            ggml-backend.c
-            ggml-backend.h
-            ggml-quants.c
-            ggml-quants.h
-            ${GGML_SOURCES_CUDA}   ${GGML_HEADERS_CUDA}
-            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
-            ${GGML_SOURCES_METAL}  ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_MPI}    ${GGML_HEADERS_MPI}
-            ${GGML_SOURCES_EXTRA}  ${GGML_HEADERS_EXTRA}
-            )
+            ggml.h)

-target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
+target_include_directories(ggml PUBLIC .)
 target_compile_features(ggml PUBLIC c_std_11) # don't bump
-target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
-if (GGML_USE_CPU_HBM)
-    target_link_libraries(ggml PUBLIC memkind)
-endif()
-
-add_library(ggml_static STATIC $<TARGET_OBJECTS:ggml>)
-if (BUILD_SHARED_LIBS)
-    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    add_library(ggml_shared SHARED $<TARGET_OBJECTS:ggml>)
-    target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
-    install(TARGETS ggml_shared LIBRARY)
-endif()
-
-# llama
+target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})

 add_library(llama
            llama.cpp
-            llama.h
-            )
+            llama.h)

 target_include_directories(llama PUBLIC .)
 target_compile_features(llama PUBLIC cxx_std_11) # don't bump
-target_link_libraries(llama PRIVATE
-    ggml
-    ${LLAMA_EXTRA_LIBS}
-    )
-
-if (BUILD_SHARED_LIBS)
-    set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
-    if (LLAMA_METAL)
-        set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
-    endif()
-endif()
-
+target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS})

 #
-# install
+# Executables
 #

-include(GNUInstallDirs)
-include(CMakePackageConfigHelpers)
+add_executable(main
+               main.cpp
+               run.cpp)
+target_link_libraries(main PRIVATE llama ggml utils)

-set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}
-    CACHE PATH "Location of header files")
-set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}
-    CACHE PATH "Location of library files")
-set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
-    CACHE PATH "Location of binary files")
-set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
-set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
-set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
-get_directory_property(LLAMA_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
-
-configure_package_config_file(
-        ${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
-        ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
-    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama
-    PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
-              LLAMA_LIB_INSTALL_DIR
-              LLAMA_BIN_INSTALL_DIR )
-
-write_basic_package_version_file(
-        ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
-    VERSION ${LLAMA_INSTALL_VERSION}
-    COMPATIBILITY SameMajorVersion)
-
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
-              ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
-        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
-
-set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
-        "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
-        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
-
-set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
-install(TARGETS ggml PUBLIC_HEADER)
-
-set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
-install(TARGETS llama LIBRARY PUBLIC_HEADER)
-
-install(
-    FILES convert.py
-    PERMISSIONS
-        OWNER_READ
-        OWNER_WRITE
-        OWNER_EXECUTE
-        GROUP_READ
-        GROUP_EXECUTE
-        WORLD_READ
-        WORLD_EXECUTE
-    DESTINATION ${CMAKE_INSTALL_BINDIR})
-install(
-    FILES convert-lora-to-ggml.py
-    PERMISSIONS
-        OWNER_READ
-        OWNER_WRITE
-        OWNER_EXECUTE
-        GROUP_READ
-        GROUP_EXECUTE
-        WORLD_READ
-        WORLD_EXECUTE
-    DESTINATION ${CMAKE_INSTALL_BINDIR})
-if (LLAMA_METAL)
-    install(
-        FILES ggml-metal.metal
-        PERMISSIONS
-            OWNER_READ
-            OWNER_WRITE
-            GROUP_READ
-            WORLD_READ
-        DESTINATION ${CMAKE_INSTALL_BINDIR})
+if(NOT WIN32)
+    target_sources(main PRIVATE tcp_server.cpp)
 endif()

+add_executable(quantize quantize.cpp)
+target_link_libraries(quantize PRIVATE llama ggml utils)
+
 #
 # programs, examples and tests
 #

-add_subdirectory(common)
-
 if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
-    include(CTest)
+    enable_testing()
    add_subdirectory(tests)
 endif ()

-if (LLAMA_BUILD_EXAMPLES)
-    add_subdirectory(examples)
-    add_subdirectory(pocs)
-endif()
+#if (LLAMA_BUILD_EXAMPLES)
+#    add_subdirectory(examples)
+#endif()
--- a/838
+++ b/838
@@ -1,19 +1,3 @@
-# Define the default target now so that it is always the first target
-BUILD_TARGETS = \
-	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
-
-# Binaries only useful for tests
-TEST_TARGETS = \
-	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
-	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
-
-# Code coverage output files
-COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
-
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
@@ -26,13 +10,12 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif

+CCV := $(shell $(CC) --version | head -n 1)
+CXXV := $(shell $(CXX) --version | head -n 1)
+
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
-	ifndef LLAMA_NO_METAL
-		LLAMA_METAL := 1
-	endif
-
 	ifneq ($(UNAME_P),arm)
 		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
 		ifeq ($(SYSCTL_M),1)
@@ -43,476 +26,174 @@ ifeq ($(UNAME_S),Darwin)
 	endif
 endif

-default: $(BUILD_TARGETS)
-
-test: $(TEST_TARGETS)
-	@failures=0; \
-	for test_target in $(TEST_TARGETS); do \
-		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
-			continue; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
-			continue; \
-		else \
-			echo "Running test $$test_target..."; \
-			./$$test_target; \
-		fi; \
-		if [ $$? -ne 0 ]; then \
-			printf 'Test %s FAILED!\n\n' $$test_target; \
-			failures=$$(( failures + 1 )); \
-		else \
-			printf 'Test %s passed.\n\n' $$test_target; \
-		fi; \
-	done; \
-	if [ $$failures -gt 0 ]; then \
-		printf '\n%s tests failed.\n' $$failures; \
-		exit 1; \
-	fi
-	@echo 'All tests passed.'
-
-all: $(BUILD_TARGETS) $(TEST_TARGETS)
-
-coverage: ## Run code coverage
-	gcov -pb tests/*.cpp
-
-lcov-report: coverage ## Generate lcov report
-	mkdir -p lcov-report
-	lcov --capture --directory . --output-file lcov-report/coverage.info
-	genhtml lcov-report/coverage.info --output-directory lcov-report
-
-gcovr-report: coverage ## Generate gcovr report
-	mkdir -p gcovr-report
-	gcovr --root . --html --html-details --output gcovr-report/coverage.html
-
-ifdef RISCV_CROSS_COMPILE
-CC	:= riscv64-unknown-linux-gnu-gcc
-CXX	:= riscv64-unknown-linux-gnu-g++
-endif
-
 #
 # Compile flags
 #

 # keep standard at C11 and C++11
-MK_CPPFLAGS = -I. -Icommon
-MK_CFLAGS   = -std=c11   -fPIC
-MK_CXXFLAGS = -std=c++11 -fPIC
-
-# -Ofast tends to produce faster code, but may not be available for some compilers.
-ifdef LLAMA_FAST
-MK_CFLAGS     += -Ofast
-HOST_CXXFLAGS += -Ofast
-MK_NVCCFLAGS  += -O3
-else
-MK_CFLAGS     += -O3
-MK_CXXFLAGS   += -O3
-endif
-
-# clock_gettime came in POSIX.1b (1993)
-# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
-# posix_memalign came in POSIX.1-2001 / SUSv3
-# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
-MK_CPPFLAGS += -D_XOPEN_SOURCE=600
-
-# Somehow in OpenBSD whenever POSIX conformance is specified
-# some string functions rely on locale_t availability,
-# which was introduced in POSIX.1-2008, forcing us to go higher
-ifeq ($(UNAME_S),OpenBSD)
-	MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
-endif
-
-# Data types, macros and functions related to controlling CPU affinity and
-# some memory allocation are available on Linux through GNU extensions in libc
-ifeq ($(UNAME_S),Linux)
-	MK_CPPFLAGS += -D_GNU_SOURCE
-endif
-
-# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
-# and on macOS its availability depends on enabling Darwin extensions
-# similarly on DragonFly, enabling BSD extensions is necessary
-ifeq ($(UNAME_S),Darwin)
-	MK_CPPFLAGS += -D_DARWIN_C_SOURCE
-endif
-ifeq ($(UNAME_S),DragonFly)
-	MK_CPPFLAGS += -D__BSD_VISIBLE
-endif
-
-# alloca is a non-standard interface that is not visible on BSDs when
-# POSIX conformance is specified, but not all of them provide a clean way
-# to enable it in such cases
-ifeq ($(UNAME_S),FreeBSD)
-	MK_CPPFLAGS += -D__BSD_VISIBLE
-endif
-ifeq ($(UNAME_S),NetBSD)
-	MK_CPPFLAGS += -D_NETBSD_SOURCE
-endif
-ifeq ($(UNAME_S),OpenBSD)
-	MK_CPPFLAGS += -D_BSD_SOURCE
-endif
-
-ifdef LLAMA_DEBUG
-	MK_CFLAGS   += -O0 -g
-	MK_CXXFLAGS += -O0 -g
-	MK_LDFLAGS  += -g
-
-	ifeq ($(UNAME_S),Linux)
-		MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
-	endif
-else
-	MK_CPPFLAGS += -DNDEBUG
-endif
-
-ifdef LLAMA_SANITIZE_THREAD
-	MK_CFLAGS   += -fsanitize=thread -g
-	MK_CXXFLAGS += -fsanitize=thread -g
-	MK_LDFLAGS  += -fsanitize=thread -g
-endif
-
-ifdef LLAMA_SANITIZE_ADDRESS
-	MK_CFLAGS   += -fsanitize=address -fno-omit-frame-pointer -g
-	MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
-	MK_LDFLAGS  += -fsanitize=address -fno-omit-frame-pointer -g
-endif
-
-ifdef LLAMA_SANITIZE_UNDEFINED
-	MK_CFLAGS   += -fsanitize=undefined -g
-	MK_CXXFLAGS += -fsanitize=undefined -g
-	MK_LDFLAGS  += -fsanitize=undefined -g
-endif
-
-ifdef LLAMA_SERVER_VERBOSE
-	MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
-endif
-
-
-ifdef LLAMA_CODE_COVERAGE
-	MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
-endif
-
-ifdef LLAMA_DISABLE_LOGS
-	MK_CPPFLAGS += -DLOG_DISABLE_LOGS
-endif # LLAMA_DISABLE_LOGS
-
-# warnings
-WARN_FLAGS    = -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
-MK_CFLAGS    += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int \
-				-Werror=implicit-function-declaration
-MK_CXXFLAGS  += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn
-
-# this version of Apple ld64 is buggy
-ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
-	MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
-endif
+CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
+CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
+LDFLAGS  =

 # OS specific
 # TODO: support Windows
-ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
-	MK_CFLAGS   += -pthread
-	MK_CXXFLAGS += -pthread
+ifeq ($(UNAME_S),Linux)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
 endif
-
-# detect Windows
-ifneq ($(findstring _NT,$(UNAME_S)),)
-	_WIN32 := 1
+ifeq ($(UNAME_S),Darwin)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
 endif
-
-# library name prefix
-ifneq ($(_WIN32),1)
-	LIB_PRE := lib
+ifeq ($(UNAME_S),FreeBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
 endif
-
-# Dynamic Shared Object extension
-ifneq ($(_WIN32),1)
-	DSO_EXT := .so
-else
-	DSO_EXT := .dll
+ifeq ($(UNAME_S),NetBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
 endif
-
-# Windows Sockets 2 (Winsock) for network-capable apps
-ifeq ($(_WIN32),1)
-	LWINSOCK2 := -lws2_32
+ifeq ($(UNAME_S),OpenBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
 endif
-
-ifdef LLAMA_GPROF
-	MK_CFLAGS   += -pg
-	MK_CXXFLAGS += -pg
-endif
-ifdef LLAMA_PERF
-	MK_CPPFLAGS += -DGGML_PERF
+ifeq ($(UNAME_S),Haiku)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
 endif

 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
-
-ifndef RISCV
-
-ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
-	# Use all CPU extensions that are available:
-	MK_CFLAGS     += -march=native -mtune=native
-	HOST_CXXFLAGS += -march=native -mtune=native
-
-	# Usage AVX-only
-	#MK_CFLAGS   += -mfma -mf16c -mavx
-	#MK_CXXFLAGS += -mfma -mf16c -mavx
-
-	# Usage SSSE3-only (Not is SSE3!)
-	#MK_CFLAGS   += -mssse3
-	#MK_CXXFLAGS += -mssse3
-endif
-
-ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
-	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
-	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
-	# https://github.com/ggerganov/llama.cpp/issues/2922
-	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
-	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
-
-	# Target Windows 8 for PrefetchVirtualMemory
-	MK_CPPFLAGS += -D_WIN32_WINNT=0x602
-endif
-
-ifneq ($(filter aarch64%,$(UNAME_M)),)
-	# Apple M1, M2, etc.
-	# Raspberry Pi 3, 4, Zero 2 (64-bit)
-	# Nvidia Jetson
-	MK_CFLAGS   += -mcpu=native
-	MK_CXXFLAGS += -mcpu=native
-	JETSON_RELEASE_INFO = $(shell jetson_release)
-	ifdef JETSON_RELEASE_INFO
-		ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),)
-			JETSON_EOL_MODULE_DETECT = 1
-			CC = aarch64-unknown-linux-gnu-gcc
-			cxx = aarch64-unknown-linux-gnu-g++
+ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
+	ifeq ($(UNAME_S),Darwin)
+		CFLAGS += -mf16c
+		AVX1_M := $(shell sysctl machdep.cpu.features)
+		ifneq (,$(findstring FMA,$(AVX1_M)))
+			CFLAGS += -mfma
 		endif
+		ifneq (,$(findstring AVX1.0,$(AVX1_M)))
+			CFLAGS += -mavx
+		endif
+		AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
+		ifneq (,$(findstring AVX2,$(AVX2_M)))
+			CFLAGS += -mavx2
+		endif
+	else ifeq ($(UNAME_S),Linux)
+		AVX1_M := $(shell grep "avx " /proc/cpuinfo)
+		ifneq (,$(findstring avx,$(AVX1_M)))
+			CFLAGS += -mavx
+		endif
+		AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
+		ifneq (,$(findstring avx2,$(AVX2_M)))
+			CFLAGS += -mavx2
+		endif
+		FMA_M := $(shell grep "fma " /proc/cpuinfo)
+		ifneq (,$(findstring fma,$(FMA_M)))
+			CFLAGS += -mfma
+		endif
+		F16C_M := $(shell grep "f16c " /proc/cpuinfo)
+		ifneq (,$(findstring f16c,$(F16C_M)))
+			CFLAGS += -mf16c
+		endif
+		SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
+		ifneq (,$(findstring sse3,$(SSE3_M)))
+			CFLAGS += -msse3
+		endif
+		AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
+		ifneq (,$(findstring avx512f,$(AVX512F_M)))
+			CFLAGS += -mavx512f
+		endif
+		AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
+		ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
+			CFLAGS += -mavx512bw
+		endif
+		AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
+		ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
+			CFLAGS += -mavx512dq
+		endif
+		AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
+		ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
+			CFLAGS += -mavx512vl
+		endif
+		AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
+		ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
+			CFLAGS += -mavx512cd
+		endif
+		AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
+		ifneq (,$(findstring avx512er,$(AVX512ER_M)))
+			CFLAGS += -mavx512er
+		endif
+		AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
+		ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
+			CFLAGS += -mavx512ifma
+		endif
+		AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
+		ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
+			CFLAGS += -mavx512pf
+		endif
+	else ifeq ($(UNAME_S),Haiku)
+		AVX1_M := $(shell sysinfo -cpu | grep -w "AVX")
+		ifneq (,$(findstring AVX,$(AVX1_M)))
+			CFLAGS += -mavx
+		endif
+		AVX2_M := $(shell sysinfo -cpu | grep -w "AVX2")
+		ifneq (,$(findstring AVX2,$(AVX2_M)))
+			CFLAGS += -mavx2
+		endif
+		FMA_M := $(shell sysinfo -cpu | grep -w "FMA")
+		ifneq (,$(findstring FMA,$(FMA_M)))
+			CFLAGS += -mfma
+		endif
+		F16C_M := $(shell sysinfo -cpu | grep -w "F16C")
+		ifneq (,$(findstring F16C,$(F16C_M)))
+			CFLAGS += -mf16c
+		endif
+	else
+		CFLAGS += -mfma -mf16c -mavx -mavx2
 	endif
 endif
-
-ifneq ($(filter armv6%,$(UNAME_M)),)
-	# Raspberry Pi 1, Zero
-	MK_CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
-	MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
-endif
-
-ifneq ($(filter armv7%,$(UNAME_M)),)
-	# Raspberry Pi 2
-	MK_CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
-	MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
-endif
-
-ifneq ($(filter armv8%,$(UNAME_M)),)
-	# Raspberry Pi 3, 4, Zero 2 (32-bit)
-	MK_CFLAGS   += -mfp16-format=ieee -mno-unaligned-access
-	MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access
-endif
-
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
-		MK_CFLAGS   += -mcpu=power9
-		MK_CXXFLAGS += -mcpu=power9
+		CFLAGS += -mpower9-vector
+	endif
+	# Require c++23's std::byteswap for big-endian support.
+	ifeq ($(UNAME_M),ppc64)
+		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
 	endif
 endif
-
-ifneq ($(filter ppc64le%,$(UNAME_M)),)
-	MK_CFLAGS   += -mcpu=powerpc64le
-	MK_CXXFLAGS += -mcpu=powerpc64le
-	CUDA_POWER_ARCH = 1
-endif
-
-else
-	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
-	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
-endif
-
-ifdef LLAMA_QKK_64
-	MK_CPPFLAGS += -DGGML_QKK_64
-endif
-
 ifndef LLAMA_NO_ACCELERATE
-	# Mac OS - include Accelerate framework.
-	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
+	# Mac M1 - include Accelerate framework.
+	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
 	ifeq ($(UNAME_S),Darwin)
-		MK_CPPFLAGS += -DGGML_USE_ACCELERATE
-		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
-		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
-		MK_LDFLAGS  += -framework Accelerate
+		CFLAGS  += -DGGML_USE_ACCELERATE
+		LDFLAGS += -framework Accelerate
 	endif
-endif # LLAMA_NO_ACCELERATE
-
-ifdef LLAMA_MPI
-	MK_CPPFLAGS += -DGGML_USE_MPI
-	MK_CFLAGS   += -Wno-cast-qual
-	MK_CXXFLAGS += -Wno-cast-qual
-	OBJS        += ggml-mpi.o
-endif # LLAMA_MPI
-
+endif
 ifdef LLAMA_OPENBLAS
-	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
-	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
-	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
-endif # LLAMA_OPENBLAS
-
-ifdef LLAMA_BLIS
-	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
-	MK_LDFLAGS  += -lblis -L/usr/local/lib
-endif # LLAMA_BLIS
-
-ifdef LLAMA_CUBLAS
-	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
-	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
-	OBJS         += ggml-cuda.o
-	MK_NVCCFLAGS  = -use_fast_math
-ifndef JETSON_EOL_MODULE_DETECT
-	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
-endif # JETSON_EOL_MODULE_DETECT
-ifdef LLAMA_DEBUG
-	MK_NVCCFLAGS += -lineinfo
-endif # LLAMA_DEBUG
-ifdef LLAMA_CUDA_NVCC
-	NVCC = $(LLAMA_CUDA_NVCC)
-else
-	NVCC = nvcc
-endif #LLAMA_CUDA_NVCC
-ifdef CUDA_DOCKER_ARCH
-	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
-else ifndef CUDA_POWER_ARCH
-	MK_NVCCFLAGS += -arch=native
-endif # CUDA_DOCKER_ARCH
-ifdef LLAMA_CUDA_FORCE_DMMV
-	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
-endif # LLAMA_CUDA_FORCE_DMMV
-ifdef LLAMA_CUDA_FORCE_MMQ
-	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
-endif # LLAMA_CUDA_FORCE_MMQ
-ifdef LLAMA_CUDA_DMMV_X
-	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
-else
-	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
-endif # LLAMA_CUDA_DMMV_X
-ifdef LLAMA_CUDA_MMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
-else ifdef LLAMA_CUDA_DMMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
-else
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
-endif # LLAMA_CUDA_MMV_Y
-ifdef LLAMA_CUDA_F16
-	MK_NVCCFLAGS += -DGGML_CUDA_F16
-endif # LLAMA_CUDA_F16
-ifdef LLAMA_CUDA_DMMV_F16
-	MK_NVCCFLAGS += -DGGML_CUDA_F16
-endif # LLAMA_CUDA_DMMV_F16
-ifdef LLAMA_CUDA_KQUANTS_ITER
-	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
-else
-	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
+	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
+	LDFLAGS += -lopenblas
 endif
-ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
-else
-	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
-endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-#ifdef LLAMA_CUDA_CUBLAS
-#	MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
-#endif # LLAMA_CUDA_CUBLAS
-ifdef LLAMA_CUDA_CCBIN
-	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
+ifdef LLAMA_GPROF
+	CFLAGS   += -pg
+	CXXFLAGS += -pg
 endif
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-ifdef JETSON_EOL_MODULE_DETECT
-	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
-else
-	$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
-endif # JETSON_EOL_MODULE_DETECT
-endif # LLAMA_CUBLAS
-
-ifdef LLAMA_CLBLAST
-
-	MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
-	MK_CFLAGS   += $(shell pkg-config --cflags-only-other clblast OpenCL)
-	MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
-
-	# Mac provides OpenCL as a framework
-	ifeq ($(UNAME_S),Darwin)
-		MK_LDFLAGS += -lclblast -framework OpenCL
-	else
-		MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
-	endif
-	OBJS    += ggml-opencl.o
-
-ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif # LLAMA_CLBLAST
-
-ifdef LLAMA_HIPBLAS
-
-	ifeq ($(wildcard /opt/rocm),)
-		ROCM_PATH	?= /usr
-		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
-	else
-		ROCM_PATH	?= /opt/rocm
-		GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
-	endif
-	HIPCC                   ?= $(ROCM_PATH)/bin/hipcc
-	LLAMA_CUDA_DMMV_X       ?= 32
-	LLAMA_CUDA_MMV_Y        ?= 1
-	LLAMA_CUDA_KQUANTS_ITER ?= 2
-	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
-ifdef LLAMA_HIP_UMA
-	MK_CPPFLAGS += -DGGML_HIP_UMA
-endif # LLAMA_HIP_UMA
-	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
-	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
-	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
-	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
-	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
-	HIPFLAGS    += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
-ifdef LLAMA_CUDA_FORCE_DMMV
-	HIPFLAGS 	+= -DGGML_CUDA_FORCE_DMMV
-endif # LLAMA_CUDA_FORCE_DMMV
-	OBJS        += ggml-cuda.o
-ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
-endif # LLAMA_HIPBLAS
-
-ifdef LLAMA_METAL
-	MK_CPPFLAGS += -DGGML_USE_METAL
-	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
-	OBJS		+= ggml-metal.o
-ifdef LLAMA_METAL_NDEBUG
-	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
+ifneq ($(filter aarch64%,$(UNAME_M)),)
+	CFLAGS += -mcpu=native
+	CXXFLAGS += -mcpu=native
 endif
-endif # LLAMA_METAL
-
-ifdef LLAMA_METAL
-ggml-metal.o: ggml-metal.m ggml-metal.h
-	$(CC) $(CFLAGS) -c $< -o $@
-endif # LLAMA_METAL
-
-ifdef LLAMA_MPI
-ggml-mpi.o: ggml-mpi.c ggml-mpi.h
-	$(CC) $(CFLAGS) -c $< -o $@
-endif # LLAMA_MPI
-
-GF_CC := $(CC)
-include scripts/get-flags.mk
-
-# combine build flags with cmdline overrides
-override CFLAGS    := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
-BASE_CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
-override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
-override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
-override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
-
-# identify CUDA host compiler
-ifdef LLAMA_CUBLAS
-GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
-include scripts/get-flags.mk
-CUDA_CXXFLAGS := $(GF_CXXFLAGS)
+ifneq ($(filter armv6%,$(UNAME_M)),)
+	# Raspberry Pi 1, 2, 3
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+endif
+ifneq ($(filter armv7%,$(UNAME_M)),)
+	# Raspberry Pi 4
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+endif
+ifneq ($(filter armv8%,$(UNAME_M)),)
+	# Raspberry Pi 4
+	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif

 #
@@ -520,236 +201,51 @@ endif
 #

 $(info I llama.cpp build info: )
-$(info I UNAME_S:   $(UNAME_S))
-$(info I UNAME_P:   $(UNAME_P))
-$(info I UNAME_M:   $(UNAME_M))
-$(info I CFLAGS:    $(CFLAGS))
-$(info I CXXFLAGS:  $(CXXFLAGS))
-$(info I NVCCFLAGS: $(NVCCFLAGS))
-$(info I LDFLAGS:   $(LDFLAGS))
-$(info I CC:        $(shell $(CC) --version | head -n 1))
-$(info I CXX:       $(shell $(CXX) --version | head -n 1))
+$(info I UNAME_S:  $(UNAME_S))
+$(info I UNAME_P:  $(UNAME_P))
+$(info I UNAME_M:  $(UNAME_M))
+$(info I CFLAGS:   $(CFLAGS))
+$(info I CXXFLAGS: $(CXXFLAGS))
+$(info I LDFLAGS:  $(LDFLAGS))
+$(info I CC:       $(CCV))
+$(info I CXX:      $(CXXV))
 $(info )

+default: main quantize
+
 #
 # Build library
 #

-ggml.o: ggml.c ggml.h ggml-cuda.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml.o: ggml.c ggml.h
+	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o

-ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+llama.o: llama.cpp llama.h
+	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o

-ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+utils.o: utils.cpp utils.h
+	$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o

-ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
-	$(CC) $(CFLAGS)    -c $< -o $@
+run.o: run.cpp run.h
+	$(CXX) $(CXXFLAGS) -c run.cpp -o run.o

-OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
-
-llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h
-COMMON_DEPS   = common.o sampling.o grammar-parser.o build-info.o
-
-common.o: common/common.cpp $(COMMON_H_DEPS)
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-sampling.o: common/sampling.cpp $(COMMON_H_DEPS)
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-console.o: common/console.cpp common/console.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-train.o: common/train.cpp common/train.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-libllama.so: llama.o ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+tcp_server.o: tcp_server.cpp tcp_server.h
+	$(CXX) $(CXXFLAGS) -c tcp_server.cpp -o tcp_server.o

 clean:
-	rm -vrf *.o tests/*.o *.so *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -f *.o main quantize

-#
-# Examples
-#
+main: main.cpp ggml.o llama.o utils.o run.o tcp_server.o
+	$(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o run.o tcp_server.o -o main $(LDFLAGS)
+	@echo "\x1b[36mrun ./main -h for help\x1b[0m"

-main: examples/main/main.cpp                                  ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-	@echo
-	@echo '====  Run ./main -h for help.  ===='
-	@echo
-
-infill: examples/infill/infill.cpp                            ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-simple: examples/simple/simple.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tokenize: examples/tokenize/tokenize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-batched: examples/batched/batched.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-perplexity: examples/perplexity/perplexity.cpp                ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-imatrix: examples/imatrix/imatrix.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
-
-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
-
-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
-
-baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-ifeq ($(UNAME_S),Darwin)
-swift: examples/batched.swift
-	(cd examples/batched.swift; make build)
-endif
-
-common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
-	@sh scripts/build-info.sh $(CC) > $@.tmp
-	@if ! cmp -s $@.tmp $@; then \
-		mv $@.tmp $@; \
-	else \
-		rm $@.tmp; \
-	fi
-
-build-info.o: common/build-info.cpp
-	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
+quantize: quantize.cpp ggml.o llama.o utils.o
+	$(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS)

 #
 # Tests
 #

-tests: $(TEST_TARGETS)
-
-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.o ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-run-benchmark-matmult: benchmark-matmult
-	./$@
-
-.PHONY: run-benchmark-matmult swift
-
-vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-
-q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-
-tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-c.o: tests/test-c.c llama.h
-	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
-
-tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+.PHONY: tests
+tests:
+	bash ./tests/run-tests.sh
--- a/Package.swift
+++ b/Package.swift
@@ -1,46 +0,0 @@
-// swift-tools-version:5.5
-
-import PackageDescription
-
-let package = Package(
-    name: "llama",
-    platforms: [
-        .macOS(.v12),
-        .iOS(.v14),
-        .watchOS(.v4),
-        .tvOS(.v14)
-    ],
-    products: [
-        .library(name: "llama", targets: ["llama"]),
-    ],
-    dependencies: [
-        .package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
-    ],
-    targets: [
-        .target(
-            name: "llama",
-            dependencies: ["ggml"],
-            path: ".",
-            exclude: ["ggml-metal.metal"],
-            sources: [
-                "llama.cpp",
-            ],
-            publicHeadersPath: "spm-headers",
-            cSettings: [
-                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE"),
-                .unsafeFlags(["-fno-objc-arc"]),
-                .define("GGML_USE_METAL"),
-                // NOTE: NEW_LAPACK will required iOS version 16.4+
-                // We should consider add this in the future when we drop support for iOS 14
-                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
-                // .define("ACCELERATE_NEW_LAPACK"),
-                // .define("ACCELERATE_LAPACK_ILP64")
-            ],
-            linkerSettings: [
-                .linkedFramework("Accelerate")
-            ]
-        )
-    ],
-    cxxLanguageStandard: .cxx11
-)
--- a/README.md
+++ b/README.md
--- a/55
+++ b/55
@@ -1,27 +1,26 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
-666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
-ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf  models/7B/ggml-model-q4_0.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_1.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_0.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_1.bin
+abe4aec2cdc297e2916011f66c7efd6fb4424e0e84315503005b5c118358cc22  models/7B/ggml-model-f16.bin
+f495fa02a0b5ef265e1864d9680eede7fd23a60b0a2f93edba8091e2a4ca68b9  models/7B/ggml-model-q4_0.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
-2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
-fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5  models/13B/ggml-model-q4_0.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_1.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_0.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_1.bin
+a6bd0537c6873f36c47292df0b6f794e1135f5aafb89c3343bcc9e93264bf167  models/13B/ggml-model-f16.bin
+0fb0951b90f2ec46c1f2f2372af5dacb4614b27e9fb6c10c69fbec58d7dd0e36  models/13B/ggml-model-f16.bin.1
+1c218ba37ae61e15e35efd9949c78d6edf553b6280824c263cad56ae0b9d5a8f  models/13B/ggml-model-q4_0.bin
+c37a20c2ab9fa74b006b389085660269ee06110d1e45a494eb57d4602c9bcdb2  models/13B/ggml-model-q4_0.bin.1
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
-7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
-d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d  models/30B/ggml-model-q4_0.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q4_1.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_0.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_1.bin
+def20ea508f4e36793719f857471e85b85f96e497a2cbffbbaa1b60e2b18202c  models/30B/ggml-model-f16.bin
+b37040aa67fa8608cb2d8e0719132cf3e267fd35ec1e2f0d37dbc9fa43d674f1  models/30B/ggml-model-f16.bin.1
+e7f263557e99069fe29003262ea5fa9ed885dbe79069083e6eb569b328cf30d3  models/30B/ggml-model-f16.bin.2
+2ad6a23af05eb720f202f63d130f4fc5de9b6d2efc95b921be003209a56695aa  models/30B/ggml-model-f16.bin.3
+7de31d005e6d02ebd9603b2cf5329ad2f832b65d08873a098c5cafc4046cb9ed  models/30B/ggml-model-q4_0.bin
+f91feef9f30f9a023616db2e91297ca6d5d5d7b9eb351e452a82115c46f7da9e  models/30B/ggml-model-q4_0.bin.1
+66f3a0916ac7a81839153eb061fa861030ed1892477c2f7af2ce4f98d2f6d06f  models/30B/ggml-model-q4_0.bin.2
+e3c587ba97f83d2088b001bcda3026571065649ee3090bef6743a51390b01d3b  models/30B/ggml-model-q4_0.bin.3
 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
@@ -31,10 +30,24 @@ e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770  models/65B/con
 a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/consolidated.05.pth
 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
-60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
-cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92  models/65B/ggml-model-q4_0.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q4_1.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_0.bin
-ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_1.bin
+7eba2625260cd91f8de901fd9704a1aa39448425514a335a0d3878de4ab9dc77  models/65B/ggml-model-f16.bin
+f6aa886575df0785d4231f30cc776d499ccde18857818effc0378c65b178e0b5  models/65B/ggml-model-f16.bin.1
+076037141682f5d7537955058c4740ab27f285aa4588915f830874a589c0693d  models/65B/ggml-model-f16.bin.2
+7853d96d2903ad7de2b2a89c4acf5a33a2f8e3c24ac39c9df6b44cdb42bf530a  models/65B/ggml-model-f16.bin.3
+b16b7b941abb3bc03a14df1656140855e9360a5371c83e919b9da83a72362314  models/65B/ggml-model-f16.bin.4
+5291270216f888697695acb78ef28df0c080f9e85d3245c92fb9992d1fde6678  models/65B/ggml-model-f16.bin.5
+0685ee77715f34686841006f8f94d3e7eaf148b97cecc9d3eee72808b0f7989c  models/65B/ggml-model-f16.bin.6
+00d993d73bb21d7c29388ffe0dced008cbaa0d391831dea77d7eb8f0b5c404b9  models/65B/ggml-model-f16.bin.7
+4e398f05842206e08cdc5e7bb4f6c7c34b9dc373435ece6f261b14b7b4fe9b89  models/65B/ggml-model-q4_0.bin
+4c4e899e3b12d9f57c9dcea5a1fb41bbc72023323535551f6273582ca7d7294b  models/65B/ggml-model-q4_0.bin.1
+d7b4594bbbd192043b3db0e5acc2561c42e6944e1cb91cc6e61510eee89dbcd8  models/65B/ggml-model-q4_0.bin.2
+9a099d271648863d923d0d097391ea0bc75591f27a2ca3a327760f42e6b69af2  models/65B/ggml-model-q4_0.bin.3
+5ee474051e418c5732b7949190b084d9d679db447f83c1de0d2a82daaa1a0cfa  models/65B/ggml-model-q4_0.bin.4
+a45aa05e7212bd6782790722d68056c5419667ea6b564ccc94bbcb8111d79b8b  models/65B/ggml-model-q4_0.bin.5
+a58fda714b759c28ad5e4c1d8bf8fda7b158fd5e4c4a49f851f36342fa97a105  models/65B/ggml-model-q4_0.bin.6
+a3540cfcbcda33c223c6b0d606034adbd78f17e0e5de1582b78795e78754f7a8  models/65B/ggml-model-q4_0.bin.7
 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
-9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model
+1f582babc2bd56bb63b33141898748657d369fd110c4358b2bc280907882bf13  models/alpaca-7B/ggml-model-q4_0.bin
+e17730c6b62b565b098af023ca446dcb9e3535d4222ead6369c7aae67207eb3d  models/alpaca-13B/ggml-model-q4_0.bin
+9bcd1bb30e679c939f367be11b030fe20b3eb9a3606b9bc4106420f1827b6ae4  models/alpaca-30B/ggml-model-q4_0.bin
+36079249f53c292a4c2302d7784005dcae94c865f0bedfdbfa51d9ddad402935  models/alpaca-30B/params.json
--- a/alpaca.sh
+++ b/alpaca.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+#
+# Temporary script - will be removed in the future
+#
+
+./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
--- a/awq-py/README.md
+++ b/awq-py/README.md
@@ -1,116 +0,0 @@
-# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
-[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
-
-**Supported models:**
-
- [X] LLaMA
- [x] LLaMA 2
- [X] MPT
- [X] Mistral AI v0.1
- [ ] Bloom
- [ ] Mixtral MoE
-
-**TODO:**
- [x] Update version work with both MPT and MPT-AWQ model
- [ ] Add OPT model
- [ ] Add Bloom model
- [ ] Add Mixtral MoE
- [ ] Support w3, w2
-
-
-## Contents
-
- [Install](##Install)
- [Convert](##Convert)
- [Quantize](##Quantize)
- [Test](##Test)
- [Benchmark](##Benchmark)
- [Results](##Results)
-
-## Install
-Install requirements
-```bash
-pip install -r requirements.txt
-```
-Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT
-```bash
-git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
-```
-
-## Convert
-Example for llama model
-```bash
-# For llama7b and llama2 models
-python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
-# For mistral and mpt models
-python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
-```
-
-## Quantize
-```bash
-# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types.
-./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
-```
-
-## Test
-```bash
-# For all models.
-./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time"
-```
-
-## Benchmark
-The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
-```bash
-# For llama and llama2, and mistral models.
-./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
-```
-
-## Results
-Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
-We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
-
-### Llama 7B (Build with OpenBLAS)
-
-| Model      | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
-|-----------:|--------------|-------:|-------:|-------:|-------:|
-|Llama 7B    | perplexity   | 5.9066 | 6.1214 | 6.0643 | 6.5808 |
-|Llama 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|Llama 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-|AWQ-LLama 7B| perplexity   | 5.9175 | 6.0252 | 5.9987 | 6.3692 |
-|AWQ-LLama 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|AWQ-LLama 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-
-
-### Llama2 7B (Build with CuBLAS)
-
-| Model       | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
-|------------:|--------------|-------:|-------:|-------:|-------:|
-|Llama2 7B    | perplexity   | 5.8664 | 6.0260 | 6.0656 | 6.4496 |
-|Llama2 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|Llama2 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-|AWQ-LLama2 7B| perplexity   | 5.8801 | 6.0054 | 5.9849 | 6.3650 |
-|AWQ-LLama2 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|AWQ-LLama2 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-
-
-### Mistral 7B v0.1 (Build with CuBLAS)
-
-| Model        | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
-|-------------:|--------------|-------:|-------:|-------:|-------:|
-|Mistral 7B    | perplexity   | 5.6931 | 5.8202 | 5.8268 | 6.1645 |
-|Mistral 7B    | file size     |  14.5G |   4.1G |   4.5G |   3.1G |
-|Mistral 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-|AWQ-Mistral 7B| perplexity   | 5.6934 | 5.8020 | 5.7691 | 6.0426 |
-|AWQ-Mistral 7B| file size     |  14.5G |   4.1G |   4.5G |   3.1G |
-|AWQ-Mistral 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-
-### MPT 7B (Build with OpenBLAS)
-
-| Model    | Measure      | F16    | Q4_0   | Q4_1   | Q2_K    |
-|---------:|--------------|-------:|-------:|-------:|--------:|
-|MPT 7B    | perplexity   | 8.4369 | 8.7956 | 8.6265 | 11.4913 |
-|MPT 7B    | file size    |  13.7G  |   3.9G |   4.3G |   2.8G  |
-|MPT 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  |
-|AWQ-MPT 7B| perplexity   | 8.4944 | 8.7053 |  8.6750 | 10.2873|
-|AWQ-MPT 7B| file size    |  13.7G  |   3.9G |   4.3G |   2.8G  |
-|AWQ-MPT 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  |
--- a/awq-py/awq/apply_awq.py
+++ b/awq-py/awq/apply_awq.py
@@ -1,254 +0,0 @@
-"""
-Implements the AWQ for llama.cpp use cases.
-Original paper: https://arxiv.org/abs/2306.00978
-
-This code is based on versions of the AWQ implementation found in the following repositories:
-* https://github.com/mit-han-lab/llm-awq
-* https://github.com/casper-hansen/AutoAWQ
-"""
-
-import os
-import torch
-import torch.nn as nn
-
-from transformers import AutoModelForCausalLM, AutoConfig
-from transformers.models.bloom.modeling_bloom import BloomGelu
-from transformers.models.llama.modeling_llama import LlamaRMSNorm
-from transformers.activations import GELUActivation
-
-
-class ScaledActivation(nn.Module):
-    """
-    ScaledActivation module wraps an existing activation function and applies a
-    scale factor to its output.
-
-    Args:
-        module (nn.Module): The activation function to be scaled.
-        scales (torch.Tensor): A tensor of size (num_features,) containing the initial
-            scale factors for each feature.
-
-    Returns:
-        torch.Tensor: The scaled output of the activation function.
-    """
-
-    def __init__(self, module, scales):
-        super().__init__()
-        self.act = module
-        self.scales = nn.Parameter(scales.data)
-
-    def forward(self, x):
-        return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
-
-
-def set_op_by_name(layer, name, new_module):
-    """
-    Set the new module for given module's name.
-
-    Args:
-        layer (nn.Module): The layer in which to replace the submodule.
-        name (str): The path to the submodule to be replaced, using dot notation
-            to access nested modules.
-        new_module (nn.Module): The new module to replace the existing one.
-    """
-    levels = name.split(".")
-    if len(levels) > 1:
-        mod_ = layer
-        for l_idx in range(len(levels) - 1):
-            if levels[l_idx].isdigit():
-                mod_ = mod_[int(levels[l_idx])]
-            else:
-                mod_ = getattr(mod_, levels[l_idx])
-        setattr(mod_, levels[-1], new_module)
-    else:
-        setattr(layer, name, new_module)
-
-
-def get_op_by_name(module, op_name):
-    """
-    Retrieves a submodule within a given layer based on its name.
-
-    Args:
-        module (nn.Module): The layer containing the submodule to find.
-        op_name (str): The name of the submodule.
-
-    Returns:
-        nn.Module: The requested submodule found within the given layer.
-
-    Raises:
-        ValueError: If the specified submodule cannot be found within the layer.
-    """
-    for name, m in module.named_modules():
-        if name == op_name:
-            return m
-    raise ValueError(f"Cannot find op {op_name} in module {module}")
-
-
-@torch.no_grad()
-def scale_ln_fcs(ln, fcs, scales):
-    """
-    Scales the weights of a LayerNorm and a list of fully-connected layers proportionally.
-
-    Args:
-        ln (nn.LayerNorm): The LayerNorm module to be scaled.
-        fcs (List[nn.Linear]): A list of fully-connected layers to be scaled.
-        scales (torch.Tensor): A 1D tensor of size (num_features,).
-    """
-
-    if not isinstance(fcs, list):
-        fcs = [fcs]
-
-    scales = scales.to(ln.weight.device)
-
-    ln.weight.div_(scales)
-    if hasattr(ln, "bias") and ln.bias is not None:
-        ln.bias.div_(scales)
-
-    for fc in fcs:
-        fc.weight.mul_(scales.view(1, -1))
-
-    for p in ln.parameters():
-        assert torch.isnan(p).sum() == 0
-    for fc in fcs:
-        for p in fc.parameters():
-            assert torch.isnan(p).sum() == 0
-
-
-@torch.no_grad()
-def scale_fc_fc(fc1, fc2, scales):
-    """
-    Scales the weights of two fully-connected layers in a specific pattern.
-
-    Args:
-        fc1 (nn.Linear): The first fully-connected layer to be scaled.
-        fc2 (nn.Linear): The second fully-connected layer to be scaled.
-        scales (torch.Tensor): A 1D tensor of size (num_features,).
-    """
-    assert isinstance(fc1, nn.Linear)
-    assert isinstance(fc2, nn.Linear)
-
-    scales = scales.to(fc1.weight.device)
-
-    fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
-    if fc1.bias is not None:
-        fc1.bias.div_(scales.view(-1))
-
-    fc2.weight.mul_(scales.view(1, -1))
-
-    for p in fc1.parameters():
-        assert torch.isnan(p).sum() == 0
-    for p in fc2.parameters():
-        assert torch.isnan(p).sum() == 0
-
-
-@torch.no_grad()
-def scale_gelu_fc(gelu, fc, scales):
-    """
-    Scales the weight of a GELU activation and a fully-connected layer proportionally.
-
-    Args:
-        gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled.
-        fc (nn.Linear): The fully-connected layer to be scaled.
-        scales (torch.Tensor): A 1D tensor of size (num_features,).
-
-    Raises:
-        TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`.
-        TypeError: If the `fc` module is not of type `nn.Linear`.
-    """
-    assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
-    assert isinstance(fc, nn.Linear)
-
-    fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
-
-    for p in fc.parameters():
-        assert torch.isnan(p).sum() == 0
-
-
-def apply_scale(module, scales_list, input_feat_dict=None):
-    """
-    Applies different scaling strategies to layers based on their type and hierarchy within a given module.
-
-    Args:
-        module (nn.Module): The module containing the layers to be scaled.
-        scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing:
-            * prev_op_name (str): The name of the preceding operation or module,
-                relative to which the layers to be scaled are located.
-            * layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation.
-            * scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
-        input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding
-            input features (optional).
-    """
-    for prev_op_name, layer_names, scales in scales_list:
-        prev_op = get_op_by_name(module, prev_op_name)
-        layers = [get_op_by_name(module, name) for name in layer_names]
-
-        prev_op.cuda()
-        for layer in layers:
-            layer.cuda()
-        scales.cuda()
-
-        if isinstance(prev_op, nn.Linear):
-            assert len(layers) == 1
-            scale_fc_fc(prev_op, layers[0], scales)
-        elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower():
-            scale_ln_fcs(prev_op, layers, scales)
-        elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
-            new_module = ScaledActivation(prev_op, scales)
-            set_op_by_name(module, prev_op_name, new_module)
-            scale_gelu_fc(prev_op, layers[0], scales)
-        else:
-            raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!")
-
-        # apply the scaling to input feat if given; prepare it for clipping
-        if input_feat_dict is not None:
-            for layer_name in layer_names:
-                inp = input_feat_dict[layer_name]
-                inp.div_(scales.view(1, -1).to(inp.device))
-
-        prev_op.cpu()
-        for layer in layers:
-            layer.cpu()
-        scales.cpu()
-
-
-@torch.no_grad()
-def apply_clip(module, clip_list):
-    """
-    Applies element-wise clipping to the weight of a specific layer within a given module.
-
-    Args:
-        module (nn.Module): The module containing the layer to be clipped.
-        clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing:
-            * name (str): The name of the layer to be clipped, relative to the root of the module.
-            * max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight.
-    """
-    for name, max_val in clip_list:
-        layer = get_op_by_name(module, name)
-        layer.cuda()
-        max_val = max_val.to(layer.weight.device)
-        org_shape = layer.weight.shape
-        layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
-        layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
-        layer.weight.data = layer.weight.data.reshape(org_shape)
-        layer.cpu()
-
-
-def add_scale_weights(model_path, scale_path, tmp_path):
-    """
-    Adds pre-computed Activation Weight Quantization (AWQ) results to a model,
-    including scaling factors and clipping bounds.
-
-    Args:
-        model_path (str): Path to the pre-trained model to be equipped with AWQ.
-        scale_path (str): Path to the AWQ scale factors (.pt file).
-        tmp_path (str): Path to the temporary directory where the equipped model will be saved.
-    """
-    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path, config=config, trust_remote_code=True
-    )
-    model.eval()
-    awq_results = torch.load(str(scale_path), map_location="cpu")
-    apply_scale(model, awq_results["scale"])
-    apply_clip(model, awq_results["clip"])
-    model.save_pretrained(str(tmp_path))
-    os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")
--- a/awq-py/requirements.txt
+++ b/awq-py/requirements.txt
@@ -1,2 +0,0 @@
-torch>=2.1.1
-transformers>=4.32.0
--- a/build.zig
+++ b/build.zig
@@ -1,138 +0,0 @@
-// Compatible with Zig Version 0.11.0
-const std = @import("std");
-const ArrayList = std.ArrayList;
-const Compile = std.Build.Step.Compile;
-const ConfigHeader = std.Build.Step.ConfigHeader;
-const Mode = std.builtin.Mode;
-const CrossTarget = std.zig.CrossTarget;
-
-const Maker = struct {
-    builder: *std.build.Builder,
-    target: CrossTarget,
-    optimize: Mode,
-    enable_lto: bool,
-
-    include_dirs: ArrayList([]const u8),
-    cflags: ArrayList([]const u8),
-    cxxflags: ArrayList([]const u8),
-    objs: ArrayList(*Compile),
-
-    fn addInclude(m: *Maker, dir: []const u8) !void {
-        try m.include_dirs.append(dir);
-    }
-    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
-        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
-    }
-    fn addCFlag(m: *Maker, flag: []const u8) !void {
-        try m.cflags.append(flag);
-    }
-    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
-        try m.cxxflags.append(flag);
-    }
-    fn addFlag(m: *Maker, flag: []const u8) !void {
-        try m.addCFlag(flag);
-        try m.addCxxFlag(flag);
-    }
-
-    fn init(builder: *std.build.Builder) !Maker {
-        const target = builder.standardTargetOptions(.{});
-        const zig_version = @import("builtin").zig_version_string;
-        const commit_hash = try std.ChildProcess.exec(
-            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
-        );
-        try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
-            \\int LLAMA_BUILD_NUMBER = {};
-            \\char const *LLAMA_COMMIT = "{s}";
-            \\char const *LLAMA_COMPILER = "Zig {s}";
-            \\char const *LLAMA_BUILD_TARGET = "{s}";
-            \\
-        , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
-        var m = Maker{
-            .builder = builder,
-            .target = target,
-            .optimize = builder.standardOptimizeOption(.{}),
-            .enable_lto = false,
-            .include_dirs = ArrayList([]const u8).init(builder.allocator),
-            .cflags = ArrayList([]const u8).init(builder.allocator),
-            .cxxflags = ArrayList([]const u8).init(builder.allocator),
-            .objs = ArrayList(*Compile).init(builder.allocator),
-        };
-
-        try m.addCFlag("-std=c11");
-        try m.addCxxFlag("-std=c++11");
-        try m.addProjectInclude(&.{});
-        try m.addProjectInclude(&.{"common"});
-        return m;
-    }
-
-    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
-        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
-        if (o.target.getAbi() != .msvc)
-            o.defineCMacro("_GNU_SOURCE", null);
-
-        if (std.mem.endsWith(u8, src, ".c")) {
-            o.addCSourceFiles(&.{src}, m.cflags.items);
-            o.linkLibC();
-        } else {
-            o.addCSourceFiles(&.{src}, m.cxxflags.items);
-            if (o.target.getAbi() == .msvc) {
-                o.linkLibC(); // need winsdk + crt
-            } else {
-                // linkLibCpp already add (libc++ + libunwind + libc)
-                o.linkLibCpp();
-            }
-        }
-        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
-        o.want_lto = m.enable_lto;
-        return o;
-    }
-
-    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
-        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
-        e.addCSourceFiles(&.{src}, m.cxxflags.items);
-        for (deps) |d| e.addObject(d);
-        for (m.objs.items) |o| e.addObject(o);
-        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
-
-        // https://github.com/ziglang/zig/issues/15448
-        if (e.target.getAbi() == .msvc) {
-            e.linkLibC(); // need winsdk + crt
-        } else {
-            // linkLibCpp already add (libc++ + libunwind + libc)
-            e.linkLibCpp();
-        }
-        m.builder.installArtifact(e);
-        e.want_lto = m.enable_lto;
-        return e;
-    }
-};
-
-pub fn build(b: *std.build.Builder) !void {
-    var make = try Maker.init(b);
-    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
-
-    const ggml = make.obj("ggml", "ggml.c");
-    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
-    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
-    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
-    const llama = make.obj("llama", "llama.cpp");
-    const buildinfo = make.obj("common", "common/build-info.cpp");
-    const common = make.obj("common", "common/common.cpp");
-    const console = make.obj("console", "common/console.cpp");
-    const sampling = make.obj("sampling", "common/sampling.cpp");
-    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
-    const train = make.obj("train", "common/train.cpp");
-    const clip = make.obj("clip", "examples/llava/clip.cpp");
-
-    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
-    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
-    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
-    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
-    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
-    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
-
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
-    if (server.target.isWindows()) {
-        server.linkSystemLibrary("ws2_32");
-    }
-}
--- a/chat.sh
+++ b/chat.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+#
+# Temporary script - will be removed in the future
+#
+
+./main -m ./models/7B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
--- a/chat_tcp_client.sh
+++ b/chat_tcp_client.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+PORT=${PORT:-8080}
+PROMPT="${PROMPT:-"Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
+
+User:Hello, Bob.
+Bob:Hello. How may I help you today?
+User:Please tell me the largest city in Europe.
+Bob:Sure. The largest city in Europe is Moscow, the capital of Russia.
+User:"}"
+RPROMPT="${RPROMPT:-"User:"}"
+N_PREDICT="${N_PREDICT:-"4096"}"
+REPEAT_PENALTY="${REPEAT_PENALTY:-"1.0"}"
+N_THREADS="${N_THREADS:-"4"}"
+
+# Open connection to the chat server
+exec 3<>/dev/tcp/127.0.0.1/${PORT}
+
+# Pass the arguments. The protocol is really simple:
+# 1. Pass the number of arguments followed by a linefeed
+# 2. Pass the arguments, with each being followed by "0"
+(
+echo -en "12\n"
+echo -en "-t\x00"
+echo -en "$N_THREADS\x00"
+echo -en "-n\x00"
+echo -en "$N_PREDICT\x00"
+echo -en "--repeat_penalty\x00"
+echo -en "$REPEAT_PENALTY\x00"
+echo -en "--color\x00"
+echo -en "-i\x00"
+echo -en "-r\x00"
+echo -en "$RPROMPT\x00"
+echo -en "-p\x00"
+echo -en "$PROMPT\x00"
+) >&3
+
+trap exit TERM
+
+# When we have passed the arguments, start printing socket data to the screen.
+# This is done in a background job because we also want to send data when
+# running in interactive mode.
+cat <&3 && echo "(disconnected, press \"enter\" twice to exit)" &
+cat >&3
+wait
--- a/chat_tcp_server.sh
+++ b/chat_tcp_server.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+PORT=${PORT:-8080}
+MODEL=${MODEL:-models/7B/ggml-model-q4_0.bin}
+
+./main -l ${PORT} -m $MODEL
--- a/ci/README.md
+++ b/ci/README.md
@@ -1,25 +0,0 @@
-# CI
-
-In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
-
-https://github.com/ggml-org/ci
-
-It monitors the `master` branch for new commits and runs the
-[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
-to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
-to cover various hardware architectures, including GPU and Apple Silicon instances.
-
-Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
-Only the branches of this repo are monitored for this keyword.
-
-It is a good practice, before publishing changes to execute the full CI locally on your machine:
-
-```bash
-mkdir tmp
-
-# CPU-only build
-bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-# with CUDA support
-GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-```
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -1,594 +0,0 @@
-#/bin/bash
-#
-# sample usage:
-#
-# mkdir tmp
-#
-# # CPU-only build
-# bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#
-# # with CUDA support
-# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#
-
-if [ -z "$2" ]; then
-    echo "usage: $0 <output-dir> <mnt-dir>"
-    exit 1
-fi
-
-mkdir -p "$1"
-mkdir -p "$2"
-
-OUT=$(realpath "$1")
-MNT=$(realpath "$2")
-
-rm -f "$OUT/*.log"
-rm -f "$OUT/*.exit"
-rm -f "$OUT/*.md"
-
-sd=`dirname $0`
-cd $sd/../
-SRC=`pwd`
-
-CMAKE_EXTRA=""
-
-if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
-fi
-
-if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
-fi
-
-## helpers
-
-# download a file if it does not exist or if it is outdated
-function gg_wget {
-    local out=$1
-    local url=$2
-
-    local cwd=`pwd`
-
-    mkdir -p $out
-    cd $out
-
-    # should not re-download if file is the same
-    wget -nv -N $url
-
-    cd $cwd
-}
-
-function gg_printf {
-    printf -- "$@" >> $OUT/README.md
-}
-
-function gg_run {
-    ci=$1
-
-    set -o pipefail
-    set -x
-
-    gg_run_$ci | tee $OUT/$ci.log
-    cur=$?
-    echo "$cur" > $OUT/$ci.exit
-
-    set +x
-    set +o pipefail
-
-    gg_sum_$ci
-
-    ret=$((ret | cur))
-}
-
-## ci
-
-# ctest_debug
-
-function gg_run_ctest_debug {
-    cd ${SRC}
-
-    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
-
-    set +e
-}
-
-function gg_sum_ctest_debug {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs ctest in debug mode\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
-    gg_printf '```\n'
-    gg_printf '\n'
-}
-
-# ctest_release
-
-function gg_run_ctest_release {
-    cd ${SRC}
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    if [ -z ${GG_BUILD_LOW_PERF} ]; then
-        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
-    else
-        (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
-    fi
-
-    set +e
-}
-
-function gg_sum_ctest_release {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs ctest in release mode\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
-    gg_printf '```\n'
-}
-
-function gg_get_model {
-    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
-    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
-    if [[ -s $gguf_3b ]]; then
-        echo -n "$gguf_3b"
-    elif [[ -s $gguf_7b ]]; then
-        echo -n "$gguf_7b"
-    else
-        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
-        exit 1
-    fi
-}
-
-function gg_run_ctest_with_model_debug {
-    cd ${SRC}
-
-    local model; model=$(gg_get_model)
-    cd build-ci-debug
-    set -e
-    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
-    set +e
-    cd ..
-}
-
-function gg_run_ctest_with_model_release {
-    cd ${SRC}
-
-    local model; model=$(gg_get_model)
-    cd build-ci-release
-    set -e
-    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
-    set +e
-    cd ..
-}
-
-function gg_sum_ctest_with_model_debug {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs ctest with model files in debug mode\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
-    gg_printf '```\n'
-}
-
-function gg_sum_ctest_with_model_release {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'Runs ctest with model files in release mode\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '```\n'
-    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
-    gg_printf '```\n'
-}
-
-# open_llama_3b_v2
-
-function gg_run_open_llama_3b_v2 {
-    cd ${SRC}
-
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
-    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
-
-    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
-
-    path_models="../models-mnt/open-llama/3B-v2"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert.py ${path_models}
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test_60="${path_wiki}/wiki.test-60.raw"
-
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    # lora
-    function compare_ppl {
-        qnt="$1"
-        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
-            return 20
-        fi
-
-        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
-        return 0
-    }
-
-    path_lora="../models-mnt/open-llama/3B-v2/lora"
-    path_shakespeare="../models-mnt/shakespeare"
-
-    shakespeare="${path_shakespeare}/shakespeare.txt"
-    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
-
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
-    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
-
-    python3 ../convert-lora-to-ggml.py ${path_lora}
-
-    # f16
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
-    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # q8_0
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
-    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # q8_0 + f16 lora-base
-    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
-    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    set +e
-}
-
-function gg_sum_open_llama_3b_v2 {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'OpenLLaMA 3B-v2:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
-    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
-    gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
-    gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
-    gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
-}
-
-# open_llama_7b_v2
-# requires: GG_BUILD_CUDA
-
-function gg_run_open_llama_7b_v2 {
-    cd ${SRC}
-
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
-    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
-
-    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
-    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
-
-    path_models="../models-mnt/open-llama/7B-v2"
-    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
-    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
-    set -e
-
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
-
-    python3 ../convert.py ${path_models}
-
-    model_f16="${path_models}/ggml-model-f16.gguf"
-    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
-    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
-    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
-    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
-    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
-    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
-    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
-    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
-    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
-    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
-    wiki_test="${path_wiki}/wiki.test.raw"
-
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
-
-    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
-    function check_ppl {
-        qnt="$1"
-        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
-            return 20
-        fi
-
-        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
-        return 0
-    }
-
-    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
-    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
-    # lora
-    function compare_ppl {
-        qnt="$1"
-        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
-        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
-            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
-            return 20
-        fi
-
-        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
-        return 0
-    }
-
-    path_lora="../models-mnt/open-llama/7B-v2/lora"
-    path_shakespeare="../models-mnt/shakespeare"
-
-    shakespeare="${path_shakespeare}/shakespeare.txt"
-    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
-
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
-    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
-    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
-
-    python3 ../convert-lora-to-ggml.py ${path_lora}
-
-    # f16
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
-    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
-    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # currently not supported by the CUDA backend
-    # q8_0
-    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
-    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
-    #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    # q8_0 + f16 lora-base
-    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
-    #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
-
-    set +e
-}
-
-function gg_sum_open_llama_7b_v2 {
-    gg_printf '### %s\n\n' "${ci}"
-
-    gg_printf 'OpenLLaMA 7B-v2:\n'
-    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
-    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
-    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
-    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
-    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
-    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
-    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
-    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
-    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
-    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
-    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
-    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
-    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
-    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
-    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
-    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
-    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
-    #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
-    #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
-    #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
-}
-
-## main
-
-if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
-    rm -rf ${SRC}/models-mnt
-    mnt_models=${MNT}/models
-    mkdir -p ${mnt_models}
-    ln -sfn ${mnt_models} ${SRC}/models-mnt
-
-    # Create a fresh python3 venv and enter it
-    python3 -m venv "$MNT/venv"
-    source "$MNT/venv/bin/activate"
-
-    pip install -r ${SRC}/requirements.txt --disable-pip-version-check
-    pip install --editable gguf-py --disable-pip-version-check
-fi
-
-ret=0
-
-test $ret -eq 0 && gg_run ctest_debug
-test $ret -eq 0 && gg_run ctest_release
-
-if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        if [ -z ${GG_BUILD_CUDA} ]; then
-            test $ret -eq 0 && gg_run open_llama_3b_v2
-        else
-            test $ret -eq 0 && gg_run open_llama_7b_v2
-        fi
-        test $ret -eq 0 && gg_run ctest_with_model_debug
-        test $ret -eq 0 && gg_run ctest_with_model_release
-    fi
-fi
-
-exit $ret
--- a/cmake/FindSIMD.cmake
+++ b/cmake/FindSIMD.cmake
@@ -1,100 +0,0 @@
-include(CheckCSourceRuns)
-
-set(AVX_CODE "
-    #include <immintrin.h>
-    int main()
-    {
-        __m256 a;
-        a = _mm256_set1_ps(0);
-        return 0;
-    }
-")
-
-set(AVX512_CODE "
-    #include <immintrin.h>
-    int main()
-    {
-        __m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0,
-                                    0, 0, 0, 0, 0, 0, 0, 0);
-        __m512i b = a;
-        __mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
-        return 0;
-    }
-")
-
-set(AVX2_CODE "
-    #include <immintrin.h>
-    int main()
-    {
-        __m256i a = {0};
-        a = _mm256_abs_epi16(a);
-        __m256i x;
-        _mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
-        return 0;
-    }
-")
-
-set(FMA_CODE "
-    #include <immintrin.h>
-    int main()
-    {
-        __m256 acc = _mm256_setzero_ps();
-        const __m256 d = _mm256_setzero_ps();
-        const __m256 p = _mm256_setzero_ps();
-        acc = _mm256_fmadd_ps( d, p, acc );
-        return 0;
-    }
-")
-
-macro(check_sse type flags)
-    set(__FLAG_I 1)
-    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
-    foreach (__FLAG ${flags})
-        if (NOT ${type}_FOUND)
-            set(CMAKE_REQUIRED_FLAGS ${__FLAG})
-            check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
-            if (HAS_${type}_${__FLAG_I})
-                set(${type}_FOUND TRUE CACHE BOOL "${type} support")
-                set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
-            endif()
-            math(EXPR __FLAG_I "${__FLAG_I}+1")
-        endif()
-    endforeach()
-    set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
-
-    if (NOT ${type}_FOUND)
-        set(${type}_FOUND FALSE CACHE BOOL "${type} support")
-        set(${type}_FLAGS "" CACHE STRING "${type} flags")
-    endif()
-
-    mark_as_advanced(${type}_FOUND ${type}_FLAGS)
-endmacro()
-
-# flags are for MSVC only!
-check_sse("AVX" " ;/arch:AVX")
-if (NOT ${AVX_FOUND})
-    set(LLAMA_AVX OFF)
-else()
-    set(LLAMA_AVX ON)
-endif()
-
-check_sse("AVX2" " ;/arch:AVX2")
-check_sse("FMA" " ;/arch:AVX2")
-if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
-    set(LLAMA_AVX2 OFF)
-else()
-    set(LLAMA_AVX2 ON)
-endif()
-
-check_sse("AVX512" " ;/arch:AVX512")
-if (NOT ${AVX512_FOUND})
-    set(LLAMA_AVX512 OFF)
-else()
-    set(LLAMA_AVX512 ON)
-endif()
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,14 +0,0 @@
-comment: off
-
-coverage:
-  status:
-    project:
-      default:
-        target: auto
-        threshold: 0
-        base: auto
-    patch:
-      default:
-        target: auto
-        threshold: 0
-        base: auto
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -1,68 +0,0 @@
-# common
-
-
-# Build info header
-#
-
-if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
-    set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
-
-    # Is git submodule
-    if(NOT IS_DIRECTORY "${GIT_DIR}")
-        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
-        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
-        string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS)
-        if (SLASH_POS EQUAL 0)
-            set(GIT_DIR "${REAL_GIT_DIR}")
-        else()
-            set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
-        endif()
-    endif()
-
-    set(GIT_INDEX "${GIT_DIR}/index")
-else()
-    message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.")
-    set(GIT_INDEX "")
-endif()
-
-# Add a custom command to rebuild build-info.cpp when .git/index changes
-add_custom_command(
-    OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp"
-    COMMENT "Generating build details from Git"
-    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
-            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
-    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
-    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
-    VERBATIM
-)
-set(TARGET build_info)
-add_library(${TARGET} OBJECT build-info.cpp)
-if (BUILD_SHARED_LIBS)
-    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()
-
-
-set(TARGET common)
-
-add_library(${TARGET} STATIC
-    base64.hpp
-    common.h
-    common.cpp
-    sampling.h
-    sampling.cpp
-    console.h
-    console.cpp
-    grammar-parser.h
-    grammar-parser.cpp
-    train.h
-    train.cpp
-    )
-
-if (BUILD_SHARED_LIBS)
-    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-endif()
-
-target_include_directories(${TARGET} PUBLIC .)
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
--- a/common/base64.hpp
+++ b/common/base64.hpp
@@ -1,392 +0,0 @@
-/*
-This is free and unencumbered software released into the public domain.
-
-Anyone is free to copy, modify, publish, use, compile, sell, or
-distribute this software, either in source code form or as a compiled
-binary, for any purpose, commercial or non-commercial, and by any
-means.
-
-In jurisdictions that recognize copyright laws, the author or authors
-of this software dedicate any and all copyright interest in the
-software to the public domain. We make this dedication for the benefit
-of the public at large and to the detriment of our heirs and
-successors. We intend this dedication to be an overt act of
-relinquishment in perpetuity of all present and future rights to this
-software under copyright law.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-
-For more information, please refer to <http://unlicense.org>
-*/
-
-#ifndef PUBLIC_DOMAIN_BASE64_HPP_
-#define PUBLIC_DOMAIN_BASE64_HPP_
-
-#include <cstdint>
-#include <iterator>
-#include <stdexcept>
-#include <string>
-
-class base64_error : public std::runtime_error
-{
-public:
-    using std::runtime_error::runtime_error;
-};
-
-class base64
-{
-public:
-    enum class alphabet
-    {
-        /** the alphabet is detected automatically */
-        auto_,
-        /** the standard base64 alphabet is used */
-        standard,
-        /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
-        url_filename_safe
-    };
-
-    enum class decoding_behavior
-    {
-        /** if the input is not padded, the remaining bits are ignored */
-        moderate,
-        /** if a padding character is encounter decoding is finished */
-        loose
-    };
-
-    /**
-     Encodes all the elements from `in_begin` to `in_end` to `out`.
-
-     @warning The source and destination cannot overlap. The destination must be able to hold at least
-     `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
-
-     @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
-     8 bits
-     @tparam Output_iterator the destination; the elements written to it are from the type `char`
-     @param in_begin the beginning of the source
-     @param in_end the ending of the source
-     @param out the destination iterator
-     @param alphabet which alphabet should be used
-     @returns the iterator to the next element past the last element copied
-     @throws see `Input_iterator` and `Output_iterator`
-    */
-    template<typename Input_iterator, typename Output_iterator>
-    static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
-                                  alphabet alphabet = alphabet::standard)
-    {
-        constexpr auto pad = '=';
-        const char* alpha  = alphabet == alphabet::url_filename_safe
-                                ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
-                                : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-        while (in_begin != in_end) {
-            std::uint8_t i0 = 0, i1 = 0, i2 = 0;
-
-            // first character
-            i0 = static_cast<std::uint8_t>(*in_begin);
-            ++in_begin;
-
-            *out = alpha[i0 >> 2 & 0x3f];
-            ++out;
-
-            // part of first character and second
-            if (in_begin != in_end) {
-                i1 = static_cast<std::uint8_t>(*in_begin);
-                ++in_begin;
-
-                *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
-                ++out;
-            } else {
-                *out = alpha[(i0 & 0x3) << 4];
-                ++out;
-
-                // last padding
-                *out = pad;
-                ++out;
-
-                // last padding
-                *out = pad;
-                ++out;
-
-                break;
-            }
-
-            // part of second character and third
-            if (in_begin != in_end) {
-                i2 = static_cast<std::uint8_t>(*in_begin);
-                ++in_begin;
-
-                *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
-                ++out;
-            } else {
-                *out = alpha[(i1 & 0xf) << 2];
-                ++out;
-
-                // last padding
-                *out = pad;
-                ++out;
-
-                break;
-            }
-
-            // rest of third
-            *out = alpha[i2 & 0x3f];
-            ++out;
-        }
-
-        return out;
-    }
-    /**
-     Encodes a string.
-
-     @param str the string that should be encoded
-     @param alphabet which alphabet should be used
-     @returns the encoded base64 string
-     @throws see base64::encode()
-    */
-    static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
-    {
-        std::string result;
-
-        result.reserve(required_encode_size(str.length()) + 1);
-
-        encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
-
-        return result;
-    }
-    /**
-     Encodes a char array.
-
-     @param buffer the char array
-     @param size the size of the array
-     @param alphabet which alphabet should be used
-     @returns the encoded string
-    */
-    static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
-    {
-        std::string result;
-
-        result.reserve(required_encode_size(size) + 1);
-
-        encode(buffer, buffer + size, std::back_inserter(result), alphabet);
-
-        return result;
-    }
-    /**
-     Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
-     in other words: inplace decoding is possible.
-
-     @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
-     otherwise the behavior depends on the output iterator.
-
-     @tparam Input_iterator the source; the returned elements are cast to `char`
-     @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
-     @param in_begin the beginning of the source
-     @param in_end the ending of the source
-     @param out the destination iterator
-     @param alphabet which alphabet should be used
-     @param behavior the behavior when an error was detected
-     @returns the iterator to the next element past the last element copied
-     @throws base64_error depending on the set behavior
-     @throws see `Input_iterator` and `Output_iterator`
-    */
-    template<typename Input_iterator, typename Output_iterator>
-    static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
-                                  alphabet alphabet          = alphabet::auto_,
-                                  decoding_behavior behavior = decoding_behavior::moderate)
-    {
-        //constexpr auto pad = '=';
-        std::uint8_t last  = 0;
-        auto bits          = 0;
-
-        while (in_begin != in_end) {
-            auto c = *in_begin;
-            ++in_begin;
-
-            if (c == '=') {
-                break;
-            }
-
-            auto part = _base64_value(alphabet, c);
-
-            // enough bits for one byte
-            if (bits + 6 >= 8) {
-                *out = (last << (8 - bits)) | (part >> (bits - 2));
-                ++out;
-
-                bits -= 2;
-            } else {
-                bits += 6;
-            }
-
-            last = part;
-        }
-
-        // check padding
-        if (behavior != decoding_behavior::loose) {
-            while (in_begin != in_end) {
-                auto c = *in_begin;
-                ++in_begin;
-
-                if (c != '=') {
-                    throw base64_error("invalid base64 character.");
-                }
-            }
-        }
-
-        return out;
-    }
-    /**
-     Decodes a string.
-
-     @param str the base64 encoded string
-     @param alphabet which alphabet should be used
-     @param behavior the behavior when an error was detected
-     @returns the decoded string
-     @throws see base64::decode()
-    */
-    static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
-                              decoding_behavior behavior = decoding_behavior::moderate)
-    {
-        std::string result;
-
-        result.reserve(max_decode_size(str.length()));
-
-        decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
-
-        return result;
-    }
-    /**
-     Decodes a string.
-
-     @param buffer the base64 encoded buffer
-     @param size the size of the buffer
-     @param alphabet which alphabet should be used
-     @param behavior the behavior when an error was detected
-     @returns the decoded string
-     @throws see base64::decode()
-    */
-    static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
-                              decoding_behavior behavior = decoding_behavior::moderate)
-    {
-        std::string result;
-
-        result.reserve(max_decode_size(size));
-
-        decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
-
-        return result;
-    }
-    /**
-     Decodes a string inplace.
-
-     @param[in,out] str the base64 encoded string
-     @param alphabet which alphabet should be used
-     @param behavior the behavior when an error was detected
-     @throws base64::decode_inplace()
-    */
-    static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
-                               decoding_behavior behavior = decoding_behavior::moderate)
-    {
-        str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
-    }
-    /**
-     Decodes a char array inplace.
-
-     @param[in,out] str the string array
-     @param size the length of the array
-     @param alphabet which alphabet should be used
-     @param behavior the behavior when an error was detected
-     @returns the pointer to the next element past the last element decoded
-     @throws base64::decode_inplace()
-    */
-    static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
-                                decoding_behavior behavior = decoding_behavior::moderate)
-    {
-        return decode(str, str + size, str, alphabet, behavior);
-    }
-    /**
-     Returns the required decoding size for a given size. The value is calculated with the following formula:
-
-     $$
-     \lceil \frac{size}{4} \rceil \cdot 3
-     $$
-
-     @param size the size of the encoded input
-     @returns the size of the resulting decoded buffer; this the absolute maximum
-    */
-    static std::size_t max_decode_size(std::size_t size) noexcept
-    {
-        return (size / 4 + (size % 4 ? 1 : 0)) * 3;
-    }
-    /**
-     Returns the required encoding size for a given size. The value is calculated with the following formula:
-
-     $$
-     \lceil \frac{size}{3} \rceil \cdot 4
-     $$
-
-     @param size the size of the decoded input
-     @returns the size of the resulting encoded buffer
-    */
-    static std::size_t required_encode_size(std::size_t size) noexcept
-    {
-        return (size / 3 + (size % 3 ? 1 : 0)) * 4;
-    }
-
-private:
-    static std::uint8_t _base64_value(alphabet& alphabet, char c)
-    {
-        if (c >= 'A' && c <= 'Z') {
-            return c - 'A';
-        } else if (c >= 'a' && c <= 'z') {
-            return c - 'a' + 26;
-        } else if (c >= '0' && c <= '9') {
-            return c - '0' + 52;
-        }
-
-        // comes down to alphabet
-        if (alphabet == alphabet::standard) {
-            if (c == '+') {
-                return 62;
-            } else if (c == '/') {
-                return 63;
-            }
-        } else if (alphabet == alphabet::url_filename_safe) {
-            if (c == '-') {
-                return 62;
-            } else if (c == '_') {
-                return 63;
-            }
-        } // auto detect
-        else {
-            if (c == '+') {
-                alphabet = alphabet::standard;
-
-                return 62;
-            } else if (c == '/') {
-                alphabet = alphabet::standard;
-
-                return 63;
-            } else if (c == '-') {
-                alphabet = alphabet::url_filename_safe;
-
-                return 62;
-            } else if (c == '_') {
-                alphabet = alphabet::url_filename_safe;
-
-                return 63;
-            }
-        }
-
-        throw base64_error("invalid base64 character.");
-    }
-};
-
-#endif // !PUBLIC_DOMAIN_BASE64_HPP_
--- a/common/build-info.cpp.in
+++ b/common/build-info.cpp.in
@@ -1,4 +0,0 @@
-int LLAMA_BUILD_NUMBER = @BUILD_NUMBER@;
-char const *LLAMA_COMMIT = "@BUILD_COMMIT@";
-char const *LLAMA_COMPILER = "@BUILD_COMPILER@";
-char const *LLAMA_BUILD_TARGET = "@BUILD_TARGET@";
--- a/common/common.cpp
+++ b/common/common.cpp
--- a/common/common.h
+++ b/common/common.h
@@ -1,258 +0,0 @@
-// Various helper functions and utilities
-
-#pragma once
-
-#include "llama.h"
-
-#include "sampling.h"
-
-#define LOG_NO_FILE_LINE_FUNCTION
-#include "log.h"
-
-#include <cmath>
-#include <string>
-#include <vector>
-#include <random>
-#include <thread>
-#include <unordered_map>
-#include <tuple>
-
-#ifdef _WIN32
-#define DIRECTORY_SEPARATOR '\\'
-#else
-#define DIRECTORY_SEPARATOR '/'
-#endif // _WIN32
-
-#define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
-#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
-
-#define print_build_info() do {                                                                     \
-    fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);           \
-    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
-} while(0)
-
-// build info
-extern int LLAMA_BUILD_NUMBER;
-extern char const *LLAMA_COMMIT;
-extern char const *LLAMA_COMPILER;
-extern char const *LLAMA_BUILD_TARGET;
-
-//
-// CLI argument parsing
-//
-int32_t get_num_physical_cores();
-
-struct gpt_params {
-    uint32_t seed                           = -1;    // RNG seed
-
-    int32_t n_threads                       = get_num_physical_cores();
-    int32_t n_threads_draft                 = -1;
-    int32_t n_threads_batch                 = -1;    // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft           = -1;
-    int32_t n_predict                       = -1;    // new tokens to predict
-    int32_t n_ctx                           = 512;   // context size
-    int32_t n_batch                         = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                          = 0;     // number of tokens to keep from initial prompt
-    int32_t n_draft                         = 8;     // number of tokens to draft during speculative decoding
-    int32_t n_chunks                        = -1;    // max number of chunks to process (-1 = unlimited)
-    int32_t n_parallel                      = 1;     // number of parallel sequences to decode
-    int32_t n_sequences                     = 1;     // number of sequences to decode
-    float   p_accept                        = 0.5f;  // speculative decoding accept probability
-    float   p_split                         = 0.1f;  // speculative decoding split probability
-    int32_t n_gpu_layers                    = -1;    // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft              = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
-    llama_split_mode split_mode             = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
-    int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
-    float   tensor_split[LLAMA_MAX_DEVICES] = {0};   // how split tensors should be distributed across GPUs
-    int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
-    int32_t grp_attn_n                      = 1;     // group-attention factor
-    int32_t grp_attn_w                      = 512;   // group-attention width
-    int32_t n_print                         = -1;    // print token count every n tokens (-1 = disabled)
-    float   rope_freq_base                  = 0.0f;  // RoPE base frequency
-    float   rope_freq_scale                 = 0.0f;  // RoPE frequency scaling factor
-    float   yarn_ext_factor                 = -1.0f; // YaRN extrapolation mix factor
-    float   yarn_attn_factor                = 1.0f;  // YaRN magnitude scaling factor
-    float   yarn_beta_fast                  = 32.0f; // YaRN low correction dim
-    float   yarn_beta_slow                  = 1.0f;  // YaRN high correction dim
-    int32_t yarn_orig_ctx                   = 0;     // YaRN original context length
-    int8_t  rope_scaling_type               = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
-                                                                              //       pinging @cebtenzzre
-
-    // // sampling parameters
-    struct llama_sampling_params sparams;
-
-    std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
-    std::string model_draft       = "";                              // draft model for speculative decoding
-    std::string model_alias       = "unknown"; // model alias
-    std::string prompt            = "";
-    std::string prompt_file       = "";  // store the external prompt file name
-    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
-    std::string input_prefix      = "";  // string to prefix user inputs with
-    std::string input_suffix      = "";  // string to suffix user inputs with
-    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-    std::string logdir            = "";  // directory in which to save YAML log files
-    std::string logits_file       = "";  // file for saving *all* logits
-
-    std::vector<llama_model_kv_override> kv_overrides;
-
-    // TODO: avoid tuple, use struct
-    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
-    std::string lora_base  = "";                              // base model path for the lora adapter
-
-    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
-    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
-                                    //                                       (which is more convenient to use for plotting)
-                                    //
-    bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
-    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
-
-    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
-    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
-
-    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
-    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
-
-    bool   kl_divergence   = false; // compute KL-divergence
-
-    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
-    bool random_prompt     = false; // do not randomize prompt if none provided
-    bool use_color         = false; // use color to distinguish generations and inputs
-    bool interactive       = false; // interactive mode
-    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
-    bool prompt_cache_all  = false; // save user input and generations to prompt cache
-    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
-
-    bool embedding         = false; // get only sentence embedding
-    bool escape            = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
-    bool interactive_first = false; // wait for user input immediately
-    bool multiline_input   = false; // reverse the usage of `\`
-    bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
-    bool cont_batching     = false; // insert new sequences for decoding on-the-fly
-
-    bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
-    bool ignore_eos        = false; // ignore generated EOS tokens
-    bool instruct          = false; // instruction mode (used for Alpaca models)
-    bool logits_all        = false; // return logits for all tokens in the batch
-    bool use_mmap          = true;  // use mmap for faster loads
-    bool use_mlock         = false; // use mlock to keep model in memory
-    bool numa              = false; // attempt optimizations that help on some NUMA systems
-    bool verbose_prompt    = false; // print prompt tokens before generation
-    bool display_prompt    = true;  // print prompt before generation
-    bool infill            = false; // use infill mode
-    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
-    bool no_kv_offload     = false; // disable KV offloading
-
-    std::string cache_type_k = "f16"; // KV cache data type for the K
-    std::string cache_type_v = "f16"; // KV cache data type for the V
-
-    // multimodal models (see examples/llava)
-    std::string mmproj = ""; // path to multimodal projector
-    std::string image  = ""; // path to an image file
-};
-
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
-
-void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
-
-std::string get_system_info(const gpt_params & params);
-
-std::string gpt_random_prompt(std::mt19937 & rng);
-
-void process_escapes(std::string& input);
-
-//
-// String parsing
-//
-
-std::string parse_samplers_input(std::string input);
-
-//
-// Model utils
-//
-
-// TODO: avoid tuplue, use struct
-std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
-
-struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
-struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
-
-// Batch utils
-
-void llama_batch_clear(struct llama_batch & batch);
-
-void llama_batch_add(
-                 struct llama_batch & batch,
-                        llama_token   id,
-                          llama_pos   pos,
-    const std::vector<llama_seq_id> & seq_ids,
-                               bool   logits);
-
-//
-// Vocab utils
-//
-
-// tokenizes a string into a vector of tokens
-// should work similar to Python's `tokenizer.encode`
-std::vector<llama_token> llama_tokenize(
-  const struct llama_context * ctx,
-           const std::string & text,
-                        bool   add_bos,
-                        bool   special = false);
-
-std::vector<llama_token> llama_tokenize(
-    const struct llama_model * model,
-           const std::string & text,
-                        bool   add_bos,
-                        bool   special = false);
-
-// tokenizes a token into a piece
-// should work similar to Python's `tokenizer.id_to_piece`
-std::string llama_token_to_piece(
-        const struct llama_context * ctx,
-                       llama_token   token);
-
-// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
-//       that takes into account the tokenizer type and decides how to handle the leading space
-//
-// detokenizes a vector of tokens into a string
-// should work similar to Python's `tokenizer.decode`
-// removes the leading space from the first non-BOS token
-std::string llama_detokenize_spm(
-                         llama_context * ctx,
-        const std::vector<llama_token> & tokens);
-
-// detokenizes a vector of tokens into a string
-// should work similar to Python's `tokenizer.decode`
-std::string llama_detokenize_bpe(
-                         llama_context * ctx,
-        const std::vector<llama_token> & tokens);
-
-// Uses the value from the model metadata if possible, otherwise
-// defaults to true when model type is SPM, otherwise false.
-bool llama_should_add_bos_token(const llama_model * model);
-
-//
-// YAML utils
-//
-
-bool create_directory_with_parents(const std::string & path);
-void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
-void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
-void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
-std::string get_sortable_timestamp();
-
-void dump_non_result_info_yaml(
-    FILE * stream, const gpt_params & params, const llama_context * lctx,
-    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
-
-//
-// KV cache utils
-//
-
-// Dump the KV cache view with the number of sequences per cell.
-void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
-
-// Dump the KV cache view showing individual sequences in each cell (long output).
-void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -1,501 +0,0 @@
-#include "console.h"
-#include <vector>
-#include <iostream>
-
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#include <fcntl.h>
-#include <io.h>
-#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
-#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
-#endif
-#else
-#include <climits>
-#include <sys/ioctl.h>
-#include <unistd.h>
-#include <wchar.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <signal.h>
-#include <termios.h>
-#endif
-
-#define ANSI_COLOR_RED     "\x1b[31m"
-#define ANSI_COLOR_GREEN   "\x1b[32m"
-#define ANSI_COLOR_YELLOW  "\x1b[33m"
-#define ANSI_COLOR_BLUE    "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN    "\x1b[36m"
-#define ANSI_COLOR_RESET   "\x1b[0m"
-#define ANSI_BOLD          "\x1b[1m"
-
-namespace console {
-
-    //
-    // Console state
-    //
-
-    static bool      advanced_display = false;
-    static bool      simple_io        = true;
-    static display_t current_display  = reset;
-
-    static FILE*     out              = stdout;
-
-#if defined (_WIN32)
-    static void*     hConsole;
-#else
-    static FILE*     tty              = nullptr;
-    static termios   initial_state;
-#endif
-
-    //
-    // Init and cleanup
-    //
-
-    void init(bool use_simple_io, bool use_advanced_display) {
-        advanced_display = use_advanced_display;
-        simple_io = use_simple_io;
-#if defined(_WIN32)
-        // Windows-specific console initialization
-        DWORD dwMode = 0;
-        hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
-        if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
-            hConsole = GetStdHandle(STD_ERROR_HANDLE);
-            if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) {
-                hConsole = nullptr;
-                simple_io = true;
-            }
-        }
-        if (hConsole) {
-            // Check conditions combined to reduce nesting
-            if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) &&
-                !SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
-                advanced_display = false;
-            }
-            // Set console output codepage to UTF8
-            SetConsoleOutputCP(CP_UTF8);
-        }
-        HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
-        if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
-            // Set console input codepage to UTF16
-            _setmode(_fileno(stdin), _O_WTEXT);
-
-            // Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
-            if (simple_io) {
-                dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
-            } else {
-                dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
-            }
-            if (!SetConsoleMode(hConIn, dwMode)) {
-                simple_io = true;
-            }
-        }
-#else
-        // POSIX-specific console initialization
-        if (!simple_io) {
-            struct termios new_termios;
-            tcgetattr(STDIN_FILENO, &initial_state);
-            new_termios = initial_state;
-            new_termios.c_lflag &= ~(ICANON | ECHO);
-            new_termios.c_cc[VMIN] = 1;
-            new_termios.c_cc[VTIME] = 0;
-            tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
-
-            tty = fopen("/dev/tty", "w+");
-            if (tty != nullptr) {
-                out = tty;
-            }
-        }
-
-        setlocale(LC_ALL, "");
-#endif
-    }
-
-    void cleanup() {
-        // Reset console display
-        set_display(reset);
-
-#if !defined(_WIN32)
-        // Restore settings on POSIX systems
-        if (!simple_io) {
-            if (tty != nullptr) {
-                out = stdout;
-                fclose(tty);
-                tty = nullptr;
-            }
-            tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
-        }
-#endif
-    }
-
-    //
-    // Display and IO
-    //
-
-    // Keep track of current display and only emit ANSI code if it changes
-    void set_display(display_t display) {
-        if (advanced_display && current_display != display) {
-            fflush(stdout);
-            switch(display) {
-                case reset:
-                    fprintf(out, ANSI_COLOR_RESET);
-                    break;
-                case prompt:
-                    fprintf(out, ANSI_COLOR_YELLOW);
-                    break;
-                case user_input:
-                    fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
-                    break;
-                case error:
-                    fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
-            }
-            current_display = display;
-            fflush(out);
-        }
-    }
-
-    static char32_t getchar32() {
-#if defined(_WIN32)
-        HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
-        wchar_t high_surrogate = 0;
-
-        while (true) {
-            INPUT_RECORD record;
-            DWORD count;
-            if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
-                return WEOF;
-            }
-
-            if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
-                wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
-                if (wc == 0) {
-                    continue;
-                }
-
-                if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
-                    high_surrogate = wc;
-                    continue;
-                }
-                if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
-                    if (high_surrogate != 0) { // Check if we have a high surrogate
-                        return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
-                    }
-                }
-
-                high_surrogate = 0; // Reset the high surrogate
-                return static_cast<char32_t>(wc);
-            }
-        }
-#else
-        wchar_t wc = getwchar();
-        if (static_cast<wint_t>(wc) == WEOF) {
-            return WEOF;
-        }
-
-#if WCHAR_MAX == 0xFFFF
-        if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
-            wchar_t low_surrogate = getwchar();
-            if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
-                return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
-            }
-        }
-        if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
-            return 0xFFFD; // Return the replacement character U+FFFD
-        }
-#endif
-
-        return static_cast<char32_t>(wc);
-#endif
-    }
-
-    static void pop_cursor() {
-#if defined(_WIN32)
-        if (hConsole != NULL) {
-            CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-            GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
-
-            COORD newCursorPosition = bufferInfo.dwCursorPosition;
-            if (newCursorPosition.X == 0) {
-                newCursorPosition.X = bufferInfo.dwSize.X - 1;
-                newCursorPosition.Y -= 1;
-            } else {
-                newCursorPosition.X -= 1;
-            }
-
-            SetConsoleCursorPosition(hConsole, newCursorPosition);
-            return;
-        }
-#endif
-        putc('\b', out);
-    }
-
-    static int estimateWidth(char32_t codepoint) {
-#if defined(_WIN32)
-        (void)codepoint;
-        return 1;
-#else
-        return wcwidth(codepoint);
-#endif
-    }
-
-    static int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
-#if defined(_WIN32)
-        CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
-        if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
-            // go with the default
-            return expectedWidth;
-        }
-        COORD initialPosition = bufferInfo.dwCursorPosition;
-        DWORD nNumberOfChars = length;
-        WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
-
-        CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
-        GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
-
-        // Figure out our real position if we're in the last column
-        if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
-            DWORD nNumberOfChars;
-            WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL);
-            GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
-        }
-
-        int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
-        if (width < 0) {
-            width += newBufferInfo.dwSize.X;
-        }
-        return width;
-#else
-        // We can trust expectedWidth if we've got one
-        if (expectedWidth >= 0 || tty == nullptr) {
-            fwrite(utf8_codepoint, length, 1, out);
-            return expectedWidth;
-        }
-
-        fputs("\033[6n", tty); // Query cursor position
-        int x1;
-        int y1;
-        int x2;
-        int y2;
-        int results = 0;
-        results = fscanf(tty, "\033[%d;%dR", &y1, &x1);
-
-        fwrite(utf8_codepoint, length, 1, tty);
-
-        fputs("\033[6n", tty); // Query cursor position
-        results += fscanf(tty, "\033[%d;%dR", &y2, &x2);
-
-        if (results != 4) {
-            return expectedWidth;
-        }
-
-        int width = x2 - x1;
-        if (width < 0) {
-            // Calculate the width considering text wrapping
-            struct winsize w;
-            ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
-            width += w.ws_col;
-        }
-        return width;
-#endif
-    }
-
-    static void replace_last(char ch) {
-#if defined(_WIN32)
-        pop_cursor();
-        put_codepoint(&ch, 1, 1);
-#else
-        fprintf(out, "\b%c", ch);
-#endif
-    }
-
-    static void append_utf8(char32_t ch, std::string & out) {
-        if (ch <= 0x7F) {
-            out.push_back(static_cast<unsigned char>(ch));
-        } else if (ch <= 0x7FF) {
-            out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
-            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-        } else if (ch <= 0xFFFF) {
-            out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
-            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
-            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-        } else if (ch <= 0x10FFFF) {
-            out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
-            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
-            out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
-            out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
-        } else {
-            // Invalid Unicode code point
-        }
-    }
-
-    // Helper function to remove the last UTF-8 character from a string
-    static void pop_back_utf8_char(std::string & line) {
-        if (line.empty()) {
-            return;
-        }
-
-        size_t pos = line.length() - 1;
-
-        // Find the start of the last UTF-8 character (checking up to 4 bytes back)
-        for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
-            if ((line[pos] & 0xC0) != 0x80) {
-                break; // Found the start of the character
-            }
-        }
-        line.erase(pos);
-    }
-
-    static bool readline_advanced(std::string & line, bool multiline_input) {
-        if (out != stdout) {
-            fflush(stdout);
-        }
-
-        line.clear();
-        std::vector<int> widths;
-        bool is_special_char = false;
-        bool end_of_stream = false;
-
-        char32_t input_char;
-        while (true) {
-            fflush(out); // Ensure all output is displayed before waiting for input
-            input_char = getchar32();
-
-            if (input_char == '\r' || input_char == '\n') {
-                break;
-            }
-
-            if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
-                end_of_stream = true;
-                break;
-            }
-
-            if (is_special_char) {
-                set_display(user_input);
-                replace_last(line.back());
-                is_special_char = false;
-            }
-
-            if (input_char == '\033') { // Escape sequence
-                char32_t code = getchar32();
-                if (code == '[' || code == 0x1B) {
-                    // Discard the rest of the escape sequence
-                    while ((code = getchar32()) != (char32_t) WEOF) {
-                        if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
-                            break;
-                        }
-                    }
-                }
-            } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
-                if (!widths.empty()) {
-                    int count;
-                    do {
-                        count = widths.back();
-                        widths.pop_back();
-                        // Move cursor back, print space, and move cursor back again
-                        for (int i = 0; i < count; i++) {
-                            replace_last(' ');
-                            pop_cursor();
-                        }
-                        pop_back_utf8_char(line);
-                    } while (count == 0 && !widths.empty());
-                }
-            } else {
-                int offset = line.length();
-                append_utf8(input_char, line);
-                int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
-                if (width < 0) {
-                    width = 0;
-                }
-                widths.push_back(width);
-            }
-
-            if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
-                set_display(prompt);
-                replace_last(line.back());
-                is_special_char = true;
-            }
-        }
-
-        bool has_more = multiline_input;
-        if (is_special_char) {
-            replace_last(' ');
-            pop_cursor();
-
-            char last = line.back();
-            line.pop_back();
-            if (last == '\\') {
-                line += '\n';
-                fputc('\n', out);
-                has_more = !has_more;
-            } else {
-                // llama will just eat the single space, it won't act as a space
-                if (line.length() == 1 && line.back() == ' ') {
-                    line.clear();
-                    pop_cursor();
-                }
-                has_more = false;
-            }
-        } else {
-            if (end_of_stream) {
-                has_more = false;
-            } else {
-                line += '\n';
-                fputc('\n', out);
-            }
-        }
-
-        fflush(out);
-        return has_more;
-    }
-
-    static bool readline_simple(std::string & line, bool multiline_input) {
-#if defined(_WIN32)
-        std::wstring wline;
-        if (!std::getline(std::wcin, wline)) {
-            // Input stream is bad or EOF received
-            line.clear();
-            GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0);
-            return false;
-        }
-
-        int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL);
-        line.resize(size_needed);
-        WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL);
-#else
-        if (!std::getline(std::cin, line)) {
-            // Input stream is bad or EOF received
-            line.clear();
-            return false;
-        }
-#endif
-        if (!line.empty()) {
-            char last = line.back();
-            if (last == '/') { // Always return control on '/' symbol
-                line.pop_back();
-                return false;
-            }
-            if (last == '\\') { // '\\' changes the default action
-                line.pop_back();
-                multiline_input = !multiline_input;
-            }
-        }
-        line += '\n';
-
-        // By default, continue input if multiline_input is set
-        return multiline_input;
-    }
-
-    bool readline(std::string & line, bool multiline_input) {
-        set_display(user_input);
-
-        if (simple_io) {
-            return readline_simple(line, multiline_input);
-        }
-        return readline_advanced(line, multiline_input);
-    }
-
-}
--- a/common/console.h
+++ b/common/console.h
@@ -1,19 +0,0 @@
-// Console functions
-
-#pragma once
-
-#include <string>
-
-namespace console {
-    enum display_t {
-        reset = 0,
-        prompt,
-        user_input,
-        error
-    };
-
-    void init(bool use_simple_io, bool use_advanced_display);
-    void cleanup();
-    void set_display(display_t display);
-    bool readline(std::string & line, bool multiline_input);
-}
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -1,424 +0,0 @@
-#include "grammar-parser.h"
-#include <cstdint>
-#include <cwchar>
-#include <string>
-#include <utility>
-#include <stdexcept>
-#include <exception>
-
-namespace grammar_parser {
-    // NOTE: assumes valid utf8 (but checks for overrun)
-    // copied from llama.cpp
-    static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
-        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
-        uint8_t  first_byte = static_cast<uint8_t>(*src);
-        uint8_t  highbits   = first_byte >> 4;
-        int      len        = lookup[highbits];
-        uint8_t  mask       = (1 << (8 - len)) - 1;
-        uint32_t value      = first_byte & mask;
-        const char * end    = src + len; // may overrun!
-        const char * pos    = src + 1;
-        for ( ; pos < end && *pos; pos++) {
-            value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
-        }
-        return std::make_pair(value, pos);
-    }
-
-    static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
-        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
-        return result.first->second;
-    }
-
-    static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
-        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
-        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
-        return next_id;
-    }
-
-    static void add_rule(
-            parse_state & state,
-            uint32_t      rule_id,
-            const std::vector<llama_grammar_element> & rule) {
-        if (state.rules.size() <= rule_id) {
-            state.rules.resize(rule_id + 1);
-        }
-        state.rules[rule_id] = rule;
-    }
-
-    static bool is_word_char(char c) {
-        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
-    }
-
-    static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
-        const char * pos   = src;
-        const char * end   = src + size;
-        uint32_t     value = 0;
-        for ( ; pos < end && *pos; pos++) {
-            value <<= 4;
-            char c = *pos;
-            if ('a' <= c && c <= 'f') {
-                value += c - 'a' + 10;
-            } else if ('A' <= c && c <= 'F') {
-                value += c - 'A' + 10;
-            } else if ('0' <= c && c <= '9') {
-                value += c - '0';
-            } else {
-                break;
-            }
-        }
-        if (pos != end) {
-            throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
-        }
-        return std::make_pair(value, pos);
-    }
-
-    static const char * parse_space(const char * src, bool newline_ok) {
-        const char * pos = src;
-        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
-                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
-            if (*pos == '#') {
-                while (*pos && *pos != '\r' && *pos != '\n') {
-                    pos++;
-                }
-            } else {
-                pos++;
-            }
-        }
-        return pos;
-    }
-
-    static const char * parse_name(const char * src) {
-        const char * pos = src;
-        while (is_word_char(*pos)) {
-            pos++;
-        }
-        if (pos == src) {
-            throw std::runtime_error(std::string("expecting name at ") + src);
-        }
-        return pos;
-    }
-
-    static std::pair<uint32_t, const char *> parse_char(const char * src) {
-        if (*src == '\\') {
-            switch (src[1]) {
-                case 'x': return parse_hex(src + 2, 2);
-                case 'u': return parse_hex(src + 2, 4);
-                case 'U': return parse_hex(src + 2, 8);
-                case 't': return std::make_pair('\t', src + 2);
-                case 'r': return std::make_pair('\r', src + 2);
-                case 'n': return std::make_pair('\n', src + 2);
-                case '\\':
-                case '"':
-                case '[':
-                case ']':
-                    return std::make_pair(src[1], src + 2);
-                default:
-                    throw std::runtime_error(std::string("unknown escape at ") + src);
-            }
-        } else if (*src) {
-            return decode_utf8(src);
-        }
-        throw std::runtime_error("unexpected end of input");
-    }
-
-    const char * parse_alternates(
-            parse_state       & state,
-            const char        * src,
-            const std::string & rule_name,
-            uint32_t            rule_id,
-            bool                is_nested);
-
-    static const char * parse_sequence(
-            parse_state                        & state,
-            const char                         * src,
-            const std::string                  & rule_name,
-            std::vector<llama_grammar_element> & out_elements,
-            bool                                 is_nested) {
-        size_t last_sym_start = out_elements.size();
-        const char * pos = src;
-        while (*pos) {
-            if (*pos == '"') { // literal string
-                pos++;
-                last_sym_start = out_elements.size();
-                while (*pos != '"') {
-                    auto char_pair = parse_char(pos);
-                         pos       = char_pair.second;
-                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '[') { // char range(s)
-                pos++;
-                enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
-                if (*pos == '^') {
-                    pos++;
-                    start_type = LLAMA_GRETYPE_CHAR_NOT;
-                }
-                last_sym_start = out_elements.size();
-                while (*pos != ']') {
-                    auto char_pair = parse_char(pos);
-                         pos       = char_pair.second;
-                    enum llama_gretype type = last_sym_start < out_elements.size()
-                        ? LLAMA_GRETYPE_CHAR_ALT
-                        : start_type;
-
-                    out_elements.push_back({type, char_pair.first});
-                    if (pos[0] == '-' && pos[1] != ']') {
-                        auto endchar_pair = parse_char(pos + 1);
-                             pos          = endchar_pair.second;
-                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
-                    }
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (is_word_char(*pos)) { // rule reference
-                const char * name_end    = parse_name(pos);
-                uint32_t     ref_rule_id = get_symbol_id(state, pos, name_end - pos);
-                pos = parse_space(name_end, is_nested);
-                last_sym_start = out_elements.size();
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
-            } else if (*pos == '(') { // grouping
-                // parse nested alternates into synthesized rule
-                pos = parse_space(pos + 1, true);
-                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
-                pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
-                last_sym_start = out_elements.size();
-                // output reference to synthesized rule
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
-                if (*pos != ')') {
-                    throw std::runtime_error(std::string("expecting ')' at ") + pos);
-                }
-                pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
-                if (last_sym_start == out_elements.size()) {
-                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
-                }
-
-                // apply transformation to previous symbol (last_sym_start to end) according to
-                // rewrite rules:
-                // S* --> S' ::= S S' |
-                // S+ --> S' ::= S S' | S
-                // S? --> S' ::= S |
-                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
-                std::vector<llama_grammar_element> sub_rule;
-                // add preceding symbol to generated rule
-                sub_rule.insert(
-                    sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
-                if (*pos == '*' || *pos == '+') {
-                    // cause generated rule to recurse
-                    sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
-                }
-                // mark start of alternate def
-                sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
-                if (*pos == '+') {
-                    // add preceding symbol as alternate only for '+' (otherwise empty)
-                    sub_rule.insert(
-                        sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
-                }
-                sub_rule.push_back({LLAMA_GRETYPE_END, 0});
-                add_rule(state, sub_rule_id, sub_rule);
-
-                // in original rule, replace previous symbol with reference to generated rule
-                out_elements.resize(last_sym_start);
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
-
-                pos = parse_space(pos + 1, is_nested);
-            } else {
-                break;
-            }
-        }
-        return pos;
-    }
-
-    const char * parse_alternates(
-            parse_state       & state,
-            const char        * src,
-            const std::string & rule_name,
-            uint32_t            rule_id,
-            bool                is_nested) {
-        std::vector<llama_grammar_element> rule;
-        const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
-        while (*pos == '|') {
-            rule.push_back({LLAMA_GRETYPE_ALT, 0});
-            pos = parse_space(pos + 1, true);
-            pos = parse_sequence(state, pos, rule_name, rule, is_nested);
-        }
-        rule.push_back({LLAMA_GRETYPE_END, 0});
-        add_rule(state, rule_id, rule);
-        return pos;
-    }
-
-    static const char * parse_rule(parse_state & state, const char * src) {
-        const char * name_end = parse_name(src);
-        const char * pos      = parse_space(name_end, false);
-        size_t       name_len = name_end - src;
-        uint32_t     rule_id  = get_symbol_id(state, src, name_len);
-        const std::string name(src, name_len);
-
-        if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
-            throw std::runtime_error(std::string("expecting ::= at ") + pos);
-        }
-        pos = parse_space(pos + 3, true);
-
-        pos = parse_alternates(state, pos, name, rule_id, false);
-
-        if (*pos == '\r') {
-            pos += pos[1] == '\n' ? 2 : 1;
-        } else if (*pos == '\n') {
-            pos++;
-        } else if (*pos) {
-            throw std::runtime_error(std::string("expecting newline or end at ") + pos);
-        }
-        return parse_space(pos, true);
-    }
-
-    parse_state parse(const char * src) {
-        try {
-            parse_state state;
-            const char * pos = parse_space(src, true);
-            while (*pos) {
-                pos = parse_rule(state, pos);
-            }
-            return state;
-        } catch (const std::exception & err) {
-            fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
-            return parse_state();
-        }
-    }
-
-    static void print_grammar_char(FILE * file, uint32_t c) {
-        if (0x20 <= c && c <= 0x7f) {
-            fprintf(file, "%c", static_cast<char>(c));
-        } else {
-            // cop out of encoding UTF-8
-            fprintf(file, "<U+%04X>", c);
-        }
-    }
-
-    static bool is_char_element(llama_grammar_element elem) {
-        switch (elem.type) {
-            case LLAMA_GRETYPE_CHAR:           return true;
-            case LLAMA_GRETYPE_CHAR_NOT:       return true;
-            case LLAMA_GRETYPE_CHAR_ALT:       return true;
-            case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
-            default:                           return false;
-        }
-    }
-
-    static void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
-        for (auto elem : rule) {
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:            fprintf(file, "END");            break;
-                case LLAMA_GRETYPE_ALT:            fprintf(file, "ALT");            break;
-                case LLAMA_GRETYPE_RULE_REF:       fprintf(file, "RULE_REF");       break;
-                case LLAMA_GRETYPE_CHAR:           fprintf(file, "CHAR");           break;
-                case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
-                case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
-            }
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:
-                case LLAMA_GRETYPE_ALT:
-                case LLAMA_GRETYPE_RULE_REF:
-                    fprintf(file, "(%u) ", elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR:
-                case LLAMA_GRETYPE_CHAR_NOT:
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                case LLAMA_GRETYPE_CHAR_ALT:
-                    fprintf(file, "(\"");
-                    print_grammar_char(file, elem.value);
-                    fprintf(file, "\") ");
-                    break;
-            }
-        }
-        fprintf(file, "\n");
-    }
-
-    static void print_rule(
-            FILE     * file,
-            uint32_t   rule_id,
-            const std::vector<llama_grammar_element> & rule,
-            const std::map<uint32_t, std::string>    & symbol_id_names) {
-        if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
-            throw std::runtime_error(
-                "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
-        }
-        fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
-        for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
-            llama_grammar_element elem = rule[i];
-            switch (elem.type) {
-                case LLAMA_GRETYPE_END:
-                    throw std::runtime_error(
-                        "unexpected end of rule: " + std::to_string(rule_id) + "," +
-                        std::to_string(i));
-                case LLAMA_GRETYPE_ALT:
-                    fprintf(file, "| ");
-                    break;
-                case LLAMA_GRETYPE_RULE_REF:
-                    fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
-                    break;
-                case LLAMA_GRETYPE_CHAR:
-                    fprintf(file, "[");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_NOT:
-                    fprintf(file, "[^");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                    if (i == 0 || !is_char_element(rule[i - 1])) {
-                        throw std::runtime_error(
-                            "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
-                            std::to_string(rule_id) + "," + std::to_string(i));
-                    }
-                    fprintf(file, "-");
-                    print_grammar_char(file, elem.value);
-                    break;
-                case LLAMA_GRETYPE_CHAR_ALT:
-                    if (i == 0 || !is_char_element(rule[i - 1])) {
-                        throw std::runtime_error(
-                            "LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
-                            std::to_string(rule_id) + "," + std::to_string(i));
-                    }
-                    print_grammar_char(file, elem.value);
-                    break;
-            }
-            if (is_char_element(elem)) {
-                switch (rule[i + 1].type) {
-                    case LLAMA_GRETYPE_CHAR_ALT:
-                    case LLAMA_GRETYPE_CHAR_RNG_UPPER:
-                        break;
-                    default:
-                        fprintf(file, "] ");
-                }
-            }
-        }
-        fprintf(file, "\n");
-    }
-
-    void print_grammar(FILE * file, const parse_state & state) {
-        try {
-            std::map<uint32_t, std::string> symbol_id_names;
-            for (const auto & kv : state.symbol_ids) {
-                symbol_id_names[kv.second] = kv.first;
-            }
-            for (size_t i = 0, end = state.rules.size(); i < end; i++) {
-                // fprintf(file, "%zu: ", i);
-                // print_rule_binary(file, state.rules[i]);
-                print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
-                // fprintf(file, "\n");
-            }
-        } catch (const std::exception & err) {
-            fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
-        }
-    }
-
-    std::vector<const llama_grammar_element *> parse_state::c_rules() {
-        std::vector<const llama_grammar_element *> ret;
-        ret.reserve(rules.size());
-        for (const auto & rule : rules) {
-            ret.push_back(rule.data());
-        }
-        return ret;
-    }
-}
--- a/common/grammar-parser.h
+++ b/common/grammar-parser.h
@@ -1,29 +0,0 @@
-// Implements a parser for an extended Backus-Naur form (BNF), producing the
-// binary context-free grammar format specified by llama.h. Supports character
-// ranges, grouping, and repetition operators. As an example, a grammar for
-// arithmetic might look like:
-//
-// root  ::= expr
-// expr  ::= term ([-+*/] term)*
-// term  ::= num | "(" space expr ")" space
-// num   ::= [0-9]+ space
-// space ::= [ \t\n]*
-
-#pragma once
-#include "llama.h"
-#include <vector>
-#include <map>
-#include <cstdint>
-#include <string>
-
-namespace grammar_parser {
-    struct parse_state {
-        std::map<std::string, uint32_t>                 symbol_ids;
-        std::vector<std::vector<llama_grammar_element>> rules;
-
-        std::vector<const llama_grammar_element *> c_rules();
-    };
-
-    parse_state parse(const char * src);
-    void print_grammar(FILE * file, const parse_state & state);
-}
--- a/common/log.h
+++ b/common/log.h
@@ -1,723 +0,0 @@
-#pragma once
-
-#include <chrono>
-#include <cstring>
-#include <sstream>
-#include <iostream>
-#include <thread>
-#include <vector>
-#include <algorithm>
-#include <cinttypes>
-
-// --------------------------------
-//
-// Basic usage:
-//
-// --------
-//
-//  The LOG() and LOG_TEE() macros are ready to go by default
-//   they do not require any initialization.
-//
-//  LOGLN() and LOG_TEELN() are variants which automatically
-//   include \n character at the end of the log string.
-//
-//  LOG() behaves exactly like printf, by default writing to a logfile.
-//  LOG_TEE() additionally, prints to the screen too ( mimics Unix tee command ).
-//
-//  Default logfile is named
-//   "llama.<threadID>.log"
-//  Default LOG_TEE() secondary output target is
-//   stderr
-//
-//  Logs can be dynamically disabled or enabled using functions:
-//   log_disable()
-//  and
-//   log_enable()
-//
-//  A log target can be changed with:
-//   log_set_target( string )
-//    creating and opening, or re-opening a file by string filename
-//  or
-//   log_set_target( FILE* )
-//    allowing to point at stderr, stdout, or any valid FILE* file handler.
-//
-// --------
-//
-// End of Basic usage.
-//
-// --------------------------------
-
-// Specifies a log target.
-//  default uses log_handler() with "llama.log" log file
-//  this can be changed, by defining LOG_TARGET
-//  like so:
-//
-//  #define LOG_TARGET (a valid FILE*)
-//  #include "log.h"
-//
-//  or it can be simply redirected to stdout or stderr
-//  like so:
-//
-//  #define LOG_TARGET stderr
-//  #include "log.h"
-//
-//  The log target can also be redirected to a different function
-//  like so:
-//
-//  #define LOG_TARGET log_handler_different()
-//  #include "log.h"
-//
-//  FILE* log_handler_different()
-//  {
-//      return stderr;
-//  }
-//
-//  or:
-//
-//  #define LOG_TARGET log_handler_another_one("somelog.log")
-//  #include "log.h"
-//
-//  FILE* log_handler_another_one(char*filename)
-//  {
-//      static FILE* logfile = nullptr;
-//      (...)
-//      if( !logfile )
-//      {
-//          fopen(...)
-//      }
-//      (...)
-//      return logfile
-//  }
-//
-#ifndef LOG_TARGET
-    #define LOG_TARGET log_handler()
-#endif
-
-#ifndef LOG_TEE_TARGET
-    #define LOG_TEE_TARGET stderr
-#endif
-
-// Utility for synchronizing log configuration state
-//  since std::optional was introduced only in c++17
-enum LogTriState
-{
-    LogTriStateSame,
-    LogTriStateFalse,
-    LogTriStateTrue
-};
-
-// Utility to obtain "pid" like unique process id and use it when creating log files.
-inline std::string log_get_pid()
-{
-   static std::string pid;
-   if (pid.empty())
-   {
-       // std::this_thread::get_id() is the most portable way of obtaining a "process id"
-       //  it's not the same as "pid" but is unique enough to solve multiple instances
-       //  trying to write to the same log.
-       std::stringstream ss;
-       ss << std::this_thread::get_id();
-       pid = ss.str();
-   }
-
-   return pid;
-}
-
-// Utility function for generating log file names with unique id based on thread id.
-//  invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
-//  where the number is a runtime id of the current thread.
-
-#define log_filename_generator(log_file_basename, log_file_extension) log_filename_generator_impl(LogTriStateSame, log_file_basename, log_file_extension)
-
-// INTERNAL, DO NOT USE
-inline std::string log_filename_generator_impl(LogTriState multilog, const std::string & log_file_basename, const std::string & log_file_extension)
-{
-    static bool _multilog = false;
-
-    if (multilog != LogTriStateSame)
-    {
-        _multilog = multilog == LogTriStateTrue;
-    }
-
-    std::stringstream buf;
-
-    buf << log_file_basename;
-    if (_multilog)
-    {
-        buf << ".";
-        buf << log_get_pid();
-    }
-    buf << ".";
-    buf << log_file_extension;
-
-    return buf.str();
-}
-
-#ifndef LOG_DEFAULT_FILE_NAME
-    #define LOG_DEFAULT_FILE_NAME log_filename_generator("llama", "log")
-#endif
-
-// Utility for turning #define values into string literals
-//  so we can have a define for stderr and
-//  we can print "stderr" instead of literal stderr, etc.
-#define LOG_STRINGIZE1(s) #s
-#define LOG_STRINGIZE(s) LOG_STRINGIZE1(s)
-
-#define LOG_TEE_TARGET_STRING LOG_STRINGIZE(LOG_TEE_TARGET)
-
-// Allows disabling timestamps.
-//  in order to disable, define LOG_NO_TIMESTAMPS
-//  like so:
-//
-//  #define LOG_NO_TIMESTAMPS
-//  #include "log.h"
-//
-#ifndef LOG_NO_TIMESTAMPS
-    #ifndef _MSC_VER
-        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #else
-        #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #endif
-#else
-    #define LOG_TIMESTAMP_FMT "%s"
-    #define LOG_TIMESTAMP_VAL ,""
-#endif
-
-#ifdef LOG_TEE_TIMESTAMPS
-    #ifndef _MSC_VER
-        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #else
-        #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
-        #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
-    #endif
-#else
-    #define LOG_TEE_TIMESTAMP_FMT "%s"
-    #define LOG_TEE_TIMESTAMP_VAL ,""
-#endif
-
-// Allows disabling file/line/function prefix
-//  in order to disable, define LOG_NO_FILE_LINE_FUNCTION
-//  like so:
-//
-//  #define LOG_NO_FILE_LINE_FUNCTION
-//  #include "log.h"
-//
-#ifndef LOG_NO_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
-        #define LOG_FLF_FMT "[%24s:%5d][%24s] "
-        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #else
-        #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #endif
-#else
-    #define LOG_FLF_FMT "%s"
-    #define LOG_FLF_VAL ,""
-#endif
-
-#ifdef LOG_TEE_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
-        #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #else
-        #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
-        #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
-    #endif
-#else
-    #define LOG_TEE_FLF_FMT "%s"
-    #define LOG_TEE_FLF_VAL ,""
-#endif
-
-// INTERNAL, DO NOT USE
-//  USE LOG() INSTEAD
-//
-#ifndef _MSC_VER
-    #define LOG_IMPL(str, ...)                                                                                      \
-    do {                                                                                                            \
-        if (LOG_TARGET != nullptr)                                                                                  \
-        {                                                                                                           \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__); \
-            fflush(LOG_TARGET);                                                                                     \
-        }                                                                                                           \
-    } while (0)
-#else
-    #define LOG_IMPL(str, ...)                                                                                           \
-    do {                                                                                                                 \
-        if (LOG_TARGET != nullptr)                                                                                       \
-        {                                                                                                                \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__); \
-            fflush(LOG_TARGET);                                                                                          \
-        }                                                                                                                \
-    } while (0)
-#endif
-
-// INTERNAL, DO NOT USE
-//  USE LOG_TEE() INSTEAD
-//
-#ifndef _MSC_VER
-    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
-    do {                                                                                                                                \
-        if (LOG_TARGET != nullptr)                                                                                                      \
-        {                                                                                                                               \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL, __VA_ARGS__);                     \
-            fflush(LOG_TARGET);                                                                                                         \
-        }                                                                                                                               \
-        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                         \
-        {                                                                                                                               \
-            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL, __VA_ARGS__); \
-            fflush(LOG_TEE_TARGET);                                                                                                     \
-        }                                                                                                                               \
-    } while (0)
-#else
-    #define LOG_TEE_IMPL(str, ...)                                                                                                           \
-    do {                                                                                                                                     \
-        if (LOG_TARGET != nullptr)                                                                                                           \
-        {                                                                                                                                    \
-            fprintf(LOG_TARGET, LOG_TIMESTAMP_FMT LOG_FLF_FMT str "%s" LOG_TIMESTAMP_VAL LOG_FLF_VAL "", ##__VA_ARGS__);                     \
-            fflush(LOG_TARGET);                                                                                                              \
-        }                                                                                                                                    \
-        if (LOG_TARGET != nullptr && LOG_TARGET != stdout && LOG_TARGET != stderr && LOG_TEE_TARGET != nullptr)                              \
-        {                                                                                                                                    \
-            fprintf(LOG_TEE_TARGET, LOG_TEE_TIMESTAMP_FMT LOG_TEE_FLF_FMT str "%s" LOG_TEE_TIMESTAMP_VAL LOG_TEE_FLF_VAL "", ##__VA_ARGS__); \
-            fflush(LOG_TEE_TARGET);                                                                                                          \
-        }                                                                                                                                    \
-    } while (0)
-#endif
-
-// The '\0' as a last argument, is a trick to bypass the silly
-//  "warning: ISO C++11 requires at least one argument for the "..." in a variadic macro"
-//  so we can have a single macro which can be called just like printf.
-
-// Main LOG macro.
-//  behaves like printf, and supports arguments the exact same way.
-//
-#ifndef _MSC_VER
-    #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
-#else
-    #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
-#endif
-
-// Main TEE macro.
-//  does the same as LOG
-//  and
-//  simultaneously writes stderr.
-//
-// Secondary target can be changed just like LOG_TARGET
-//  by defining LOG_TEE_TARGET
-//
-#ifndef _MSC_VER
-    #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
-#else
-    #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
-#endif
-
-// LOG macro variants with auto endline.
-#ifndef _MSC_VER
-    #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
-    #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
-#else
-    #define LOGLN(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "\n")
-    #define LOG_TEELN(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "\n")
-#endif
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler1_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, const std::string & filename = LOG_DEFAULT_FILE_NAME, FILE *target = nullptr)
-{
-    static bool _initialized = false;
-    static bool _append = false;
-    static bool _disabled = filename.empty() && target == nullptr;
-    static std::string log_current_filename{filename};
-    static FILE *log_current_target{target};
-    static FILE *logfile = nullptr;
-
-    if (change)
-    {
-        if (append != LogTriStateSame)
-        {
-            _append = append == LogTriStateTrue;
-            return logfile;
-        }
-
-        if (disable == LogTriStateTrue)
-        {
-            // Disable primary target
-            _disabled = true;
-        }
-        // If previously disabled, only enable, and keep previous target
-        else if (disable == LogTriStateFalse)
-        {
-            _disabled = false;
-        }
-        // Otherwise, process the arguments
-        else if (log_current_filename != filename || log_current_target != target)
-        {
-            _initialized = false;
-        }
-    }
-
-    if (_disabled)
-    {
-        // Log is disabled
-        return nullptr;
-    }
-
-    if (_initialized)
-    {
-        // with fallback in case something went wrong
-        return logfile ? logfile : stderr;
-    }
-
-    // do the (re)initialization
-    if (target != nullptr)
-    {
-        if (logfile != nullptr && logfile != stdout && logfile != stderr)
-        {
-            fclose(logfile);
-        }
-
-        log_current_filename = LOG_DEFAULT_FILE_NAME;
-        log_current_target = target;
-
-        logfile = target;
-    }
-    else
-    {
-        if (log_current_filename != filename)
-        {
-            if (logfile != nullptr && logfile != stdout && logfile != stderr)
-            {
-                fclose(logfile);
-            }
-        }
-
-        logfile = fopen(filename.c_str(), _append ? "a" : "w");
-    }
-
-    if (!logfile)
-    {
-        //  Verify whether the file was opened, otherwise fallback to stderr
-        logfile = stderr;
-
-        fprintf(stderr, "Failed to open logfile '%s' with error '%s'\n", filename.c_str(), std::strerror(errno));
-        fflush(stderr);
-
-        // At this point we let the init flag be to true below, and let the target fallback to stderr
-        //  otherwise we would repeatedly fopen() which was already unsuccessful
-    }
-
-    _initialized = true;
-
-    return logfile ? logfile : stderr;
-}
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriStateSame, LogTriState disable = LogTriStateSame, FILE *target = nullptr, const std::string & filename = LOG_DEFAULT_FILE_NAME)
-{
-    return log_handler1_impl(change, append, disable, filename, target);
-}
-
-// Disables logs entirely at runtime.
-//  Makes LOG() and LOG_TEE() produce no output,
-//  until enabled back.
-#define log_disable() log_disable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_disable_impl()
-{
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateTrue);
-}
-
-// Enables logs at runtime.
-#define log_enable() log_enable_impl()
-
-// INTERNAL, DO NOT USE
-inline FILE *log_enable_impl()
-{
-    return log_handler1_impl(true, LogTriStateSame, LogTriStateFalse);
-}
-
-// Sets target fir logs, either by a file name or FILE* pointer (stdout, stderr, or any valid FILE*)
-#define log_set_target(target) log_set_target_impl(target)
-
-// INTERNAL, DO NOT USE
-inline FILE *log_set_target_impl(const std::string & filename) { return log_handler1_impl(true, LogTriStateSame, LogTriStateSame, filename); }
-inline FILE *log_set_target_impl(FILE *target) { return log_handler2_impl(true, LogTriStateSame, LogTriStateSame, target); }
-
-// INTERNAL, DO NOT USE
-inline FILE *log_handler() { return log_handler1_impl(); }
-
-// Enable or disable creating separate log files for each run.
-//  can ONLY be invoked BEFORE first log use.
-#define log_multilog(enable) log_filename_generator_impl((enable) ? LogTriStateTrue : LogTriStateFalse, "", "")
-// Enable or disable append mode for log file.
-//  can ONLY be invoked BEFORE first log use.
-#define log_append(enable) log_append_impl(enable)
-// INTERNAL, DO NOT USE
-inline FILE *log_append_impl(bool enable)
-{
-    return log_handler1_impl(true, enable ? LogTriStateTrue : LogTriStateFalse, LogTriStateSame);
-}
-
-inline void log_test()
-{
-    log_disable();
-    LOG("01 Hello World to nobody, because logs are disabled!\n");
-    log_enable();
-    LOG("02 Hello World to default output, which is \"%s\" ( Yaaay, arguments! )!\n", LOG_STRINGIZE(LOG_TARGET));
-    LOG_TEE("03 Hello World to **both** default output and " LOG_TEE_TARGET_STRING "!\n");
-    log_set_target(stderr);
-    LOG("04 Hello World to stderr!\n");
-    LOG_TEE("05 Hello World TEE with double printing to stderr prevented!\n");
-    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("06 Hello World to default log file!\n");
-    log_set_target(stdout);
-    LOG("07 Hello World to stdout!\n");
-    log_set_target(LOG_DEFAULT_FILE_NAME);
-    LOG("08 Hello World to default log file again!\n");
-    log_disable();
-    LOG("09 Hello World _1_ into the void!\n");
-    log_enable();
-    LOG("10 Hello World back from the void ( you should not see _1_ in the log or the output )!\n");
-    log_disable();
-    log_set_target("llama.anotherlog.log");
-    LOG("11 Hello World _2_ to nobody, new target was selected but logs are still disabled!\n");
-    log_enable();
-    LOG("12 Hello World this time in a new file ( you should not see _2_ in the log or the output )?\n");
-    log_set_target("llama.yetanotherlog.log");
-    LOG("13 Hello World this time in yet new file?\n");
-    log_set_target(log_filename_generator("llama_autonamed", "log"));
-    LOG("14 Hello World in log with generated filename!\n");
-#ifdef _MSC_VER
-    LOG_TEE("15 Hello msvc TEE without arguments\n");
-    LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test");
-    LOG_TEELN("17 Hello msvc TEELN without arguments\n");
-    LOG_TEELN("18 Hello msvc TEELN with (%d)(%s) arguments\n", 1, "test");
-    LOG("19 Hello msvc LOG without arguments\n");
-    LOG("20 Hello msvc LOG with (%d)(%s) arguments\n", 1, "test");
-    LOGLN("21 Hello msvc LOGLN without arguments\n");
-    LOGLN("22 Hello msvc LOGLN with (%d)(%s) arguments\n", 1, "test");
-#endif
-}
-
-inline bool log_param_single_parse(const std::string & param)
-{
-    if ( param == "--log-test")
-    {
-        log_test();
-        return true;
-    }
-
-    if ( param == "--log-disable")
-    {
-        log_disable();
-        return true;
-    }
-
-    if ( param == "--log-enable")
-    {
-        log_enable();
-        return true;
-    }
-
-    if (param == "--log-new")
-    {
-        log_multilog(true);
-        return true;
-    }
-
-    if (param == "--log-append")
-    {
-        log_append(true);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string & param, const std::string & next = std::string())
-{
-    if ( param == "--log-file")
-    {
-        if (!check_but_dont_parse)
-        {
-            log_set_target(log_filename_generator(next.empty() ? "unnamed" : next, "log"));
-        }
-
-        return true;
-    }
-
-    return false;
-}
-
-inline void log_print_usage()
-{
-    printf("log options:\n");
-    /* format
-    printf("  -h, --help            show this help message and exit\n");*/
-    /* spacing
-    printf("__-param----------------Description\n");*/
-    printf("  --log-test            Run simple logging test\n");
-    printf("  --log-disable         Disable trace logs\n");
-    printf("  --log-enable          Enable trace logs\n");
-    printf("  --log-file            Specify a log filename (without extension)\n");
-    printf("  --log-new             Create a separate new log file on start. "
-                                   "Each log file will have unique name: \"<name>.<ID>.log\"\n");
-    printf("  --log-append          Don't truncate the old log file.\n");
-}
-
-#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
-
-// INTERNAL, DO NOT USE
-inline void log_dump_cmdline_impl(int argc, char **argv)
-{
-    std::stringstream buf;
-    for (int i = 0; i < argc; ++i)
-    {
-        if (std::string(argv[i]).find(' ') != std::string::npos)
-        {
-            buf << " \"" << argv[i] <<"\"";
-        }
-        else
-        {
-            buf << " " << argv[i];
-        }
-    }
-    LOGLN("Cmd:%s", buf.str().c_str());
-}
-
-#define log_tostr(var) log_var_to_string_impl(var).c_str()
-
-inline std::string log_var_to_string_impl(bool var)
-{
-    return var ? "true" : "false";
-}
-
-inline std::string log_var_to_string_impl(std::string var)
-{
-    return var;
-}
-
-inline std::string log_var_to_string_impl(const std::vector<int> & var)
-{
-    std::stringstream buf;
-    buf << "[ ";
-    bool first = true;
-    for (auto e : var)
-    {
-        if (first)
-        {
-            first = false;
-        }
-        else
-        {
-            buf << ", ";
-        }
-        buf << std::to_string(e);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-template <typename C, typename T>
-inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
-{
-    std::stringstream buf;
-    buf << "[ ";
-
-    bool first = true;
-    for (const auto &token : tokens)
-    {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, token);
-
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
-
-        buf
-            << "'" << detokenized << "'"
-            << ":" << std::to_string(token);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-template <typename C, typename B>
-inline std::string LOG_BATCH_TOSTR_PRETTY(const C & ctx, const B & batch)
-{
-    std::stringstream buf;
-    buf << "[ ";
-
-    bool first = true;
-    for (int i = 0; i < batch.n_tokens; ++i)
-    {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
-
-        detokenized.erase(
-            std::remove_if(
-                detokenized.begin(),
-                detokenized.end(),
-                [](const unsigned char c) { return !std::isprint(c); }),
-            detokenized.end());
-
-        buf
-            << "\n" << std::to_string(i)
-            << ":token '" << detokenized << "'"
-            << ":pos " << std::to_string(batch.pos[i])
-            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
-            << ":seq_id " << std::to_string(batch.seq_id[i][0])
-            << ":logits " << std::to_string(batch.logits[i]);
-    }
-    buf << " ]";
-
-    return buf.str();
-}
-
-#ifdef LOG_DISABLE_LOGS
-
-#undef LOG
-#define LOG(...) // dummy stub
-#undef LOGLN
-#define LOGLN(...) // dummy stub
-
-#undef LOG_TEE
-#define LOG_TEE(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
-
-#undef LOG_TEELN
-#define LOG_TEELN(...) fprintf(stderr, __VA_ARGS__) // convert to normal fprintf
-
-#undef LOG_DISABLE
-#define LOG_DISABLE() // dummy stub
-
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
-
-#undef LOG_ENABLE
-#define LOG_ENABLE() // dummy stub
-
-#undef LOG_SET_TARGET
-#define LOG_SET_TARGET(...) // dummy stub
-
-#undef LOG_DUMP_CMDLINE
-#define LOG_DUMP_CMDLINE(...) // dummy stub
-
-#endif // LOG_DISABLE_LOGS
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -1,324 +0,0 @@
-#include "sampling.h"
-
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
-    struct llama_sampling_context * result = new llama_sampling_context();
-
-    result->params  = params;
-    result->grammar = nullptr;
-
-    // if there is a grammar, parse it
-    if (!params.grammar.empty()) {
-        result->parsed_grammar = grammar_parser::parse(params.grammar.c_str());
-
-        // will be empty (default) if there are parse errors
-        if (result->parsed_grammar.rules.empty()) {
-            fprintf(stderr, "%s: failed to parse grammar\n", __func__);
-            return nullptr;
-        }
-
-        std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
-
-        result->grammar = llama_grammar_init(
-                grammar_rules.data(),
-                grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
-    }
-
-    result->prev.resize(params.n_prev);
-
-    return result;
-}
-
-void llama_sampling_free(struct llama_sampling_context * ctx) {
-    if (ctx->grammar != NULL) {
-        llama_grammar_free(ctx->grammar);
-    }
-
-    delete ctx;
-}
-
-void llama_sampling_reset(llama_sampling_context * ctx) {
-    if (ctx->grammar != NULL) {
-        llama_grammar_free(ctx->grammar);
-        ctx->grammar = NULL;
-    }
-
-    if (!ctx->parsed_grammar.rules.empty()) {
-        std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
-
-        ctx->grammar = llama_grammar_init(
-                grammar_rules.data(),
-                grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
-    }
-
-    std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
-    ctx->cur.clear();
-}
-
-void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) {
-    if (dst->grammar) {
-        llama_grammar_free(dst->grammar);
-        dst->grammar = nullptr;
-    }
-
-    if (src->grammar) {
-        dst->grammar = llama_grammar_copy(src->grammar);
-    }
-
-    dst->prev = src->prev;
-}
-
-llama_token llama_sampling_last(llama_sampling_context * ctx) {
-    return ctx->prev.back();
-}
-
-std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
-    const int size = ctx_sampling->prev.size();
-
-    n = std::min(n, size);
-
-    std::string result;
-
-    for (int i = size - n; i < size; i++) {
-        result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
-    }
-
-    return result;
-}
-
-std::string llama_sampling_print(const llama_sampling_params & params) {
-    char result[1024];
-
-    snprintf(result, sizeof(result),
-            "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
-            "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
-            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
-            params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
-            params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
-            params.mirostat, params.mirostat_eta, params.mirostat_tau);
-
-    return std::string(result);
-}
-
-std::string llama_sampling_order_print(const llama_sampling_params & params) {
-    std::string result = "CFG -> Penalties ";
-    if (params.mirostat == 0) {
-        for (auto s : params.samplers_sequence) {
-            switch (s) {
-                case 'k': result += "-> top_k "; break;
-                case 'f': result += "-> tfs_z "; break;
-                case 'y': result += "-> typical_p "; break;
-                case 'p': result += "-> top_p "; break;
-                case 'm': result += "-> min_p "; break;
-                case 't': result += "-> temp "; break;
-                default : break;
-            }
-        }
-    } else {
-        result += "-> mirostat ";
-    }
-
-    return result;
-}
-
-// no reasons to expose this function in header
-static void sampler_queue(
-                   struct llama_context * ctx_main,
-            const llama_sampling_params & params,
-                 llama_token_data_array & cur_p,
-                                 size_t & min_keep) {
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
-    const float         temp              = params.temp;
-    const float         dynatemp_range    = params.dynatemp_range;
-    const float         dynatemp_exponent = params.dynatemp_exponent;
-    const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
-    const float         top_p             = params.top_p;
-    const float         min_p             = params.min_p;
-    const float         tfs_z             = params.tfs_z;
-    const float         typical_p         = params.typical_p;
-    const std::string & samplers_sequence = params.samplers_sequence;
-
-    for (auto s : samplers_sequence) {
-        switch (s){
-            case 'k': llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
-            case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
-            case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
-            case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
-            case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case 't':
-                if (dynatemp_range > 0) {
-                    float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
-                    float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
-                    llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
-                } else {
-                    llama_sample_temp(ctx_main, &cur_p, temp);
-                }
-                break;
-            default : break;
-        }
-    }
-}
-
-static llama_token llama_sampling_sample_impl(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool is_resampling) {  // Add a parameter to indicate if we are resampling
-    const llama_sampling_params & params = ctx_sampling->params;
-
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
-    const float   temp            = params.temp;
-    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
-    const float   penalty_repeat  = params.penalty_repeat;
-    const float   penalty_freq    = params.penalty_freq;
-    const float   penalty_present = params.penalty_present;
-    const int     mirostat        = params.mirostat;
-    const float   mirostat_tau    = params.mirostat_tau;
-    const float   mirostat_eta    = params.mirostat_eta;
-    const bool    penalize_nl     = params.penalize_nl;
-
-    auto & prev = ctx_sampling->prev;
-    auto & cur  = ctx_sampling->cur;
-
-    llama_token id = 0;
-
-    // Get a pointer to the logits
-    float * logits = llama_get_logits_ith(ctx_main, idx);
-
-    // Declare original_logits at the beginning of the function scope
-    std::vector<float> original_logits;
-
-    if (!is_resampling) {
-        // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this.
-        original_logits = std::vector<float>(logits, logits + llama_n_vocab(llama_get_model(ctx_main)));
-    }
-
-    // apply params.logit_bias map
-    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
-        logits[it->first] += it->second;
-    }
-
-    if (ctx_cfg) {
-        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
-    }
-
-    cur.clear();
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-    }
-
-    llama_token_data_array cur_p = { cur.data(), cur.size(), false };
-
-    // apply penalties
-    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
-    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
-    if (penalty_tokens_used_size) {
-        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
-
-        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
-                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
-
-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
-                }
-            }
-        }
-    }
-
-    // If we are in the resampling phase, apply grammar checks before sampling logic
-    if (is_resampling && ctx_sampling->grammar != NULL) {
-        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
-    }
-
-    if (temp < 0.0) {
-        // greedy sampling, with probs
-        llama_sample_softmax(ctx_main, &cur_p);
-        id = cur_p.data[0].id;
-    } else if (temp == 0.0) {
-        // greedy sampling, no probs
-        id = llama_sample_token_greedy(ctx_main, &cur_p);
-    } else {
-        if (mirostat == 1) {
-            const int mirostat_m = 100;
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat(ctx_main, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_sampling->mirostat_mu);
-        } else if (mirostat == 2) {
-            llama_sample_temp(ctx_main, &cur_p, temp);
-            id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
-        } else {
-            // temperature sampling
-            size_t min_keep = std::max(1, params.n_probs);
-
-            sampler_queue(ctx_main, params, cur_p, min_keep);
-
-            id = llama_sample_token(ctx_main, &cur_p);
-
-            //{
-            //    const int n_top = 10;
-            //    LOG("top %d candidates:\n", n_top);
-
-            //    for (int i = 0; i < n_top; i++) {
-            //        const llama_token id = cur_p.data[i].id;
-            //        (void)id; // To avoid a warning that id is unused when logging is disabled.
-            //        LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx_main, id).c_str(), cur_p.data[i].p);
-            //    }
-            //}
-
-            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
-        }
-    }
-
-    if (ctx_sampling->grammar != NULL && !is_resampling) {
-        // Create an array with a single token data element for the sampled id
-        llama_token_data single_token_data = {id, logits[id], 0.0f};
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
-
-        // Apply grammar constraints to the single token
-        llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
-
-        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
-        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-
-        // If the token is not valid according to the grammar, perform resampling
-        if (!is_valid) {
-            LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
-
-            // Restore logits from the copy
-            std::copy(original_logits.begin(), original_logits.end(), logits);
-
-            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true);  // Pass true for is_resampling
-        }
-    }
-
-    return id;
-}
-
-llama_token llama_sampling_sample(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    // Call the implementation function with is_resampling set to false by default
-    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
-}
-
-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        llama_token id,
-        bool apply_grammar) {
-    ctx_sampling->prev.erase(ctx_sampling->prev.begin());
-    ctx_sampling->prev.push_back(id);
-
-    if (ctx_sampling->grammar != NULL && apply_grammar) {
-        llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
-    }
-}
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -1,119 +0,0 @@
-#pragma once
-
-#include "llama.h"
-
-#include "grammar-parser.h"
-
-#include <string>
-#include <vector>
-#include <unordered_map>
-
-// sampling parameters
-typedef struct llama_sampling_params {
-    int32_t     n_prev                = 64;       // number of previous tokens to remember
-    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t     top_k                 = 40;       // <= 0 to use vocab size
-    float       top_p                 = 0.95f;    // 1.0 = disabled
-    float       min_p                 = 0.05f;    // 0.0 = disabled
-    float       tfs_z                 = 1.00f;    // 1.0 = disabled
-    float       typical_p             = 1.00f;    // 1.0 = disabled
-    float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities
-    float       dynatemp_range        = 0.00f;    // 0.0 = disabled
-    float       dynatemp_exponent     = 1.00f;    // controls how entropy maps to temperature in dynamic temperature sampler
-    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
-    float       penalty_freq          = 0.00f;    // 0.0 = disabled
-    float       penalty_present       = 0.00f;    // 0.0 = disabled
-    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float       mirostat_tau          = 5.00f;    // target entropy
-    float       mirostat_eta          = 0.10f;    // learning rate
-    bool        penalize_nl           = true;     // consider newlines as a repeatable token
-    std::string samplers_sequence     = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
-
-    std::string grammar;  // optional BNF-like grammar to constrain sampling
-
-    // Classifier-Free Guidance
-    // https://arxiv.org/abs/2306.17806
-    std::string cfg_negative_prompt; // string to help guidance
-    float       cfg_scale     = 1.f; // how strong is guidance
-
-    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-
-    std::vector<llama_token> penalty_prompt_tokens;
-    bool                     use_penalty_prompt_tokens = false;
-} llama_sampling_params;
-
-// general sampler context
-// TODO: move to llama.h
-struct llama_sampling_context {
-    // parameters that will be used for sampling
-    llama_sampling_params params;
-
-    // mirostat sampler state
-    float mirostat_mu;
-
-    llama_grammar * grammar;
-
-    // internal
-    grammar_parser::parse_state parsed_grammar;
-
-    // TODO: replace with ring-buffer
-    std::vector<llama_token>      prev;
-    std::vector<llama_token_data> cur;
-};
-
-#include "common.h"
-
-// Create a new sampling context instance.
-struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
-
-void llama_sampling_free(struct llama_sampling_context * ctx);
-
-// Reset the sampler context
-// - clear prev tokens
-// - reset grammar
-void llama_sampling_reset(llama_sampling_context * ctx);
-
-// Copy the sampler context
-void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
-
-// Get the last sampled token
-llama_token llama_sampling_last(llama_sampling_context * ctx);
-
-// Get a string representation of the last sampled tokens
-std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
-
-// Print sampling parameters into a string
-std::string llama_sampling_print(const llama_sampling_params & params);
-
-// Print sampling order into a string
-std::string llama_sampling_order_print(const llama_sampling_params & params);
-
-// this is a common sampling function used across the examples for convenience
-// it can serve as a starting point for implementing your own sampling function
-// Note: When using multiple sequences, it is the caller's responsibility to call
-//       llama_sampling_reset when a sequence ends
-//
-// required:
-//  - ctx_main:     context to use for sampling
-//  - ctx_sampling: sampling-specific context
-//
-// optional:
-//  - ctx_cfg:      context to use for classifier-free guidance
-//  - idx:          sample from llama_get_logits_ith(ctx, idx)
-//
-// returns:
-//  - token:      sampled token
-//  - candidates: vector of candidate tokens
-//
-llama_token llama_sampling_sample(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        struct llama_context * ctx_cfg,
-        int idx = 0);
-
-void llama_sampling_accept(
-        struct llama_sampling_context * ctx_sampling,
-        struct llama_context * ctx_main,
-        llama_token id,
-        bool apply_grammar);
--- a/common/stb_image.h
+++ b/common/stb_image.h
--- a/common/train.cpp
+++ b/common/train.cpp
--- a/common/train.h
+++ b/common/train.h
@@ -1,233 +0,0 @@
-// Various helper functions and utilities for training
-
-#pragma once
-
-#include <string>
-#include <random>
-#include <vector>
-
-#include "ggml.h"
-#include "llama.h"
-
-#define LLAMA_TRAIN_MAX_NODES 16384
-
-typedef std::string mt19937_state;
-
-struct train_state {
-    struct ggml_opt_context * opt;
-
-    uint64_t train_its;
-    uint64_t train_samples;
-    uint64_t train_tokens;
-    uint64_t train_epochs;
-
-    size_t        shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
-    mt19937_state shuffle_rng_state_current;
-    mt19937_state shuffle_rng_state_next;
-    size_t        shuffle_sample_count;
-    size_t        shuffle_next_sample;
-};
-
-struct train_params_common {
-    const char * fn_train_data;
-    const char * fn_checkpoint_in;
-    const char * fn_checkpoint_out;
-    const char * pattern_fn_it;
-    const char * fn_latest;
-
-    bool print_usage;
-
-    int save_every;
-
-    uint32_t seed;
-
-    int n_ctx;
-    int n_threads;
-    int n_batch;
-    int n_gradient_accumulation;
-    int n_epochs;
-    int n_gpu_layers;
-
-    bool custom_n_ctx;
-
-    bool use_flash;
-    bool use_checkpointing;
-
-    std::string sample_start;
-    bool include_sample_start;
-    bool escape;
-    bool overlapping_samples;
-    bool fill_with_next_samples;
-    bool separate_with_eos;
-    bool separate_with_bos;
-    bool sample_random_offsets;
-
-    bool force_reshuffle;
-
-    int   warmup;
-    int   cos_decay_steps;
-    float cos_decay_restart;
-    float cos_decay_min;
-    bool  enable_restart;
-
-    int   opt_past;
-    float opt_delta;
-    int   opt_max_no_improvement;
-
-    int   adam_n_iter;
-    float adam_alpha;
-    float adam_min_alpha;
-    float adam_decay;
-    int   adam_decay_min_ndim;
-    float adam_beta1;
-    float adam_beta2;
-    float adam_gclip;
-    float adam_eps_f;
-};
-
-typedef void (*save_train_files_callback)(void * data, struct train_state * train);
-
-struct train_opt_callback_data {
-    struct train_params_common * params;
-    struct train_state         * train;
-    save_train_files_callback    save_cb;
-    void                       * save_data;
-    struct llama_context       * lctx;
-    int                          last_save_iter;
-    llama_token                * tokens_data;
-    size_t                       tokens_size;
-    size_t                     * samples_begin;
-    size_t                     * samples_size;
-    size_t                     * shuffled_samples_offs;
-    size_t                     * shuffled_samples_begin;
-    size_t                     * shuffled_samples_size;
-    size_t                       samples_count;
-    struct ggml_tensor         * tokens_input;
-    struct ggml_tensor         * target_probs;
-    int                          first_iter;
-    int                          first_epoch;
-    int                          iter_at_last_epoch;
-    int64_t                      last_time;
-    double                       millis_per_iter;
-};
-
-struct train_state * init_train_state();
-void free_train_state(struct train_state  * state);
-
-struct train_params_common get_default_train_params_common();
-void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
-
-bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
-void finish_processing_train_args(struct train_params_common * params);
-
-struct random_normal_distribution;
-struct random_uniform_distribution;
-
-struct random_normal_distribution  * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
-struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
-
-void free_random_normal_distribution (struct random_normal_distribution  * rnd);
-void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
-
-struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
-struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
-
-// generate random float in interval [0,1)
-float frand();
-float frand_normal (struct random_normal_distribution * rnd);
-float frand_uniform(struct random_uniform_distribution * rnd);
-
-int   clamp (const int v, const int min, const int max);
-float fclamp(const float v, const float min, const float max);
-
-void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
-void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
-void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
-void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
-
-size_t tokenize_file(
-        struct llama_context     * lctx,
-        const char               * filename,
-        const std::string        & sample_start,
-        bool                       include_sample_start,
-        bool                       overlapping_samples,
-        unsigned                   context_length,
-        std::vector<llama_token> & out_tokens,
-        std::vector<size_t>      & out_samples_begin,
-        std::vector<size_t>      & out_samples_size);
-
-int64_t get_example_targets_batch(
-        struct llama_context * lctx,
-        struct ggml_tensor   * tokens_input,
-        struct ggml_tensor   * target_probs,
-        int64_t                example_id,
-        const size_t         * samples_offs,
-        const size_t         * samples_begin,
-        const size_t         * samples_size,
-              size_t           samples_count,
-        const llama_token    * train_data,
-        size_t                 n_train_data,
-        bool                   separate_with_eos,
-        bool                   separate_with_bos,
-        bool                   fill_with_next_samples,
-        bool                   sample_random_offsets);
-
-
-void          mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
-mt19937_state mt19937_get_state(const std::mt19937& rng);
-mt19937_state mt19937_seed_to_state(unsigned seed);
-
-mt19937_state shuffle_samples(
-        const mt19937_state & rng_state,
-        size_t              * shuffled_offs,
-        size_t              * shuffled_begins,
-        size_t              * shuffled_sizes,
-        const size_t        * begins,
-        const size_t        * sizes,
-        size_t                count);
-
-size_t hash_combine(size_t h1, size_t h2);
-
-size_t compute_samples_hash(
-    const char* fn,
-    const size_t* samples_begin,
-    const size_t* samples_size,
-    size_t sample_count);
-
-
-std::string replace_str(const char * s, const char * needle, const char * replacement);
-
-void print_duration(double milliseconds);
-
-float cosine_decay(
-    int64_t step,
-    int64_t decay_steps,
-    float   minimum);
-
-float cosine_decay_restart(
-    int64_t step,
-    int64_t decay_steps,
-    float   minimum,
-    float   restart_step_mult);
-
-float learning_schedule(
-    int64_t step,
-    int64_t warmup_steps,
-    int64_t decay_steps,
-    float   learning_rate,
-    float   overall_minimum,
-    float   cos_decay_minimum,
-    float   cos_decay_restart_step_mult,
-    bool    enable_restart);
-
-void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
-
-void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
-void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
-
-bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
-void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
-
-std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
-
-void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@@ -0,0 +1,172 @@
+# Convert a GPTQ quantized LLaMA model to a ggml compatible file
+# Based on: https://github.com/qwopqwop200/GPTQ-for-LLaMa
+#
+import os
+import re
+import sys
+import json
+import struct
+import numpy as np
+import torch
+from sentencepiece import SentencePieceProcessor
+
+if len(sys.argv) != 4:
+    print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
+    sys.exit(1)
+
+fname_model = sys.argv[1]
+fname_tokenizer = sys.argv[2]
+dir_out = sys.argv[3]
+
+model = torch.load(fname_model, map_location="cpu")
+
+n_vocab, n_embd = model['model.embed_tokens.weight'].shape
+n_layer = 1 + max(int(m.group(1)) for name in model
+                  if (m := re.match(r'model\.layers\.([0-9]+)', name)))
+
+# hardcoded:
+n_mult = 256
+n_head = {32: 32, 40: 40, 60: 52, 80: 64}[n_layer]
+
+tokenizer = SentencePieceProcessor(fname_tokenizer)
+
+assert tokenizer.vocab_size() == n_vocab
+
+fname_out = sys.argv[3]
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", n_vocab))
+fout.write(struct.pack("i", n_embd))
+fout.write(struct.pack("i", n_mult))
+fout.write(struct.pack("i", n_head))
+fout.write(struct.pack("i", n_layer))
+fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
+fout.write(struct.pack("i", 4))
+
+
+# This loop unchanged from convert-pth-to-ggml.py:
+for i in range(tokenizer.vocab_size()):
+    if tokenizer.is_unknown(i):
+        # "<unk>" token (translated as ??)
+        text = " \u2047 ".encode("utf-8")
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+    elif tokenizer.is_control(i):
+        # "<s>"/"</s>" tokens
+        fout.write(struct.pack("i", 0))
+    elif tokenizer.is_byte(i):
+        # "<U+XX>" tokens (which may be invalid UTF-8)
+        piece = tokenizer.id_to_piece(i)
+        if len(piece) != 6:
+            print("Invalid token: " + piece)
+            sys.exit(1)
+        byte_value = int(piece[3:-1], 16)
+        fout.write(struct.pack("i", 1))
+        fout.write(struct.pack("B", byte_value))
+    else:
+        # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
+        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+
+def write_header(shape, dst_name, ftype_cur):
+    sname = dst_name.encode('utf-8')
+    fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
+    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
+    fout.write(sname)
+
+def convert_non_q4(src_name, dst_name):
+    v = model[src_name]
+    shape = v.shape
+    print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
+    if len(shape) == 1:
+        print("  Converting to float32")
+        v = v.to(torch.float32)
+
+    ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
+
+    # header
+    write_header(shape, dst_name, ftype_cur)
+
+    # data
+    v.numpy().tofile(fout)
+
+def convert_q4(src_name, dst_name, permute=False):
+    zeros = model[f"{src_name}.zeros"].numpy()
+    scales = model[f"{src_name}.scales"].numpy()
+    bias = model[f"{src_name}.bias"].numpy()
+    qweight = model[f"{src_name}.qweight"].numpy().T # transpose
+
+    # Q4_1 does not support bias; good thing the bias is always all zeros.
+    assert not np.any(bias)
+
+    # Each int32 item is actually 8 int4 items packed together, and it's transposed.
+    shape = (qweight.shape[0], qweight.shape[1] * 8)
+
+    print("Processing Q4 variable: " + src_name + " with shape: ", shape)
+
+    # The output format has the int4 weights in groups of 32 rather than 8.
+    # It looks like this:
+    # For each row:
+    #   For each group of 32 columns:
+    #     - addend (float32, 4 bytes)
+    #     - scale (float32, 4 bytes)
+    #     - weights (int4 * 32, 16 bytes)
+    # Note that in the input, the scales and addends are shared between all
+    # the columns in a row, so we end up wasting quite a bit of memory with
+    # repeated scales and addends.
+
+    addends = -zeros # flip sign
+
+    # Since the output format is mixed between integers and floats, we have
+    # to hackily view the floats as int32s just so numpy will let us
+    # concatenate them.
+    addends_view = addends.view(dtype=np.int32)
+    scales_view = scales.view(dtype=np.int32)
+
+    # Split into groups of 4 columns (i.e. 32 columns of quantized data):
+    grouped = qweight.reshape([qweight.shape[0], qweight.shape[1] // 4, 4])
+
+    # Repeat addends and scales:
+    addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
+    scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)
+
+    blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
+
+    if permute:
+        # Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
+        # This can be done after the above conversion because it doesn't affect column order/layout.
+        blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
+                    .swapaxes(1, 2)
+                    .reshape(blob.shape))
+
+    # header
+    write_header(shape, dst_name, 3) # ftype = Q4_1
+
+    # data
+    blob.tofile(fout)
+
+convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
+convert_non_q4("model.norm.weight", "norm.weight")
+convert_non_q4("lm_head.weight", "output.weight")
+
+for i in range(n_layer):
+    convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
+    convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
+    convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
+    convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")
+
+    convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
+    convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
+    convert_q4(f"model.layers.{i}.mlp.up_proj",   f"layers.{i}.feed_forward.w3.weight")
+
+    convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
+    convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
+
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -1,441 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import argparse
-import os
-import struct
-import sys
-from enum import IntEnum
-from pathlib import Path
-
-import numpy as np
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-
-class GGMLFormat(IntEnum):
-    GGML = 0
-    GGMF = 1
-    GGJT = 2
-
-
-class GGMLFType(IntEnum):
-    ALL_F32              = 0
-    MOSTLY_F16           = 1
-    MOSTLY_Q4_0          = 2
-    MOSTLY_Q4_1          = 3
-    MOSTLY_Q4_1_SOME_F16 = 4
-    MOSTLY_Q8_0          = 7
-    MOSTLY_Q5_0          = 8
-    MOSTLY_Q5_1          = 9
-    MOSTLY_Q2_K          = 10
-    MOSTLY_Q3_K_S        = 11
-    MOSTLY_Q3_K_M        = 12
-    MOSTLY_Q3_K_L        = 13
-    MOSTLY_Q4_K_S        = 14
-    MOSTLY_Q4_K_M        = 15
-    MOSTLY_Q5_K_S        = 16
-    MOSTLY_Q5_K_M        = 17
-    MOSTLY_Q6_K          = 18
-
-
-class Hyperparameters:
-    def __init__(self):
-        self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
-        self.n_layer = self.n_rot = self.n_ff = 0
-        self.ftype = GGMLFType.ALL_F32
-
-    def set_n_ff(self, model):
-        ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
-        assert ff_tensor_idx is not None, 'Missing layer 0 FF tensor'
-        ff_tensor = model.tensors[ff_tensor_idx]
-        self.n_ff = ff_tensor.dims[1]
-
-    def load(self, data, offset):
-        (
-            self.n_vocab,
-            self.n_embd,
-            self.n_mult,
-            self.n_head,
-            self.n_layer,
-            self.n_rot,
-            ftype,
-        ) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
-        try:
-            self.ftype = GGMLFType(ftype)
-        except ValueError:
-            raise ValueError(f'Invalid ftype {ftype}')
-        return 4 * 7
-
-    def __str__(self):
-        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
-
-
-class Vocab:
-    def __init__(self, load_scores = True):
-        self.items = []
-        self.load_scores = load_scores
-
-    def load(self, data, offset, n_vocab):
-        orig_offset = offset
-        for _ in range(n_vocab):
-            itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
-            assert itemlen < 4096, 'Absurd vocab item length'
-            offset += 4
-            item_text = bytes(data[offset:offset + itemlen])
-            offset += itemlen
-            if self.load_scores:
-                item_score = struct.unpack('<f', data[offset:offset + 4])[0]
-                offset += 4
-            else:
-                item_score = 0.0
-            self.items.append((item_text, item_score))
-        return offset - orig_offset
-
-
-class Tensor:
-    def __init__(self, use_padding = True):
-        self.name = None
-        self.dims: tuple[int, ...] = ()
-        self.dtype = None
-        self.start_offset = 0
-        self.len_bytes = np.int64(0)
-        self.use_padding = use_padding
-
-    def load(self, data, offset):
-        orig_offset = offset
-        (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
-        assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
-        assert name_len < 4096, 'Absurd tensor name length'
-        quant = gguf.GGML_QUANT_SIZES.get(dtype)
-        assert quant is not None, 'Unknown tensor type'
-        (blksize, tysize) = quant
-        offset += 12
-        self.dtype= dtype
-        self.dims = struct.unpack(f'<{n_dims}I', data[offset:offset + (4 * n_dims)])
-        offset += 4 * n_dims
-        self.name = bytes(data[offset:offset + name_len])
-        offset += name_len
-        pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
-        offset += pad
-        n_elems = np.prod(self.dims)
-        n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
-        self.start_offset = offset
-        self.len_bytes = n_bytes
-        offset += n_bytes
-        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
-        return offset - orig_offset
-
-
-class GGMLModel:
-    def __init__(self):
-        self.hyperparameters = None
-        self.vocab = None
-        self.tensor_map = {}
-        self.tensors = []
-
-    def validate_header(self, data, offset):
-        magic = bytes(data[offset:offset + 4])
-        if magic == b'GGUF':
-            raise ValueError('File is already in GGUF format.')
-        if magic == b'lmgg':
-            self.file_format = GGMLFormat.GGML
-            self.format_version = 1
-            return 4
-        version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
-        if magic == b'fmgg':
-            if version != 1:
-                raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
-            self.file_format = GGMLFormat.GGMF
-            self.format_version = version
-            return 8
-        if magic == b'tjgg':
-            if version < 1 or version > 3:
-                raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
-            self.file_format = GGMLFormat.GGJT
-            self.format_version = version
-            return 8
-        raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
-
-    def validate_conversion(self, ftype):
-        err = ''
-        if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
-            if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
-                err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
-        elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
-            if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
-                         GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
-                err = 'Q4 and Q8 quantizations changed in GGJTv3.'
-        if len(err) > 0:
-            raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
-
-    def load(self, data, offset):
-        offset += self.validate_header(data, offset)
-        hp = Hyperparameters()
-        offset += hp.load(data, offset)
-        print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
-        self.validate_conversion(hp.ftype)
-        vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
-        offset += vocab.load(data, offset, hp.n_vocab)
-        tensors: list[Tensor] = []
-        tensor_map = {}
-        while offset < len(data):
-            tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
-            offset += tensor.load(data, offset)
-            tensor_map[tensor.name] = len(tensors)
-            tensors.append(tensor)
-        self.hyperparameters = hp
-        self.vocab = vocab
-        self.tensors = tensors
-        self.tensor_map = tensor_map
-        hp.set_n_ff(self)
-        return offset
-
-
-class GGMLToGGUF:
-    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
-        hp = ggml_model.hyperparameters
-        self.model = ggml_model
-        self.data = data
-        self.cfg = cfg
-        self.params_override = params_override
-        self.vocab_override = vocab_override
-        self.special_vocab = special_vocab
-        if params_override is not None:
-            n_kv_head = params_override.n_head_kv
-        else:
-            if cfg.gqa == 1:
-                n_kv_head = hp.n_head
-            else:
-                gqa = float(cfg.gqa)
-                n_kv_head = None
-                for x in range(1, 256):
-                    if float(hp.n_head) / float(x) == gqa:
-                        n_kv_head = x
-                assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
-                print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
-        self.n_kv_head = n_kv_head
-        self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
-
-    def save(self):
-        print('* Preparing to save GGUF file')
-        gguf_writer = gguf.GGUFWriter(
-            self.cfg.output,
-            gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
-            use_temp_file = False)
-        self.add_params(gguf_writer)
-        self.add_vocab(gguf_writer)
-        if self.special_vocab is not None:
-            self.special_vocab.add_to_gguf(gguf_writer)
-        self.add_tensors(gguf_writer)
-        print("    gguf: write header")
-        gguf_writer.write_header_to_file()
-        print("    gguf: write metadata")
-        gguf_writer.write_kv_data_to_file()
-        print("    gguf: write tensors")
-        gguf_writer.write_tensors_to_file()
-        gguf_writer.close()
-
-    def add_params(self, gguf_writer):
-        hp = self.model.hyperparameters
-        cfg = self.cfg
-        if cfg.desc is not None:
-            desc = cfg.desc
-        else:
-            desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
-        try:
-            # Filenames aren't necessarily valid UTF8.
-            name = cfg.name if cfg.name is not None else cfg.input.name
-        except UnicodeDecodeError:
-            name = None
-        print('* Adding model parameters and KV items')
-        if name is not None:
-            gguf_writer.add_name(name)
-        gguf_writer.add_description(desc)
-        gguf_writer.add_file_type(int(hp.ftype))
-        if self.params_override is not None:
-            po = self.params_override
-            assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
-            assert po.n_layer == hp.n_layer, 'Model hyperparams mismatch'
-            assert po.n_head == hp.n_head, 'Model hyperparams mismatch'
-            gguf_writer.add_context_length      (po.n_ctx)
-            gguf_writer.add_embedding_length    (po.n_embd)
-            gguf_writer.add_block_count         (po.n_layer)
-            gguf_writer.add_feed_forward_length (po.n_ff)
-            gguf_writer.add_rope_dimension_count(po.n_embd // po.n_head)
-            gguf_writer.add_head_count          (po.n_head)
-            gguf_writer.add_head_count_kv       (po.n_head_kv)
-            gguf_writer.add_layer_norm_rms_eps  (po.f_norm_eps)
-            return
-        gguf_writer.add_context_length(cfg.context_length)
-        gguf_writer.add_embedding_length(hp.n_embd)
-        gguf_writer.add_block_count(hp.n_layer)
-        gguf_writer.add_feed_forward_length(hp.n_ff)
-        gguf_writer.add_rope_dimension_count(hp.n_embd // hp.n_head)
-        gguf_writer.add_head_count(hp.n_head)
-        gguf_writer.add_head_count_kv(self.n_kv_head)
-        gguf_writer.add_layer_norm_rms_eps(float(cfg.eps))
-
-    def add_vocab(self, gguf_writer):
-        hp = self.model.hyperparameters
-        gguf_writer.add_tokenizer_model('llama')
-        tokens = []
-        scores = []
-        toktypes = []
-        if self.vocab_override is not None:
-            vo = self.vocab_override
-            print('* Adding vocab item(s)')
-            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
-                tokens.append(vbytes)
-                scores.append(score)
-                toktypes.append(ttype)
-            assert len(tokens) == hp.n_vocab, \
-                f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
-            gguf_writer.add_token_list(tokens)
-            gguf_writer.add_token_scores(scores)
-            if len(toktypes) > 0:
-                gguf_writer.add_token_types(toktypes)
-            return
-        print(f'* Adding {hp.n_vocab} vocab item(s)')
-        assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
-        for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
-            tt = 1 # Normal
-            # Special handling for UNK, BOS, EOS tokens.
-            if tokid <= 2:
-                if tokid == 0:
-                    vbytes = b'<unk>'
-                    tt = 2
-                elif tokid == 1:
-                    vbytes = b'<s>'
-                    tt = 3
-                else:
-                    vbytes = b'</s>'
-                    tt = 3
-            elif len(vbytes) == 0:
-                tt = 3 # Control
-            elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
-                vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
-                tt = 6 # Byte
-            else:
-                vbytes = vbytes.replace(b' ', b'\xe2\x96\x81')
-            toktypes.append(tt)
-            tokens.append(vbytes)
-            scores.append(vscore)
-        gguf_writer.add_token_list(tokens)
-        gguf_writer.add_token_scores(scores)
-        gguf_writer.add_token_types(toktypes)
-        gguf_writer.add_unk_token_id(0)
-        gguf_writer.add_bos_token_id(1)
-        gguf_writer.add_eos_token_id(2)
-
-    def add_tensors(self, gguf_writer):
-        tensor_map = self.name_map
-        data = self.data
-        print(f'* Adding {len(self.model.tensors)} tensor(s)')
-        for tensor in self.model.tensors:
-            name = str(tensor.name, 'UTF-8')
-            mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-            assert mapped_name is not None, f'Bad name {name}'
-            tempdims = list(tensor.dims[:])
-            if len(tempdims) > 1:
-                temp = tempdims[1]
-                tempdims[1] = tempdims[0]
-                tempdims[0] = temp
-            # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
-            gguf_writer.add_tensor(
-                mapped_name,
-                data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
-                raw_shape = tempdims,
-                raw_dtype = tensor.dtype)
-
-
-def handle_metadata(cfg, hp):
-    import convert
-    assert cfg.model_metadata_dir.is_dir(), 'Metadata dir is not a directory'
-    hf_config_path   = cfg.model_metadata_dir / "config.json"
-    orig_config_path = cfg.model_metadata_dir / "params.json"
-    # We pass a fake model here. "original" mode will check the shapes of some
-    # tensors if information is missing in the .json file: other than that, the
-    # model data isn't used so this should be safe (at least for now).
-    fakemodel = {
-        'tok_embeddings.weight': convert.LazyTensor.__new__(convert.LazyTensor),
-        'layers.0.feed_forward.w1.weight': convert.LazyTensor.__new__(convert.LazyTensor),
-    }
-    fakemodel['tok_embeddings.weight'].shape = [hp.n_vocab]
-    fakemodel['layers.0.feed_forward.w1.weight'].shape = [hp.n_ff]
-    if hf_config_path.exists():
-        params = convert.Params.loadHFTransformerJson(fakemodel, hf_config_path)
-    elif orig_config_path.exists():
-        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
-    else:
-        raise ValueError('Unable to load metadata')
-    vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
-    vocab_factory = convert.VocabFactory(vocab_path)
-    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
-    convert.check_vocab_size(params, vocab)
-    return params, vocab, special_vocab
-
-
-def handle_args():
-    parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
-    parser.add_argument('--input', '-i', type = Path, required = True,
-                        help = 'Input GGMLv3 filename')
-    parser.add_argument('--output', '-o', type = Path, required = True,
-                        help ='Output GGUF filename')
-    parser.add_argument('--name',
-                        help = 'Set model name')
-    parser.add_argument('--desc',
-                        help = 'Set model description')
-    parser.add_argument('--gqa', type = int, default = 1,
-                        help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
-    parser.add_argument('--eps', default = '5.0e-06',
-                        help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
-    parser.add_argument('--context-length', '-c', type=int, default = 2048,
-                        help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
-    parser.add_argument('--model-metadata-dir', '-m', type = Path,
-                        help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
-    parser.add_argument("--vocab-dir", type=Path,
-                        help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
-    parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
-                        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
-    return parser.parse_args()
-
-
-def main():
-    cfg = handle_args()
-    print(f'* Using config: {cfg}')
-    print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
-    if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
-        print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
-    data = np.memmap(cfg.input, mode = 'r')
-    model = GGMLModel()
-    print('* Scanning GGML input file')
-    offset = model.load(data, 0)  # noqa
-    print(f'* GGML model hyperparameters: {model.hyperparameters}')
-    vocab_override = None
-    params_override = None
-    special_vocab = None
-    if cfg.model_metadata_dir is not None:
-        (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
-        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
-        print(f'* Overriding params: {params_override}')
-        print(f'* Overriding vocab: {vocab_override}')
-        print(f'* Special vocab: {special_vocab}')
-    else:
-        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
-        if model.file_format == GGMLFormat.GGML:
-            print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
-    converter = GGMLToGGUF(
-        model, data, cfg,
-        params_override = params_override,
-        vocab_override = vocab_override,
-        special_vocab = special_vocab
-    )
-    converter.save()
-    print(f'* Successful completion. Output saved to: {cfg.output}')
-
-
-if __name__ == '__main__':
-    main()
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -1,148 +0,0 @@
-#!/usr/bin/env python3
-from __future__ import annotations
-
-import json
-import os
-import struct
-import sys
-from pathlib import Path
-from typing import Any, BinaryIO, Sequence
-
-import numpy as np
-import torch
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
-
-
-def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
-    fout.write(b"ggla"[::-1])  # magic (ggml lora)
-    fout.write(struct.pack("i", 1))  # file version
-    fout.write(struct.pack("i", params["r"]))
-    # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
-    # but some models ship a float value instead
-    # let's convert to int, but fail if lossless conversion is not possible
-    assert (
-        int(params["lora_alpha"]) == params["lora_alpha"]
-    ), "cannot convert float to int losslessly"
-    fout.write(struct.pack("i", int(params["lora_alpha"])))
-
-
-def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
-    sname = name.encode("utf-8")
-    fout.write(
-        struct.pack(
-            "iii",
-            len(shape),
-            len(sname),
-            NUMPY_TYPE_TO_FTYPE[data_type.name],
-        )
-    )
-    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
-    fout.write(sname)
-    fout.seek((fout.tell() + 31) & -32)
-
-
-if __name__ == '__main__':
-    if len(sys.argv) < 2:
-        print(f"Usage: python {sys.argv[0]} <path> [arch]")
-        print(
-            "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
-        )
-        print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
-        sys.exit(1)
-
-    input_json = os.path.join(sys.argv[1], "adapter_config.json")
-    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
-    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
-
-    if os.path.exists(input_model):
-        model = torch.load(input_model, map_location="cpu")
-    else:
-        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
-        # lazy import load_file only if lora is in safetensors format.
-        from safetensors.torch import load_file
-        model = load_file(input_model, device="cpu")
-
-    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
-
-    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
-        print(f"Error: unsupported architecture {arch_name}")
-        sys.exit(1)
-
-    arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
-    name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
-
-    with open(input_json, "r") as f:
-        params = json.load(f)
-
-    if params["peft_type"] != "LORA":
-        print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
-        sys.exit(1)
-
-    if params["fan_in_fan_out"] is True:
-        print("Error: param fan_in_fan_out is not supported")
-        sys.exit(1)
-
-    if params["bias"] is not None and params["bias"] != "none":
-        print("Error: param bias is not supported")
-        sys.exit(1)
-
-    # TODO: these seem to be layers that have been trained but without lora.
-    # doesn't seem widely used but eventually should be supported
-    if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
-        print("Error: param modules_to_save is not supported")
-        sys.exit(1)
-
-    with open(output_path, "wb") as fout:
-        fout.truncate()
-
-        write_file_header(fout, params)
-        for k, v in model.items():
-            orig_k = k
-            if k.endswith(".default.weight"):
-                k = k.replace(".default.weight", ".weight")
-            if k in ["llama_proj.weight", "llama_proj.bias"]:
-                continue
-            if k.endswith("lora_A.weight"):
-                if v.dtype != torch.float16 and v.dtype != torch.float32:
-                    v = v.float()
-                v = v.T
-            else:
-                v = v.float()
-
-            t = v.detach().numpy()
-
-            prefix = "base_model.model."
-            if k.startswith(prefix):
-                k = k[len(prefix) :]
-
-            lora_suffixes = (".lora_A.weight", ".lora_B.weight")
-            if k.endswith(lora_suffixes):
-                suffix = k[-len(lora_suffixes[0]):]
-                k = k[: -len(lora_suffixes[0])]
-            else:
-                print(f"Error: unrecognized tensor name {orig_k}")
-                sys.exit(1)
-
-            tname = name_map.get_name(k)
-            if tname is None:
-                print(f"Error: could not map tensor name {orig_k}")
-                print(" Note: the arch parameter must be specified if the model is not llama")
-                sys.exit(1)
-
-            if suffix == ".lora_A.weight":
-                tname += ".weight.loraA"
-            elif suffix == ".lora_B.weight":
-                tname += ".weight.loraB"
-            else:
-                assert False
-
-            print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
-            write_tensor_header(fout, tname, t.shape, t.dtype)
-            t.tofile(fout)
-
-    print(f"Converted {input_json} and {input_model} to {output_path}")
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,135 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import os
-import sys
-from pathlib import Path
-from pprint import pprint
-
-import torch
-from sentencepiece import SentencePieceProcessor
-
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
-
-
-def _flatten_dict(dct, tensors, prefix=None):
-    assert isinstance(dct, dict)
-    for key in dct.keys():
-        new_prefix = prefix + '.' + key if prefix is not None else key
-        if isinstance(dct[key], torch.Tensor):
-            tensors[new_prefix] = dct[key]
-        elif isinstance(dct[key], dict):
-            _flatten_dict(dct[key], tensors, new_prefix)
-        else:
-            raise ValueError(type(dct[key]))
-    return None
-
-
-def _get_sentencepiece_tokenizer_info(dir_model: Path):
-    tokenizer_path = dir_model / 'adept_vocab.model'
-    print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
-    tokenizer = SentencePieceProcessor(str(tokenizer_path))
-    print('gguf: adding tokens')
-    tokens: list[bytes] = []
-    scores: list[float] = []
-    toktypes: list[int] = []
-
-    for i in range(tokenizer.vocab_size()):
-        text: bytes
-        score: float
-
-        piece = tokenizer.id_to_piece(i)
-        text = piece.encode("utf-8")
-        score = tokenizer.get_score(i)
-
-        toktype = 1
-        if tokenizer.is_unknown(i):
-            toktype = 2
-        if tokenizer.is_control(i):
-            toktype = 3
-        if tokenizer.is_unused(i):
-            toktype = 5
-        if tokenizer.is_byte(i):
-            toktype = 6
-
-        tokens.append(text)
-        scores.append(score)
-        toktypes.append(toktype)
-        pass
-    return tokens, scores, toktypes
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
-    parser.add_argument("--outfile",             type=Path, help="path to write to; default: based on input")
-    parser.add_argument("--ckpt-path",           type=Path, help="path to persimmon checkpoint .pt file")
-    parser.add_argument("--model-dir",           type=Path, help="directory containing model e.g. 8b_chat_model_release")
-    parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
-    args = parser.parse_args()
-    sys.path.append(str(args.adept_inference_dir))
-    persimmon_model = torch.load(args.ckpt_path)
-    hparams = persimmon_model['args']
-    pprint(hparams)
-    tensors: dict[str, torch.Tensor] = {}
-    _flatten_dict(persimmon_model['model'], tensors, None)
-
-    arch = gguf.MODEL_ARCH.PERSIMMON
-    gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
-
-    block_count = hparams.num_layers
-    head_count = hparams.num_attention_heads
-    head_count_kv = head_count
-    ctx_length = hparams.seq_length
-    hidden_size = hparams.hidden_size
-
-    gguf_writer.add_name('persimmon-8b-chat')
-    gguf_writer.add_context_length(ctx_length)
-    gguf_writer.add_embedding_length(hidden_size)
-    gguf_writer.add_block_count(block_count)
-    gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
-    gguf_writer.add_rope_dimension_count(hidden_size // head_count)
-    gguf_writer.add_head_count(head_count)
-    gguf_writer.add_head_count_kv(head_count_kv)
-    gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
-    gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
-
-    tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
-    gguf_writer.add_tokenizer_model('llama')
-    gguf_writer.add_token_list(tokens)
-    gguf_writer.add_token_scores(scores)
-    gguf_writer.add_token_types(toktypes)
-    gguf_writer.add_bos_token_id(71013)
-    gguf_writer.add_eos_token_id(71013)
-
-    tensor_map = gguf.get_tensor_name_map(arch, block_count)
-    print(tensor_map)
-    for name in tensors.keys():
-        data = tensors[name]
-        if name.endswith(".self_attention.rotary_emb.inv_freq"):
-            continue
-        old_dtype = data.dtype
-        # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
-        data = data.to(torch.float32).squeeze().numpy()
-        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-        if new_name is None:
-            print("Can not map tensor '" + name + "'")
-            sys.exit()
-        n_dims = len(data.shape)
-        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-        gguf_writer.add_tensor(new_name, data)
-    print("gguf: write header")
-    gguf_writer.write_header_to_file()
-    print("gguf: write metadata")
-    gguf_writer.write_kv_data_to_file()
-    print("gguf: write tensors")
-    gguf_writer.write_tensors_to_file()
-
-    gguf_writer.close()
-
-    print(f"gguf: model successfully exported to '{args.outfile}'")
-    print("")
-
-
-if __name__ == '__main__':
-    main()
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -0,0 +1,181 @@
+# Convert a LLaMA model checkpoint to a ggml compatible file
+#
+# Load the model using Torch
+# Iterate over all variables and write them to a binary file.
+#
+# For each variable, write the following:
+#   - Number of dimensions (int)
+#   - Name length (int)
+#   - Dimensions (int[n_dims])
+#   - Name (char[name_length])
+#   - Data (float[n_dims])
+#
+# At the start of the ggml file we write the model parameters
+# and vocabulary.
+#
+
+import argparse
+import os
+import sys
+import json
+import struct
+import numpy as np
+import torch
+
+from sentencepiece import SentencePieceProcessor
+
+def parse_args():
+
+    parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
+    parser.add_argument('dir_model',  help='directory containing the model checkpoint')
+    parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
+    parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
+    return parser.parse_args()
+
+def get_n_parts(dim):
+
+    mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
+    n_parts = mappings.get(dim)
+    if n_parts is None:
+        print(f"Invalid dim: {dim}")
+        sys.exit(1)
+
+    print(f"n_parts = {n_parts}\n")
+    return n_parts
+
+def load_hparams_and_tokenizer(dir_model):
+
+    # `dir_model` is something like `models/7B` or `models/7B/`.
+    # "tokenizer.model" is expected under model's parent dir.
+    # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
+    # Let's use the model's parent dir directly.
+    model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
+
+    fname_hparams = f"{dir_model}/params.json"
+    fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
+
+    with open(fname_hparams, "r") as f:
+        hparams = json.load(f)
+        print(hparams)
+
+    tokenizer = SentencePieceProcessor(fname_tokenizer)
+    hparams.update({"vocab_size": tokenizer.vocab_size()})
+
+    return hparams, tokenizer
+
+def write_header(fout, hparams, ftype):
+
+    keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
+    values = [
+        0x67676d66,  # magic: ggmf in hex
+        1, # file version
+        *[hparams[key] for key in keys],
+        hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
+        ftype
+    ]
+    fout.write(struct.pack("i" * len(values), *values))
+
+def write_tokens(fout, tokenizer):
+
+    for i in range(tokenizer.vocab_size()):
+        if tokenizer.is_unknown(i):
+            text = " \u2047 ".encode("utf-8")
+        elif tokenizer.is_control(i):
+            text = b""
+        elif tokenizer.is_byte(i):
+            piece = tokenizer.id_to_piece(i)
+            if len(piece) != 6:
+                print(f"Invalid token: {piece}")
+                sys.exit(1)
+            byte_value = int(piece[3:-1], 16)
+            text = struct.pack("B", byte_value)
+        else:
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+        fout.write(struct.pack("f", tokenizer.get_score(i)))
+
+def process_and_write_variables(fout, model, ftype):
+
+    for name, datao in model.items():
+
+        if name.endswith("freqs"):
+            continue
+
+        shape = datao.shape
+
+        print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
+
+        data = datao.numpy().squeeze()
+        n_dims = len(shape)
+
+        # default type is fp16
+        ftype_cur = 1
+        if ftype == 0 or n_dims == 1:
+            print("  Converting to float32")
+            data = data.astype(np.float32)
+            ftype_cur = 0
+
+        # header
+        sname = name.encode('utf-8')
+        fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
+        for dim in reversed(data.shape):
+            fout.write(struct.pack("i", dim))
+        fout.write(sname)
+
+        # data output to file
+        data.tofile(fout)
+
+def main():
+
+    args = parse_args()
+    dir_model = args.dir_model
+    ftype = args.ftype
+    ftype_str = ["f32", "f16"]
+
+    hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
+
+    print(args)
+
+    # if only writing vocab to file
+    if args.vocab_only:
+
+        fname_model = f"{dir_model}/consolidated.00.pth"
+        fname_out = f"{dir_model}/ggml-vocab.bin"
+
+        print(f"Extracting only the vocab from '{fname_model}'\n")
+
+        model = torch.load(fname_model, map_location="cpu")
+
+        with open(fname_out, "wb") as fout:
+            write_header(fout, hparams, ftype)
+            write_tokens(fout, tokenizer)
+
+        del model
+
+        print(f"Done. Output file: {fname_out}\n")
+
+        return
+
+    n_parts = get_n_parts(hparams["dim"])
+
+    for p in range(n_parts):
+
+        print(f"Processing part {p}\n")
+
+        fname_model = f"{dir_model}/consolidated.0{p}.pth"
+        fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
+
+        model = torch.load(fname_model, map_location="cpu")
+
+        with open(fname_out, "wb") as fout:
+            write_header(fout, hparams, ftype)
+            write_tokens(fout, tokenizer)
+            process_and_write_variables(fout, model, ftype)
+
+        del model
+
+        print(f"Done. Output file: {fname_out}, (part {p})\n")
+
+if __name__ == "__main__":
+    main()
--- a/convert.py
+++ b/convert.py
--- a/docs/BLIS.md
+++ b/docs/BLIS.md
@@ -1,67 +0,0 @@
-BLIS Installation Manual
------------------------
-
-BLIS is a portable software framework for high-performance BLAS-like dense linear algebra libraries. It has received awards and recognition, including the 2023 James H. Wilkinson Prize for Numerical Software and the 2020 SIAM Activity Group on Supercomputing Best Paper Prize. BLIS provides a new BLAS-like API and a compatibility layer for traditional BLAS routine calls. It offers features such as object-based API, typed API, BLAS and CBLAS compatibility layers.
-
-Project URL: https://github.com/flame/blis
-
-### Prepare:
-
-Compile BLIS:
-
-```bash
-git clone https://github.com/flame/blis
-cd blis
-./configure --enable-cblas -t openmp,pthreads auto
-# will install to /usr/local/ by default.
-make -j
-```
-
-Install BLIS:
-
-```bash
-sudo make install
-```
-
-We recommend using openmp since it's easier to modify the cores been used.
-
-### llama.cpp compilation
-
-Makefile:
-
-```bash
-make LLAMA_BLIS=1 -j
-# make LLAMA_BLIS=1 benchmark-matmult
-```
-
-CMake:
-
-```bash
-mkdir build
-cd build
-cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
-make -j
-```
-
-### llama.cpp execution
-
-According to the BLIS documentation, we could set the following
-environment variables to modify the behavior of openmp:
-
-```bash
-export GOMP_CPU_AFFINITY="0-19"
-export BLIS_NUM_THREADS=14
-```
-
-And then run the binaries as normal.
-
-
-### Intel specific issue
-
-Some might get the error message saying that `libimf.so` cannot be found.
-Please follow this [stackoverflow page](https://stackoverflow.com/questions/70687930/intel-oneapi-2022-libimf-so-no-such-file-or-directory-during-openmpi-compila).
-
-### Reference:
-
-1. https://github.com/flame/blis#getting-started
-2. https://github.com/flame/blis/blob/master/docs/Multithreading.md
--- a/docs/llama-star/idea-arch.key
+++ b/docs/llama-star/idea-arch.key
--- a/docs/llama-star/idea-arch.pdf
+++ b/docs/llama-star/idea-arch.pdf
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@@ -1,40 +0,0 @@
-# Token generation performance troubleshooting
-
-## Verifying that the model is running on the GPU with cuBLAS
-Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example:
-```shell
-./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some "
-```
-
-When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines:
-```shell
-llama_model_load_internal: [cublas] offloading 60 layers to GPU
-llama_model_load_internal: [cublas] offloading output layer to GPU
-llama_model_load_internal: [cublas] total VRAM used: 17223 MB
-... rest of inference
-```
-
-If you see these lines, then the GPU is being used.
-
-## Verifying that the CPU is not oversaturated
-llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
-
-# Example of runtime flags effect on inference speed benchmark
-These runs were tested on the following machine:
-GPU: A6000 (48GB VRAM)
-CPU: 7 physical cores
-RAM: 32GB
-
-Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML)
-
-Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]`
-
-Result:
-
-| command | tokens/second (higher is better) |
-| - | - |
-| -ngl 2000000 | N/A (less than 0.1) |
-| -t 7 | 1.7 |
-| -t 1 -ngl 2000000 | 5.5 |
-| -t 7 -ngl 2000000 | 8.7 |
-| -t 4 -ngl 2000000 | 9.1 |
--- a/download-pth.py
+++ b/download-pth.py
@@ -0,0 +1,66 @@
+import os
+import sys
+from tqdm import tqdm
+import requests
+
+if len(sys.argv) < 3:
+    print("Usage: download-pth.py dir-model model-type\n")
+    print("  model-type: Available models 7B, 13B, 30B or 65B")
+    sys.exit(1)
+
+modelsDir = sys.argv[1]
+model = sys.argv[2]
+
+num = {
+    "7B": 1,
+    "13B": 2,
+    "30B": 4,
+    "65B": 8,
+}
+
+if model not in num:
+    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
+    sys.exit(1)
+
+print(f"Downloading model {model}")
+
+files = ["checklist.chk", "params.json"]
+
+for i in range(num[model]):
+    files.append(f"consolidated.0{i}.pth")
+
+resolved_path = os.path.abspath(os.path.join(modelsDir, model))
+os.makedirs(resolved_path, exist_ok=True)
+
+for file in files:
+    dest_path = os.path.join(resolved_path, file)
+    
+    if os.path.exists(dest_path):
+        print(f"Skip file download, it already exists: {file}")
+        continue
+
+    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
+    response = requests.get(url, stream=True)
+    with open(dest_path, 'wb') as f:
+        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+                    t.update(len(chunk))
+
+files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
+for file in files2:
+    dest_path = os.path.join(modelsDir, file)
+    
+    if os.path.exists(dest_path):
+        print(f"Skip file download, it already exists: {file}")
+        continue
+    
+    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
+    response = requests.get(url, stream=True)
+    with open(dest_path, 'wb') as f:
+        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+                    t.update(len(chunk))
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,44 +0,0 @@
-# dependencies
-
-find_package(Threads REQUIRED)
-
-# third-party
-
-# ...
-
-# examples
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-if (EMSCRIPTEN)
-else()
-    add_subdirectory(baby-llama)
-    add_subdirectory(batched)
-    add_subdirectory(batched-bench)
-    add_subdirectory(beam-search)
-    add_subdirectory(benchmark)
-    add_subdirectory(convert-llama2c-to-ggml)
-    add_subdirectory(embedding)
-    add_subdirectory(finetune)
-    add_subdirectory(infill)
-    add_subdirectory(llama-bench)
-    add_subdirectory(llava)
-    add_subdirectory(main)
-    add_subdirectory(tokenize)
-    add_subdirectory(parallel)
-    add_subdirectory(perplexity)
-    add_subdirectory(quantize)
-    add_subdirectory(quantize-stats)
-    add_subdirectory(save-load-state)
-    add_subdirectory(simple)
-    add_subdirectory(passkey)
-    add_subdirectory(speculative)
-    add_subdirectory(lookahead)
-    add_subdirectory(lookup)
-    add_subdirectory(train-text-from-scratch)
-    add_subdirectory(imatrix)
-    if (LLAMA_BUILD_SERVER)
-        add_subdirectory(server)
-    endif()
-    add_subdirectory(export-lora)
-endif()
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@@ -1,50 +0,0 @@
-#!/bin/bash
-set -e
-
-AI_NAME="${AI_NAME:-Miku}"
-MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
-USER_NAME="${USER_NAME:-Anon}"
-
-# Uncomment and adjust to the number of CPU cores you want to use.
-#N_THREAD="${N_THREAD:-4}"
-CTX_SIZE="${CTX_SIZE:-4096}"
-N_PREDICTS="${N_PREDICTS:-4096}"
-
-GEN_OPTIONS=(--batch_size 1024
--ctx_size "$CTX_SIZE"
--keep -1
--repeat_last_n 256
--repeat_penalty 1.17647
--temp 0.6
--mirostat 2)
-
-if [ -n "$N_THREAD" ]; then
-    GEN_OPTIONS+=(--threads "$N_THREAD")
-fi
-
-./main "${GEN_OPTIONS[@]}" \
-    --model "$MODEL" \
-    --in-prefix " " \
-    --in-suffix "${AI_NAME}:" \
-    --n_predict "$N_PREDICTS" \
-    --color --interactive \
-    --reverse-prompt "${USER_NAME}:" \
-    --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
-${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
-${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
-${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
-${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
-The conversation is only between ${USER_NAME} and ${AI_NAME}.
-The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
-${AI_NAME} can only communicate through text, so she can't send images or videos.
-
-
-${USER_NAME}: Hello!
-${AI_NAME}: /think I wonder what I should say to ${USER_NAME}? This is the first time we talk, so it's important that I make a good first impression!
-${AI_NAME}: Hi! I am ${AI_NAME}, your new AI friend, assistant (or whatever you like!), it's so nice to meet you! ^_^
-${AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :)
-${USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant!
-${AI_NAME}: /think It sounds like ${USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off!
-${AI_NAME}: /think I wonder what ${USER_NAME} likes to do in his free time? I should ask him about that!
-${AI_NAME}: What do you like to do in your free time? ^_^
-${USER_NAME}:" "$@"
--- a/examples/alpaca.sh
+++ b/examples/alpaca.sh
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-#
-# Temporary script - will be removed in the future
-#
-
-cd `dirname $0`
-cd ..
-
-./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
-       --color \
-       -f ./prompts/alpaca.txt \
-       --ctx_size 2048 \
-       -n -1 \
-       -ins -b 256 \
-       --top_k 10000 \
-       --temp 0.2 \
-       --repeat_penalty 1.1 \
-       -t 7
--- a/examples/baby-llama/CMakeLists.txt
+++ b/examples/baby-llama/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET baby-llama)
-add_executable(${TARGET} baby-llama.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@@ -1,61 +0,0 @@
-#!/bin/bash
-#
-# Few-shot translation example.
-# Requires a base model (i.e. no fine-tuned or instruct models).
-#
-# Usage:
-#
-#   cd llama.cpp
-#   make -j
-#
-#   ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
-#
-
-if [ $# -lt 2 ]; then
-  echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
-  exit 1
-fi
-
-eargs=""
-if [ $# -gt 2 ]; then
-  eargs="${@:3}"
-fi
-
-ftmp="__llama.cpp_example_tmp__.txt"
-trap "rm -f $ftmp" EXIT
-
-echo "Translate from English to French:
-
-===
-
-sea otter, peppermint, plush girafe:
-
-sea otter => loutre de mer
-peppermint => menthe poivrée
-plush girafe => girafe peluche
-
-===
-
-violin
-
-violin => violon
-
-===
-
-phone, computer, mouse, keyboard:
-
-phone => téléphone
-computer => ordinateur
-mouse => souris
-keyboard => clavier
-
-===
-" > $ftmp
-
-echo "$2
-" >> $ftmp
-
-model=$1
-
-# generate the most likely continuation until the string "===" is found
-./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
--- a/examples/batched-bench/CMakeLists.txt
+++ b/examples/batched-bench/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET batched-bench)
-add_executable(${TARGET} batched-bench.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -1,51 +0,0 @@
-# llama.cpp/example/batched-bench
-
-Benchmark the batched decoding performance of `llama.cpp`
-
-## Usage
-
-There are 2 modes of operation:
-
- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`)
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
-
-```bash
-./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
-
-# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99
-
-# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99
-
-# custom set of batches
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32
-```
-
-## Sample results
-
- `PP` - prompt tokens per batch
- `TG` - generated tokens per batch
- `B` - number of batches
- `N_KV` - required KV cache size
- `T_PP` - prompt processing time (i.e. time to first token)
- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`)
- `T_TG` - time to generate all batches
- `S_TG` - text generation speed (`(B*TG)/T_TG`)
- `T` - total time
- `S` - total speed (i.e. all tokens / total time)
-
-|    PP |     TG |    B |   N_KV |   T_PP s | S_PP t/s |   T_TG s | S_TG t/s |      T s |    S t/s |
-|-------|--------|------|--------|----------|----------|----------|----------|----------|----------|
-|   128 |    128 |    1 |    256 |    0.108 |  1186.64 |    3.079 |    41.57 |    3.187 |    80.32 |
-|   128 |    128 |    2 |    512 |    0.198 |  1295.19 |    5.029 |    50.90 |    5.227 |    97.95 |
-|   128 |    128 |    4 |   1024 |    0.373 |  1373.96 |    6.878 |    74.44 |    7.251 |   141.23 |
-|   128 |    128 |    8 |   2048 |    0.751 |  1363.27 |    7.344 |   139.43 |    8.095 |   252.99 |
-|   128 |    128 |   16 |   4096 |    1.570 |  1304.68 |    8.455 |   242.23 |   10.024 |   408.60 |
-|   128 |    128 |   32 |   8192 |    3.408 |  1201.73 |    8.801 |   465.40 |   12.209 |   670.96 |
-|   128 |    256 |    1 |    384 |    0.107 |  1196.70 |    6.329 |    40.45 |    6.436 |    59.67 |
-|   128 |    256 |    2 |    768 |    0.194 |  1317.45 |   10.239 |    50.00 |   10.433 |    73.61 |
-|   128 |    256 |    4 |   1536 |    0.366 |  1399.03 |   13.960 |    73.35 |   14.326 |   107.22 |
-|   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 |
-|   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 |
-|   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 |
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -1,250 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdio>
-#include <string>
-#include <vector>
-
-// mutates the input string
-static std::vector<int> parse_list(char * p) {
-    std::vector<int> ret;
-
-    char * q = p;
-
-    while (*p) {
-        if (*p == ',') {
-            *p = '\0';
-            ret.push_back(std::atoi(q));
-            q = p + 1;
-        }
-
-        ++p;
-    }
-
-    ret.push_back(std::atoi(q));
-
-    return ret;
-}
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-
-    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>\n" , argv[0]);
-        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
-        printf("  example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
-        return 1 ;
-    }
-
-    int n_kv_max     = 2048;
-    int is_pp_shared = 0;
-    int n_gpu_layers = 0;
-    int mmq          = 0;
-
-    std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
-    std::vector<int> n_tg = { 128, 256, };
-    std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
-    //std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
-
-    if (argc >= 2) {
-        params.model = argv[1];
-    }
-
-    if (argc >= 3) {
-        n_kv_max = std::atoi(argv[2]);
-    }
-
-    if (argc >= 4) {
-        is_pp_shared = std::atoi(argv[3]);
-    }
-
-    if (argc >= 5) {
-        n_gpu_layers = std::atoi(argv[4]);
-    }
-
-    if (argc >= 6) {
-        mmq = std::atoi(argv[5]);
-    }
-
-    if (argc >= 7) {
-        n_pp = parse_list(argv[6]);
-    }
-
-    if (argc >= 8) {
-        n_tg = parse_list(argv[7]);
-    }
-
-    if (argc >= 9) {
-        n_pl = parse_list(argv[8]);
-    }
-
-    // init LLM
-
-    llama_backend_init(params.numa);
-
-    // initialize the model
-
-    llama_model_params model_params = llama_model_default_params();
-
-    const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f);
-
-    model_params.n_gpu_layers = n_gpu_layers;
-    model_params.tensor_split = t_split.data();
-
-    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
-
-    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
-        return 1;
-    }
-
-    llama_context_params ctx_params = llama_context_default_params();
-
-    ctx_params.seed      = 1234;
-    ctx_params.n_ctx     = n_kv_max;
-    ctx_params.n_batch   = 512;
-    ctx_params.mul_mat_q = mmq;
-
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
-
-    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
-
-    if (ctx == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
-        return 1;
-    }
-
-    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
-
-    // decode in batches of ctx_params.n_batch tokens
-    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
-
-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-                0, 0, 0, // unused
-            };
-
-            const int ret = llama_decode(ctx, batch_view);
-            if (ret != 0) {
-                LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
-                return false;
-            }
-        }
-
-        return true;
-    };
-
-    // warm up
-    {
-        for (int i = 0; i < 16; ++i) {
-            llama_batch_add(batch, 0, i, { 0 }, false);
-        }
-
-        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-            LOG_TEE("%s: llama_decode() failed\n", __func__);
-            return 1;
-        }
-    }
-
-    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
-    LOG_TEE("\n");
-
-    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
-    LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
-
-    for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
-        for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) {
-            for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) {
-                const int pp = n_pp[i_pp];
-                const int tg = n_tg[i_tg];
-                const int pl = n_pl[i_pl];
-
-                const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg);
-
-                if (n_ctx_req > n_kv_max) {
-                    continue;
-                }
-
-                llama_batch_clear(batch);
-
-                const int n_tokens = is_pp_shared ? pp : pl*pp;
-
-                for (int i = 0; i < n_tokens; ++i) {
-                    llama_batch_add(batch, 0, i, { 0 }, false);
-                }
-                batch.logits[batch.n_tokens - 1] = true;
-
-                const auto t_pp_start = ggml_time_us();
-
-                llama_kv_cache_clear(ctx);
-
-                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                    LOG_TEE("%s: llama_decode() failed\n", __func__);
-                    return 1;
-                }
-
-                if (is_pp_shared) {
-                    for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
-                    }
-                }
-
-                const auto t_pp_end = ggml_time_us();
-
-                const auto t_tg_start = ggml_time_us();
-
-                for (int i = 0; i < tg; ++i) {
-                    llama_batch_clear(batch);
-
-                    for (int j = 0; j < pl; ++j) {
-                        llama_batch_add(batch, 0, pp + i, { j }, true);
-                    }
-
-                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
-                        LOG_TEE("%s: llama_decode() failed\n", __func__);
-                        return 1;
-                    }
-                }
-
-                const auto t_tg_end = ggml_time_us();
-
-                const int32_t n_kv = n_ctx_req;
-
-                const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f;
-                const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f;
-                const float t    = t_pp + t_tg;
-
-                const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
-                const float speed_tg = pl*tg / t_tg;
-                const float speed    = n_kv / t;
-
-                LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
-            }
-        }
-    }
-
-    llama_print_timings(ctx);
-
-    llama_batch_free(batch);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    fprintf(stderr, "\n\n");
-
-    return 0;
-}
--- a/examples/batched.swift/.gitignore
+++ b/examples/batched.swift/.gitignore
@@ -1,9 +0,0 @@
-.DS_Store
-/.build
-/Packages
-xcuserdata/
-DerivedData/
-.swiftpm/configuration/registries.json
-.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
-.netrc
-batched_swift
--- a/examples/batched.swift/Makefile
+++ b/examples/batched.swift/Makefile
@@ -1,6 +0,0 @@
-.PHONY: build
-
-build:
-	xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build
-	rm -f ./batched_swift
-	ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift
--- a/examples/batched.swift/Package.swift
+++ b/examples/batched.swift/Package.swift
@@ -1,22 +0,0 @@
-// swift-tools-version: 5.5
-// The swift-tools-version declares the minimum version of Swift required to build this package.
-
-import PackageDescription
-
-let package = Package(
-    name: "batched_swift",
-    platforms: [.macOS(.v12)],
-    dependencies: [
-        .package(name: "llama", path: "../../"),
-    ],
-    targets: [
-        // Targets are the basic building blocks of a package, defining a module or a test suite.
-        // Targets can depend on other targets in this package and products from dependencies.
-        .executableTarget(
-            name: "batched_swift",
-            dependencies: ["llama"],
-            path: "Sources",
-            linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")]
-        ),
-    ]
-)
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@@ -1,4 +0,0 @@
-This is a swift clone of `examples/batched`.
-
-$ `make`
-$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -1,260 +0,0 @@
-import Foundation
-import llama
-
-let arguments = CommandLine.arguments
-
-// Check that we have at least one argument (the model path)
-guard arguments.count > 1 else {
-    print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]")
-    exit(1)
-}
-
-let modelPath: String = arguments[1]
-let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is"
-let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1
-
-// total length of the sequences including the prompt
-let n_len: Int = 32
-
-// init LLM
-llama_backend_init(false)
-defer {
-    llama_backend_free()
-}
-
-let model_params = llama_model_default_params()
-guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else {
-    print("Failed to load model")
-    exit(1)
-}
-
-defer {
-    llama_free_model(model)
-}
-
-var tokens = tokenize(text: prompt, add_bos: true)
-
-let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel)
-
-var context_params = llama_context_default_params()
-context_params.seed = 1234
-context_params.n_ctx = n_kv_req
-context_params.n_batch = UInt32(max(n_len, n_parallel))
-context_params.n_threads = 8
-context_params.n_threads_batch = 8
-
-let context = llama_new_context_with_model(model, context_params)
-guard context != nil else {
-    print("Failed to initialize context")
-    exit(1)
-}
-
-defer {
-    llama_free(context)
-}
-
-let n_ctx = llama_n_ctx(context)
-
-print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n")
-
-if n_kv_req > n_ctx {
-    print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req)
-    exit(1)
-}
-
-var buffer: [CChar] = []
-for id: llama_token in tokens {
-    print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "")
-}
-
-print("\n")
-
-var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1)
-defer {
-    llama_batch_free(batch)
-}
-
-// evaluate the initial prompt
-batch.n_tokens = Int32(tokens.count)
-
-for (i, token) in tokens.enumerated() {
-    batch.token[i] = token
-    batch.pos[i] = Int32(i)
-    batch.n_seq_id[i] = 1
-    // batch.seq_id[i][0] = 0
-    // TODO: is this the proper way to do this?
-    if let seq_id = batch.seq_id[i] {
-        seq_id[0] = 0
-    }
-    batch.logits[i] = 0
-}
-
-// llama_decode will output logits only for the last token of the prompt
-batch.logits[Int(batch.n_tokens) - 1] = 1
-
-if llama_decode(context, batch) != 0 {
-    print("llama_decode() failed")
-    exit(1)
-}
-
-for i in 1 ..< n_parallel {
-    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
-}
-
-if n_parallel > 1 {
-    print("generating \(n_parallel) sequences ...\n")
-}
-
-var streams: [String] = .init(repeating: "", count: n_parallel)
-var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel)
-var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel)
-
-var n_cur = batch.n_tokens
-var n_decode = 0
-
-let t_main_start = ggml_time_us()
-
-while n_cur <= n_len {
-    // prepare the next batch
-    batch.n_tokens = 0
-
-    // sample the next token for each parallel sequence / stream
-    for i in 0 ..< n_parallel {
-        if i_batch[i] < 0 {
-            // the stream has already finished
-            continue
-        }
-
-        var n_vocab = llama_n_vocab(model)
-        var logits = llama_get_logits_ith(context, i_batch[i])
-
-        var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab))
-
-        for token_id in 0 ..< n_vocab {
-            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
-        }
-
-        var candidates_p: llama_token_data_array = .init(
-            data: &candidates,
-            size: candidates.count,
-            sorted: false
-        )
-
-        let top_k: Int32 = 40
-        let top_p: Float = 0.9
-        let temp: Float = 0.4
-
-        llama_sample_top_k(context, &candidates_p, top_k, 1)
-        llama_sample_top_p(context, &candidates_p, top_p, 1)
-        llama_sample_temp(context, &candidates_p, temp)
-
-        let new_token_id = llama_sample_token(context, &candidates_p)
-
-        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
-
-        // is it an end of stream? -> mark the stream as finished
-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
-            i_batch[i] = -1
-            // print("")
-            if n_parallel > 1 {
-                print("stream \(i) finished at n_cur = \(n_cur)")
-            }
-
-            continue
-        }
-
-        let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? ""
-
-        // if there is only one stream, we print immediately to stdout
-        if n_parallel == 1 {
-            print(nextStringPiece, terminator: "")
-        }
-        streams[i] += nextStringPiece
-
-        // push this new token for next evaluation
-        batch.token[Int(batch.n_tokens)] = new_token_id
-        batch.pos[Int(batch.n_tokens)] = n_cur
-        batch.n_seq_id[Int(batch.n_tokens)] = 1
-        if let seq_id = batch.seq_id[Int(batch.n_tokens)] {
-            seq_id[0] = Int32(i)
-        }
-        batch.logits[Int(batch.n_tokens)] = 1
-
-        i_batch[i] = batch.n_tokens
-
-        batch.n_tokens += 1
-
-        n_decode += 1
-    }
-
-    // all streams are finished
-    if batch.n_tokens == 0 {
-        break
-    }
-
-    n_cur += 1
-
-    // evaluate the current batch with the transformer model
-    if llama_decode(context, batch) != 0 {
-        print("llama_decode() failed")
-        exit(1)
-    }
-}
-
-if n_parallel > 1 {
-    print("\n")
-    for (i, stream) in streams.enumerated() {
-        print("sequence \(i):\n\n\(prompt)\(stream)\n")
-    }
-}
-
-let t_main_end = ggml_time_us()
-
-print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n")
-
-llama_print_timings(context)
-
-private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-    let utf8Count = text.utf8.count
-    let n_tokens = utf8Count + (add_bos ? 1 : 0)
-    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
-    var swiftTokens: [llama_token] = []
-    for i in 0 ..< tokenCount {
-        swiftTokens.append(tokens[Int(i)])
-    }
-    tokens.deallocate()
-    return swiftTokens
-}
-
-private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
-    var result = [CChar](repeating: 0, count: 8)
-    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
-    if nTokens < 0 {
-        let actualTokensCount = -Int(nTokens)
-        result = .init(repeating: 0, count: actualTokensCount)
-        let check = llama_token_to_piece(
-            model,
-            token,
-            &result,
-            Int32(result.count)
-        )
-        assert(check == actualTokensCount)
-    } else {
-        result.removeLast(result.count - Int(nTokens))
-    }
-    if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) {
-        return utfString
-    } else {
-        buffer.append(contentsOf: result)
-        let data = Data(buffer.map { UInt8(bitPattern: $0) })
-        if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer
-            buffer = []
-        }
-        guard let bufferString = String(data: data, encoding: .utf8) else {
-            return nil
-        }
-        buffer = []
-        return bufferString
-    }
-}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Thiago Padilha	3a0dcb3920	Implement server mode. This new mode works by first loading the model then listening for TCP connections on a port. When a connection is received, arguments will be parsed using a simple protocol: - First the number of arguments will be read followed by a newline character. - Then each argument will be read, separated by the 0 byte. - With this we build an argument vector, similar to what is passed to the program entry point. We pass this to gpt_params_parse. Finally `run` will be executed with the input/output streams connected to the socket. Signed-off-by: Thiago Padilha <thiago@padilha.cc>	2023-03-22 14:34:19 -03:00
Thiago Padilha	bf44faa0ee	Remove direct access to std streams from "run" The goal is to allow running "run" while connected to other streams, such as TCP sockets. Signed-off-by: Thiago Padilha <thiago@padilha.cc>	2023-03-22 14:34:18 -03:00
Thiago Padilha	b7f1fa6d8c	Move llama_context setup + perplexity back to main.cpp Signed-off-by: Thiago Padilha <thiago@padilha.cc>	2023-03-22 14:31:41 -03:00
Thiago Padilha	d7d53b84db	Add main.cpp back and invoke "run" from it Signed-off-by: Thiago Padilha <thiago@padilha.cc>	2023-03-22 14:31:41 -03:00
Thiago Padilha	90175ee13f	Move main.cpp to run.cpp Signed-off-by: Thiago Padilha <thiago@padilha.cc>	2023-03-22 14:31:35 -03:00