build : enable libstdc++ assertions for debug builds

2026-02-12 14:03:20 +02:00 · 2023-11-30 18:18:24 -05:00
193 changed files with 13741 additions and 35568 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -14,8 +14,7 @@ ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git

-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
+COPY requirements.txt requirements.txt

 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -23,8 +23,7 @@ ARG ROCM_DOCKER_ARCH=\
    gfx1101 \
    gfx1102

-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
+COPY requirements.txt requirements.txt

 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -5,8 +5,7 @@ FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git

-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
+COPY requirements.txt requirements.txt

 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@@ -23,8 +23,7 @@ ARG ROCM_DOCKER_ARCH=\
    gfx1101 \
    gfx1102

-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
+COPY requirements.txt requirements.txt

 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@@ -1,22 +0,0 @@
-{
-  perSystem =
-    { config, lib, ... }:
-    {
-      apps =
-        let
-          inherit (config.packages) default;
-          binaries = [
-            "llama"
-            "llama-embedding"
-            "llama-server"
-            "quantize"
-            "train-text-from-scratch"
-          ];
-          mkApp = name: {
-            type = "app";
-            program = "${default}/bin/${name}";
-          };
-        in
-        lib.genAttrs binaries mkApp;
-    };
-}
--- a/.devops/nix/devshells.nix
+++ b/.devops/nix/devshells.nix
@@ -1,13 +0,0 @@
-{
-  perSystem =
-    { config, lib, ... }:
-    {
-      devShells =
-        lib.concatMapAttrs
-          (name: package: {
-            ${name} = package.passthru.shell;
-            ${name + "-extra"} = package.passthru.shell-extra;
-          })
-          config.packages;
-    };
-}
--- a/.devops/nix/jetson-support.nix
+++ b/.devops/nix/jetson-support.nix
@@ -1,39 +0,0 @@
-{ inputs, ... }:
-{
-  perSystem =
-    {
-      config,
-      system,
-      lib,
-      pkgsCuda,
-      ...
-    }:
-    {
-      legacyPackages =
-        let
-          caps.llamaPackagesXavier = "7.2";
-          caps.llamaPackagesOrin = "8.7";
-          caps.llamaPackagesTX2 = "6.2";
-          caps.llamaPackagesNano = "5.3";
-
-          pkgsFor =
-            cap:
-            import inputs.nixpkgs {
-              inherit system;
-              config = {
-                cudaSupport = true;
-                cudaCapabilities = [ cap ];
-                cudaEnableForwardCompat = false;
-                inherit (pkgsCuda.config) allowUnfreePredicate;
-              };
-            };
-        in
-        builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
-
-      packages = lib.optionalAttrs (system == "aarch64-linux") {
-        jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
-        jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
-        jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
-      };
-    };
-}
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@@ -1,35 +0,0 @@
-{ inputs, ... }:
-{
-  # The _module.args definitions are passed on to modules as arguments. E.g.
-  # the module `{ pkgs ... }: { /* config */ }` implicitly uses
-  # `_module.args.pkgs` (defined in this case by flake-parts).
-  perSystem =
-    { system, ... }:
-    {
-      _module.args = {
-        pkgsCuda = import inputs.nixpkgs {
-          inherit system;
-          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
-          # and ucx are built with CUDA support)
-          config.cudaSupport = true;
-          config.allowUnfreePredicate =
-            p:
-            builtins.all
-              (
-                license:
-                license.free
-                || builtins.elem license.shortName [
-                  "CUDA EULA"
-                  "cuDNN EULA"
-                ]
-              )
-              (p.meta.licenses or [ p.meta.license ]);
-        };
-        # Ensure dependencies use ROCm consistently
-        pkgsRocm = import inputs.nixpkgs {
-          inherit system;
-          config.rocmSupport = true;
-        };
-      };
-    };
-}
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -1,265 +0,0 @@
-{
-  lib,
-  config,
-  stdenv,
-  mkShell,
-  cmake,
-  ninja,
-  pkg-config,
-  git,
-  python3,
-  mpi,
-  openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
-  cudaPackages,
-  darwin,
-  rocmPackages,
-  clblast,
-  useBlas ? builtins.all (x: !x) [
-    useCuda
-    useMetalKit
-    useOpenCL
-    useRocm
-  ],
-  useCuda ? config.cudaSupport,
-  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
-  useMpi ? false, # Increases the runtime closure size by ~700M
-  useOpenCL ? false,
-  useRocm ? config.rocmSupport,
-  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
-}@inputs:
-
-let
-  inherit (lib)
-    cmakeBool
-    cmakeFeature
-    optionals
-    strings
-    versionOlder
-    ;
-
-  # It's necessary to consistently use backendStdenv when building with CUDA support,
-  # otherwise we get libstdc++ errors downstream.
-  stdenv = throw "Use effectiveStdenv instead";
-  effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
-
-  suffices =
-    lib.optionals useBlas [ "BLAS" ]
-    ++ lib.optionals useCuda [ "CUDA" ]
-    ++ lib.optionals useMetalKit [ "MetalKit" ]
-    ++ lib.optionals useMpi [ "MPI" ]
-    ++ lib.optionals useOpenCL [ "OpenCL" ]
-    ++ lib.optionals useRocm [ "ROCm" ];
-
-  pnameSuffix =
-    strings.optionalString (suffices != [ ])
-      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
-  descriptionSuffix =
-    strings.optionalString (suffices != [ ])
-      ", accelerated with ${strings.concatStringsSep ", " suffices}";
-
-  # TODO: package the Python in this repository in a Nix-like way.
-  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
-  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
-  # https://peps.python.org/pep-0517/
-  llama-python = python3.withPackages (
-    ps: [
-      ps.numpy
-      ps.sentencepiece
-    ]
-  );
-
-  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
-  llama-python-extra = python3.withPackages (
-    ps: [
-      ps.numpy
-      ps.sentencepiece
-      ps.torchWithoutCuda
-      ps.transformers
-    ]
-  );
-
-  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
-  # separately
-  darwinBuildInputs =
-    with darwin.apple_sdk.frameworks;
-    [
-      Accelerate
-      CoreVideo
-      CoreGraphics
-    ]
-    ++ optionals useMetalKit [ MetalKit ];
-
-  cudaBuildInputs = with cudaPackages; [
-    cuda_cccl.dev # <nv/target>
-
-    # A temporary hack for reducing the closure size, remove once cudaPackages
-    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
-    cuda_cudart.dev
-    cuda_cudart.lib
-    cuda_cudart.static
-    libcublas.dev
-    libcublas.lib
-    libcublas.static
-  ];
-
-  rocmBuildInputs = with rocmPackages; [
-    clr
-    hipblas
-    rocblas
-  ];
-in
-
-effectiveStdenv.mkDerivation (
-  finalAttrs: {
-    pname = "llama-cpp${pnameSuffix}";
-    version = llamaVersion;
-
-    src = lib.cleanSourceWith {
-      filter =
-        name: type:
-        !(builtins.any (_: _) [
-          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-          (name == "README.md") # Ignore *.md changes whe computing outPaths
-          (lib.hasPrefix "." name) # Skip hidden files and directories
-        ]);
-      src = lib.cleanSource ../../.;
-    };
-
-    postPatch = ''
-      substituteInPlace ./ggml-metal.m \
-        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-
-      # TODO: Package up each Python script or service appropriately.
-      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
-      # we could make those *.py into setuptools' entrypoints
-      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
-    '';
-
-    nativeBuildInputs =
-      [
-        cmake
-        ninja
-        pkg-config
-        git
-      ]
-      ++ optionals useCuda [
-        cudaPackages.cuda_nvcc
-
-        # TODO: Replace with autoAddDriverRunpath
-        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
-        cudaPackages.autoAddOpenGLRunpathHook
-      ];
-
-    buildInputs =
-      optionals effectiveStdenv.isDarwin darwinBuildInputs
-      ++ optionals useCuda cudaBuildInputs
-      ++ optionals useMpi [ mpi ]
-      ++ optionals useOpenCL [ clblast ]
-      ++ optionals useRocm rocmBuildInputs;
-
-    cmakeFlags =
-      [
-        (cmakeBool "LLAMA_NATIVE" true)
-        (cmakeBool "LLAMA_BUILD_SERVER" true)
-        (cmakeBool "BUILD_SHARED_LIBS" true)
-        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-        (cmakeBool "LLAMA_BLAS" useBlas)
-        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
-        (cmakeBool "LLAMA_CUBLAS" useCuda)
-        (cmakeBool "LLAMA_HIPBLAS" useRocm)
-        (cmakeBool "LLAMA_METAL" useMetalKit)
-        (cmakeBool "LLAMA_MPI" useMpi)
-      ]
-      ++ optionals useCuda [
-        (
-          with cudaPackages.flags;
-          cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
-            builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
-          )
-        )
-      ]
-      ++ optionals useRocm [
-        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
-        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
-
-        # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
-        # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
-        # and select the line that matches the current nixpkgs version of rocBLAS.
-        # Should likely use `rocmPackages.clr.gpuTargets`.
-        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
-      ]
-      ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
-      ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
-
-    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
-    # if they haven't been added yet.
-    postInstall = ''
-      mv $out/bin/main $out/bin/llama
-      mv $out/bin/server $out/bin/llama-server
-      mkdir -p $out/include
-      cp $src/llama.h $out/include/
-    '';
-
-    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
-    passthru = {
-      inherit
-        useBlas
-        useCuda
-        useMetalKit
-        useMpi
-        useOpenCL
-        useRocm
-        ;
-
-      shell = mkShell {
-        name = "shell-${finalAttrs.finalPackage.name}";
-        description = "contains numpy and sentencepiece";
-        buildInputs = [ llama-python ];
-        inputsFrom = [ finalAttrs.finalPackage ];
-      };
-
-      shell-extra = mkShell {
-        name = "shell-extra-${finalAttrs.finalPackage.name}";
-        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
-        buildInputs = [ llama-python-extra ];
-        inputsFrom = [ finalAttrs.finalPackage ];
-      };
-    };
-
-    meta = {
-      # Configurations we don't want even the CI to evaluate. Results in the
-      # "unsupported platform" messages. This is mostly a no-op, because
-      # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
-
-      # Configurations that are known to result in build failures. Can be
-      # overridden by importing Nixpkgs with `allowBroken = true`.
-      broken = (useMetalKit && !effectiveStdenv.isDarwin);
-
-      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-      homepage = "https://github.com/ggerganov/llama.cpp/";
-      license = lib.licenses.mit;
-
-      # Accommodates `nix run` and `lib.getExe`
-      mainProgram = "llama";
-
-      # These people might respond, on the best effort basis, if you ping them
-      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
-      # Consider adding yourself to this list if you want to ensure this flake
-      # stays maintained and you're willing to invest your time. Do not add
-      # other people without their consent. Consider removing people after
-      # they've been unreachable for long periods of time.
-
-      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
-      # an attrset following the same format as in
-      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
-      maintainers = with lib.maintainers; [
-        philiptaron
-        SomeoneSerge
-      ];
-
-      # Extend `badPlatforms` instead
-      platforms = lib.platforms.all;
-    };
-  }
-)
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -1,12 +0,0 @@
-{
-  lib,
-  newScope,
-  llamaVersion ? "0.0.0",
-}:
-
-lib.makeScope newScope (
-  self: {
-    inherit llamaVersion;
-    llama-cpp = self.callPackage ./package.nix { };
-  }
-)
--- a/.editorconfig
+++ b/.editorconfig
@@ -15,14 +15,8 @@ indent_size = 4
 [Makefile]
 indent_style = tab

-[scripts/*.mk]
-indent_style = tab
-
 [prompts/*.txt]
 insert_final_newline = unset

 [examples/server/public/*]
 indent_size = 2
-
-[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
-indent_style = tab
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ b/.github/ISSUE_TEMPLATE/bug.md
@@ -6,4 +6,179 @@ assignees: ''

 ---

-Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
+# Prerequisites
+
+Please answer the following questions for yourself before submitting an issue.
+
+- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
+- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
+- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
+
+# Expected Behavior
+
+Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
+
+# Current Behavior
+
+Please provide a detailed written description of what `llama.cpp` did, instead.
+
+# Environment and Context
+
+Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
+
+* Physical (or virtual) hardware you are using, e.g. for Linux:
+
+`$ lscpu`
+
+* Operating System, e.g. for Linux:
+
+`$ uname -a`
+
+* SDK version, e.g. for Linux:
+
+```
+$ python3 --version
+$ make --version
+$ g++ --version
+```
+
+# Failure Information (for bugs)
+
+Please help provide information about the failure / bug.
+
+# Steps to Reproduce
+
+Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
+
+1. step 1
+2. step 2
+3. step 3
+4. etc.
+
+# Failure Logs
+
+Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
+
+Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
+
+Example environment info:
+```
+llama.cpp$ git log | head -1
+commit 2af23d30434a677c6416812eea52ccc0af65119c
+
+llama.cpp$ lscpu | egrep "AMD|Flags"
+Vendor ID:                       AuthenticAMD
+Model name:                      AMD Ryzen Threadripper 1950X 16-Core Processor
+Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev
+Virtualization:                  AMD-V
+
+llama.cpp$ python3 --version
+Python 3.10.9
+
+llama.cpp$ pip list | egrep "torch|numpy|sentencepiece"
+numpy                         1.24.2
+numpydoc                      1.5.0
+sentencepiece                 0.1.97
+torch                         1.13.1
+torchvision                   0.14.1
+
+llama.cpp$ make --version | head -1
+GNU Make 4.3
+
+$ md5sum ./models/65B/ggml-model-q4_0.bin
+dbdd682cce80e2d6e93cefc7449df487  ./models/65B/ggml-model-q4_0.bin
+```
+
+Example run with the Linux command [perf](https://www.brendangregg.com/perf.html)
+```
+llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered."
+main: seed = 1679149377
+llama_model_load: loading model from './models/65B/ggml-model-q4_0.bin' - please wait ...
+llama_model_load: n_vocab = 32000
+llama_model_load: n_ctx   = 512
+llama_model_load: n_embd  = 8192
+llama_model_load: n_mult  = 256
+llama_model_load: n_head  = 64
+llama_model_load: n_layer = 80
+llama_model_load: n_rot   = 128
+llama_model_load: f16     = 2
+llama_model_load: n_ff    = 22016
+llama_model_load: n_parts = 8
+llama_model_load: ggml ctx size = 41477.73 MB
+llama_model_load: memory_size =  2560.00 MB, n_mem = 40960
+llama_model_load: loading model part 1/8 from './models/65B/ggml-model-q4_0.bin'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 2/8 from './models/65B/ggml-model-q4_0.bin.1'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 3/8 from './models/65B/ggml-model-q4_0.bin.2'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 4/8 from './models/65B/ggml-model-q4_0.bin.3'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 5/8 from './models/65B/ggml-model-q4_0.bin.4'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 6/8 from './models/65B/ggml-model-q4_0.bin.5'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 7/8 from './models/65B/ggml-model-q4_0.bin.6'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.7'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+
+system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
+
+main: prompt: 'Please close your issue when it has been answered.'
+main: number of tokens in prompt = 11
+     1 -> ''
+ 12148 -> 'Please'
+  3802 -> ' close'
+   596 -> ' your'
+  2228 -> ' issue'
+   746 -> ' when'
+   372 -> ' it'
+   756 -> ' has'
+  1063 -> ' been'
+  7699 -> ' answered'
+ 29889 -> '.'
+
+sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
+
+
+Please close your issue when it has been answered.
+@duncan-donut: I'm trying to figure out what kind of "support" you need for this script and why, exactly? Is there a question about how the code works that hasn't already been addressed in one or more comments below this ticket, or are we talking something else entirely like some sorta bugfixing job because your server setup is different from mine??
+I can understand if your site needs to be running smoothly and you need help with a fix of sorts but there should really be nothing wrong here that the code itself could not handle. And given that I'm getting reports about how it works perfectly well on some other servers, what exactly are we talking? A detailed report will do wonders in helping us get this resolved for ya quickly so please take your time and describe the issue(s) you see as clearly & concisely as possible!!
+@duncan-donut: I'm not sure if you have access to cPanel but you could try these instructions. It is worth a shot! Let me know how it goes (or what error message, exactly!) when/if ya give that code a go? [end of text]
+
+
+main: mem per token = 71159620 bytes
+main:     load time = 19309.95 ms
+main:   sample time =   168.62 ms
+main:  predict time = 223895.61 ms / 888.47 ms per token
+main:    total time = 246406.42 ms
+
+ Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
+
+        3636882.89 msec task-clock                #   14.677 CPUs utilized
+             13509      context-switches          #    3.714 /sec
+              2436      cpu-migrations            #    0.670 /sec
+          10476679      page-faults               #    2.881 K/sec
+    13133115082869      cycles                    #    3.611 GHz                      (16.77%)
+       29314462753      stalled-cycles-frontend   #    0.22% frontend cycles idle     (16.76%)
+    10294402631459      stalled-cycles-backend    #   78.39% backend cycles idle      (16.74%)
+    23479217109614      instructions              #    1.79  insn per cycle
+                                                  #    0.44  stalled cycles per insn  (16.76%)
+     2353072268027      branches                  #  647.002 M/sec                    (16.77%)
+        1998682780      branch-misses             #    0.08% of all branches          (16.76%)
+
+     247.802177522 seconds time elapsed
+
+    3618.573072000 seconds user
+      18.491698000 seconds sys
+```
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -143,9 +143,6 @@ jobs:
          cd build
          ctest --verbose

-  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
-  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
  macOS-latest-make:
    runs-on: macos-latest

@@ -163,18 +160,14 @@ jobs:
      - name: Build
        id: make_build
        run: |
-          LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
+          make -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: make_test
        run: |
-          LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
-          LLAMA_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
+          make tests -j $(sysctl -n hw.logicalcpu)
+          make test -j $(sysctl -n hw.logicalcpu)

-  # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
-  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
-  #       would be great if we fix these
  macOS-latest-cmake:
    runs-on: macos-latest

@@ -195,7 +188,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_METAL=OFF ..
+          cmake ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -515,30 +508,6 @@ jobs:
      - name: Build Xcode project
        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build

-  android-build:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone
-        uses: actions/checkout@v3
-
-      - name: Set up JDK
-        uses: actions/setup-java@v3
-        with:
-          java-version: 17
-          distribution: zulu
-
-      - name: Setup Android SDK
-        uses: android-actions/setup-android@v3
-        with:
-          log-accepted-android-sdk-licenses: false
-
-      - name: Build
-        run: |
-          cd examples/llama.android
-
-          # Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
-          ./gradlew build --no-daemon -Pskip-armeabi-v7a

 #  freeBSD-latest:
 #    runs-on: macos-12
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -52,36 +52,6 @@ jobs:
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: true
-          swap-storage: true
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v4
@@ -89,7 +59,7 @@ jobs:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}

      - name: Build and push Docker image (tagged)
@@ -98,5 +68,5 @@ jobs:
          context: .
          push: ${{ github.event_name == 'push' }}
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -1,55 +0,0 @@
-name: Nix aarch64 builds
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
-
-jobs:
-  nix-build-aarch64:
-    if: ${{ vars.CACHIX_NAME != '' }}
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install QEMU
-      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y qemu-user-static qemu-system-aarch64
-        sudo usermod -a -G kvm $USER
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-platforms = aarch64-linux
-          extra-system-features = nixos-test kvm
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.aarch64-linux"
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --systems aarch64-linux
-          --flake
-          ".#checks.aarch64-linux"
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -1,71 +0,0 @@
-name: Nix CI
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
-
-jobs:
-  nix-eval:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: List all flake outputs
-      run: nix flake show --all-systems
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
-  nix-build:
-    if: ${{ vars.CACHIX_NAME != '' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --flake
-          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
--- a/.github/workflows/nix-flake-update.yml
+++ b/.github/workflows/nix-flake-update.yml
@@ -1,22 +0,0 @@
-name: update-flake-lock
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
-
-jobs:
-  lockfile:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Install Nix
-        uses: DeterminateSystems/nix-installer-action@main
-      - name: Update flake.lock
-        uses: DeterminateSystems/update-flake-lock@main
-        with:
-          pr-title: "nix: update flake.lock"
-          pr-labels: |
-            nix
-          pr-reviewers: philiptaron,SomeoneSerge
-          token: ${{ secrets.FLAKE_TOKEN }}
--- a/.github/workflows/nix-publish-flake.yml
+++ b/.github/workflows/nix-publish-flake.yml
@@ -1,36 +0,0 @@
-# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
-name: "Publish a flake to flakestry & flakehub"
-on:
-    push:
-        tags:
-        - "*"
-    workflow_dispatch:
-        inputs:
-            tag:
-                description: "The existing tag to publish"
-                type: "string"
-                required: true
-jobs:
-    flakestry-publish:
-        runs-on: ubuntu-latest
-        permissions:
-            id-token: "write"
-            contents: "read"
-        steps:
-            - uses: flakestry/flakestry-publish@main
-              with:
-                version: "${{ inputs.tag || github.ref_name }}"
-    flakehub-publish:
-      runs-on: "ubuntu-latest"
-      permissions:
-        id-token: "write"
-        contents: "read"
-      steps:
-        - uses: "actions/checkout@v4"
-          with:
-            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
-        - uses: "DeterminateSystems/nix-installer-action@main"
-        - uses: "DeterminateSystems/flakehub-push@main"
-          with:
-            visibility: "public"
-            tag: "${{ inputs.tag }}"
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -1,29 +0,0 @@
-name: Python check requirements.txt
-
-on:
-  push:
-    paths:
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
-  pull_request:
-    paths:
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
-
-jobs:
-  python-check-requirements:
-    runs-on: ubuntu-latest
-    name: check-requirements
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v3
-      - name: Set up Python environment
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.11"
-      - name: Run check-requirements.sh script
-        run:  bash scripts/check-requirements.sh nocleanup
--- a/.gitignore
+++ b/.gitignore
@@ -43,16 +43,13 @@ models-mnt
 /embedding
 /gguf
 /gguf-llama-simple
-/imatrix
 /infill
 /libllama.so
 /llama-bench
 /llava-cli
 /lookahead
-/lookup
 /main
 /metal
-/passkey
 /perplexity
 /q8dot
 /quantize
@@ -104,4 +101,3 @@ poetry.toml
 /tests/test-tokenizer-1-llama
 /tests/test-tokenizer-1-bpe
 /tests/test-rope
-/tests/test-backend-ops
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
+cmake_minimum_required(VERSION 3.13)  # for add_link_options
 project("llama.cpp" C CXX)

 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -76,10 +76,6 @@ if (NOT MSVC)
    option(LLAMA_F16C                        "llama: enable F16C"                               ${INS_ENB})
 endif()

-if (WIN32)
-    option(LLAMA_WIN_VER                     "llama: Windows Version"                           0x602)
-endif()
-
 # 3rd party libs
 option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
 option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
@@ -95,17 +91,15 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
-option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
-option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)

-option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
+option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)

 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
@@ -159,9 +153,9 @@ if (APPLE AND LLAMA_ACCELERATE)
 endif()

 if (LLAMA_METAL)
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
+    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
+    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
+    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)

    message(STATUS "Metal framework found")
    set(GGML_HEADERS_METAL ggml-metal.h)
@@ -178,35 +172,6 @@ if (LLAMA_METAL)
    # copy ggml-metal.metal to bin directory
    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)

-    if (LLAMA_METAL_SHADER_DEBUG)
-        # custom command to do the following:
-        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
-        #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
-        #
-        # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
-        #       disabling fast math is needed in order to pass tests/test-backend-ops
-        # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
-        # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
-        #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
-        set(XC_FLAGS -fno-fast-math -fno-inline -g)
-        if (LLAMA_QKK_64)
-            set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
-        endif()
-
-        add_custom_command(
-            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
-            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            DEPENDS ggml-metal.metal
-            COMMENT "Compiling Metal kernels"
-        )
-
-        add_custom_target(
-            ggml-metal ALL
-            DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-        )
-    endif()
-
    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
        ${FOUNDATION_LIBRARY}
        ${METAL_FRAMEWORK}
@@ -234,11 +199,7 @@ if (LLAMA_BLAS)
            if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
                pkg_check_modules(DepBLAS REQUIRED blas)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
-                # As of openblas v0.3.22, the 64-bit is named openblas64.pc
-                pkg_check_modules(DepBLAS openblas64)
-                if (NOT DepBLAS_FOUND)
-                    pkg_check_modules(DepBLAS REQUIRED openblas)
-                endif()
+                pkg_check_modules(DepBLAS REQUIRED openblas)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
                pkg_check_modules(DepBLAS REQUIRED blis)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
@@ -330,18 +291,11 @@ if (LLAMA_CUBLAS)
        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})

        if (LLAMA_STATIC)
-            if (WIN32)
-                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-            else ()
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-            endif()
+            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
        else()
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()

-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
-
    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
        # 52 == lowest CUDA 12 standard
        # 60 == f16 CUDA intrinsics
@@ -418,9 +372,6 @@ if (LLAMA_HIPBLAS)
    if (${hipblas_FOUND} AND ${hip_FOUND})
        message(STATUS "HIP and hipBLAS found")
        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
-        if (LLAMA_HIP_UMA)
-            add_compile_definitions(GGML_HIP_UMA)
-        endif()
        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
        if (BUILD_SHARED_LIBS)
            set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -446,102 +397,57 @@ if (LLAMA_HIPBLAS)
    endif()
 endif()

-function(get_flags CCID CCVER)
-    set(C_FLAGS "")
-    set(CXX_FLAGS "")
-
-    if (CCID MATCHES "Clang")
-        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
-        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
-
-        if (
-            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
-            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
-        )
-            set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
-        endif()
-    elseif (CCID STREQUAL "GNU")
-        set(C_FLAGS   -Wdouble-promotion)
-        set(CXX_FLAGS -Wno-array-bounds)
-
-        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
-            set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
-        endif()
-        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
-            set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
-        endif()
-    endif()
-
-    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
-    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
-endfunction()
-
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
-        set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        set(C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-                          -Werror=implicit-int -Werror=implicit-function-declaration)
-        set(CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
+        set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
+        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
+        set(host_cxx_flags "")

-        set(C_FLAGS   ${WARNING_FLAGS} ${C_FLAGS})
-        set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS})
+        if (CMAKE_C_COMPILER_ID MATCHES "Clang")
+            set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
+            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)

-        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+            if (
+                (CMAKE_C_COMPILER_ID STREQUAL "Clang"      AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
+                (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0)
+            )
+                set(c_flags ${c_flags} -Wdouble-promotion)
+            endif()
+        elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
+            set(c_flags ${c_flags} -Wdouble-promotion)
+            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)

-        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
-                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
+                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
+            endif()
+            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
+                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
+            endif()
+        endif()
    else()
        # todo : msvc
-        set(C_FLAGS   "")
-        set(CXX_FLAGS "")
    endif()
+
+    set(c_flags   ${c_flags}   ${warning_flags})
+    set(cxx_flags ${cxx_flags} ${warning_flags})
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
+                        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
+                        "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
+
 endif()

-if (LLAMA_CUBLAS)
-    set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
-    if (NOT MSVC)
-        set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
-    endif()
-
-    if (LLAMA_ALL_WARNINGS AND NOT MSVC)
-        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
-        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-            set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
-        endif()
-
-        execute_process(
-            COMMAND ${NVCC_CMD} -Xcompiler --version
-            OUTPUT_VARIABLE CUDA_CCFULLVER
-            ERROR_QUIET
-        )
-
-        if (NOT CUDA_CCFULLVER MATCHES clang)
-            set(CUDA_CCID "GNU")
-            execute_process(
-                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
-                OUTPUT_VARIABLE CUDA_CCVER
-                ERROR_QUIET
-            )
-        else()
-            if (CUDA_CCFULLVER MATCHES Apple)
-                set(CUDA_CCID "AppleClang")
-            else()
-                set(CUDA_CCID "Clang")
-            endif()
-            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
-        endif()
-
-        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
-
-        get_flags(${CUDA_CCID} ${CUDA_CCVER})
-        list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS)  # pass host compiler flags as a single argument
-        if (NOT CUDA_CXX_FLAGS STREQUAL "")
-            set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
-        endif()
-    endif()
-
-    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
+if (NOT MSVC)
+    set(cuda_flags -Wno-pedantic)
 endif()
+set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
+
+list(JOIN host_cxx_flags " " cuda_host_flags)  # pass host compiler flags as a single argument
+if (NOT cuda_host_flags STREQUAL "")
+    set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
+endif()
+
+add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")

 if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
@@ -565,7 +471,6 @@ endif()
 execute_process(
    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
    ERROR_VARIABLE output
-    OUTPUT_QUIET
 )
 if (output MATCHES "dyld-1015\.7")
    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
@@ -594,13 +499,6 @@ if (NOT MSVC)
    endif()
 endif()

-function(add_compile_option_cpp ARG)
-    # Adds a compile option to C/C++ only, but not for Cuda.
-    # Use, e.g., for CPU-architecture flags.
-    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${ARG}>)
-    add_compile_options($<$<COMPILE_LANGUAGE:C>:${ARG}>)
-endfunction()
-
 if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
    message(STATUS "ARM detected")
    if (MSVC)
@@ -635,7 +533,8 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
            include(cmake/FindSIMD.cmake)
        endif ()
        if (LLAMA_AVX512)
-            add_compile_option_cpp(/arch:AVX512)
+            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
+            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
            # MSVC has no compile-time flags enabling specific
            # AVX512 extensions, neither it defines the
            # macros corresponding to the extensions.
@@ -649,35 +548,37 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
        elseif (LLAMA_AVX2)
-            add_compile_option_cpp(/arch:AVX2)
+            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
+            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
        elseif (LLAMA_AVX)
-            add_compile_option_cpp(/arch:AVX)
+            add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
+            add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
        endif()
    else()
        if (LLAMA_NATIVE)
-            add_compile_option_cpp(-march=native)
+            add_compile_options(-march=native)
        endif()
        if (LLAMA_F16C)
-            add_compile_option_cpp(-mf16c)
+            add_compile_options(-mf16c)
        endif()
        if (LLAMA_FMA)
-            add_compile_option_cpp(-mfma)
+            add_compile_options(-mfma)
        endif()
        if (LLAMA_AVX)
-            add_compile_option_cpp(-mavx)
+            add_compile_options(-mavx)
        endif()
        if (LLAMA_AVX2)
-            add_compile_option_cpp(-mavx2)
+            add_compile_options(-mavx2)
        endif()
        if (LLAMA_AVX512)
-            add_compile_option_cpp(-mavx512f)
-            add_compile_option_cpp(-mavx512bw)
+            add_compile_options(-mavx512f)
+            add_compile_options(-mavx512bw)
        endif()
        if (LLAMA_AVX512_VBMI)
-            add_compile_option_cpp(-mavx512vbmi)
+            add_compile_options(-mavx512vbmi)
        endif()
        if (LLAMA_AVX512_VNNI)
-            add_compile_option_cpp(-mavx512vnni)
+            add_compile_options(-mavx512vnni)
        endif()
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
@@ -692,11 +593,6 @@ else()
    message(STATUS "Unknown architecture")
 endif()

-if (MINGW)
-    # Target Windows 8 for PrefetchVirtualMemory
-    add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
-endif()
-
 #
 # POSIX conformance
 #
@@ -766,11 +662,11 @@ add_library(ggml OBJECT
            ggml-backend.h
            ggml-quants.c
            ggml-quants.h
-            ${GGML_SOURCES_CUDA}   ${GGML_HEADERS_CUDA}
+            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
-            ${GGML_SOURCES_METAL}  ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_MPI}    ${GGML_HEADERS_MPI}
-            ${GGML_SOURCES_EXTRA}  ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
+            ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
            )

 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
--- a/201
+++ b/201
@@ -1,15 +1,14 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
-	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
+	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
 	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
+	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops
+	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -26,6 +25,20 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif

+ifeq '' '$(findstring clang,$(shell $(CC) --version))'
+	CC_IS_GCC=1
+	CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
+else
+	CC_IS_CLANG=1
+	ifeq '' '$(findstring Apple,$(shell $(CC) --version))'
+		CC_IS_LLVM_CLANG=1
+	else
+		CC_IS_APPLE_CLANG=1
+	endif
+	CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
+				| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
+endif
+
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
@@ -43,6 +56,10 @@ ifeq ($(UNAME_S),Darwin)
 	endif
 endif

+ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
+BUILD_TARGETS += metal
+endif
+
 default: $(BUILD_TARGETS)

 test: $(TEST_TARGETS)
@@ -61,7 +78,7 @@ test: $(TEST_TARGETS)
 			./$$test_target; \
 		fi; \
 		if [ $$? -ne 0 ]; then \
-			printf 'Test %s FAILED!\n\n' $$test_target; \
+			printf 'Test $$test_target FAILED!\n\n' $$test_target; \
 			failures=$$(( failures + 1 )); \
 		else \
 			printf 'Test %s passed.\n\n' $$test_target; \
@@ -103,12 +120,12 @@ MK_CXXFLAGS = -std=c++11 -fPIC

 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
-MK_CFLAGS     += -Ofast
-HOST_CXXFLAGS += -Ofast
-MK_NVCCFLAGS  += -O3
+MK_CFLAGS        += -Ofast
+MK_HOST_CXXFLAGS += -Ofast
+MK_CUDA_CXXFLAGS += -O3
 else
-MK_CFLAGS     += -O3
-MK_CXXFLAGS   += -O3
+MK_CFLAGS        += -O3
+MK_CXXFLAGS      += -O3
 endif

 # clock_gettime came in POSIX.1b (1993)
@@ -202,6 +219,30 @@ MK_CFLAGS    += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
 				-Werror=implicit-function-declaration
 MK_CXXFLAGS  += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn

+ifeq ($(CC_IS_CLANG), 1)
+	# clang options
+	MK_CFLAGS        += -Wunreachable-code-break -Wunreachable-code-return
+	MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
+
+	ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
+		MK_CFLAGS += -Wdouble-promotion
+	endif
+	ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
+		MK_CFLAGS += -Wdouble-promotion
+	endif
+else
+	# gcc options
+	MK_CFLAGS        += -Wdouble-promotion
+	MK_HOST_CXXFLAGS += -Wno-array-bounds
+
+	ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
+		MK_HOST_CXXFLAGS += -Wno-format-truncation
+	endif
+	ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
+		MK_HOST_CXXFLAGS += -Wextra-semi
+	endif
+endif
+
 # this version of Apple ld64 is buggy
 ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
 	MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
@@ -252,8 +293,8 @@ ifndef RISCV

 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
-	MK_CFLAGS     += -march=native -mtune=native
-	HOST_CXXFLAGS += -march=native -mtune=native
+	MK_CFLAGS   += -march=native -mtune=native
+	MK_HOST_CXXFLAGS += -march=native -mtune=native

 	# Usage AVX-only
 	#MK_CFLAGS   += -mfma -mf16c -mavx
@@ -264,31 +305,19 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	#MK_CXXFLAGS += -mssse3
 endif

+# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
+# https://github.com/ggerganov/llama.cpp/issues/2922
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
-	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
-	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
-	# https://github.com/ggerganov/llama.cpp/issues/2922
 	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
 	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
-
-	# Target Windows 8 for PrefetchVirtualMemory
-	MK_CPPFLAGS += -D_WIN32_WINNT=0x602
 endif

 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	# Apple M1, M2, etc.
 	# Raspberry Pi 3, 4, Zero 2 (64-bit)
-	# Nvidia Jetson
 	MK_CFLAGS   += -mcpu=native
 	MK_CXXFLAGS += -mcpu=native
-	JETSON_RELEASE_INFO = $(shell jetson_release)
-	ifdef JETSON_RELEASE_INFO
-		ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),)
-			JETSON_EOL_MODULE_DETECT = 1
-			CC = aarch64-unknown-linux-gnu-gcc
-			cxx = aarch64-unknown-linux-gnu-g++
-		endif
-	endif
 endif

 ifneq ($(filter armv6%,$(UNAME_M)),)
@@ -362,72 +391,64 @@ ifdef LLAMA_BLIS
 endif # LLAMA_BLIS

 ifdef LLAMA_CUBLAS
-	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
-	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
+	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	MK_LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS         += ggml-cuda.o
-	MK_NVCCFLAGS  = -use_fast_math
-ifndef JETSON_EOL_MODULE_DETECT
-	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
-endif # JETSON_EOL_MODULE_DETECT
-ifdef LLAMA_DEBUG
-	MK_NVCCFLAGS += -lineinfo
-endif # LLAMA_DEBUG
+	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
 ifdef LLAMA_CUDA_NVCC
 	NVCC = $(LLAMA_CUDA_NVCC)
 else
 	NVCC = nvcc
 endif #LLAMA_CUDA_NVCC
 ifdef CUDA_DOCKER_ARCH
-	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
-else ifndef CUDA_POWER_ARCH
-	MK_NVCCFLAGS += -arch=native
+	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
+else ifdef CUDA_POWER_ARCH
+	NVCCFLAGS +=
+else
+	NVCCFLAGS += -arch=native
 endif # CUDA_DOCKER_ARCH
 ifdef LLAMA_CUDA_FORCE_DMMV
-	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
+	NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
 ifdef LLAMA_CUDA_FORCE_MMQ
-	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
+	NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
 endif # LLAMA_CUDA_FORCE_MMQ
 ifdef LLAMA_CUDA_DMMV_X
-	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 else
-	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
+	NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
 endif # LLAMA_CUDA_DMMV_X
 ifdef LLAMA_CUDA_MMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
+	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 else ifdef LLAMA_CUDA_DMMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
+	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
 else
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
+	NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # LLAMA_CUDA_MMV_Y
 ifdef LLAMA_CUDA_F16
-	MK_NVCCFLAGS += -DGGML_CUDA_F16
+	NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_F16
 ifdef LLAMA_CUDA_DMMV_F16
-	MK_NVCCFLAGS += -DGGML_CUDA_F16
+	NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_DMMV_F16
 ifdef LLAMA_CUDA_KQUANTS_ITER
-	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 else
-	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
+	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
 ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
+	NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
 else
-	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
+	NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
 endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
 #ifdef LLAMA_CUDA_CUBLAS
-#	MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
+#	NVCCFLAGS += -DGGML_CUDA_CUBLAS
 #endif # LLAMA_CUDA_CUBLAS
 ifdef LLAMA_CUDA_CCBIN
-	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
+	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-ifdef JETSON_EOL_MODULE_DETECT
-	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
-else
-	$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
-endif # JETSON_EOL_MODULE_DETECT
+	$(NVCC) $(NVCCFLAGS) -c $< -o $@
 endif # LLAMA_CUBLAS

 ifdef LLAMA_CLBLAST
@@ -449,22 +470,13 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 endif # LLAMA_CLBLAST

 ifdef LLAMA_HIPBLAS
-
-	ifeq ($(wildcard /opt/rocm),)
-		ROCM_PATH	?= /usr
-		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
-	else
-		ROCM_PATH	?= /opt/rocm
-		GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
-	endif
-	HIPCC                   ?= $(ROCM_PATH)/bin/hipcc
+	ROCM_PATH	?= /opt/rocm
+	HIPCC	    ?= $(ROCM_PATH)/bin/hipcc
+	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
-ifdef LLAMA_HIP_UMA
-	MK_CPPFLAGS += -DGGML_HIP_UMA
-endif # LLAMA_HIP_UMA
 	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
 	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
@@ -498,22 +510,16 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI

-GF_CC := $(CC)
-include scripts/get-flags.mk
-
 # combine build flags with cmdline overrides
-override CFLAGS    := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
-BASE_CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
-override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
-override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
-override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
+override CFLAGS        := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
+override CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
+override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
+override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
+override LDFLAGS       := $(MK_LDFLAGS) $(LDFLAGS)

-# identify CUDA host compiler
-ifdef LLAMA_CUBLAS
-GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
-include scripts/get-flags.mk
-CUDA_CXXFLAGS := $(GF_CXXFLAGS)
-endif
+# save CXXFLAGS before we add host-only options
+NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
+override CXXFLAGS += $(HOST_CXXFLAGS)

 #
 # Print build information
@@ -610,9 +616,6 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp    build-info.o ggml.
 perplexity: examples/perplexity/perplexity.cpp                ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-imatrix: examples/imatrix/imatrix.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
 embedding: examples/embedding/embedding.cpp                   ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

@@ -622,7 +625,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual

-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
+gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
@@ -661,11 +664,10 @@ parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+ifdef LLAMA_METAL
+metal: examples/metal/metal.cpp ggml.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+endif

 ifeq ($(UNAME_S),Darwin)
 swift: examples/batched.swift
@@ -727,16 +729,16 @@ tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
 tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
@@ -744,6 +746,3 @@ tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)

 tests/test-c.o: tests/test-c.c llama.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
-
-tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@@ -2,41 +2,59 @@

 import PackageDescription

+#if arch(arm) || arch(arm64)
+let platforms: [SupportedPlatform]? = [
+    .macOS(.v12),
+    .iOS(.v14),
+    .watchOS(.v4),
+    .tvOS(.v14)
+]
+let exclude: [String] = []
+let resources: [Resource] = [
+    .process("ggml-metal.metal")
+]
+let additionalSources: [String] = ["ggml-metal.m"]
+let additionalSettings: [CSetting] = [
+    .unsafeFlags(["-fno-objc-arc"]),
+    .define("GGML_USE_METAL")
+]
+#else
+let platforms: [SupportedPlatform]? = nil
+let exclude: [String] = ["ggml-metal.metal"]
+let resources: [Resource] = []
+let additionalSources: [String] = []
+let additionalSettings: [CSetting] = []
+#endif
+
 let package = Package(
    name: "llama",
-    platforms: [
-        .macOS(.v12),
-        .iOS(.v14),
-        .watchOS(.v4),
-        .tvOS(.v14)
-    ],
+    platforms: platforms,
    products: [
        .library(name: "llama", targets: ["llama"]),
    ],
-    dependencies: [
-        .package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
-    ],
    targets: [
        .target(
            name: "llama",
-            dependencies: ["ggml"],
            path: ".",
-            exclude: ["ggml-metal.metal"],
+            exclude: exclude,
            sources: [
+                "ggml.c",
                "llama.cpp",
-            ],
+                "ggml-alloc.c",
+                "ggml-backend.c",
+                "ggml-quants.c",
+            ] + additionalSources,
+            resources: resources,
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE"),
-                .unsafeFlags(["-fno-objc-arc"]),
-                .define("GGML_USE_METAL"),
+                .define("GGML_USE_ACCELERATE")
                // NOTE: NEW_LAPACK will required iOS version 16.4+
                // We should consider add this in the future when we drop support for iOS 14
                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
                // .define("ACCELERATE_NEW_LAPACK"),
                // .define("ACCELERATE_LAPACK_ILP64")
-            ],
+            ] + additionalSettings,
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
--- a/README.md
+++ b/README.md
@@ -10,12 +10,9 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 ### Hot topics

- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
- Collecting Apple Silicon performance stats:
-  - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
-  - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
+- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
+- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167

 ----

@@ -98,20 +95,7 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
 - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
 - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
 - [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586)
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
- [x] [GPT-2](https://huggingface.co/gpt2)
-
-**Multimodal models:**
-
- [x] [Llava 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e)
- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)


 **Bindings:**
@@ -119,7 +103,6 @@ as the main playground for developing new features for the [ggml](https://github
 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
 - Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
@@ -127,7 +110,6 @@ as the main playground for developing new features for the [ggml](https://github
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)

 **UI:**

@@ -136,8 +118,6 @@ as the main playground for developing new features for the [ggml](https://github
 - [withcatai/catai](https://github.com/withcatai/catai)
 - [semperai/amica](https://github.com/semperai/amica)
 - [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)
- [iohub/collama](https://github.com/iohub/coLLaMA)

 ---

@@ -388,37 +368,20 @@ Building the program with BLAS support may lead to some performance improvements

  Check [BLIS.md](docs/BLIS.md) for more information.

- #### Intel oneMKL
-  - Using manual oneAPI installation:
-    By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
-      ```bash
-      mkdir build
-      cd build
-      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-runtime docker image, only required for manual installation
-      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
-      cmake --build . --config Release
-      ```
+- #### Intel MKL

-  - Using oneAPI docker image:
-    If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime)
+  By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:

-      ```bash
-      mkdir build
-      cd build
-      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
-      cmake --build . --config Release
-      ```
-
-  Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni.
-
-  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
+  ```bash
+  mkdir build
+  cd build
+  cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+  cmake --build . --config Release
+  ```

 - #### cuBLAS

  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
-
-  For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
-
  - Using `make`:
    ```bash
    make LLAMA_CUBLAS=1
@@ -456,21 +419,14 @@ Building the program with BLAS support may lead to some performance improvements
    ```bash
    make LLAMA_HIPBLAS=1
    ```
-  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
+  - Using `CMake` for Linux:
    ```bash
-    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
-        cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
-        && cmake --build build -- -j 16
+    mkdir build
+    cd build
+    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
+    cmake --build .
    ```
-    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
-    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
-
-  - Using `make` (example for target gfx1030, build with 16 CPU threads):
-    ```bash
-    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
-    ```
-
-  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
+  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS):
    ```bash
    set PATH=%HIP_PATH%\bin;%PATH%
    mkdir build
@@ -479,11 +435,10 @@ Building the program with BLAS support may lead to some performance improvements
    cmake --build .
    ```
    Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
-    Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.


  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
-  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
+  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):

  | Option                  | Legal values           | Default | Description |
@@ -1014,8 +969,6 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
 - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`

 ### Docs

--- a/awq-py/README.md
+++ b/awq-py/README.md
@@ -1,116 +0,0 @@
-# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
-[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
-
-**Supported models:**
-
- [X] LLaMA
- [x] LLaMA 2
- [X] MPT
- [X] Mistral AI v0.1
- [ ] Bloom
- [ ] Mixtral MoE
-
-**TODO:**
- [x] Update version work with both MPT and MPT-AWQ model
- [ ] Add OPT model
- [ ] Add Bloom model
- [ ] Add Mixtral MoE
- [ ] Support w3, w2
-
-
-## Contents
-
- [Install](##Install)
- [Convert](##Convert)
- [Quantize](##Quantize)
- [Test](##Test)
- [Benchmark](##Benchmark)
- [Results](##Results)
-
-## Install
-Install requirements
-```bash
-pip install -r requirements.txt
-```
-Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT
-```bash
-git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
-```
-
-## Convert
-Example for llama model
-```bash
-# For llama7b and llama2 models
-python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
-# For mistral and mpt models
-python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
-```
-
-## Quantize
-```bash
-# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types.
-./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
-```
-
-## Test
-```bash
-# For all models.
-./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time"
-```
-
-## Benchmark
-The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
-```bash
-# For llama and llama2, and mistral models.
-./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
-```
-
-## Results
-Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
-We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
-
-### Llama 7B (Build with OpenBLAS)
-
-| Model      | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
-|-----------:|--------------|-------:|-------:|-------:|-------:|
-|Llama 7B    | perplexity   | 5.9066 | 6.1214 | 6.0643 | 6.5808 |
-|Llama 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|Llama 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-|AWQ-LLama 7B| perplexity   | 5.9175 | 6.0252 | 5.9987 | 6.3692 |
-|AWQ-LLama 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|AWQ-LLama 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-
-
-### Llama2 7B (Build with CuBLAS)
-
-| Model       | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
-|------------:|--------------|-------:|-------:|-------:|-------:|
-|Llama2 7B    | perplexity   | 5.8664 | 6.0260 | 6.0656 | 6.4496 |
-|Llama2 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|Llama2 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-|AWQ-LLama2 7B| perplexity   | 5.8801 | 6.0054 | 5.9849 | 6.3650 |
-|AWQ-LLama2 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|AWQ-LLama2 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-
-
-### Mistral 7B v0.1 (Build with CuBLAS)
-
-| Model        | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
-|-------------:|--------------|-------:|-------:|-------:|-------:|
-|Mistral 7B    | perplexity   | 5.6931 | 5.8202 | 5.8268 | 6.1645 |
-|Mistral 7B    | file size     |  14.5G |   4.1G |   4.5G |   3.1G |
-|Mistral 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-|AWQ-Mistral 7B| perplexity   | 5.6934 | 5.8020 | 5.7691 | 6.0426 |
-|AWQ-Mistral 7B| file size     |  14.5G |   4.1G |   4.5G |   3.1G |
-|AWQ-Mistral 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-
-### MPT 7B (Build with OpenBLAS)
-
-| Model    | Measure      | F16    | Q4_0   | Q4_1   | Q2_K    |
-|---------:|--------------|-------:|-------:|-------:|--------:|
-|MPT 7B    | perplexity   | 8.4369 | 8.7956 | 8.6265 | 11.4913 |
-|MPT 7B    | file size    |  13.7G  |   3.9G |   4.3G |   2.8G  |
-|MPT 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  |
-|AWQ-MPT 7B| perplexity   | 8.4944 | 8.7053 |  8.6750 | 10.2873|
-|AWQ-MPT 7B| file size    |  13.7G  |   3.9G |   4.3G |   2.8G  |
-|AWQ-MPT 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  |
--- a/awq-py/awq/apply_awq.py
+++ b/awq-py/awq/apply_awq.py
@@ -1,254 +0,0 @@
-"""
-Implements the AWQ for llama.cpp use cases.
-Original paper: https://arxiv.org/abs/2306.00978
-
-This code is based on versions of the AWQ implementation found in the following repositories:
-* https://github.com/mit-han-lab/llm-awq
-* https://github.com/casper-hansen/AutoAWQ
-"""
-
-import os
-import torch
-import torch.nn as nn
-
-from transformers import AutoModelForCausalLM, AutoConfig
-from transformers.models.bloom.modeling_bloom import BloomGelu
-from transformers.models.llama.modeling_llama import LlamaRMSNorm
-from transformers.activations import GELUActivation
-
-
-class ScaledActivation(nn.Module):
-    """
-    ScaledActivation module wraps an existing activation function and applies a
-    scale factor to its output.
-
-    Args:
-        module (nn.Module): The activation function to be scaled.
-        scales (torch.Tensor): A tensor of size (num_features,) containing the initial
-            scale factors for each feature.
-
-    Returns:
-        torch.Tensor: The scaled output of the activation function.
-    """
-
-    def __init__(self, module, scales):
-        super().__init__()
-        self.act = module
-        self.scales = nn.Parameter(scales.data)
-
-    def forward(self, x):
-        return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
-
-
-def set_op_by_name(layer, name, new_module):
-    """
-    Set the new module for given module's name.
-
-    Args:
-        layer (nn.Module): The layer in which to replace the submodule.
-        name (str): The path to the submodule to be replaced, using dot notation
-            to access nested modules.
-        new_module (nn.Module): The new module to replace the existing one.
-    """
-    levels = name.split(".")
-    if len(levels) > 1:
-        mod_ = layer
-        for l_idx in range(len(levels) - 1):
-            if levels[l_idx].isdigit():
-                mod_ = mod_[int(levels[l_idx])]
-            else:
-                mod_ = getattr(mod_, levels[l_idx])
-        setattr(mod_, levels[-1], new_module)
-    else:
-        setattr(layer, name, new_module)
-
-
-def get_op_by_name(module, op_name):
-    """
-    Retrieves a submodule within a given layer based on its name.
-
-    Args:
-        module (nn.Module): The layer containing the submodule to find.
-        op_name (str): The name of the submodule.
-
-    Returns:
-        nn.Module: The requested submodule found within the given layer.
-
-    Raises:
-        ValueError: If the specified submodule cannot be found within the layer.
-    """
-    for name, m in module.named_modules():
-        if name == op_name:
-            return m
-    raise ValueError(f"Cannot find op {op_name} in module {module}")
-
-
-@torch.no_grad()
-def scale_ln_fcs(ln, fcs, scales):
-    """
-    Scales the weights of a LayerNorm and a list of fully-connected layers proportionally.
-
-    Args:
-        ln (nn.LayerNorm): The LayerNorm module to be scaled.
-        fcs (List[nn.Linear]): A list of fully-connected layers to be scaled.
-        scales (torch.Tensor): A 1D tensor of size (num_features,).
-    """
-
-    if not isinstance(fcs, list):
-        fcs = [fcs]
-
-    scales = scales.to(ln.weight.device)
-
-    ln.weight.div_(scales)
-    if hasattr(ln, "bias") and ln.bias is not None:
-        ln.bias.div_(scales)
-
-    for fc in fcs:
-        fc.weight.mul_(scales.view(1, -1))
-
-    for p in ln.parameters():
-        assert torch.isnan(p).sum() == 0
-    for fc in fcs:
-        for p in fc.parameters():
-            assert torch.isnan(p).sum() == 0
-
-
-@torch.no_grad()
-def scale_fc_fc(fc1, fc2, scales):
-    """
-    Scales the weights of two fully-connected layers in a specific pattern.
-
-    Args:
-        fc1 (nn.Linear): The first fully-connected layer to be scaled.
-        fc2 (nn.Linear): The second fully-connected layer to be scaled.
-        scales (torch.Tensor): A 1D tensor of size (num_features,).
-    """
-    assert isinstance(fc1, nn.Linear)
-    assert isinstance(fc2, nn.Linear)
-
-    scales = scales.to(fc1.weight.device)
-
-    fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
-    if fc1.bias is not None:
-        fc1.bias.div_(scales.view(-1))
-
-    fc2.weight.mul_(scales.view(1, -1))
-
-    for p in fc1.parameters():
-        assert torch.isnan(p).sum() == 0
-    for p in fc2.parameters():
-        assert torch.isnan(p).sum() == 0
-
-
-@torch.no_grad()
-def scale_gelu_fc(gelu, fc, scales):
-    """
-    Scales the weight of a GELU activation and a fully-connected layer proportionally.
-
-    Args:
-        gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled.
-        fc (nn.Linear): The fully-connected layer to be scaled.
-        scales (torch.Tensor): A 1D tensor of size (num_features,).
-
-    Raises:
-        TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`.
-        TypeError: If the `fc` module is not of type `nn.Linear`.
-    """
-    assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
-    assert isinstance(fc, nn.Linear)
-
-    fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
-
-    for p in fc.parameters():
-        assert torch.isnan(p).sum() == 0
-
-
-def apply_scale(module, scales_list, input_feat_dict=None):
-    """
-    Applies different scaling strategies to layers based on their type and hierarchy within a given module.
-
-    Args:
-        module (nn.Module): The module containing the layers to be scaled.
-        scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing:
-            * prev_op_name (str): The name of the preceding operation or module,
-                relative to which the layers to be scaled are located.
-            * layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation.
-            * scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
-        input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding
-            input features (optional).
-    """
-    for prev_op_name, layer_names, scales in scales_list:
-        prev_op = get_op_by_name(module, prev_op_name)
-        layers = [get_op_by_name(module, name) for name in layer_names]
-
-        prev_op.cuda()
-        for layer in layers:
-            layer.cuda()
-        scales.cuda()
-
-        if isinstance(prev_op, nn.Linear):
-            assert len(layers) == 1
-            scale_fc_fc(prev_op, layers[0], scales)
-        elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower():
-            scale_ln_fcs(prev_op, layers, scales)
-        elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
-            new_module = ScaledActivation(prev_op, scales)
-            set_op_by_name(module, prev_op_name, new_module)
-            scale_gelu_fc(prev_op, layers[0], scales)
-        else:
-            raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!")
-
-        # apply the scaling to input feat if given; prepare it for clipping
-        if input_feat_dict is not None:
-            for layer_name in layer_names:
-                inp = input_feat_dict[layer_name]
-                inp.div_(scales.view(1, -1).to(inp.device))
-
-        prev_op.cpu()
-        for layer in layers:
-            layer.cpu()
-        scales.cpu()
-
-
-@torch.no_grad()
-def apply_clip(module, clip_list):
-    """
-    Applies element-wise clipping to the weight of a specific layer within a given module.
-
-    Args:
-        module (nn.Module): The module containing the layer to be clipped.
-        clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing:
-            * name (str): The name of the layer to be clipped, relative to the root of the module.
-            * max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight.
-    """
-    for name, max_val in clip_list:
-        layer = get_op_by_name(module, name)
-        layer.cuda()
-        max_val = max_val.to(layer.weight.device)
-        org_shape = layer.weight.shape
-        layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
-        layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
-        layer.weight.data = layer.weight.data.reshape(org_shape)
-        layer.cpu()
-
-
-def add_scale_weights(model_path, scale_path, tmp_path):
-    """
-    Adds pre-computed Activation Weight Quantization (AWQ) results to a model,
-    including scaling factors and clipping bounds.
-
-    Args:
-        model_path (str): Path to the pre-trained model to be equipped with AWQ.
-        scale_path (str): Path to the AWQ scale factors (.pt file).
-        tmp_path (str): Path to the temporary directory where the equipped model will be saved.
-    """
-    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path, config=config, trust_remote_code=True
-    )
-    model.eval()
-    awq_results = torch.load(str(scale_path), map_location="cpu")
-    apply_scale(model, awq_results["scale"])
-    apply_clip(model, awq_results["clip"])
-    model.save_pretrained(str(tmp_path))
-    os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")
--- a/awq-py/requirements.txt
+++ b/awq-py/requirements.txt
@@ -1,2 +0,0 @@
-torch>=2.1.1
-transformers>=4.32.0
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -30,12 +30,6 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`

-CMAKE_EXTRA=""
-
-if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
-fi
-
 ## helpers

 # download a file if it does not exist or if it is outdated
@@ -87,8 +81,8 @@ function gg_run_ctest_debug {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Debug ..     ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log

    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log

@@ -115,8 +109,8 @@ function gg_run_ctest_release {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log

    if [ -z ${GG_BUILD_LOW_PERF} ]; then
        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -65,4 +65,4 @@ endif()

 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
+target_link_libraries(${TARGET} PRIVATE llama build_info)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -167,24 +167,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            if (params.n_threads_batch <= 0) {
                params.n_threads_batch = std::thread::hardware_concurrency();
            }
-        } else if (arg == "-td" || arg == "--threads-draft") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads_draft = std::stoi(argv[i]);
-            if (params.n_threads_draft <= 0) {
-                params.n_threads_draft = std::thread::hardware_concurrency();
-            }
-        } else if (arg == "-tbd" || arg == "--threads-batch-draft") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads_batch_draft = std::stoi(argv[i]);
-            if (params.n_threads_batch_draft <= 0) {
-                params.n_threads_batch_draft = std::thread::hardware_concurrency();
-            }
        } else if (arg == "-p" || arg == "--prompt") {
            if (++i >= argc) {
                invalid_param = true;
@@ -238,20 +220,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--grp-attn-n" || arg == "-gan") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-
-            params.grp_attn_n = std::stoi(argv[i]);
-        } else if (arg == "--grp-attn-w" || arg == "-gaw") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-
-            params.grp_attn_w = std::stoi(argv[i]);
        } else if (arg == "--rope-freq-base") {
            if (++i >= argc) {
                invalid_param = true;
@@ -310,18 +278,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.yarn_beta_slow = std::stof(argv[i]);
-        } else if (arg == "--samplers") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.samplers_sequence = parse_samplers_input(argv[i]);
-        } else if (arg == "--sampling-seq") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.samplers_sequence = argv[i];
+        } else if (arg == "--memory-f32") {
+            params.memory_f16 = false;
        } else if (arg == "--top-p") {
            if (++i >= argc) {
                invalid_param = true;
@@ -540,12 +498,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.infill = true;
        } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
            params.dump_kv_cache = true;
-        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
-            params.no_kv_offload = true;
-        } else if (arg == "-ctk" || arg == "--cache-type-k") {
-            params.cache_type_k = argv[++i];
-        } else if (arg == "-ctv" || arg == "--cache-type-v") {
-            params.cache_type_v = argv[++i];
        } else if (arg == "--multiline-input") {
            params.multiline_input = true;
        } else if (arg == "--simple-io") {
@@ -561,8 +513,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
            params.n_gpu_layers = std::stoi(argv[i]);
-#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
+#else
            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
@@ -571,8 +524,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
            params.n_gpu_layers_draft = std::stoi(argv[i]);
-#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
+#else
            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 #endif
@@ -581,44 +535,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
+#ifdef GGML_USE_CUBLAS
            params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS
-        } else if (arg == "--split-mode" || arg == "-sm") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string arg_next = argv[i];
-            if (arg_next == "none") {
-                params.split_mode = LLAMA_SPLIT_NONE;
-            } else if (arg_next == "layer") {
-                params.split_mode = LLAMA_SPLIT_LAYER;
-            } else if (arg_next == "row") {
-                params.split_mode = LLAMA_SPLIT_ROW;
-            } else {
-                invalid_param = true;
-                break;
-            }
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#else
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
+#endif
        } else if (arg == "--tensor-split" || arg == "-ts") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
+#ifdef GGML_USE_CUBLAS
            std::string arg_next = argv[i];

            // split string by , and /
            const std::regex regex{R"([,/]+)"};
            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
            std::vector<std::string> split_arg{it, {}};
-            if (split_arg.size() >= LLAMA_MAX_DEVICES) {
-                invalid_param = true;
-                break;
-            }
+            GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
+
            for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
                if (i < split_arg.size()) {
                    params.tensor_split[i] = std::stof(split_arg[i]);
@@ -626,8 +561,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                    params.tensor_split[i] = 0.0f;
                }
            }
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
+#else
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
+#endif // GGML_USE_CUBLAS
+        } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
+#ifdef GGML_USE_CUBLAS
+            params.mul_mat_q = false;
+#else
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n");
 #endif // GGML_USE_CUBLAS
        } else if (arg == "--no-mmap") {
            params.use_mmap = false;
@@ -635,8 +576,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.numa = true;
        } else if (arg == "--verbose-prompt") {
            params.verbose_prompt = true;
-        } else if (arg == "--no-display-prompt") {
-            params.display_prompt = false;
        } else if (arg == "-r" || arg == "--reverse-prompt") {
            if (++i >= argc) {
                invalid_param = true;
@@ -661,12 +600,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.ppl_stride = std::stoi(argv[i]);
-        } else if (arg == "-ptc" || arg == "--print-token-count") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_print = std::stoi(argv[i]);
        } else if (arg == "--ppl-output-type") {
            if (++i >= argc) {
                invalid_param = true;
@@ -707,10 +640,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "-h" || arg == "--help") {
            return false;

-        } else if (arg == "--version") {
-            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
-            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
        } else if (arg == "--in-prefix-bos") {
@@ -749,47 +678,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                std::istreambuf_iterator<char>(),
                std::back_inserter(sparams.grammar)
            );
-        } else if (arg == "--override-kv") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            char * sep = strchr(argv[i], '=');
-            if (sep == nullptr || sep - argv[i] >= 128) {
-                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            struct llama_model_kv_override kvo;
-            std::strncpy(kvo.key, argv[i], sep - argv[i]);
-            kvo.key[sep - argv[i]] = 0;
-            sep++;
-            if (strncmp(sep, "int:", 4) == 0) {
-                sep += 4;
-                kvo.tag = LLAMA_KV_OVERRIDE_INT;
-                kvo.int_value = std::atol(sep);
-            } else if (strncmp(sep, "float:", 6) == 0) {
-                sep += 6;
-                kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
-                kvo.float_value = std::atof(sep);
-            } else if (strncmp(sep, "bool:", 5) == 0) {
-                sep += 5;
-                kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
-                if (std::strcmp(sep, "true") == 0) {
-                    kvo.bool_value = true;
-                } else if (std::strcmp(sep, "false") == 0) {
-                    kvo.bool_value = false;
-                } else {
-                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
-                    invalid_param = true;
-                    break;
-                }
-            } else {
-                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            params.kv_overrides.push_back(kvo);
 #ifndef LOG_DISABLE_LOGS
        // Parse args for logging parameters
        } else if ( log_param_single_parse( argv[i] ) ) {
@@ -833,11 +721,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        }
    }

-    if (!params.kv_overrides.empty()) {
-        params.kv_overrides.emplace_back(llama_model_kv_override());
-        params.kv_overrides.back().key[0] = 0;
-    }
-
    return true;
 }

@@ -849,7 +732,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("\n");
    printf("options:\n");
    printf("  -h, --help            show this help message and exit\n");
-    printf("  --version             show version and build info\n");
    printf("  -i, --interactive     run in interactive mode\n");
    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
@@ -863,10 +745,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
    printf("  -tb N, --threads-batch N\n");
    printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
-    printf("  -td N, --threads-draft N");
-    printf("                        number of threads to use during generation (default: same as --threads)");
-    printf("  -tbd N, --threads-batch-draft N\n");
-    printf("                        number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
    printf("  -p PROMPT, --prompt PROMPT\n");
    printf("                        prompt to start generation with (default: empty)\n");
    printf("  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
@@ -883,8 +761,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
-    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
@@ -922,6 +798,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    printf("  --no-penalize-nl      do not penalize newline token\n");
+    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
    printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
@@ -950,30 +828,18 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        number of layers to store in VRAM\n");
    printf("  -ngld N, --n-gpu-layers-draft N\n");
    printf("                        number of layers to store in VRAM for the draft model\n");
-    printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
-    printf("                        how to split the model across multiple GPUs, one of:\n");
-    printf("                          - none: use one GPU only\n");
-    printf("                          - layer (default): split layers and KV across GPUs\n");
-    printf("                          - row: split rows across GPUs\n");
-    printf("  -ts SPLIT, --tensor-split SPLIT\n");
-    printf("                        fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
-    printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
-    printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
+    printf("  -ts SPLIT --tensor-split SPLIT\n");
+    printf("                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
+    printf("  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n");
+#ifdef GGML_USE_CUBLAS
+    printf("  -nommq, --no-mul-mat-q\n");
+    printf("                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
+    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
+#endif // GGML_USE_CUBLAS
 #endif
-    printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
-    printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
-    printf("  -gan N, --grp-attn-n N\n");
-    printf("                        group-attention factor (default: %d)\n", params.grp_attn_n);
-    printf("  -gaw N, --grp-attn-w N\n");
-    printf("                        group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
+    printf("  --verbose-prompt      print prompt before generation\n");
    printf("  -dkvc, --dump-kv-cache\n");
    printf("                        verbose print of the KV cache\n");
-    printf("  -nkvo, --no-kv-offload\n");
-    printf("                        disable KV offload\n");
-    printf("  -ctk TYPE, --cache-type-k TYPE\n");
-    printf("                        KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
-    printf("  -ctv TYPE, --cache-type-v TYPE\n");
-    printf("                        KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
    printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
@@ -981,14 +847,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -m FNAME, --model FNAME\n");
    printf("                        model path (default: %s)\n", params.model.c_str());
    printf("  -md FNAME, --model-draft FNAME\n");
-    printf("                        draft model for speculative decoding\n");
+    printf("                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
    printf("  -ld LOGDIR, --logdir LOGDIR\n");
    printf("                        path under which to save YAML logs (no logging if unset)\n");
-    printf("  --override-kv KEY=TYPE:VALUE\n");
-    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
-    printf("  -ptc N, --print-token-count N\n");
-    printf("                        print token count every N tokens (default: %d)\n", params.n_print);
    printf("\n");
 #ifndef LOG_DISABLE_LOGS
    log_print_usage();
@@ -1025,48 +886,6 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
    GGML_UNREACHABLE();
 }

-//
-// String parsing
-//
-
-std::string parse_samplers_input(std::string input) {
-    std::string output = "";
-    // since samplers names are written multiple ways
-    // make it ready for both system names and input names
-    std::unordered_map<std::string, char> samplers_symbols {
-        {"top_k",      'k'},
-        {"top-k",      'k'},
-        {"top_p",      'p'},
-        {"top-p",      'p'},
-        {"nucleus",    'p'},
-        {"typical_p",  'y'},
-        {"typical-p",  'y'},
-        {"typical",    'y'},
-        {"min_p",      'm'},
-        {"min-p",      'm'},
-        {"tfs_z",      'f'},
-        {"tfs-z",      'f'},
-        {"tfs",        'f'},
-        {"temp",       't'},
-        {"temperature",'t'}
-    };
-    // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
-    size_t separator = input.find(';');
-    while (separator != input.npos) {
-        std::string name = input.substr(0,separator);
-        input = input.substr(separator+1);
-        separator = input.find(';');
-
-        if (samplers_symbols.find(name) != samplers_symbols.end()) {
-            output += samplers_symbols[name];
-        }
-    }
-    if (samplers_symbols.find(input) != samplers_symbols.end()) {
-        output += samplers_symbols[input];
-    }
-    return output;
-}
-
 //
 // Model utils
 //
@@ -1078,46 +897,13 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
    mparams.main_gpu        = params.main_gpu;
-    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
-    if (params.kv_overrides.empty()) {
-        mparams.kv_overrides = NULL;
-    } else {
-        GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
-        mparams.kv_overrides = params.kv_overrides.data();
-    }

    return mparams;
 }

-static ggml_type kv_cache_type_from_str(const std::string & s) {
-    if (s == "f32") {
-        return GGML_TYPE_F32;
-    }
-    if (s == "f16") {
-        return GGML_TYPE_F16;
-    }
-    if (s == "q8_0") {
-        return GGML_TYPE_Q8_0;
-    }
-    if (s == "q4_0") {
-        return GGML_TYPE_Q4_0;
-    }
-    if (s == "q4_1") {
-        return GGML_TYPE_Q4_1;
-    }
-    if (s == "q5_0") {
-        return GGML_TYPE_Q5_0;
-    }
-    if (s == "q5_1") {
-        return GGML_TYPE_Q5_1;
-    }
-
-    throw std::runtime_error("Invalid cache type: " + s);
-}
-
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto cparams = llama_context_default_params();

@@ -1127,6 +913,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    cparams.mul_mat_q         = params.mul_mat_q;
    cparams.seed              = params.seed;
+    cparams.f16_kv            = params.memory_f16;
    cparams.logits_all        = params.logits_all;
    cparams.embedding         = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
@@ -1137,10 +924,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.yarn_beta_fast    = params.yarn_beta_fast;
    cparams.yarn_beta_slow    = params.yarn_beta_slow;
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
-    cparams.offload_kqv       = !params.no_kv_offload;
-
-    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
-    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);

    return cparams;
 }
@@ -1461,7 +1244,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
-    fprintf(stream, "cpu_has_avx_vnni: %s\n",    ggml_cpu_has_avx_vnni()    ? "true" : "false");
    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
@@ -1554,6 +1336,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    }
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
+    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
@@ -1607,7 +1390,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
-    fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
 }

 //
--- a/common/common.h
+++ b/common/common.h
@@ -46,14 +46,12 @@ struct gpt_params {
    uint32_t seed                           = -1;    // RNG seed

    int32_t n_threads                       = get_num_physical_cores();
-    int32_t n_threads_draft                 = -1;
    int32_t n_threads_batch                 = -1;    // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft           = -1;
    int32_t n_predict                       = -1;    // new tokens to predict
    int32_t n_ctx                           = 512;   // context size
    int32_t n_batch                         = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                          = 0;     // number of tokens to keep from initial prompt
-    int32_t n_draft                         = 8;     // number of tokens to draft during speculative decoding
+    int32_t n_draft                         = 16;    // number of tokens to draft during speculative decoding
    int32_t n_chunks                        = -1;    // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel                      = 1;     // number of parallel sequences to decode
    int32_t n_sequences                     = 1;     // number of sequences to decode
@@ -61,13 +59,9 @@ struct gpt_params {
    float   p_split                         = 0.1f;  // speculative decoding split probability
    int32_t n_gpu_layers                    = -1;    // number of layers to store in VRAM (-1 - use default)
    int32_t n_gpu_layers_draft              = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
-    llama_split_mode split_mode             = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
    int32_t main_gpu                        = 0;     // the GPU that is used for scratch and small tensors
    float   tensor_split[LLAMA_MAX_DEVICES] = {0};   // how split tensors should be distributed across GPUs
    int32_t n_beams                         = 0;     // if non-zero then use beam search of given width.
-    int32_t grp_attn_n                      = 1;     // group-attention factor
-    int32_t grp_attn_w                      = 512;   // group-attention width
-    int32_t n_print                         = -1;    // print token count every n tokens (-1 = disabled)
    float   rope_freq_base                  = 0.0f;  // RoPE base frequency
    float   rope_freq_scale                 = 0.0f;  // RoPE frequency scaling factor
    float   yarn_ext_factor                 = -1.0f; // YaRN extrapolation mix factor
@@ -92,8 +86,6 @@ struct gpt_params {
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files

-    std::vector<llama_model_kv_override> kv_overrides;
-
    // TODO: avoid tuple, use struct
    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
    std::string lora_base  = "";                              // base model path for the lora adapter
@@ -106,6 +98,7 @@ struct gpt_params {
    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score

    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
+    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
@@ -128,17 +121,12 @@ struct gpt_params {
    bool use_mlock         = false; // use mlock to keep model in memory
    bool numa              = false; // attempt optimizations that help on some NUMA systems
    bool verbose_prompt    = false; // print prompt tokens before generation
-    bool display_prompt    = true;  // print prompt before generation
    bool infill            = false; // use infill mode
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
-    bool no_kv_offload     = false; // disable KV offloading
-
-    std::string cache_type_k = "f16"; // KV cache data type for the K
-    std::string cache_type_v = "f16"; // KV cache data type for the V

    // multimodal models (see examples/llava)
    std::string mmproj = ""; // path to multimodal projector
-    std::string image  = ""; // path to an image file
+    std::string image = ""; // path to an image file
 };

 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
@@ -153,12 +141,6 @@ std::string gpt_random_prompt(std::mt19937 & rng);

 void process_escapes(std::string& input);

-//
-// String parsing
-//
-
-std::string parse_samplers_input(std::string input);
-
 //
 // Model utils
 //
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -190,7 +190,7 @@ namespace grammar_parser {
                pos = parse_space(pos + 1, is_nested);
            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
                if (last_sym_start == out_elements.size()) {
-                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
+                    throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
                }

                // apply transformation to previous symbol (last_sym_start to end) according to
--- a/common/log.h
+++ b/common/log.h
@@ -61,13 +61,13 @@
 //  #define LOG_TARGET stderr
 //  #include "log.h"
 //
-//  The log target can also be redirected to a different function
+//  The log target can also be redirected to a diffrent function
 //  like so:
 //
-//  #define LOG_TARGET log_handler_different()
+//  #define LOG_TARGET log_handler_diffrent()
 //  #include "log.h"
 //
-//  FILE* log_handler_different()
+//  FILE* log_handler_diffrent()
 //  {
 //      return stderr;
 //  }
@@ -421,7 +421,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS

 // Disables logs entirely at runtime.
 //  Makes LOG() and LOG_TEE() produce no output,
-//  until enabled back.
+//  untill enabled back.
 #define log_disable() log_disable_impl()

 // INTERNAL, DO NOT USE
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -99,67 +99,21 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
    return std::string(result);
 }

-std::string llama_sampling_order_print(const llama_sampling_params & params) {
-    std::string result = "CFG -> Penalties ";
-    if (params.mirostat == 0) {
-        for (auto s : params.samplers_sequence) {
-            switch (s) {
-                case 'k': result += "-> top_k "; break;
-                case 'f': result += "-> tfs_z "; break;
-                case 'y': result += "-> typical_p "; break;
-                case 'p': result += "-> top_p "; break;
-                case 'm': result += "-> min_p "; break;
-                case 't': result += "-> temp "; break;
-                default : break;
-            }
-        }
-    } else {
-        result += "-> mirostat ";
-    }
-
-    return result;
-}
-
-// no reasons to expose this function in header
-static void sampler_queue(
-                   struct llama_context * ctx_main,
-            const llama_sampling_params & params,
-                 llama_token_data_array & cur_p,
-                                 size_t & min_keep) {
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
-    const float         temp              = params.temp;
-    const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
-    const float         top_p             = params.top_p;
-    const float         min_p             = params.min_p;
-    const float         tfs_z             = params.tfs_z;
-    const float         typical_p         = params.typical_p;
-    const std::string & samplers_sequence = params.samplers_sequence;
-
-    for (auto s : samplers_sequence) {
-        switch (s){
-            case 'k': llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
-            case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
-            case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
-            case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
-            case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case 't': llama_sample_temp     (ctx_main, &cur_p, temp); break;
-            default : break;
-        }
-    }
-}
-
-static llama_token llama_sampling_sample_impl(
+llama_token llama_sampling_sample(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool is_resampling) {  // Add a parameter to indicate if we are resampling
+                  const int idx) {
    const llama_sampling_params & params = ctx_sampling->params;

    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));

    const float   temp            = params.temp;
+    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
+    const float   top_p           = params.top_p;
+    const float   min_p           = params.min_p;
+    const float   tfs_z           = params.tfs_z;
+    const float   typical_p       = params.typical_p;
    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
    const float   penalty_repeat  = params.penalty_repeat;
    const float   penalty_freq    = params.penalty_freq;
@@ -174,27 +128,13 @@ static llama_token llama_sampling_sample_impl(

    llama_token id = 0;

-    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);

-    // Declare original_logits at the beginning of the function scope
-    std::vector<float> original_logits;
-
-    if (!is_resampling) {
-        // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this.
-        original_logits = std::vector<float>(logits, logits + llama_n_vocab(llama_get_model(ctx_main)));
-    }
-
    // apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
        logits[it->first] += it->second;
    }

-    if (ctx_cfg) {
-        float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx);
-        llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
-    }
-
    cur.clear();

    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
@@ -203,15 +143,17 @@ static llama_token llama_sampling_sample_impl(

    llama_token_data_array cur_p = { cur.data(), cur.size(), false };

+    if (ctx_cfg) {
+        llama_sample_classifier_free_guidance(ctx_main, &cur_p, ctx_cfg, params.cfg_scale);
+    }
+
    // apply penalties
-    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
-    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
-    if (penalty_tokens_used_size) {
+    if (!prev.empty()) {
        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];

        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
-                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
+                prev.data() + prev.size() - penalty_last_n,
+                penalty_last_n, penalty_repeat, penalty_freq, penalty_present);

        if (!penalize_nl) {
            for (size_t idx = 0; idx < cur_p.size; idx++) {
@@ -223,8 +165,7 @@ static llama_token llama_sampling_sample_impl(
        }
    }

-    // If we are in the resampling phase, apply grammar checks before sampling logic
-    if (is_resampling && ctx_sampling->grammar != NULL) {
+    if (ctx_sampling->grammar != NULL) {
        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    }

@@ -247,7 +188,12 @@ static llama_token llama_sampling_sample_impl(
            // temperature sampling
            size_t min_keep = std::max(1, params.n_probs);

-            sampler_queue(ctx_main, params, cur_p, min_keep);
+            llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep);
+            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
+            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
+            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
+            llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep);
+            llama_sample_temp     (ctx_main, &cur_p, temp);

            id = llama_sample_token(ctx_main, &cur_p);

@@ -266,40 +212,9 @@ static llama_token llama_sampling_sample_impl(
        }
    }

-    if (ctx_sampling->grammar != NULL && !is_resampling) {
-        // Create an array with a single token data element for the sampled id
-        llama_token_data single_token_data = {id, logits[id], 0.0f};
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
-
-        // Apply grammar constraints to the single token
-        llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
-
-        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
-        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-
-        // If the token is not valid according to the grammar, perform resampling
-        if (!is_valid) {
-            LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
-
-            // Restore logits from the copy
-            std::copy(original_logits.begin(), original_logits.end(), logits);
-
-            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true);  // Pass true for is_resampling
-        }
-    }
-
    return id;
 }

-llama_token llama_sampling_sample(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    // Call the implementation function with is_resampling set to false by default
-    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
-}
-
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -10,23 +10,22 @@

 // sampling parameters
 typedef struct llama_sampling_params {
-    int32_t     n_prev                = 64;       // number of previous tokens to remember
-    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t     top_k                 = 40;       // <= 0 to use vocab size
-    float       top_p                 = 0.95f;    // 1.0 = disabled
-    float       min_p                 = 0.05f;    // 0.0 = disabled
-    float       tfs_z                 = 1.00f;    // 1.0 = disabled
-    float       typical_p             = 1.00f;    // 1.0 = disabled
-    float       temp                  = 0.80f;    // 1.0 = disabled
-    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
-    float       penalty_freq          = 0.00f;    // 0.0 = disabled
-    float       penalty_present       = 0.00f;    // 0.0 = disabled
-    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float       mirostat_tau          = 5.00f;    // target entropy
-    float       mirostat_eta          = 0.10f;    // learning rate
-    bool        penalize_nl           = true;     // consider newlines as a repeatable token
-    std::string samplers_sequence     = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
+    int32_t n_prev            = 64;    // number of previous tokens to remember
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   min_p             = 0.05f; // 0.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typical_p         = 1.00f; // 1.0 = disabled
+    float   temp              = 0.80f; // 1.0 = disabled
+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat    = 1.10f; // 1.0 = disabled
+    float   penalty_freq      = 0.00f; // 0.0 = disabled
+    float   penalty_present   = 0.00f; // 0.0 = disabled
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+    bool    penalize_nl       = true;  // consider newlines as a repeatable token

    std::string grammar;  // optional BNF-like grammar to constrain sampling

@@ -36,9 +35,6 @@ typedef struct llama_sampling_params {
    float       cfg_scale     = 1.f; // how strong is guidance

    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-
-    std::vector<llama_token> penalty_prompt_tokens;
-    bool                     use_penalty_prompt_tokens = false;
 } llama_sampling_params;

 // general sampler context
@@ -84,9 +80,6 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
 // Print sampling parameters into a string
 std::string llama_sampling_print(const llama_sampling_params & params);

-// Print sampling order into a string
-std::string llama_sampling_order_print(const llama_sampling_params & params);
-
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -71,7 +71,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)

 struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
    float scale = 1.0f; // xavier
-    switch (ggml_n_dims(tensor)) {
+    switch (tensor->n_dims) {
        case 1:
            scale /= sqrtf((float) tensor->ne[0]);
            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
@@ -119,7 +119,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
 }

 struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
-    switch (ggml_n_dims(tensor)) {
+    switch (tensor->n_dims) {
        case 1:
            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
@@ -183,27 +183,25 @@ float fclamp(const float v, const float min, const float max) {
 }

 void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
+    GGML_ASSERT(tensor->n_dims == 1);
    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == 1);
-    GGML_ASSERT(tensor->ne[2] == 1);
-    GGML_ASSERT(tensor->ne[3] == 1);
 }

 void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
+    GGML_ASSERT(tensor->n_dims == 2);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == 1);
-    GGML_ASSERT(tensor->ne[3] == 1);
 }

 void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
+    GGML_ASSERT(tensor->n_dims == 3);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
-    GGML_ASSERT(tensor->ne[3] == 1);
 }

 void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+    GGML_ASSERT(tensor->n_dims == 4);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
@@ -227,8 +225,8 @@ int64_t get_example_targets_batch(
    bool                   sample_random_offsets
 ) {
    GGML_ASSERT(samples_count > 0);
-    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(ggml_is_3d(target_probs));
+    GGML_ASSERT(tokens_input->n_dims  == 2);
+    GGML_ASSERT(target_probs->n_dims  == 3);
    int64_t n_vocab  = target_probs->ne[0];
    int64_t n_tokens = tokens_input->ne[0];
    int64_t n_batch  = tokens_input->ne[1];
@@ -1107,7 +1105,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
    fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
    fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
    fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    fprintf(stderr, "  --overlapping-samples      Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
+    fprintf(stderr, "  --overlapping-samples      Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
    fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
    fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
    fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -10,7 +10,7 @@ import re
 import sys
 from enum import IntEnum
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
+from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast

 import numpy as np
 import torch
@@ -23,15 +23,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
 import gguf


-# check for any of the given keys in the dictionary and return the value of the first key found
-def get_key_opts(d, keys):
-    for k in keys:
-        if k in d:
-            return d[k]
-    print(f"Could not find any of {keys}")
-    sys.exit()
-
-
 ###### MODEL DEFINITIONS ######

 class SentencePieceTokenTypes(IntEnum):
@@ -55,7 +46,7 @@ class Model:
        self.part_names = self._get_part_names()
        self.hparams = Model.load_hparams(self.dir_model)
        self.model_arch = self._get_model_architecture()
-        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
+        self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess)

    def set_vocab(self):
        self._set_vocab_gpt2()
@@ -86,18 +77,8 @@ class Model:
            self.gguf_writer.add_embedding_length(n_embd)
        if (n_ff := self.hparams.get("intermediate_size")) is not None:
            self.gguf_writer.add_feed_forward_length(n_ff)
-        if (n_head := self.hparams.get("num_attention_heads")) is not None:
+        if (n_head := self.hparams.get("num_attention_head")) is not None:
            self.gguf_writer.add_head_count(n_head)
-        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
-            self.gguf_writer.add_head_count_kv(n_head_kv)
-
-        if (n_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
-            self.gguf_writer.add_layer_norm_rms_eps(n_rms_eps)
-        if (n_experts := self.hparams.get("num_local_experts")) is not None:
-            self.gguf_writer.add_expert_count(n_experts)
-        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
-            self.gguf_writer.add_expert_used_count(n_experts_used)
-
        self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))

    def write_tensors(self):
@@ -187,16 +168,6 @@ class Model:
            return PersimmonModel
        if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
            return StableLMModel
-        if model_architecture == "QWenLMHeadModel":
-            return QwenModel
-        if model_architecture == "MixtralForCausalLM":
-            return MixtralModel
-        if model_architecture == "GPT2LMHeadModel":
-            return GPT2Model
-        if model_architecture == "PhiForCausalLM":
-            return Phi2Model
-        if model_architecture == "PlamoForCausalLM":
-            return PlamoModel
        return Model

    def _is_model_safetensors(self) -> bool:
@@ -232,16 +203,6 @@ class Model:
            return gguf.MODEL_ARCH.PERSIMMON
        if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
            return gguf.MODEL_ARCH.STABLELM
-        if arch == "QWenLMHeadModel":
-            return gguf.MODEL_ARCH.QWEN
-        if arch == "MixtralForCausalLM":
-            return gguf.MODEL_ARCH.LLAMA
-        if arch == "GPT2LMHeadModel":
-            return gguf.MODEL_ARCH.GPT2
-        if arch == "PhiForCausalLM":
-            return gguf.MODEL_ARCH.PHI2
-        if arch == "PlamoForCausalLM":
-            return gguf.MODEL_ARCH.PLAMO

        raise NotImplementedError(f'Architecture "{arch}" not supported!')

@@ -251,7 +212,7 @@ class Model:
        tokens: list[bytearray] = []
        toktypes: list[int] = []

-        from transformers import AutoTokenizer
+        from transformers import AutoTokenizer  # type: ignore[attr-defined]
        tokenizer = AutoTokenizer.from_pretrained(dir_model)
        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
        assert max(tokenizer.vocab.values()) < vocab_size
@@ -266,11 +227,10 @@ class Model:
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
-                if hasattr(tokenizer, "added_tokens_decoder"):
-                    if tokenizer.added_tokens_decoder[i].special:
-                        toktypes.append(gguf.TokenType.CONTROL)
-                    else:
-                        toktypes.append(gguf.TokenType.USER_DEFINED)
+                if tokenizer.added_tokens_decoder[i].special:
+                    toktypes.append(gguf.TokenType.CONTROL)
+                else:
+                    toktypes.append(gguf.TokenType.USER_DEFINED)
            else:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.NORMAL)
@@ -478,11 +438,7 @@ class MPTModel(Model):
            data = data_torch.squeeze().numpy()

            # map tensor names
-            if "scales" in name:
-                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
-                new_name = new_name.replace("scales", "act.scales")
-            else:
-                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
                print(f"Can not map tensor {name!r}")
                sys.exit()
@@ -827,17 +783,10 @@ class PersimmonModel(Model):
        hidden_size = self.hparams["hidden_size"]

        self.gguf_writer.add_name('persimmon-8b-chat')
-        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hidden_size)
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-
-        # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
-        #       than the head size?
-        #       ref: https://github.com/ggerganov/llama.cpp/pull/4889
-        # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
-        self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
-
+        self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
        self.gguf_writer.add_head_count(head_count)
        self.gguf_writer.add_head_count_kv(head_count_kv)
        self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
@@ -873,7 +822,7 @@ class StableLMModel(Model):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]

-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(dir_model.name)
        self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
@@ -883,313 +832,15 @@ class StableLMModel(Model):
        self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
        self.gguf_writer.add_layer_norm_eps(1e-5)

-
-class MixtralModel(Model):
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-
-class QwenModel(Model):
-    @staticmethod
-    def token_bytes_to_string(b):
-        from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
-        byte_encoder = bytes_to_unicode()
-        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
-
-    @staticmethod
-    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
-        parts = [bytes([b]) for b in token]
-        while True:
-            min_idx = None
-            min_rank = None
-            for i, pair in enumerate(zip(parts[:-1], parts[1:])):
-                rank = mergeable_ranks.get(pair[0] + pair[1])
-                if rank is not None and (min_rank is None or rank < min_rank):
-                    min_idx = i
-                    min_rank = rank
-            if min_rank is None or (max_rank is not None and min_rank >= max_rank):
-                break
-            assert min_idx is not None
-            parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
-        return parts
-
-    def set_vocab(self):
-        dir_model = self.dir_model
-        hparams = self.hparams
-        tokens: list[bytearray] = []
-        toktypes: list[int] = []
-
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-        vocab_size = hparams["vocab_size"]
-        assert max(tokenizer.get_vocab().values()) < vocab_size
-
-        merges = []
-        vocab = {}
-        mergeable_ranks = tokenizer.mergeable_ranks
-        for token, rank in mergeable_ranks.items():
-            vocab[self.token_bytes_to_string(token)] = rank
-            if len(token) == 1:
-                continue
-            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
-            assert len(merged) == 2
-            merges.append(' '.join(map(self.token_bytes_to_string, merged)))
-
-        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
-        added_vocab = tokenizer.special_tokens
-
-        for i in range(vocab_size):
-            if i not in reverse_vocab:
-                pad_token = f"[PAD{i}]".encode("utf-8")
-                tokens.append(bytearray(pad_token))
-                toktypes.append(gguf.TokenType.USER_DEFINED)
-            elif reverse_vocab[i] in added_vocab:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.CONTROL)
-            else:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.NORMAL)
-
-        self.gguf_writer.add_tokenizer_model("gpt2")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
-        special_vocab.merges = merges
-        special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab.add_to_gguf(self.gguf_writer)
-
-    def set_gguf_parameters(self):
-        self.gguf_writer.add_name("Qwen")
-        self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
-        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
-        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
-        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
-        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
-        self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
-
-    def write_tensors(self):
-        block_count = self.hparams["num_hidden_layers"]
-        model_kv = dict(self.get_tensors())
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-        for name, data_torch in model_kv.items():
-            # we don't need these
-            if name.endswith(".rotary_emb.inv_freq"):
-                continue
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-            self.gguf_writer.add_tensor(new_name, data)
-
-
-class GPT2Model(Model):
-    def set_gguf_parameters(self):
-        self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_block_count(self.hparams["n_layer"])
-        self.gguf_writer.add_context_length(self.hparams["n_ctx"])
-        self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
-        self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
-        self.gguf_writer.add_head_count(self.hparams["n_head"])
-        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
-        self.gguf_writer.add_file_type(self.ftype)
-
-    def write_tensors(self):
-        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
-        for name, data_torch in self.get_tensors():
-            # we don't need these
-            if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")):
-                continue
-
-            if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
-                data_torch = data_torch.transpose(1, 0)
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-            # note: GPT2 output is tied to (same as) wte in original model
-            if new_name == "token_embd.weight":
-                print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-                self.gguf_writer.add_tensor("output.weight", data)
-
-
-class Phi2Model(Model):
-    def set_gguf_parameters(self):
-        block_count = get_key_opts(self.hparams, ["num_hidden_layers", "n_layer"])
-
-        rot_pct = get_key_opts(self.hparams, ["partial_rotary_factor"])
-        n_embd = get_key_opts(self.hparams, ["hidden_size", "n_embd"])
-        n_head = get_key_opts(self.hparams, ["num_attention_heads", "n_head"])
-
-        self.gguf_writer.add_name("Phi2")
-        self.gguf_writer.add_context_length(get_key_opts(self.hparams, ["n_positions", "max_position_embeddings"]))
-
-        self.gguf_writer.add_embedding_length(n_embd)
-        self.gguf_writer.add_feed_forward_length(4 * n_embd)
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_head_count(n_head)
-        self.gguf_writer.add_head_count_kv(n_head)
-        self.gguf_writer.add_layer_norm_eps(get_key_opts(self.hparams, ["layer_norm_epsilon", "layer_norm_eps"]))
-        self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
-        self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_add_bos_token(False)
-
-
-class PlamoModel(Model):
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-    def set_gguf_parameters(self):
-        hparams = self.hparams
-        block_count = hparams["num_hidden_layers"]
-
-        self.gguf_writer.add_name("PLaMo")
-        self.gguf_writer.add_context_length(4096)  # not in config.json
-        self.gguf_writer.add_embedding_length(hparams["hidden_size"])
-        self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
-        self.gguf_writer.add_block_count(block_count)
-        self.gguf_writer.add_head_count(hparams["num_attention_heads"])
-        self.gguf_writer.add_head_count_kv(5)  # hparams["num_key_value_heads"]) is wrong
-        self.gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
-
-    def shuffle_attn_q_weight(self, data_torch):
-        assert data_torch.size() == (5120, 5120)
-        data_torch = data_torch.reshape(8, 5, 128, 5120)
-        data_torch = torch.permute(data_torch, (1, 0, 2, 3))
-        data_torch = torch.reshape(data_torch, (5120, 5120))
-        return data_torch
-
-    def shuffle_attn_output_weight(self, data_torch):
-        assert data_torch.size() == (5120, 5120)
-        data_torch = data_torch.reshape(5120, 8, 5, 128)
-        data_torch = torch.permute(data_torch, (0, 2, 1, 3))
-        data_torch = torch.reshape(data_torch, (5120, 5120))
-        return data_torch
-
-    def write_tensors(self):
-        block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
-        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
-
-        for name, data_torch in self.get_tensors():
-            if "self_attn.rotary_emb.inv_freq" in name:
-                continue
-
-            # map tensor names
-            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
-            if new_name is None:
-                print(f"Can not map tensor {name!r}")
-                sys.exit()
-
-            # shuffle for broadcasting of gqa in ggml_mul_mat
-            if new_name.endswith("attn_q.weight"):
-                data_torch = self.shuffle_attn_q_weight(data_torch)
-            elif new_name.endswith("attn_output.weight"):
-                data_torch = self.shuffle_attn_output_weight(data_torch)
-
-            old_dtype = data_torch.dtype
-
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-
-            data = data_torch.squeeze().numpy()
-
-            n_dims = len(data.shape)
-            data_dtype = data.dtype
-
-            # if f32 desired, convert any float16 to float32
-            if self.ftype == 0 and data_dtype == np.float16:
-                data = data.astype(np.float32)
-
-            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-                data = data.astype(np.float32)
-
-            # if f16 desired, convert any float32 2-dim weight tensors to float16
-            if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-                data = data.astype(np.float16)
-
-            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
-
-            self.gguf_writer.add_tensor(new_name, data)
-
-
 ###### CONVERSION LOGIC ######


 def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Convert a huggingface model to a GGML compatible file")
+    parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
    parser.add_argument(
        "--vocab-only", action="store_true",
        help="extract only the vocab",
    )
-    parser.add_argument(
-        "--awq-path", type=Path, default=None,
-        help="Path to scale awq cache file")
    parser.add_argument(
        "--outfile", type=Path,
        help="path to write to; default: based on input",
@@ -1207,62 +858,43 @@ def parse_args() -> argparse.Namespace:
    return parser.parse_args()


-def main() -> None:
-    args = parse_args()
+args = parse_args()

-    dir_model = args.model
+dir_model = args.model
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file=sys.stderr)
+    sys.exit(1)

-    if args.awq_path:
-        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-        from awq.apply_awq import add_scale_weights
-        tmp_model_path = args.model / "weighted_model"
-        dir_model = tmp_model_path
-        if tmp_model_path.is_dir():
-            print(f"{tmp_model_path} exists as a weighted model.")
-        else:
-            tmp_model_path.mkdir(parents=True, exist_ok=True)
-            print("Saving new weighted model ...")
-            add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
-            print(f"Saved weighted model at {tmp_model_path}.")
+ftype_map = {
+    "f32": gguf.GGMLQuantizationType.F32,
+    "f16": gguf.GGMLQuantizationType.F16,
+}

-    if not dir_model.is_dir():
-        print(f'Error: {args.model} is not a directory', file=sys.stderr)
-        sys.exit(1)
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'

-    ftype_map = {
-        "f32": gguf.GGMLQuantizationType.F32,
-        "f16": gguf.GGMLQuantizationType.F16,
-    }
+print(f"Loading model: {dir_model.name}")

-    if args.outfile is not None:
-        fname_out = args.outfile
+hparams = Model.load_hparams(dir_model)
+
+with torch.inference_mode():
+    model_class = Model.from_model_architecture(hparams["architectures"][0])
+    model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
+
+    print("Set model parameters")
+    model_instance.set_gguf_parameters()
+
+    print("Set model tokenizer")
+    model_instance.set_vocab()
+
+    if args.vocab_only:
+        print(f"Exporting model vocab to '{fname_out}'")
+        model_instance.write_vocab()
    else:
-        # output in the same directory as the model by default
-        fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
+        print(f"Exporting model to '{fname_out}'")
+        model_instance.write()

-    print(f"Loading model: {dir_model.name}")
-
-    hparams = Model.load_hparams(dir_model)
-
-    with torch.inference_mode():
-        model_class = Model.from_model_architecture(hparams["architectures"][0])
-        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
-
-        print("Set model parameters")
-        model_instance.set_gguf_parameters()
-
-        print("Set model tokenizer")
-        model_instance.set_vocab()
-
-        if args.vocab_only:
-            print(f"Exporting model vocab to '{fname_out}'")
-            model_instance.write_vocab()
-        else:
-            print(f"Exporting model to '{fname_out}'")
-            model_instance.write()
-
-        print(f"Model successfully exported to '{fname_out}'")
-
-
-if __name__ == '__main__':
-    main()
+    print(f"Model successfully exported to '{fname_out}'")
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -3,6 +3,7 @@ from __future__ import annotations

 import json
 import os
+import re
 import struct
 import sys
 from typing import Any, BinaryIO, Sequence
@@ -10,15 +11,43 @@ from typing import Any, BinaryIO, Sequence
 import numpy as np
 import torch

-from pathlib import Path
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}


+HF_SUBLAYER_TO_GGML = {
+    "self_attn.q_proj": "attn_q",
+    "self_attn.k_proj": "attn_k",
+    "self_attn.v_proj": "attn_v",
+    "self_attn.o_proj": "attn_output",
+    "mlp.gate_proj": "ffn_gate",
+    "mlp.down_proj": "ffn_down",
+    "mlp.up_proj": "ffn_up",
+    "input_layernorm": "attn_norm",
+    "post_attention_layernorm": "ffn_norm",
+}
+
+
+def translate_tensor_name(t: str) -> str:
+    match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
+    if match:
+        nn = match.group(1)
+        sub_layer = match.group(2)
+        lora_type = match.group(3)
+
+        sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
+        if sub_layer_renamed is None:
+            print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
+            sys.exit(1)
+
+        output_string = (
+            f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
+        )
+        return output_string
+    else:
+        print(f"Error: unrecognized tensor {t}")
+        sys.exit(1)
+
+
 def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(b"ggla"[::-1])  # magic (ggml lora)
    fout.write(struct.pack("i", 1))  # file version
@@ -32,7 +61,9 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(struct.pack("i", int(params["lora_alpha"])))


-def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
+def write_tensor_header(
+    self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
+) -> None:
    sname = name.encode("utf-8")
    fout.write(
        struct.pack(
@@ -47,96 +78,60 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
    fout.seek((fout.tell() + 31) & -32)


-if __name__ == '__main__':
-    if len(sys.argv) < 2:
-        print(f"Usage: python {sys.argv[0]} <path> [arch]")
-        print(
-            "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
-        )
-        print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
-        sys.exit(1)
+if len(sys.argv) != 2:
+    print(f"Usage: python {sys.argv[0]} <path>")
+    print(
+        "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
+    )
+    sys.exit(1)

-    input_json = os.path.join(sys.argv[1], "adapter_config.json")
-    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
-    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
+input_json = os.path.join(sys.argv[1], "adapter_config.json")
+input_model = os.path.join(sys.argv[1], "adapter_model.bin")
+output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")

-    model = torch.load(input_model, map_location="cpu")
-    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
+model = torch.load(input_model, map_location="cpu")

-    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
-        print(f"Error: unsupported architecture {arch_name}")
-        sys.exit(1)
+with open(input_json, "r") as f:
+    params = json.load(f)

-    arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
-    name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
+if params["peft_type"] != "LORA":
+    print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
+    sys.exit(1)

-    with open(input_json, "r") as f:
-        params = json.load(f)
+if params["fan_in_fan_out"] is True:
+    print("Error: param fan_in_fan_out is not supported")
+    sys.exit(1)

-    if params["peft_type"] != "LORA":
-        print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
-        sys.exit(1)
+if params["bias"] is not None and params["bias"] != "none":
+    print("Error: param bias is not supported")
+    sys.exit(1)

-    if params["fan_in_fan_out"] is True:
-        print("Error: param fan_in_fan_out is not supported")
-        sys.exit(1)
+# TODO: these seem to be layers that have been trained but without lora.
+# doesn't seem widely used but eventually should be supported
+if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
+    print("Error: param modules_to_save is not supported")
+    sys.exit(1)

-    if params["bias"] is not None and params["bias"] != "none":
-        print("Error: param bias is not supported")
-        sys.exit(1)
+with open(output_path, "wb") as fout:
+    fout.truncate()

-    # TODO: these seem to be layers that have been trained but without lora.
-    # doesn't seem widely used but eventually should be supported
-    if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
-        print("Error: param modules_to_save is not supported")
-        sys.exit(1)
-
-    with open(output_path, "wb") as fout:
-        fout.truncate()
-
-        write_file_header(fout, params)
-        for k, v in model.items():
-            orig_k = k
-            if k.endswith(".default.weight"):
-                k = k.replace(".default.weight", ".weight")
-            if k in ["llama_proj.weight", "llama_proj.bias"]:
-                continue
-            if k.endswith("lora_A.weight"):
-                if v.dtype != torch.float16 and v.dtype != torch.float32:
-                    v = v.float()
-                v = v.T
-            else:
+    write_file_header(fout, params)
+    for k, v in model.items():
+        if k.endswith(".default.weight"):
+            k = k.replace(".default.weight", ".weight")
+        if k in ["llama_proj.weight", "llama_proj.bias"]:
+            continue
+        if k.endswith("lora_A.weight"):
+            if v.dtype != torch.float16 and v.dtype != torch.float32:
                v = v.float()
+            v = v.T
+        else:
+            v = v.float()

-            t = v.detach().numpy()
+        t = v.detach().numpy()
+        tname = translate_tensor_name(k)
+        print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
+        write_tensor_header(fout, tname, t.shape, t.dtype)
+        t.tofile(fout)

-            prefix = "base_model.model."
-            if k.startswith(prefix):
-                k = k[len(prefix) :]
-
-            lora_suffixes = (".lora_A.weight", ".lora_B.weight")
-            if k.endswith(lora_suffixes):
-                suffix = k[-len(lora_suffixes[0]):]
-                k = k[: -len(lora_suffixes[0])]
-            else:
-                print(f"Error: unrecognized tensor name {orig_k}")
-                sys.exit(1)
-
-            tname = name_map.get_name(k)
-            if tname is None:
-                print(f"Error: could not map tensor name {orig_k}")
-                print(" Note: the arch parameter must be specified if the model is not llama")
-                sys.exit(1)
-
-            if suffix == ".lora_A.weight":
-                tname += ".weight.loraA"
-            elif suffix == ".lora_B.weight":
-                tname += ".weight.loraB"
-            else:
-                assert False
-
-            print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
-            write_tensor_header(fout, tname, t.shape, t.dtype)
-            t.tofile(fout)
-
-    print(f"Converted {input_json} and {input_model} to {output_path}")
+print(f"Converted {input_json} and {input_model} to {output_path}")
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 import torch
 import os
 from pprint import pprint
--- a/convert.py
+++ b/convert.py
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -31,12 +31,12 @@ else()
    add_subdirectory(quantize-stats)
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
-    add_subdirectory(passkey)
    add_subdirectory(speculative)
    add_subdirectory(lookahead)
-    add_subdirectory(lookup)
    add_subdirectory(train-text-from-scratch)
-    add_subdirectory(imatrix)
+    if (LLAMA_METAL)
+        add_subdirectory(metal)
+    endif()
    if (LLAMA_BUILD_SERVER)
        add_subdirectory(server)
    endif()
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -575,7 +575,10 @@ static struct ggml_tensor * forward(

            // KQ_scaled = KQ / sqrt(n_embd/n_head)
            // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));

            // KQ_masked = mask_past(KQ_scaled)
            // KQ_masked shape [n_past + N, N, n_head, 1]
@@ -841,7 +844,10 @@ static struct ggml_tensor * forward_batch(

            // KQ_scaled = KQ / sqrt(n_embd/n_head)
            // KQ_scaled shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
            assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);

            // KQ_masked = mask_past(KQ_scaled)
@@ -1125,7 +1131,10 @@ static struct ggml_tensor * forward_lora(

            // KQ_scaled = KQ / sqrt(n_embd/n_head)
            // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));

            // KQ_masked = mask_past(KQ_scaled)
            // KQ_masked shape [n_past + N, N, n_head, 1]
@@ -1249,9 +1258,9 @@ static struct ggml_tensor * forward_lora(
 }

 static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
-    assert(ggml_is_matrix(logits));
-    assert(ggml_is_matrix(probs));
-    assert(ggml_is_vector(best_samples));
+    assert(logits->n_dims == 2);
+    assert(probs->n_dims == 2);
+    assert(best_samples->n_dims == 1);
    assert(logits->ne[1] == best_samples->ne[0]);
    assert(logits->ne[0] == probs->ne[0]);
    assert(logits->ne[1] == probs->ne[1]);
@@ -1283,9 +1292,9 @@ static void sample_softmax_batch(
    struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
    struct ggml_tensor * best_samples
 ) {
-    GGML_ASSERT(ggml_is_matrix(best_samples));
-    GGML_ASSERT(ggml_is_3d(logits));
-    GGML_ASSERT(ggml_is_3d(probs));
+    GGML_ASSERT(best_samples->n_dims == 2);
+    GGML_ASSERT(logits->n_dims == 3);
+    GGML_ASSERT(probs->n_dims == 3);
    int n_tokens = best_samples->ne[0];
    int n_batch  = best_samples->ne[1];
    int n_vocab  = logits->ne[0];
@@ -1325,7 +1334,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }

 static void print_matrix(struct ggml_tensor * probs) {
-    assert(ggml_is_matrix(probs));
+    assert(probs->n_dims == 2);
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
@@ -1377,8 +1386,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu
 static void get_example_targets_batch(
    struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
 ) {
-    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(ggml_is_3d(targets));
+    GGML_ASSERT(tokens_input->n_dims == 2);
+    GGML_ASSERT(     targets->n_dims == 3);
    int n_tokens = tokens_input->ne[0];
    int n_batch  = tokens_input->ne[1];
    GGML_ASSERT(n_tokens == targets->ne[1]);
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@@ -1,61 +0,0 @@
-#!/bin/bash
-#
-# Few-shot translation example.
-# Requires a base model (i.e. no fine-tuned or instruct models).
-#
-# Usage:
-#
-#   cd llama.cpp
-#   make -j
-#
-#   ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
-#
-
-if [ $# -lt 2 ]; then
-  echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
-  exit 1
-fi
-
-eargs=""
-if [ $# -gt 2 ]; then
-  eargs="${@:3}"
-fi
-
-ftmp="__llama.cpp_example_tmp__.txt"
-trap "rm -f $ftmp" EXIT
-
-echo "Translate from English to French:
-
-===
-
-sea otter, peppermint, plush girafe:
-
-sea otter => loutre de mer
-peppermint => menthe poivrée
-plush girafe => girafe peluche
-
-===
-
-violin
-
-violin => violon
-
-===
-
-phone, computer, mouse, keyboard:
-
-phone => téléphone
-computer => ordinateur
-mouse => souris
-keyboard => clavier
-
-===
-" > $ftmp
-
-echo "$2
-" >> $ftmp
-
-model=$1
-
-# generate the most likely continuation until the string "===" is found
-./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -88,10 +88,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = llama_model_default_params();

-    const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f);
-
    model_params.n_gpu_layers = n_gpu_layers;
-    model_params.tensor_split = t_split.data();

    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);

@@ -158,7 +155,7 @@ int main(int argc, char ** argv) {
    }

    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
    LOG_TEE("\n");

    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -215,10 +215,9 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end
 llama_print_timings(context)

 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-    let utf8Count = text.utf8.count
-    let n_tokens = utf8Count + (add_bos ? 1 : 0)
+    let n_tokens = text.count + (add_bos ? 1 : 0)
    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
+    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
    var swiftTokens: [llama_token] = []
    for i in 0 ..< tokenCount {
        swiftTokens.append(tokens[Int(i)])
@@ -231,15 +230,18 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
    var result = [CChar](repeating: 0, count: 8)
    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
    if nTokens < 0 {
-        let actualTokensCount = -Int(nTokens)
-        result = .init(repeating: 0, count: actualTokensCount)
+        if result.count >= -Int(nTokens) {
+            result.removeLast(-Int(nTokens))
+        } else {
+            result.removeAll()
+        }
        let check = llama_token_to_piece(
            model,
            token,
            &result,
            Int32(result.count)
        )
-        assert(check == actualTokensCount)
+        assert(check == nTokens)
    } else {
        result.removeLast(result.count - Int(nTokens))
    }
@@ -257,4 +259,5 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
        buffer = []
        return bufferString
    }
+    return nil
 }
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -69,7 +69,6 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(model, params.prompt, true);
-
    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;

    // initialize the context
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -129,13 +129,13 @@ int main(int argc, char ** argv)  {
    const ggml_type qtype = GGML_TYPE_Q4_1;

    size_t ctx_size = 0;
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
    ctx_size += 1024*1024*16;

    printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
@@ -194,7 +194,7 @@ int main(int argc, char ** argv)  {
    // Set up a the benchmark matrices
    // printf("Creating new tensor q11 & Running quantize\n");
    struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
+    ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements, hist_cur.data());

    // Set up a the compute graph
    // printf("Creating new tensor q31\n");
@@ -207,7 +207,7 @@ int main(int argc, char ** argv)  {
    // Set up a second graph computation to make sure we override the CPU cache lines
    // printf("Creating new tensor q12 & Running quantize\n");
    struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
-    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
+    ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements, hist_cur.data());

    // printf("Creating new tensor q32\n");
    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }

 static void print_matrix(struct ggml_tensor * probs) {
-    assert(ggml_is_matrix(probs));
+    assert(probs->n_dims == 2);
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = get_f32_2d(probs, k, i);
@@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab

 static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
    int ct;
-    switch (ggml_n_dims(gg_weights)) {
+    switch (gg_weights->n_dims){
        case 1:
            ct = 0;
            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -245,8 +245,9 @@ static struct lora_data * load_lora(struct lora_info * info) {
    params_ggml.no_alloc   = true;
    result->ctx = ggml_init(params_ggml);

+    uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
    uint32_t magic   = file.read_u32();
-    if (magic != LLAMA_FILE_MAGIC_GGLA) {
+    if (magic != LLAMA_FILE_MAGIC_LORA) {
        die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
    }
    uint32_t version = file.read_u32();
@@ -308,7 +309,7 @@ static struct ggml_cgraph * build_graph_lora(
 ) {
    struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
    if (scaling != 1.0f) {
-        ab = ggml_scale(ctx, ab, scaling);
+        ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
    }
    struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);

--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
 ```

-The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
+The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.

 Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
 If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -3,9 +3,15 @@
 #include "llama.h"
 #include "common.h"
 #include "train.h"
+#include <unordered_map>
 #include <vector>
+#include <cassert>
+#include <climits>
 #include <cstring>
+#include <cstdarg>
 #include <ctime>
+#include <random>
+#include <stdexcept>
 #include <algorithm>
 #include <string>

@@ -190,13 +196,13 @@ static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
 static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";

 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab               : %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx                 : %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd                : %u\n", __func__, params->n_embd);
-    printf("%s: n_ff                  : %u\n", __func__, params->n_ff);
-    printf("%s: n_head                : %u\n", __func__, params->n_head);
-    printf("%s: n_head_kv             : %u\n", __func__, params->n_head_kv);
-    printf("%s: n_layer               : %u\n", __func__, params->n_layer);
+    printf("%s: n_vocab:   %u\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:     %u\n", __func__, params->n_ctx);
+    printf("%s: n_embd:    %u\n", __func__, params->n_embd);
+    printf("%s: n_ff:      %u\n", __func__, params->n_ff);
+    printf("%s: n_head:    %u\n", __func__, params->n_head);
+    printf("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+    printf("%s: n_layer:   %u\n", __func__, params->n_layer);
    printf("%s: norm_rms_eps          : %f\n", __func__, params->f_norm_rms_eps);
    printf("%s: rope_freq_base        : %f\n", __func__, params->rope_freq_base);
    printf("%s: rope_freq_scale       : %f\n", __func__, params->rope_freq_scale);
@@ -263,7 +269,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
    float rope_freq_scale = 1.0f;
    GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
    GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
-    GGUF_GET_KEY(ctx, rope_freq_scale,         gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+    GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
    if (rope_freq_scale != 1.0f) {
        hparams->rope_freq_scale = 1.0f / rope_freq_scale;
    }
@@ -606,7 +612,6 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    const int n_rot       = hparams.n_embd_head();
    const int n_embd_head = hparams.n_embd_head();
    const int n_embd_gqa  = hparams.n_embd_gqa();
-
    const float rms_norm_eps    = hparams.f_norm_rms_eps;
    const float rope_freq_base  = hparams.rope_freq_base;
    const float rope_freq_scale = hparams.rope_freq_scale;
@@ -675,7 +680,10 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        checkpoints.push_back(t01);
    }

-    const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
+    struct ggml_tensor * kv_scale = NULL;
+    if (!enable_flash_attn) {
+        kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
+    }

    for (int il = 0; il < n_layer; ++il) {
        struct my_llama_layer & layer = model->layers[il];
@@ -773,32 +781,32 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
    int n_leafs_before = gb->n_leafs;
    int n_nodes_before = gb->n_nodes;
-
+    struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
    // output tensors
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
    // input gradient
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
    GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
    ggml_allocr_alloc(alloc, t36->grad);
    // KQ_pos
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));

    // make sure base model tensors data cannot be used in viewable operations
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one));
    for (int il = 0; il < n_layer; ++il) {
        struct my_llama_layer & layer = model->layers[il];
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one));
    }

    // allocating checkpoints in one block to reduce memory fragmentation
@@ -1102,7 +1110,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor,
        name = ggml_get_name(tensor);
    }
    uint32_t name_len = strlen(name);
-    uint32_t nd = ggml_n_dims(tensor);
+    uint32_t nd = tensor->n_dims;
    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
                       (uint32_t)tensor->ne[1],
                       (uint32_t)tensor->ne[2],
@@ -1138,8 +1146,9 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
        return tn_buf.data();
    };

+    uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
    // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC_GGLA);   // magic
+    file.write_u32(LLAMA_FILE_MAGIC_LORA);   // magic
    file.write_u32(1); // version
    // write_hparams
    file.write_u32(lora->hparams.lora_r);
@@ -1611,6 +1620,8 @@ int main(int argc, char ** argv) {
    opt->params.adam.gclip              = params.common.adam_gclip;
    opt->params.adam.eps_f              = params.common.adam_eps_f;

+    ggml_allocr * alloc = NULL;
+
    printf("%s: init model\n", __func__);
    bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);

@@ -1714,9 +1725,10 @@ int main(int argc, char ** argv) {

    // allocate input tensors
    mem_input_data.resize(max_input_size);
-    ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
-    ggml_allocr_alloc(alloc_inps, tokens_input);
-    ggml_allocr_alloc(alloc_inps, target_probs);
+    alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
+    ggml_allocr_alloc(alloc, tokens_input);
+    ggml_allocr_alloc(alloc, target_probs);
+    ggml_allocr_free(alloc);

    // context for compute tensors without their data
    const size_t estimated_compute_size_wo_data = (
@@ -1743,7 +1755,7 @@ int main(int argc, char ** argv) {
    // find best evaluation order
    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
        ctx_compute = ggml_init(ctx_compute_params);
-        ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment);
+        alloc = ggml_allocr_new_measure(tensor_alignment);
        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
        gf->order = (enum ggml_cgraph_eval_order) order;
        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@@ -1776,7 +1788,7 @@ int main(int argc, char ** argv) {
    // allocate compute tensors
    mem_compute_data.resize(max_compute_size);
    ctx_compute = ggml_init(ctx_compute_params);
-    ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
+    alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
    gf->order = best_order;
    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
@@ -1792,14 +1804,12 @@ int main(int argc, char ** argv) {
        params.common.use_checkpointing
    );
    ggml_allocr_free(alloc);
-    ggml_allocr_free(alloc_inps);
-

    // tokenize data
    std::vector<llama_token> train_tokens;
    std::vector<size_t> train_samples_begin;
    std::vector<size_t> train_samples_size;
-    printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
+    printf("%s: tokenize training data\n", __func__);
    tokenize_file(lctx,
            params.common.fn_train_data,
            params.common.sample_start,
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET gguf)
 add_executable(${TARGET} gguf.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "llama.h"

 #include <cstdio>
 #include <cinttypes>
@@ -194,7 +195,7 @@ static bool gguf_ex_read_1(const std::string & fname) {

            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);

-            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
+            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);

            // print first 10 elements
            const float * data = (const float *) cur->data;
--- a/examples/imatrix/CMakeLists.txt
+++ b/examples/imatrix/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET imatrix)
-add_executable(${TARGET} imatrix.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -1,380 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <sstream>
-#include <thread>
-#include <mutex>
-#include <vector>
-#include <fstream>
-#include <unordered_map>
-#include <algorithm>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-struct Stats {
-    std::vector<float> values;
-    int ncall = 0;
-};
-
-struct StatParams {
-    std::string ofile = "imatrix.dat";
-    int         n_output_frequency = 10;
-    int         verbosity = 1;
-    bool        collect_output_weight = false;
-};
-
-class IMatrixCollector {
-public:
-    IMatrixCollector() = default;
-    void set_parameters(StatParams&& params) { m_params = std::move(params); }
-    void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
-    void save_imatrix() const;
-private:
-    std::unordered_map<std::string, Stats> m_stats;
-    StatParams                             m_params;
-    std::mutex                             m_mutex;
-    int                                    m_last_call = 0;
-};
-
-void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
-    if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return;
-    if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return;
-    std::lock_guard<std::mutex> lock(m_mutex);
-    auto& e = m_stats[src0->name];
-    if (e.values.empty()) {
-        e.values.resize(src1->ne[0], 0);
-    }
-    else if (e.values.size() != (size_t)src1->ne[0]) {
-        fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
-        exit(1); //GGML_ASSERT(false);
-    }
-    ++e.ncall;
-    if (m_params.verbosity > 1) {
-        printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
-    }
-    for (int row = 0; row < (int)src1->ne[1]; ++row) {
-        const float * x = (const float *)src1->data + row * src1->ne[0];
-        for (int j = 0; j < (int)src1->ne[0]; ++j) {
-            e.values[j] += x[j]*x[j];
-        }
-    }
-    if (e.ncall > m_last_call) {
-        m_last_call = e.ncall;
-        if (m_last_call % m_params.n_output_frequency == 0) {
-            save_imatrix();
-        }
-    }
-}
-
-void IMatrixCollector::save_imatrix() const {
-    const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
-    std::ofstream out(fname, std::ios::binary);
-    int n_entries = m_stats.size();
-    out.write((const char*)&n_entries, sizeof(n_entries));
-    for (auto& p : m_stats) {
-        int len = p.first.size();
-        out.write((const char*)&len, sizeof(len));
-        out.write(p.first.c_str(), len);
-        out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
-        int nval = p.second.values.size();
-        out.write((const char*)&nval, sizeof(nval));
-        if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
-    }
-    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
-    }
-}
-
-static IMatrixCollector g_collector;
-
-static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
-    g_collector.collect_imatrix(src0, src1);
-}
-
-
-struct results_log_softmax {
-    double log_softmax;
-    float  logit;
-    float  prob;
-};
-
-static std::vector<float> softmax(const std::vector<float>& logits) {
-    std::vector<float> probs(logits.size());
-    float max_logit = logits[0];
-    for (float v : logits) {
-        max_logit = std::max(max_logit, v);
-    }
-    double sum_exp = 0.0;
-    for (size_t i = 0; i < logits.size(); i++) {
-        // Subtract the maximum logit value from the current logit value for numerical stability
-        const float logit = logits[i] - max_logit;
-        const float exp_logit = expf(logit);
-        sum_exp += exp_logit;
-        probs[i] = exp_logit;
-    }
-    for (size_t i = 0; i < probs.size(); i++) {
-        probs[i] /= sum_exp;
-    }
-    return probs;
-}
-
-static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
-    float max_logit = logits[0];
-    for (int i = 1; i < n_vocab; ++i) {
-        max_logit = std::max(max_logit, logits[i]);
-    }
-    double sum_exp = 0.0;
-    for (int i = 0; i < n_vocab; ++i) {
-        sum_exp += expf(logits[i] - max_logit);
-    }
-    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
-}
-
-static void process_logits(
-    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
-    double & nll, double & nll2, float * logit_history, float * prob_history
-) {
-    std::mutex mutex;
-    int counter = 0;
-    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
-        double local_nll  = 0;
-        double local_nll2 = 0;
-        while (true) {
-            std::unique_lock<std::mutex> lock(mutex);
-            int i = counter++;
-            if (i >= n_token) {
-                nll += local_nll; nll2 += local_nll2;
-                break;
-            }
-            lock.unlock();
-            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
-            const double v = -results.log_softmax;
-            local_nll += v;
-            local_nll2 += v*v;
-
-            logit_history[i] = results.logit;
-            prob_history[i]  = results.prob;
-        }
-    };
-    for (auto & w : workers) {
-        w = std::thread(compute);
-    }
-    compute();
-    for (auto & w : workers) {
-        w.join();
-    }
-}
-
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
-
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-    const int n_ctx = llama_n_ctx(ctx);
-
-    auto tim1 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
-
-    std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
-
-    auto tim2 = std::chrono::high_resolution_clock::now();
-    fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
-
-    if (int(tokens.size()) < 2*n_ctx) {
-        fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
-                n_ctx);
-        fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size());
-        return false;
-    }
-
-    std::vector<float> logit_history;
-    logit_history.resize(tokens.size());
-
-    std::vector<float> prob_history;
-    prob_history.resize(tokens.size());
-
-    const int n_chunk_max = tokens.size() / n_ctx;
-
-    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-    const int n_batch = params.n_batch;
-
-    int count = 0;
-    double nll = 0.0;
-    double nll2 = 0.0;
-
-    fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch);
-
-    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
-
-    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
-        const int end   = start + n_ctx;
-
-        const int num_batches = (n_ctx + n_batch - 1) / n_batch;
-
-        std::vector<float> logits;
-
-        const auto t_start = std::chrono::high_resolution_clock::now();
-
-        // clear the KV cache
-        llama_kv_cache_clear(ctx);
-
-        for (int j = 0; j < num_batches; ++j) {
-            const int batch_start = start + j * n_batch;
-            const int batch_size  = std::min(end - batch_start, n_batch);
-
-            // save original token and restore it after eval
-            const auto token_org = tokens[batch_start];
-
-            // add BOS token for the first batch of each chunk
-            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
-            }
-
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
-                return false;
-            }
-
-            // restore the original token in case it was set to BOS
-            tokens[batch_start] = token_org;
-
-            const auto * batch_logits = llama_get_logits(ctx);
-            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
-        }
-
-        const auto t_end = std::chrono::high_resolution_clock::now();
-
-        if (i == 0) {
-            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
-            int total_seconds = (int)(t_total * n_chunk);
-            if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
-                total_seconds = total_seconds % (60*60);
-            }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
-        }
-
-        const int first = n_ctx/2;
-        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-        count += n_ctx - first - 1;
-
-        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
-        fflush(stdout);
-    }
-    printf("\n");
-
-    nll2 /= count;
-    nll /= count;
-    const double ppl = exp(nll);
-    nll2 -= nll * nll;
-    if (nll2 > 0) {
-        nll2 = sqrt(nll2/(count-1));
-        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
-    } else {
-        printf("Unexpected negative standard deviation of log(prob)\n");
-    }
-
-    return true;
-}
-
-int main(int argc, char ** argv) {
-
-    StatParams sparams;
-    std::vector<char*> args;
-    args.push_back(argv[0]);
-    int iarg = 1;
-    for (; iarg < argc-1; ++iarg) {
-        std::string arg{argv[iarg]};
-        if (arg == "-o" || arg == "--output-file") {
-            sparams.ofile = argv[++iarg];
-        }
-        else if (arg == "-ofreq" || arg == "--output-frequency") {
-            sparams.n_output_frequency = std::stoi(argv[++iarg]);
-        }
-        else if (arg == "-ow" || arg == "--output-weight") {
-            sparams.collect_output_weight = std::stoi(argv[++iarg]);
-        }
-        else if (arg == "--verbosity") {
-            sparams.verbosity = std::stoi(argv[++iarg]);
-        } else {
-            args.push_back(argv[iarg]);
-        }
-    }
-    if (iarg < argc) {
-        args.push_back(argv[iarg]);
-    }
-
-    gpt_params params;
-    params.n_batch = 512;
-    if (!gpt_params_parse(args.size(), args.data(), params)) {
-        return 1;
-    }
-
-    g_collector.set_parameters(std::move(sparams));
-
-    ggml_set_imatrix_collection(ik_collect_imatrix);
-
-    params.logits_all = true;
-    params.n_batch = std::min(params.n_batch, params.n_ctx);
-
-    print_build_info();
-
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = gpt_random_prompt(rng);
-    }
-
-    llama_backend_init(params.numa);
-
-    llama_model * model;
-    llama_context * ctx;
-
-    // load the model and apply lora adapter, if any
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: unable to load model\n", __func__);
-        return 1;
-    }
-
-    const int n_ctx_train = llama_n_ctx_train(model);
-    if (params.n_ctx > n_ctx_train) {
-        fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
-                __func__, n_ctx_train, params.n_ctx);
-    }
-
-    // print system information
-    {
-        fprintf(stderr, "\n");
-        fprintf(stderr, "%s\n", get_system_info(params).c_str());
-    }
-
-    bool OK = compute_imatrix(ctx, params);
-    if (!OK) {
-        return 1;
-    }
-
-    g_collector.save_imatrix();
-
-    llama_print_timings(ctx);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    return 0;
-}
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -53,13 +53,6 @@ static std::vector<T> split(const std::string & str, char delim) {
    return values;
 }

-template<typename T, typename F>
-static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
-    std::vector<std::string> str_values;
-    std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
-    return str_values;
-}
-
 template<typename T>
 static T avg(const std::vector<T> & v) {
    if (v.empty()) {
@@ -128,37 +121,15 @@ static std::string get_gpu_info() {
 // command line params
 enum output_formats {CSV, JSON, MARKDOWN, SQL};

-static const char * output_format_str(output_formats format) {
-    switch (format) {
-        case CSV:      return "csv";
-        case JSON:     return "json";
-        case MARKDOWN: return "md";
-        case SQL:      return "sql";
-        default: GGML_ASSERT(!"invalid output format");
-    }
-}
-
-static const char * split_mode_str(llama_split_mode mode) {
-    switch (mode) {
-        case LLAMA_SPLIT_NONE:  return "none";
-        case LLAMA_SPLIT_LAYER: return "layer";
-        case LLAMA_SPLIT_ROW:   return "row";
-        default: GGML_ASSERT(!"invalid split mode");
-    }
-}
-
 struct cmd_params {
    std::vector<std::string> model;
    std::vector<int> n_prompt;
    std::vector<int> n_gen;
    std::vector<int> n_batch;
-    std::vector<ggml_type> type_k;
-    std::vector<ggml_type> type_v;
+    std::vector<bool> f32_kv;
    std::vector<int> n_threads;
    std::vector<int> n_gpu_layers;
-    std::vector<llama_split_mode> split_mode;
    std::vector<int> main_gpu;
-    std::vector<bool> no_kv_offload;
    std::vector<bool> mul_mat_q;
    std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
    int reps;
@@ -171,13 +142,10 @@ static const cmd_params cmd_params_defaults = {
    /* n_prompt      */ {512},
    /* n_gen         */ {128},
    /* n_batch       */ {512},
-    /* type_k        */ {GGML_TYPE_F16},
-    /* type_v        */ {GGML_TYPE_F16},
+    /* f32_kv        */ {false},
    /* n_threads     */ {get_num_physical_cores()},
    /* n_gpu_layers  */ {99},
-    /* split_mode    */ {LLAMA_SPLIT_LAYER},
    /* main_gpu      */ {0},
-    /* no_kv_offload */ {false},
    /* mul_mat_q     */ {true},
    /* tensor_split  */ {{}},
    /* reps          */ 5,
@@ -190,50 +158,24 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("\n");
    printf("options:\n");
    printf("  -h, --help\n");
-    printf("  -m, --model <filename>              (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
-    printf("  -p, --n-prompt <n>                  (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
-    printf("  -n, --n-gen <n>                     (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
-    printf("  -b, --batch-size <n>                (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  -ctk <t>, --cache-type-k <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
-    printf("  -ctv <t>, --cache-type-v <t>        (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
-    printf("  -t, --threads <n>                   (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
-    printf("  -ngl, --n-gpu-layers <n>            (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-    printf("  -sm, --split-mode <none|layer|row>  (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
-    printf("  -mg, --main-gpu <i>                 (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
-    printf("  -nkvo, --no-kv-offload <0|1>        (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
-    printf("  -mmq, --mul-mat-q <0|1>             (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
-    printf("  -ts, --tensor_split <ts0/ts1/..>    (default: 0)\n");
-    printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
-    printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
-    printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
+    printf("  -m, --model <filename>            (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
+    printf("  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
+    printf("  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
+    printf("  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
+    printf("  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
+    printf("  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
+    printf("  -ngl, --n-gpu-layers <n>          (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+    printf("  -mg, --main-gpu <i>               (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
+    printf("  -mmq, --mul-mat-q <0|1>           (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
+    printf("  -ts, --tensor_split <ts0/ts1/..>               \n");
+    printf("  -r, --repetitions <n>             (default: %d)\n", cmd_params_defaults.reps);
+    printf("  -o, --output <csv|json|md|sql>    (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
+    printf("  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
+
 }

-static ggml_type ggml_type_from_name(const std::string & s) {
-    if (s == "f16") {
-        return GGML_TYPE_F16;
-    }
-    if (s == "q8_0") {
-        return GGML_TYPE_Q8_0;
-    }
-    if (s == "q4_0") {
-        return GGML_TYPE_Q4_0;
-    }
-    if (s == "q4_1") {
-        return GGML_TYPE_Q4_1;
-    }
-    if (s == "q5_0") {
-        return GGML_TYPE_Q5_0;
-    }
-    if (s == "q5_1") {
-        return GGML_TYPE_Q5_1;
-    }
-
-    return GGML_TYPE_COUNT;
-}
-
-
 static cmd_params parse_cmd_params(int argc, char ** argv) {
    cmd_params params;
    std::string arg;
@@ -282,38 +224,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
-        } else if (arg == "-ctk" || arg == "--cache-type-k") {
+        } else if (arg == "--memory-f32") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<std::string>(argv[i], split_delim);
-            std::vector<ggml_type> types;
-            for (const auto & t : p) {
-                ggml_type gt = ggml_type_from_name(t);
-                if (gt == GGML_TYPE_COUNT) {
-                    invalid_param = true;
-                    break;
-                }
-                types.push_back(gt);
-            }
-            params.type_k.insert(params.type_k.end(), types.begin(), types.end());
-        } else if (arg == "-ctv" || arg == "--cache-type-v") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            auto p = split<std::string>(argv[i], split_delim);
-            std::vector<ggml_type> types;
-            for (const auto & t : p) {
-                ggml_type gt = ggml_type_from_name(t);
-                if (gt == GGML_TYPE_COUNT) {
-                    invalid_param = true;
-                    break;
-                }
-                types.push_back(gt);
-            }
-            params.type_v.insert(params.type_v.end(), types.begin(), types.end());
+            auto p = split<int>(argv[i], split_delim);
+            params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
        } else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_param = true;
@@ -328,41 +245,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
-        } else if (arg == "-sm" || arg == "--split-mode") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            auto p = split<std::string>(argv[i], split_delim);
-            std::vector<llama_split_mode> modes;
-            for (const auto & m : p) {
-                llama_split_mode mode;
-                if (m == "none") {
-                    mode = LLAMA_SPLIT_NONE;
-                } else if (m == "layer") {
-                    mode = LLAMA_SPLIT_LAYER;
-                } else if (m == "row") {
-                    mode = LLAMA_SPLIT_ROW;
-                } else {
-                    invalid_param = true;
-                    break;
-                }
-                modes.push_back(mode);
-            }
-            params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
        } else if (arg == "-mg" || arg == "--main-gpu") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.main_gpu = split<int>(argv[i], split_delim);
-        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            auto p = split<bool>(argv[i], split_delim);
-            params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
        } else if (arg == "-mmq" || arg == "--mul-mat-q") {
            if (++i >= argc) {
                invalid_param = true;
@@ -433,12 +321,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
    if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
    if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
-    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
-    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
+    if (params.f32_kv.empty())       { params.f32_kv = cmd_params_defaults.f32_kv; }
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
-    if (params.split_mode.empty())   { params.split_mode = cmd_params_defaults.split_mode; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
-    if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
    if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
    if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
    if (params.n_threads.empty())    { params.n_threads = cmd_params_defaults.n_threads; }
@@ -451,13 +336,10 @@ struct cmd_params_instance {
    int n_prompt;
    int n_gen;
    int n_batch;
-    ggml_type type_k;
-    ggml_type type_v;
+    bool f32_kv;
    int n_threads;
    int n_gpu_layers;
-    llama_split_mode split_mode;
    int main_gpu;
-    bool no_kv_offload;
    bool mul_mat_q;
    std::array<float, LLAMA_MAX_DEVICES> tensor_split;

@@ -465,7 +347,6 @@ struct cmd_params_instance {
        llama_model_params mparams = llama_model_default_params();

        mparams.n_gpu_layers = n_gpu_layers;
-        mparams.split_mode = split_mode;
        mparams.main_gpu = main_gpu;
        mparams.tensor_split = tensor_split.data();

@@ -475,7 +356,6 @@ struct cmd_params_instance {
    bool equal_mparams(const cmd_params_instance & other) const {
        return model == other.model &&
               n_gpu_layers == other.n_gpu_layers &&
-               split_mode == other.split_mode &&
               main_gpu == other.main_gpu &&
               tensor_split == other.tensor_split;
    }
@@ -485,29 +365,53 @@ struct cmd_params_instance {

        cparams.n_ctx = n_prompt + n_gen;
        cparams.n_batch = n_batch;
-        cparams.type_k = type_k;
-        cparams.type_v = type_v;
+        cparams.f16_kv = !f32_kv;
        cparams.mul_mat_q = mul_mat_q;
-        cparams.offload_kqv = !no_kv_offload;

        return cparams;
    }
 };

-static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
+static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) {
    std::vector<cmd_params_instance> instances;

-    // this ordering minimizes the number of times that each model needs to be reloaded
    for (const auto & m : params.model)
    for (const auto & nl : params.n_gpu_layers)
-    for (const auto & sm : params.split_mode)
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
    for (const auto & nb : params.n_batch)
-    for (const auto & tk : params.type_k)
-    for (const auto & tv : params.type_v)
+    for (const auto & fk : params.f32_kv)
+    for (const auto & mmq : params.mul_mat_q)
+    for (const auto & nt : params.n_threads) {
+        cmd_params_instance instance = {
+            /* .model        = */ m,
+            /* .n_prompt     = */ n_prompt,
+            /* .n_gen        = */ n_gen,
+            /* .n_batch      = */ nb,
+            /* .f32_kv       = */ fk,
+            /* .n_threads    = */ nt,
+            /* .n_gpu_layers = */ nl,
+            /* .main_gpu     = */ mg,
+            /* .mul_mat_q    = */ mmq,
+            /* .tensor_split = */ ts,
+        };
+        instances.push_back(instance);
+    }
+    return instances;
+}
+
+static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_params & params) {
+    std::vector<cmd_params_instance> instances;
+
+#if 1
+    // this ordering minimizes the number of times that each model needs to be reloaded
+    for (const auto & m : params.model)
+    for (const auto & nl : params.n_gpu_layers)
+    for (const auto & mg : params.main_gpu)
+    for (const auto & ts : params.tensor_split)
+    for (const auto & nb : params.n_batch)
+    for (const auto & fk : params.f32_kv)
    for (const auto & mmq : params.mul_mat_q)
-    for (const auto & nkvo : params.no_kv_offload)
    for (const auto & nt : params.n_threads) {
        for (const auto & n_prompt : params.n_prompt) {
            if (n_prompt == 0) {
@@ -518,13 +422,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_prompt     = */ n_prompt,
                /* .n_gen        = */ 0,
                /* .n_batch      = */ nb,
-                /* .type_k       = */ tk,
-                /* .type_v       = */ tv,
+                /* .f32_kv       = */ fk,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
-                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
-                /* .no_kv_offload= */ nkvo,
                /* .mul_mat_q    = */ mmq,
                /* .tensor_split = */ ts,
            };
@@ -540,19 +441,34 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_prompt     = */ 0,
                /* .n_gen        = */ n_gen,
                /* .n_batch      = */ nb,
-                /* .type_k       = */ tk,
-                /* .type_v       = */ tv,
+                /* .f32_kv       = */ fk,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
-                /* .split_mode   = */ sm,
                /* .main_gpu     = */ mg,
-                /* .no_kv_offload= */ nkvo,
                /* .mul_mat_q    = */ mmq,
                /* .tensor_split = */ ts,
            };
            instances.push_back(instance);
        }
    }
+#else
+    // this ordering separates the prompt and generation tests
+    for (const auto & n_prompt : params.n_prompt) {
+        if (n_prompt == 0) {
+            continue;
+        }
+        auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt);
+        instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end());
+    }
+
+    for (const auto & n_gen : params.n_gen) {
+        if (n_gen == 0) {
+            continue;
+        }
+        auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0);
+        instances.insert(instances.end(), instances_gen.begin(), instances_gen.end());
+    }
+#endif

    return instances;
 }
@@ -573,12 +489,9 @@ struct test {
    uint64_t model_n_params;
    int n_batch;
    int n_threads;
-    ggml_type type_k;
-    ggml_type type_v;
+    bool f32_kv;
    int n_gpu_layers;
-    llama_split_mode split_mode;
    int main_gpu;
-    bool no_kv_offload;
    bool mul_mat_q;
    std::array<float, LLAMA_MAX_DEVICES> tensor_split;
    int n_prompt;
@@ -595,12 +508,9 @@ struct test {
        model_n_params = llama_model_n_params(lmodel);
        n_batch = inst.n_batch;
        n_threads = inst.n_threads;
-        type_k = inst.type_k;
-        type_v = inst.type_v;
+        f32_kv = inst.f32_kv;
        n_gpu_layers = inst.n_gpu_layers;
-        split_mode = inst.split_mode;
        main_gpu = inst.main_gpu;
-        no_kv_offload = inst.no_kv_offload;
        mul_mat_q = inst.mul_mat_q;
        tensor_split = inst.tensor_split;
        n_prompt = inst.n_prompt;
@@ -661,10 +571,8 @@ struct test {
            "cuda", "opencl", "metal", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
-            "n_batch", "n_threads", "type_k", "type_v",
-            "n_gpu_layers", "split_mode",
-            "main_gpu", "no_kv_offload",
-            "mul_mat_q", "tensor_split",
+            "n_batch", "n_threads", "f16_kv",
+            "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
            "avg_ts", "stddev_ts"
@@ -683,7 +591,7 @@ struct test {
            return INT;
        }
        if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
-            field == "f16_kv" || field == "no_kv_offload" || field == "mul_mat_q") {
+            field == "f16_kv" || field == "mul_mat_q") {
            return BOOL;
        }
        if (field == "avg_ts" || field == "stddev_ts") {
@@ -713,10 +621,8 @@ struct test {
            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
-            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
-            std::to_string(n_gpu_layers), split_mode_str(split_mode),
-            std::to_string(main_gpu), std::to_string(no_kv_offload),
-            std::to_string(mul_mat_q), tensor_split_str,
+            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
+            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
            std::to_string(avg_ts()), std::to_string(stdev_ts())
@@ -871,18 +777,12 @@ struct markdown_printer : public printer {
        if (field == "n_gpu_layers") {
            return "ngl";
        }
-        if (field == "split_mode") {
-            return "sm";
-        }
        if (field == "n_threads") {
            return "threads";
        }
        if (field == "mul_mat_q") {
            return "mmq";
        }
-        if (field == "no_kv_offload") {
-            return "nkvo";
-        }
        if (field == "tensor_split") {
            return "ts";
        }
@@ -905,24 +805,15 @@ struct markdown_printer : public printer {
        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
            fields.push_back("n_batch");
        }
-        if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
-            fields.push_back("type_k");
-        }
-        if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
-            fields.push_back("type_v");
+        if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
+            fields.push_back("f16_kv");
        }
        if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
            fields.push_back("main_gpu");
        }
-        if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) {
-            fields.push_back("split_mode");
-        }
        if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) {
            fields.push_back("mul_mat_q");
        }
-        if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
-            fields.push_back("no_kv_offload");
-        }
        if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
            fields.push_back("tensor_split");
        }
--- a/examples/llama.android/.gitignore
+++ b/examples/llama.android/.gitignore
@@ -1,33 +0,0 @@
-# Gradle files
-.gradle/
-build/
-
-# Local configuration file (sdk path, etc)
-local.properties
-
-# Log/OS Files
-*.log
-
-# Android Studio generated files and folders
-captures/
-.externalNativeBuild/
-.cxx/
-*.apk
-output.json
-
-# IntelliJ
-*.iml
-.idea/
-misc.xml
-deploymentTargetDropDown.xml
-render.experimental.xml
-
-# Keystore files
-*.jks
-*.keystore
-
-# Google Services (e.g. APIs or Firebase)
-google-services.json
-
-# Android Profiling
-*.hprof
--- a/examples/llama.android/README.md
+++ b/examples/llama.android/README.md
--- a/examples/llama.android/app/.gitignore
+++ b/examples/llama.android/app/.gitignore
@@ -1 +0,0 @@
-/build
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -1,91 +0,0 @@
-plugins {
-    id("com.android.application")
-    id("org.jetbrains.kotlin.android")
-}
-
-android {
-    namespace = "com.example.llama"
-    compileSdk = 34
-
-    ndkVersion = "26.1.10909125"
-
-    defaultConfig {
-        applicationId = "com.example.llama"
-        minSdk = 33
-        targetSdk = 34
-        versionCode = 1
-        versionName = "1.0"
-
-        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
-        vectorDrawables {
-            useSupportLibrary = true
-        }
-        ndk {
-            // Workaround for https://github.com/llvm/llvm-project/issues/65820
-            // affecting armeabi-v7a. Skip armeabi-v7a when invoked with
-            // -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
-            if (project.hasProperty("skip-armeabi-v7a")) {
-                abiFilters += listOf("arm64-v8a", "x86_64", "x86")
-            }
-        }
-        externalNativeBuild {
-            cmake {
-                cppFlags += listOf()
-                arguments += listOf()
-            }
-        }
-    }
-
-    buildTypes {
-        release {
-            isMinifyEnabled = false
-            proguardFiles(
-                getDefaultProguardFile("proguard-android-optimize.txt"),
-                "proguard-rules.pro"
-            )
-        }
-    }
-    compileOptions {
-        sourceCompatibility = JavaVersion.VERSION_1_8
-        targetCompatibility = JavaVersion.VERSION_1_8
-    }
-    kotlinOptions {
-        jvmTarget = "1.8"
-    }
-    buildFeatures {
-        compose = true
-    }
-    composeOptions {
-        kotlinCompilerExtensionVersion = "1.5.1"
-    }
-    packaging {
-        resources {
-            excludes += "/META-INF/{AL2.0,LGPL2.1}"
-        }
-    }
-    externalNativeBuild {
-        cmake {
-            path = file("src/main/cpp/CMakeLists.txt")
-            version = "3.22.1"
-        }
-    }
-}
-
-dependencies {
-
-    implementation("androidx.core:core-ktx:1.12.0")
-    implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
-    implementation("androidx.activity:activity-compose:1.8.2")
-    implementation(platform("androidx.compose:compose-bom:2023.08.00"))
-    implementation("androidx.compose.ui:ui")
-    implementation("androidx.compose.ui:ui-graphics")
-    implementation("androidx.compose.ui:ui-tooling-preview")
-    implementation("androidx.compose.material3:material3")
-    testImplementation("junit:junit:4.13.2")
-    androidTestImplementation("androidx.test.ext:junit:1.1.5")
-    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
-    androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
-    androidTestImplementation("androidx.compose.ui:ui-test-junit4")
-    debugImplementation("androidx.compose.ui:ui-tooling")
-    debugImplementation("androidx.compose.ui:ui-test-manifest")
-}
--- a/examples/llama.android/app/proguard-rules.pro
+++ b/examples/llama.android/app/proguard-rules.pro
@@ -1,21 +0,0 @@
-# Add project specific ProGuard rules here.
-# You can control the set of applied configuration files using the
-# proguardFiles setting in build.gradle.
-#
-# For more details, see
-#   http://developer.android.com/guide/developing/tools/proguard.html
-
-# If your project uses WebView with JS, uncomment the following
-# and specify the fully qualified class name to the JavaScript interface
-# class:
-#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
-#   public *;
-#}
-
-# Uncomment this to preserve the line number information for
-# debugging stack traces.
-#-keepattributes SourceFile,LineNumberTable
-
-# If you keep the line number information, uncomment this to
-# hide the original source file name.
-#-renamesourcefileattribute SourceFile
--- a/examples/llama.android/app/src/main/AndroidManifest.xml
+++ b/examples/llama.android/app/src/main/AndroidManifest.xml
@@ -1,30 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:tools="http://schemas.android.com/tools">
-
-    <uses-permission android:name="android.permission.INTERNET" />
-
-    <application
-        android:allowBackup="true"
-        android:dataExtractionRules="@xml/data_extraction_rules"
-        android:fullBackupContent="@xml/backup_rules"
-        android:icon="@mipmap/ic_launcher"
-        android:label="@string/app_name"
-        android:roundIcon="@mipmap/ic_launcher_round"
-        android:supportsRtl="true"
-        android:theme="@style/Theme.LlamaAndroid"
-        >
-
-        <activity
-            android:name=".MainActivity"
-            android:exported="true"
-            android:theme="@style/Theme.LlamaAndroid">
-            <intent-filter>
-                <action android:name="android.intent.action.MAIN" />
-
-                <category android:name="android.intent.category.LAUNCHER" />
-            </intent-filter>
-        </activity>
-    </application>
-
-</manifest>
--- a/examples/llama.android/app/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/app/src/main/cpp/CMakeLists.txt
@@ -1,50 +0,0 @@
-
-# For more information about using CMake with Android Studio, read the
-# documentation: https://d.android.com/studio/projects/add-native-code.html.
-# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
-
-# Sets the minimum CMake version required for this project.
-cmake_minimum_required(VERSION 3.22.1)
-
-# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
-# Since this is the top level CMakeLists.txt, the project name is also accessible
-# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
-# build script scope).
-project("llama-android")
-
-include(FetchContent)
-FetchContent_Declare(
-        llama
-        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
-        GIT_TAG        master
-)
-
-# Also provides "common"
-FetchContent_MakeAvailable(llama)
-
-# Creates and names a library, sets it as either STATIC
-# or SHARED, and provides the relative paths to its source code.
-# You can define multiple libraries, and CMake builds them for you.
-# Gradle automatically packages shared libraries with your APK.
-#
-# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
-# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
-# is preferred for the same purpose.
-#
-# In order to load a library into your app from Java/Kotlin, you must call
-# System.loadLibrary() and pass the name of the library defined here;
-# for GameActivity/NativeActivity derived applications, the same library name must be
-# used in the AndroidManifest.xml file.
-add_library(${CMAKE_PROJECT_NAME} SHARED
-    # List C/C++ source files with relative paths to this CMakeLists.txt.
-    llama-android.cpp)
-
-# Specifies libraries CMake should link to your target library. You
-# can link libraries from various origins, such as libraries defined in this
-# build script, prebuilt third-party libraries, or Android system libraries.
-target_link_libraries(${CMAKE_PROJECT_NAME}
-    # List libraries link to the target library
-    llama
-    common
-    android
-    log)
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@@ -1,394 +0,0 @@
-#include <android/log.h>
-#include <jni.h>
-#include <iomanip>
-#include <math.h>
-#include <string>
-#include <unistd.h>
-#include "llama.h"
-#include "common/common.h"
-
-// Write C++ code here.
-//
-// Do not forget to dynamically load the C++ library into your application.
-//
-// For instance,
-//
-// In MainActivity.java:
-//    static {
-//       System.loadLibrary("llama-android");
-//    }
-//
-// Or, in MainActivity.kt:
-//    companion object {
-//      init {
-//         System.loadLibrary("llama-android")
-//      }
-//    }
-
-#define TAG "llama-android.cpp"
-#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
-#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
-
-jclass la_int_var;
-jmethodID la_int_var_value;
-jmethodID la_int_var_inc;
-
-static void log_callback(ggml_log_level level, const char * fmt, void * data) {
-    if (level == GGML_LOG_LEVEL_ERROR)     __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
-    else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
-    else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data);
-    else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data);
-}
-
-extern "C"
-JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
-    llama_model_params model_params = llama_model_default_params();
-
-    auto path_to_model = env->GetStringUTFChars(filename, 0);
-    LOGi("Loading model from %s", path_to_model);
-
-    auto model = llama_load_model_from_file(path_to_model, model_params);
-    env->ReleaseStringUTFChars(filename, path_to_model);
-
-    if (!model) {
-        LOGe("load_model() failed");
-        env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed");
-        return 0;
-    }
-
-    return reinterpret_cast<jlong>(model);
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
-    llama_free_model(reinterpret_cast<llama_model *>(model));
-}
-
-extern "C"
-JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
-    auto model = reinterpret_cast<llama_model *>(jmodel);
-
-    if (!model) {
-        LOGe("new_context(): model cannot be null");
-        env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null");
-        return 0;
-    }
-
-    int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2));
-    LOGi("Using %d threads", n_threads);
-
-    llama_context_params ctx_params = llama_context_default_params();
-    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = 2048;
-    ctx_params.n_threads       = n_threads;
-    ctx_params.n_threads_batch = n_threads;
-
-    llama_context * context = llama_new_context_with_model(model, ctx_params);
-
-    if (!context) {
-        LOGe("llama_new_context_with_model() returned null)");
-        env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
-                      "llama_new_context_with_model() returned null)");
-        return 0;
-    }
-
-    return reinterpret_cast<jlong>(context);
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
-    llama_free(reinterpret_cast<llama_context *>(context));
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
-    llama_backend_free();
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
-    llama_log_set(log_callback, NULL);
-}
-
-extern "C"
-JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_bench_1model(
-        JNIEnv *env,
-        jobject,
-        jlong context_pointer,
-        jlong model_pointer,
-        jlong batch_pointer,
-        jint pp,
-        jint tg,
-        jint pl,
-        jint nr
-        ) {
-    auto pp_avg = 0.0;
-    auto tg_avg = 0.0;
-    auto pp_std = 0.0;
-    auto tg_std = 0.0;
-
-    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto model = reinterpret_cast<llama_model *>(model_pointer);
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
-
-    const int n_ctx = llama_n_ctx(context);
-
-    LOGi("n_ctx = %d", n_ctx);
-
-    int i, j;
-    int nri;
-    for (nri = 0; nri < nr; nri++) {
-        LOGi("Benchmark prompt processing (pp)");
-
-        llama_batch_clear(*batch);
-
-        const int n_tokens = pp;
-        for (i = 0; i < n_tokens; i++) {
-            llama_batch_add(*batch, 0, i, { 0 }, false);
-        }
-
-        batch->logits[batch->n_tokens - 1] = true;
-        llama_kv_cache_clear(context);
-
-        const auto t_pp_start = ggml_time_us();
-        if (llama_decode(context, *batch) != 0) {
-            LOGi("llama_decode() failed during prompt processing");
-        }
-        const auto t_pp_end = ggml_time_us();
-
-        // bench text generation
-
-        LOGi("Benchmark text generation (tg)");
-
-        llama_kv_cache_clear(context);
-        const auto t_tg_start = ggml_time_us();
-        for (i = 0; i < tg; i++) {
-
-            llama_batch_clear(*batch);
-            for (j = 0; j < pl; j++) {
-                llama_batch_add(*batch, 0, i, { j }, true);
-            }
-
-            LOGi("llama_decode() text generation: %d", i);
-            if (llama_decode(context, *batch) != 0) {
-                LOGi("llama_decode() failed during text generation");
-            }
-        }
-
-        const auto t_tg_end = ggml_time_us();
-
-        llama_kv_cache_clear(context);
-
-        const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
-        const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
-
-        const auto speed_pp = double(pp) / t_pp;
-        const auto speed_tg = double(pl * tg) / t_tg;
-
-        pp_avg += speed_pp;
-        tg_avg += speed_tg;
-
-        pp_std += speed_pp * speed_pp;
-        tg_std += speed_tg * speed_tg;
-
-        LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg);
-    }
-
-    pp_avg /= double(nr);
-    tg_avg /= double(nr);
-
-    if (nr > 1) {
-        pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1));
-        tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1));
-    } else {
-        pp_std = 0;
-        tg_std = 0;
-    }
-
-    char model_desc[128];
-    llama_model_desc(model, model_desc, sizeof(model_desc));
-
-    const auto model_size     = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0;
-    const auto model_n_params = double(llama_model_n_params(model)) / 1e9;
-
-    const auto backend    = "(Android)"; // TODO: What should this be?
-
-    std::stringstream result;
-    result << std::setprecision(2);
-    result << "| model | size | params | backend | test | t/s |\n";
-    result << "| --- | --- | --- | --- | --- | --- |\n";
-    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n";
-    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n";
-
-    return env->NewStringUTF(result.str().c_str());
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
-    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
-}
-
-extern "C"
-JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
-
-    // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
-
-    llama_batch *batch = new llama_batch {
-        0,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        0,
-        0,
-        0,
-    };
-
-    if (embd) {
-        batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd);
-    } else {
-        batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
-    }
-
-    batch->pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens);
-    batch->n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens);
-    batch->seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
-    for (int i = 0; i < n_tokens; ++i) {
-        batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
-    }
-    batch->logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens);
-
-    return reinterpret_cast<jlong>(batch);
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) {
-    llama_backend_init(numa);
-}
-
-extern "C"
-JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
-    return env->NewStringUTF(llama_print_system_info());
-}
-
-extern "C"
-JNIEXPORT jint JNICALL
-Java_com_example_llama_Llm_completion_1init(
-        JNIEnv *env,
-        jobject,
-        jlong context_pointer,
-        jlong batch_pointer,
-        jstring jtext,
-        jint n_len
-    ) {
-
-    const auto text = env->GetStringUTFChars(jtext, 0);
-    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
-
-    const auto tokens_list = llama_tokenize(context, text, 1);
-
-    auto n_ctx = llama_n_ctx(context);
-    auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
-
-    LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
-
-    if (n_kv_req > n_ctx) {
-        LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough");
-    }
-
-    for (auto id : tokens_list) {
-        LOGi("%s", llama_token_to_piece(context, id).c_str());
-    }
-
-    llama_batch_clear(*batch);
-
-    // evaluate the initial prompt
-    for (auto i = 0; i < tokens_list.size(); i++) {
-        llama_batch_add(*batch, tokens_list[i], i, { 0 }, false);
-    }
-
-    // llama_decode will output logits only for the last token of the prompt
-    batch->logits[batch->n_tokens - 1] = true;
-
-    if (llama_decode(context, *batch) != 0) {
-        LOGe("llama_decode() failed");
-    }
-
-    env->ReleaseStringUTFChars(jtext, text);
-
-    return batch->n_tokens;
-}
-
-extern "C"
-JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_completion_1loop(
-        JNIEnv * env,
-        jobject,
-        jlong context_pointer,
-        jlong batch_pointer,
-        jint n_len,
-        jobject intvar_ncur
-) {
-    const auto context = reinterpret_cast<llama_context *>(context_pointer);
-    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
-    const auto model = llama_get_model(context);
-
-    if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
-    if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
-    if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
-
-    auto n_vocab = llama_n_vocab(model);
-    auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
-
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-
-    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-    // sample the most likely token
-    const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
-
-    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
-    if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
-        return env->NewStringUTF("");
-    }
-
-    auto new_token_chars = llama_token_to_piece(context, new_token_id);
-    LOGi("new_token_chars: `%s`", new_token_chars.c_str());
-    auto new_token = env->NewStringUTF(new_token_chars.c_str());
-
-    llama_batch_clear(*batch);
-    llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
-
-    env->CallVoidMethod(intvar_ncur, la_int_var_inc);
-
-    if (llama_decode(context, *batch) != 0) {
-        LOGe("llama_decode() returned null");
-    }
-
-    return new_token;
-}
-
-extern "C"
-JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
-    llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
-}
--- a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
@@ -1,119 +0,0 @@
-package com.example.llama
-
-import android.app.DownloadManager
-import android.net.Uri
-import android.util.Log
-import androidx.compose.material3.Button
-import androidx.compose.material3.Text
-import androidx.compose.runtime.Composable
-import androidx.compose.runtime.getValue
-import androidx.compose.runtime.mutableDoubleStateOf
-import androidx.compose.runtime.mutableStateOf
-import androidx.compose.runtime.remember
-import androidx.compose.runtime.rememberCoroutineScope
-import androidx.compose.runtime.setValue
-import androidx.core.database.getLongOrNull
-import androidx.core.net.toUri
-import kotlinx.coroutines.delay
-import kotlinx.coroutines.launch
-import java.io.File
-
-data class Downloadable(val name: String, val source: Uri, val destination: File) {
-    companion object {
-        @JvmStatic
-        private val tag: String? = this::class.qualifiedName
-
-        sealed interface State
-        data object Ready: State
-        data class Downloading(val id: Long): State
-        data class Downloaded(val downloadable: Downloadable): State
-        data class Error(val message: String): State
-
-        @JvmStatic
-        @Composable
-        fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) {
-            var status: State by remember {
-                mutableStateOf(
-                    if (item.destination.exists()) Downloaded(item)
-                    else Ready
-                )
-            }
-            var progress by remember { mutableDoubleStateOf(0.0) }
-
-            val coroutineScope = rememberCoroutineScope()
-
-            suspend fun waitForDownload(result: Downloading, item: Downloadable): State {
-                while (true) {
-                    val cursor = dm.query(DownloadManager.Query().setFilterById(result.id))
-
-                    if (cursor == null) {
-                        Log.e(tag, "dm.query() returned null")
-                        return Error("dm.query() returned null")
-                    }
-
-                    if (!cursor.moveToFirst() || cursor.count < 1) {
-                        cursor.close()
-                        Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?")
-                        return Ready
-                    }
-
-                    val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR)
-                    val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES)
-                    val sofar = cursor.getLongOrNull(pix) ?: 0
-                    val total = cursor.getLongOrNull(tix) ?: 1
-                    cursor.close()
-
-                    if (sofar == total) {
-                        return Downloaded(item)
-                    }
-
-                    progress = (sofar * 1.0) / total
-
-                    delay(1000L)
-                }
-            }
-
-            fun onClick() {
-                when (val s = status) {
-                    is Downloaded -> {
-                        viewModel.load(item.destination.path)
-                    }
-
-                    is Downloading -> {
-                        coroutineScope.launch {
-                            status = waitForDownload(s, item)
-                        }
-                    }
-
-                    else -> {
-                        item.destination.delete()
-
-                        val request = DownloadManager.Request(item.source).apply {
-                            setTitle("Downloading model")
-                            setDescription("Downloading model: ${item.name}")
-                            setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI)
-                            setDestinationUri(item.destination.toUri())
-                        }
-
-                        viewModel.log("Saving ${item.name} to ${item.destination.path}")
-                        Log.i(tag, "Saving ${item.name} to ${item.destination.path}")
-
-                        val id = dm.enqueue(request)
-                        status = Downloading(id)
-                        onClick()
-                    }
-                }
-            }
-
-            Button(onClick = { onClick() }, enabled = status !is Downloading) {
-                when (status) {
-                    is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%")
-                    is Downloaded -> Text("Load ${item.name}")
-                    is Ready -> Text("Download ${item.name}")
-                    is Error -> Text("Download ${item.name}")
-                }
-            }
-        }
-
-    }
-}
--- a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
@@ -1,172 +0,0 @@
-package com.example.llama
-
-import android.util.Log
-import kotlinx.coroutines.CoroutineDispatcher
-import kotlinx.coroutines.asCoroutineDispatcher
-import kotlinx.coroutines.flow.Flow
-import kotlinx.coroutines.flow.flow
-import kotlinx.coroutines.flow.flowOn
-import kotlinx.coroutines.withContext
-import java.util.concurrent.Executors
-import kotlin.concurrent.thread
-
-class Llm {
-    private val tag: String? = this::class.simpleName
-
-    private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
-
-    private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor {
-        thread(start = false, name = "Llm-RunLoop") {
-            Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}")
-
-            // No-op if called more than once.
-            System.loadLibrary("llama-android")
-
-            // Set llama log handler to Android
-            log_to_android()
-            backend_init(false)
-
-            Log.d(tag, system_info())
-
-            it.run()
-        }.apply {
-            uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable ->
-                Log.e(tag, "Unhandled exception", exception)
-            }
-        }
-    }.asCoroutineDispatcher()
-
-    private val nlen: Int = 64
-
-    private external fun log_to_android()
-    private external fun load_model(filename: String): Long
-    private external fun free_model(model: Long)
-    private external fun new_context(model: Long): Long
-    private external fun free_context(context: Long)
-    private external fun backend_init(numa: Boolean)
-    private external fun backend_free()
-    private external fun free_batch(batch: Long)
-    private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
-    private external fun bench_model(
-        context: Long,
-        model: Long,
-        batch: Long,
-        pp: Int,
-        tg: Int,
-        pl: Int,
-        nr: Int
-    ): String
-
-    private external fun system_info(): String
-
-    private external fun completion_init(
-        context: Long,
-        batch: Long,
-        text: String,
-        nLen: Int
-    ): Int
-
-    private external fun completion_loop(
-        context: Long,
-        batch: Long,
-        nLen: Int,
-        ncur: IntVar
-    ): String
-
-    private external fun kv_cache_clear(context: Long)
-
-    suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String {
-        return withContext(runLoop) {
-            when (val state = threadLocalState.get()) {
-                is State.Loaded -> {
-                    Log.d(tag, "bench(): $state")
-                    bench_model(state.context, state.model, state.batch, pp, tg, pl, nr)
-                }
-
-                else -> throw IllegalStateException("No model loaded")
-            }
-        }
-    }
-
-    suspend fun load(pathToModel: String) {
-        withContext(runLoop) {
-            when (threadLocalState.get()) {
-                is State.Idle -> {
-                    val model = load_model(pathToModel)
-                    if (model == 0L)  throw IllegalStateException("load_model() failed")
-
-                    val context = new_context(model)
-                    if (context == 0L) throw IllegalStateException("new_context() failed")
-
-                    val batch = new_batch(512, 0, 1)
-                    if (batch == 0L) throw IllegalStateException("new_batch() failed")
-
-                    Log.i(tag, "Loaded model $pathToModel")
-                    threadLocalState.set(State.Loaded(model, context, batch))
-                }
-                else -> throw IllegalStateException("Model already loaded")
-            }
-        }
-    }
-
-    fun send(message: String): Flow<String> = flow {
-        when (val state = threadLocalState.get()) {
-            is State.Loaded -> {
-                val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
-                while (ncur.value <= nlen) {
-                    val str = completion_loop(state.context, state.batch, nlen, ncur)
-                    if (str.isEmpty()) {
-                        break
-                    }
-                    emit(str)
-                }
-                kv_cache_clear(state.context)
-            }
-            else -> {}
-        }
-    }.flowOn(runLoop)
-
-    /**
-     * Unloads the model and frees resources.
-     *
-     * This is a no-op if there's no model loaded.
-     */
-    suspend fun unload() {
-        withContext(runLoop) {
-            when (val state = threadLocalState.get()) {
-                is State.Loaded -> {
-                    free_context(state.context)
-                    free_model(state.model)
-                    free_batch(state.batch)
-
-                    threadLocalState.set(State.Idle)
-                }
-                else -> {}
-            }
-        }
-    }
-
-    companion object {
-        private class IntVar(value: Int) {
-            @Volatile
-            var value: Int = value
-                private set
-
-            fun inc() {
-                synchronized(this) {
-                    value += 1
-                }
-            }
-        }
-
-        private sealed interface State {
-            data object Idle: State
-            data class Loaded(val model: Long, val context: Long, val batch: Long): State
-        }
-
-        // Enforce only one instance of Llm.
-        private val _instance: Llm = Llm()
-
-        fun instance(): Llm = _instance
-    }
-}
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
@@ -1,154 +0,0 @@
-package com.example.llama
-
-import android.app.ActivityManager
-import android.app.DownloadManager
-import android.content.ClipData
-import android.content.ClipboardManager
-import android.net.Uri
-import android.os.Bundle
-import android.os.StrictMode
-import android.os.StrictMode.VmPolicy
-import android.text.format.Formatter
-import androidx.activity.ComponentActivity
-import androidx.activity.compose.setContent
-import androidx.activity.viewModels
-import androidx.compose.foundation.layout.Box
-import androidx.compose.foundation.layout.Column
-import androidx.compose.foundation.layout.Row
-import androidx.compose.foundation.layout.fillMaxSize
-import androidx.compose.foundation.layout.padding
-import androidx.compose.foundation.lazy.LazyColumn
-import androidx.compose.foundation.lazy.items
-import androidx.compose.foundation.lazy.rememberLazyListState
-import androidx.compose.material3.Button
-import androidx.compose.material3.LocalContentColor
-import androidx.compose.material3.MaterialTheme
-import androidx.compose.material3.OutlinedTextField
-import androidx.compose.material3.Surface
-import androidx.compose.material3.Text
-import androidx.compose.runtime.Composable
-import androidx.compose.ui.Modifier
-import androidx.compose.ui.unit.dp
-import androidx.core.content.getSystemService
-import com.example.llama.ui.theme.LlamaAndroidTheme
-import java.io.File
-
-class MainActivity(
-    activityManager: ActivityManager? = null,
-    downloadManager: DownloadManager? = null,
-    clipboardManager: ClipboardManager? = null,
-): ComponentActivity() {
-    private val tag: String? = this::class.simpleName
-
-    private val activityManager by lazy { activityManager ?: getSystemService<ActivityManager>()!! }
-    private val downloadManager by lazy { downloadManager ?: getSystemService<DownloadManager>()!! }
-    private val clipboardManager by lazy { clipboardManager ?: getSystemService<ClipboardManager>()!! }
-
-    private val viewModel: MainViewModel by viewModels()
-
-    // Get a MemoryInfo object for the device's current memory status.
-    private fun availableMemory(): ActivityManager.MemoryInfo {
-        return ActivityManager.MemoryInfo().also { memoryInfo ->
-            activityManager.getMemoryInfo(memoryInfo)
-        }
-    }
-
-    override fun onCreate(savedInstanceState: Bundle?) {
-        super.onCreate(savedInstanceState)
-
-        StrictMode.setVmPolicy(
-            VmPolicy.Builder(StrictMode.getVmPolicy())
-                .detectLeakedClosableObjects()
-                .build()
-        )
-
-        val free = Formatter.formatFileSize(this, availableMemory().availMem)
-        val total = Formatter.formatFileSize(this, availableMemory().totalMem)
-
-        viewModel.log("Current memory: $free / $total")
-        viewModel.log("Downloads directory: ${getExternalFilesDir(null)}")
-
-        val extFilesDir = getExternalFilesDir(null)
-
-        val models = listOf(
-            Downloadable(
-                "Phi-2 7B (Q4_0, 1.6 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
-                File(extFilesDir, "phi-2-q4_0.gguf"),
-            ),
-            Downloadable(
-                "TinyLlama 1.1B (f16, 2.2 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
-                File(extFilesDir, "tinyllama-1.1-f16.gguf"),
-            ),
-            Downloadable(
-                "Phi 2 DPO (Q3_K_M, 1.48 GiB)",
-                Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
-                File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
-            ),
-        )
-
-        setContent {
-            LlamaAndroidTheme {
-                // A surface container using the 'background' color from the theme
-                Surface(
-                    modifier = Modifier.fillMaxSize(),
-                    color = MaterialTheme.colorScheme.background
-                ) {
-                    MainCompose(
-                        viewModel,
-                        clipboardManager,
-                        downloadManager,
-                        models,
-                    )
-                }
-
-            }
-        }
-    }
-}
-
-@Composable
-fun MainCompose(
-    viewModel: MainViewModel,
-    clipboard: ClipboardManager,
-    dm: DownloadManager,
-    models: List<Downloadable>
-) {
-    Column {
-        val scrollState = rememberLazyListState()
-
-        Box(modifier = Modifier.weight(1f)) {
-            LazyColumn(state = scrollState) {
-                items(viewModel.messages) {
-                    Text(
-                        it,
-                        style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current),
-                        modifier = Modifier.padding(16.dp)
-                    )
-                }
-            }
-        }
-        OutlinedTextField(
-            value = viewModel.message,
-            onValueChange = { viewModel.updateMessage(it) },
-            label = { Text("Message") },
-        )
-        Row {
-            Button({ viewModel.send() }) { Text("Send") }
-            Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") }
-            Button({ viewModel.clear() }) { Text("Clear") }
-            Button({
-                viewModel.messages.joinToString("\n").let {
-                    clipboard.setPrimaryClip(ClipData.newPlainText("", it))
-                }
-            }) { Text("Copy") }
-        }
-
-        Column {
-            for (model in models) {
-                Downloadable.Button(viewModel, dm, model)
-            }
-        }
-    }
-}
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
@@ -1,104 +0,0 @@
-package com.example.llama
-
-import android.util.Log
-import androidx.compose.runtime.getValue
-import androidx.compose.runtime.mutableStateOf
-import androidx.compose.runtime.setValue
-import androidx.lifecycle.ViewModel
-import androidx.lifecycle.viewModelScope
-import kotlinx.coroutines.flow.catch
-import kotlinx.coroutines.launch
-
-class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
-    companion object {
-        @JvmStatic
-        private val NanosPerSecond = 1_000_000_000.0
-    }
-
-    private val tag: String? = this::class.simpleName
-
-    var messages by mutableStateOf(listOf("Initializing..."))
-        private set
-
-    var message by mutableStateOf("")
-        private set
-
-    override fun onCleared() {
-        super.onCleared()
-
-        viewModelScope.launch {
-            try {
-                llm.unload()
-            } catch (exc: IllegalStateException) {
-                messages += exc.message!!
-            }
-        }
-    }
-
-    fun send() {
-        val text = message
-        message = ""
-
-        // Add to messages console.
-        messages += text
-        messages += ""
-
-        viewModelScope.launch {
-            llm.send(text)
-                .catch {
-                    Log.e(tag, "send() failed", it)
-                    messages += it.message!!
-                }
-                .collect { messages = messages.dropLast(1) + (messages.last() + it) }
-        }
-    }
-
-    fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) {
-        viewModelScope.launch {
-            try {
-                val start = System.nanoTime()
-                val warmupResult = llm.bench(pp, tg, pl, nr)
-                val end = System.nanoTime()
-
-                messages += warmupResult
-
-                val warmup = (end - start).toDouble() / NanosPerSecond
-                messages += "Warm up time: $warmup seconds, please wait..."
-
-                if (warmup > 5.0) {
-                    messages += "Warm up took too long, aborting benchmark"
-                    return@launch
-                }
-
-                messages += llm.bench(512, 128, 1, 3)
-            } catch (exc: IllegalStateException) {
-                Log.e(tag, "bench() failed", exc)
-                messages += exc.message!!
-            }
-        }
-    }
-
-    fun load(pathToModel: String) {
-        viewModelScope.launch {
-            try {
-                llm.load(pathToModel)
-                messages += "Loaded $pathToModel"
-            } catch (exc: IllegalStateException) {
-                Log.e(tag, "load() failed", exc)
-                messages += exc.message!!
-            }
-        }
-    }
-
-    fun updateMessage(newMessage: String) {
-        message = newMessage
-    }
-
-    fun clear() {
-        messages = listOf()
-    }
-
-    fun log(message: String) {
-        messages += message
-    }
-}
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
@@ -1,11 +0,0 @@
-package com.example.llama.ui.theme
-
-import androidx.compose.ui.graphics.Color
-
-val Purple80 = Color(0xFFD0BCFF)
-val PurpleGrey80 = Color(0xFFCCC2DC)
-val Pink80 = Color(0xFFEFB8C8)
-
-val Purple40 = Color(0xFF6650a4)
-val PurpleGrey40 = Color(0xFF625b71)
-val Pink40 = Color(0xFF7D5260)
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
@@ -1,70 +0,0 @@
-package com.example.llama.ui.theme
-
-import android.app.Activity
-import android.os.Build
-import androidx.compose.foundation.isSystemInDarkTheme
-import androidx.compose.material3.MaterialTheme
-import androidx.compose.material3.darkColorScheme
-import androidx.compose.material3.dynamicDarkColorScheme
-import androidx.compose.material3.dynamicLightColorScheme
-import androidx.compose.material3.lightColorScheme
-import androidx.compose.runtime.Composable
-import androidx.compose.runtime.SideEffect
-import androidx.compose.ui.graphics.toArgb
-import androidx.compose.ui.platform.LocalContext
-import androidx.compose.ui.platform.LocalView
-import androidx.core.view.WindowCompat
-
-private val DarkColorScheme = darkColorScheme(
-    primary = Purple80,
-    secondary = PurpleGrey80,
-    tertiary = Pink80
-)
-
-private val LightColorScheme = lightColorScheme(
-    primary = Purple40,
-    secondary = PurpleGrey40,
-    tertiary = Pink40
-
-    /* Other default colors to override
-    background = Color(0xFFFFFBFE),
-    surface = Color(0xFFFFFBFE),
-    onPrimary = Color.White,
-    onSecondary = Color.White,
-    onTertiary = Color.White,
-    onBackground = Color(0xFF1C1B1F),
-    onSurface = Color(0xFF1C1B1F),
-    */
-)
-
-@Composable
-fun LlamaAndroidTheme(
-    darkTheme: Boolean = isSystemInDarkTheme(),
-    // Dynamic color is available on Android 12+
-    dynamicColor: Boolean = true,
-    content: @Composable () -> Unit
-) {
-    val colorScheme = when {
-        dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
-            val context = LocalContext.current
-            if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
-        }
-
-        darkTheme -> DarkColorScheme
-        else -> LightColorScheme
-    }
-    val view = LocalView.current
-    if (!view.isInEditMode) {
-        SideEffect {
-            val window = (view.context as Activity).window
-            window.statusBarColor = colorScheme.primary.toArgb()
-            WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
-        }
-    }
-
-    MaterialTheme(
-        colorScheme = colorScheme,
-        typography = Typography,
-        content = content
-    )
-}
--- a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
@@ -1,34 +0,0 @@
-package com.example.llama.ui.theme
-
-import androidx.compose.material3.Typography
-import androidx.compose.ui.text.TextStyle
-import androidx.compose.ui.text.font.FontFamily
-import androidx.compose.ui.text.font.FontWeight
-import androidx.compose.ui.unit.sp
-
-// Set of Material typography styles to start with
-val Typography = Typography(
-    bodyLarge = TextStyle(
-        fontFamily = FontFamily.Default,
-        fontWeight = FontWeight.Normal,
-        fontSize = 16.sp,
-        lineHeight = 24.sp,
-        letterSpacing = 0.5.sp
-    )
-    /* Other default text styles to override
-    titleLarge = TextStyle(
-        fontFamily = FontFamily.Default,
-        fontWeight = FontWeight.Normal,
-        fontSize = 22.sp,
-        lineHeight = 28.sp,
-        letterSpacing = 0.sp
-    ),
-    labelSmall = TextStyle(
-        fontFamily = FontFamily.Default,
-        fontWeight = FontWeight.Medium,
-        fontSize = 11.sp,
-        lineHeight = 16.sp,
-        letterSpacing = 0.5.sp
-    )
-    */
-)
--- a/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml
+++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml
@@ -1,170 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<vector xmlns:android="http://schemas.android.com/apk/res/android"
-    android:width="108dp"
-    android:height="108dp"
-    android:viewportWidth="108"
-    android:viewportHeight="108">
-    <path
-        android:fillColor="#3DDC84"
-        android:pathData="M0,0h108v108h-108z" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M9,0L9,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,0L19,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M29,0L29,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M39,0L39,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M49,0L49,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M59,0L59,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M69,0L69,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M79,0L79,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M89,0L89,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M99,0L99,108"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,9L108,9"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,19L108,19"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,29L108,29"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,39L108,39"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,49L108,49"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,59L108,59"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,69L108,69"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,79L108,79"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,89L108,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M0,99L108,99"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,29L89,29"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,39L89,39"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,49L89,49"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,59L89,59"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,69L89,69"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M19,79L89,79"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M29,19L29,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M39,19L39,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M49,19L49,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M59,19L59,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M69,19L69,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-    <path
-        android:fillColor="#00000000"
-        android:pathData="M79,19L79,89"
-        android:strokeWidth="0.8"
-        android:strokeColor="#33FFFFFF" />
-</vector>
--- a/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml
+++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml
@@ -1,30 +0,0 @@
-<vector xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:aapt="http://schemas.android.com/aapt"
-    android:width="108dp"
-    android:height="108dp"
-    android:viewportWidth="108"
-    android:viewportHeight="108">
-    <path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
-        <aapt:attr name="android:fillColor">
-            <gradient
-                android:endX="85.84757"
-                android:endY="92.4963"
-                android:startX="42.9492"
-                android:startY="49.59793"
-                android:type="linear">
-                <item
-                    android:color="#44000000"
-                    android:offset="0.0" />
-                <item
-                    android:color="#00000000"
-                    android:offset="1.0" />
-            </gradient>
-        </aapt:attr>
-    </path>
-    <path
-        android:fillColor="#FFFFFF"
-        android:fillType="nonZero"
-        android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
-        android:strokeWidth="1"
-        android:strokeColor="#00000000" />
-</vector>
--- a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
+++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
-    <background android:drawable="@drawable/ic_launcher_background" />
-    <foreground android:drawable="@drawable/ic_launcher_foreground" />
-    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
-</adaptive-icon>
--- a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
+++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
-    <background android:drawable="@drawable/ic_launcher_background" />
-    <foreground android:drawable="@drawable/ic_launcher_foreground" />
-    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
-</adaptive-icon>
--- a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp
--- a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
--- a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp
--- a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
--- a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
--- a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
--- a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
--- a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
--- a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
--- a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
+++ b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
--- a/examples/llama.android/app/src/main/res/values/colors.xml
+++ b/examples/llama.android/app/src/main/res/values/colors.xml
@@ -1,10 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources>
-    <color name="purple_200">#FFBB86FC</color>
-    <color name="purple_500">#FF6200EE</color>
-    <color name="purple_700">#FF3700B3</color>
-    <color name="teal_200">#FF03DAC5</color>
-    <color name="teal_700">#FF018786</color>
-    <color name="black">#FF000000</color>
-    <color name="white">#FFFFFFFF</color>
-</resources>
--- a/examples/llama.android/app/src/main/res/values/strings.xml
+++ b/examples/llama.android/app/src/main/res/values/strings.xml
@@ -1,3 +0,0 @@
-<resources>
-    <string name="app_name">LlamaAndroid</string>
-</resources>
--- a/examples/llama.android/app/src/main/res/values/themes.xml
+++ b/examples/llama.android/app/src/main/res/values/themes.xml
@@ -1,5 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<resources>
-
-    <style name="Theme.LlamaAndroid" parent="android:Theme.Material.Light.NoActionBar" />
-</resources>
--- a/examples/llama.android/app/src/main/res/xml/backup_rules.xml
+++ b/examples/llama.android/app/src/main/res/xml/backup_rules.xml
@@ -1,13 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
-   Sample backup rules file; uncomment and customize as necessary.
-   See https://developer.android.com/guide/topics/data/autobackup
-   for details.
-   Note: This file is ignored for devices older that API 31
-   See https://developer.android.com/about/versions/12/backup-restore
-->
-<full-backup-content>
-    <!--
-   <include domain="sharedpref" path="."/>
-   <exclude domain="sharedpref" path="device.xml"/>
-->
-</full-backup-content>
--- a/examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml
+++ b/examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml
@@ -1,19 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?><!--
-   Sample data extraction rules file; uncomment and customize as necessary.
-   See https://developer.android.com/about/versions/12/backup-restore#xml-changes
-   for details.
-->
-<data-extraction-rules>
-    <cloud-backup>
-        <!-- TODO: Use <include> and <exclude> to control what is backed up.
-        <include .../>
-        <exclude .../>
-        -->
-    </cloud-backup>
-    <!--
-    <device-transfer>
-        <include .../>
-        <exclude .../>
-    </device-transfer>
-    -->
-</data-extraction-rules>
--- a/examples/llama.android/build.gradle.kts
+++ b/examples/llama.android/build.gradle.kts
@@ -1,5 +0,0 @@
-// Top-level build file where you can add configuration options common to all sub-projects/modules.
-plugins {
-    id("com.android.application") version "8.2.0" apply false
-    id("org.jetbrains.kotlin.android") version "1.9.0" apply false
-}
--- a/examples/llama.android/gradle.properties
+++ b/examples/llama.android/gradle.properties
@@ -1,23 +0,0 @@
-# Project-wide Gradle settings.
-# IDE (e.g. Android Studio) users:
-# Gradle settings configured through the IDE *will override*
-# any settings specified in this file.
-# For more details on how to configure your build environment visit
-# http://www.gradle.org/docs/current/userguide/build_environment.html
-# Specifies the JVM arguments used for the daemon process.
-# The setting is particularly useful for tweaking memory settings.
-org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
-# When configured, Gradle will run in incubating parallel mode.
-# This option should only be used with decoupled projects. More details, visit
-# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
-# org.gradle.parallel=true
-# AndroidX package structure to make it clearer which packages are bundled with the
-# Android operating system, and which are packaged with your app's APK
-# https://developer.android.com/topic/libraries/support-library/androidx-rn
-android.useAndroidX=true
-# Kotlin code style for this project: "official" or "obsolete":
-kotlin.code.style=official
-# Enables namespacing of each library's R class so that its R class includes only the
-# resources declared in the library itself and none from the library's dependencies,
-# thereby reducing the size of the R class for that library
-android.nonTransitiveRClass=true
--- a/examples/llama.android/gradle/wrapper/gradle-wrapper.jar
+++ b/examples/llama.android/gradle/wrapper/gradle-wrapper.jar
--- a/examples/llama.android/gradle/wrapper/gradle-wrapper.properties
+++ b/examples/llama.android/gradle/wrapper/gradle-wrapper.properties
@@ -1,6 +0,0 @@
-#Thu Dec 21 14:31:09 AEDT 2023
-distributionBase=GRADLE_USER_HOME
-distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
-zipStoreBase=GRADLE_USER_HOME
-zipStorePath=wrapper/dists
--- a/examples/llama.android/gradlew
+++ b/examples/llama.android/gradlew
@@ -1,185 +0,0 @@
-#!/usr/bin/env sh
-
-#
-# Copyright 2015 the original author or authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-##############################################################################
-##
-##  Gradle start up script for UN*X
-##
-##############################################################################
-
-# Attempt to set APP_HOME
-# Resolve links: $0 may be a link
-PRG="$0"
-# Need this for relative symlinks.
-while [ -h "$PRG" ] ; do
-    ls=`ls -ld "$PRG"`
-    link=`expr "$ls" : '.*-> \(.*\)$'`
-    if expr "$link" : '/.*' > /dev/null; then
-        PRG="$link"
-    else
-        PRG=`dirname "$PRG"`"/$link"
-    fi
-done
-SAVED="`pwd`"
-cd "`dirname \"$PRG\"`/" >/dev/null
-APP_HOME="`pwd -P`"
-cd "$SAVED" >/dev/null
-
-APP_NAME="Gradle"
-APP_BASE_NAME=`basename "$0"`
-
-# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
-DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
-
-# Use the maximum available, or set MAX_FD != -1 to use that value.
-MAX_FD="maximum"
-
-warn () {
-    echo "$*"
-}
-
-die () {
-    echo
-    echo "$*"
-    echo
-    exit 1
-}
-
-# OS specific support (must be 'true' or 'false').
-cygwin=false
-msys=false
-darwin=false
-nonstop=false
-case "`uname`" in
-  CYGWIN* )
-    cygwin=true
-    ;;
-  Darwin* )
-    darwin=true
-    ;;
-  MINGW* )
-    msys=true
-    ;;
-  NONSTOP* )
-    nonstop=true
-    ;;
-esac
-
-CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
-
-
-# Determine the Java command to use to start the JVM.
-if [ -n "$JAVA_HOME" ] ; then
-    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
-        # IBM's JDK on AIX uses strange locations for the executables
-        JAVACMD="$JAVA_HOME/jre/sh/java"
-    else
-        JAVACMD="$JAVA_HOME/bin/java"
-    fi
-    if [ ! -x "$JAVACMD" ] ; then
-        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-    fi
-else
-    JAVACMD="java"
-    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
-
-Please set the JAVA_HOME variable in your environment to match the
-location of your Java installation."
-fi
-
-# Increase the maximum file descriptors if we can.
-if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
-    MAX_FD_LIMIT=`ulimit -H -n`
-    if [ $? -eq 0 ] ; then
-        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
-            MAX_FD="$MAX_FD_LIMIT"
-        fi
-        ulimit -n $MAX_FD
-        if [ $? -ne 0 ] ; then
-            warn "Could not set maximum file descriptor limit: $MAX_FD"
-        fi
-    else
-        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
-    fi
-fi
-
-# For Darwin, add options to specify how the application appears in the dock
-if $darwin; then
-    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
-fi
-
-# For Cygwin or MSYS, switch paths to Windows format before running java
-if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
-    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
-    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
-
-    JAVACMD=`cygpath --unix "$JAVACMD"`
-
-    # We build the pattern for arguments to be converted via cygpath
-    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
-    SEP=""
-    for dir in $ROOTDIRSRAW ; do
-        ROOTDIRS="$ROOTDIRS$SEP$dir"
-        SEP="|"
-    done
-    OURCYGPATTERN="(^($ROOTDIRS))"
-    # Add a user-defined pattern to the cygpath arguments
-    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
-        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
-    fi
-    # Now convert the arguments - kludge to limit ourselves to /bin/sh
-    i=0
-    for arg in "$@" ; do
-        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
-        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
-
-        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
-            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
-        else
-            eval `echo args$i`="\"$arg\""
-        fi
-        i=`expr $i + 1`
-    done
-    case $i in
-        0) set -- ;;
-        1) set -- "$args0" ;;
-        2) set -- "$args0" "$args1" ;;
-        3) set -- "$args0" "$args1" "$args2" ;;
-        4) set -- "$args0" "$args1" "$args2" "$args3" ;;
-        5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
-        6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
-        7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
-        8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
-        9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
-    esac
-fi
-
-# Escape application args
-save () {
-    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
-    echo " "
-}
-APP_ARGS=`save "$@"`
-
-# Collect all arguments for the java command, following the shell quoting and substitution rules
-eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
-
-exec "$JAVACMD" "$@"
--- a/examples/llama.android/settings.gradle.kts
+++ b/examples/llama.android/settings.gradle.kts
@@ -1,17 +0,0 @@
-pluginManagement {
-    repositories {
-        google()
-        mavenCentral()
-        gradlePluginPortal()
-    }
-}
-dependencyResolutionManagement {
-    repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
-    repositories {
-        google()
-        mavenCentral()
-    }
-}
-
-rootProject.name = "LlamaAndroid"
-include(":app")
--- a/examples/llama.swiftui/.gitignore
+++ b/examples/llama.swiftui/.gitignore
@@ -1,2 +1 @@
 xcuserdata
-xcshareddata
--- a/examples/llama.swiftui/README.md
+++ b/examples/llama.swiftui/README.md
@@ -1,12 +1,7 @@
-# llama.cpp/examples/llama.swiftui
+# llama.swiftui

-Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting
-point for more advanced projects.
-
-For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508
-
-![image](https://github.com/ggerganov/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
-
-Video demonstration:
+Local inference of llama.cpp on an iPhone.
+So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well.

 https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545
+
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -1,38 +1,19 @@
 import Foundation
-import llama
+
+// import llama

 enum LlamaError: Error {
    case couldNotInitializeContext
 }

-func llama_batch_clear(_ batch: inout llama_batch) {
-    batch.n_tokens = 0
-}
-
-func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) {
-    batch.token   [Int(batch.n_tokens)] = id
-    batch.pos     [Int(batch.n_tokens)] = pos
-    batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
-    for i in 0..<seq_ids.count {
-        batch.seq_id[Int(batch.n_tokens)]![Int(i)] = seq_ids[i]
-    }
-    batch.logits  [Int(batch.n_tokens)] = logits ? 1 : 0
-
-    batch.n_tokens += 1
-}
-
 actor LlamaContext {
    private var model: OpaquePointer
    private var context: OpaquePointer
    private var batch: llama_batch
    private var tokens_list: [llama_token]

-    /// This variable is used to store temporarily invalid cchars
-    private var temporary_invalid_cchars: [CChar]
-
-    var n_len: Int32 = 64
+    var n_len: Int32 = 512
    var n_cur: Int32 = 0
-
    var n_decode: Int32 = 0

    init(model: OpaquePointer, context: OpaquePointer) {
@@ -40,38 +21,28 @@ actor LlamaContext {
        self.context = context
        self.tokens_list = []
        self.batch = llama_batch_init(512, 0, 1)
-        self.temporary_invalid_cchars = []
    }

    deinit {
-        llama_batch_free(batch)
        llama_free(context)
        llama_free_model(model)
        llama_backend_free()
    }

-    static func create_context(path: String) throws -> LlamaContext {
+    static func createContext(path: String) throws -> LlamaContext {
        llama_backend_init(false)
-        var model_params = llama_model_default_params()
+        let model_params = llama_model_default_params()

-#if targetEnvironment(simulator)
-        model_params.n_gpu_layers = 0
-        print("Running on simulator, force use n_gpu_layers = 0")
-#endif
        let model = llama_load_model_from_file(path, model_params)
        guard let model else {
            print("Could not load model at \(path)")
            throw LlamaError.couldNotInitializeContext
        }
-
-        let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
-        print("Using \(n_threads) threads")
-
        var ctx_params = llama_context_default_params()
-        ctx_params.seed  = 1234
+        ctx_params.seed = 1234
        ctx_params.n_ctx = 2048
-        ctx_params.n_threads       = UInt32(n_threads)
-        ctx_params.n_threads_batch = UInt32(n_threads)
+        ctx_params.n_threads = 8
+        ctx_params.n_threads_batch = 8

        let context = llama_new_context_with_model(model, ctx_params)
        guard let context else {
@@ -82,26 +53,6 @@ actor LlamaContext {
        return LlamaContext(model: model, context: context)
    }

-    func model_info() -> String {
-        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 256)
-        result.initialize(repeating: Int8(0), count: 256)
-        defer {
-            result.deallocate()
-        }
-
-        // TODO: this is probably very stupid way to get the string from C
-
-        let nChars = llama_model_desc(model, result, 256)
-        let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars))
-
-        var SwiftString = ""
-        for char in bufferPointer {
-            SwiftString.append(Character(UnicodeScalar(UInt8(char))))
-        }
-
-        return SwiftString
-    }
-
    func get_n_tokens() -> Int32 {
        return batch.n_tokens;
    }
@@ -110,7 +61,6 @@ actor LlamaContext {
        print("attempting to complete \"\(text)\"")

        tokens_list = tokenize(text: text, add_bos: true)
-        temporary_invalid_cchars = []

        let n_ctx = llama_n_ctx(context)
        let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
@@ -122,14 +72,19 @@ actor LlamaContext {
        }

        for id in tokens_list {
-            print(String(cString: token_to_piece(token: id) + [0]))
+            print(token_to_piece(token: id))
        }

-        llama_batch_clear(&batch)
+        // batch = llama_batch_init(512, 0) // done in init()
+        batch.n_tokens = Int32(tokens_list.count)

-        for i1 in 0..<tokens_list.count {
+        for i1 in 0..<batch.n_tokens {
            let i = Int(i1)
-            llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
+            batch.token[i] = tokens_list[i]
+            batch.pos[i] = i1
+            batch.n_seq_id[Int(i)] = 1
+            batch.seq_id[Int(i)]![0] = 0
+            batch.logits[i] = 0
        }
        batch.logits[Int(batch.n_tokens) - 1] = 1 // true

@@ -158,35 +113,27 @@ actor LlamaContext {
            new_token_id = llama_sample_token_greedy(context, &candidates_p)
        }

-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
+        if new_token_id == llama_token_eos(context) || n_cur == n_len {
            print("\n")
-            let new_token_str = String(cString: temporary_invalid_cchars + [0])
-            temporary_invalid_cchars.removeAll()
-            return new_token_str
+            return ""
        }

-        let new_token_cchars = token_to_piece(token: new_token_id)
-        temporary_invalid_cchars.append(contentsOf: new_token_cchars)
-        let new_token_str: String
-        if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
-            temporary_invalid_cchars.removeAll()
-            new_token_str = string
-        } else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
-            // in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
-            let string = String(cString: temporary_invalid_cchars + [0])
-            temporary_invalid_cchars.removeAll()
-            new_token_str = string
-        } else {
-            new_token_str = ""
-        }
+        let new_token_str = token_to_piece(token: new_token_id)
        print(new_token_str)
        // tokens_list.append(new_token_id)

-        llama_batch_clear(&batch)
-        llama_batch_add(&batch, new_token_id, n_cur, [0], true)
+        batch.n_tokens = 0
+
+        batch.token[Int(batch.n_tokens)] = new_token_id
+        batch.pos[Int(batch.n_tokens)] = n_cur
+        batch.n_seq_id[Int(batch.n_tokens)] = 1
+        batch.seq_id[Int(batch.n_tokens)]![0] = 0
+        batch.logits[Int(batch.n_tokens)] = 1 // true
+        batch.n_tokens += 1

        n_decode += 1
-        n_cur    += 1
+
+        n_cur += 1

        if llama_decode(context, batch) != 0 {
            print("failed to evaluate llama!")
@@ -195,113 +142,14 @@ actor LlamaContext {
        return new_token_str
    }

-    func bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) -> String {
-        var pp_avg: Double = 0
-        var tg_avg: Double = 0
-
-        var pp_std: Double = 0
-        var tg_std: Double = 0
-
-        for _ in 0..<nr {
-            // bench prompt processing
-
-            llama_batch_clear(&batch)
-
-            let n_tokens = pp
-
-            for i in 0..<n_tokens {
-                llama_batch_add(&batch, 0, Int32(i), [0], false)
-            }
-            batch.logits[Int(batch.n_tokens) - 1] = 1 // true
-
-            llama_kv_cache_clear(context)
-
-            let t_pp_start = ggml_time_us()
-
-            if llama_decode(context, batch) != 0 {
-                print("llama_decode() failed during prompt")
-            }
-
-            let t_pp_end = ggml_time_us()
-
-            // bench text generation
-
-            llama_kv_cache_clear(context)
-
-            let t_tg_start = ggml_time_us()
-
-            for i in 0..<tg {
-                llama_batch_clear(&batch)
-
-                for j in 0..<pl {
-                    llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
-                }
-
-                if llama_decode(context, batch) != 0 {
-                    print("llama_decode() failed during text generation")
-                }
-            }
-
-            let t_tg_end = ggml_time_us()
-
-            llama_kv_cache_clear(context)
-
-            let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
-            let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
-
-            let speed_pp = Double(pp)    / t_pp
-            let speed_tg = Double(pl*tg) / t_tg
-
-            pp_avg += speed_pp
-            tg_avg += speed_tg
-
-            pp_std += speed_pp * speed_pp
-            tg_std += speed_tg * speed_tg
-
-            print("pp \(speed_pp) t/s, tg \(speed_tg) t/s")
-        }
-
-        pp_avg /= Double(nr)
-        tg_avg /= Double(nr)
-
-        if nr > 1 {
-            pp_std = sqrt(pp_std / Double(nr - 1) - pp_avg * pp_avg * Double(nr) / Double(nr - 1))
-            tg_std = sqrt(tg_std / Double(nr - 1) - tg_avg * tg_avg * Double(nr) / Double(nr - 1))
-        } else {
-            pp_std = 0
-            tg_std = 0
-        }
-
-        let model_desc     = model_info();
-        let model_size     = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0);
-        let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9);
-        let backend        = "Metal";
-        let pp_avg_str     = String(format: "%.2f", pp_avg);
-        let tg_avg_str     = String(format: "%.2f", tg_avg);
-        let pp_std_str     = String(format: "%.2f", pp_std);
-        let tg_std_str     = String(format: "%.2f", tg_std);
-
-        var result = ""
-
-        result += String("| model | size | params | backend | test | t/s |\n")
-        result += String("| --- | --- | --- | --- | --- | --- |\n")
-        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | pp \(pp) | \(pp_avg_str) ± \(pp_std_str) |\n")
-        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | tg \(tg) | \(tg_avg_str) ± \(tg_std_str) |\n")
-
-        return result;
-    }
-
    func clear() {
        tokens_list.removeAll()
-        temporary_invalid_cchars.removeAll()
-        llama_kv_cache_clear(context)
    }

    private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-        let utf8Count = text.utf8.count
-        let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
+        let n_tokens = text.count + (add_bos ? 1 : 0)
        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
+        let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false)

        var swiftTokens: [llama_token] = []
        for i in 0..<tokenCount {
@@ -313,27 +161,16 @@ actor LlamaContext {
        return swiftTokens
    }

-    /// - note: The result does not contain null-terminator
-    private func token_to_piece(token: llama_token) -> [CChar] {
+    private func token_to_piece(token: llama_token) -> String {
        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
        result.initialize(repeating: Int8(0), count: 8)
-        defer {
-            result.deallocate()
-        }
-        let nTokens = llama_token_to_piece(model, token, result, 8)

-        if nTokens < 0 {
-            let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
-            newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
-            defer {
-                newResult.deallocate()
-            }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
-            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
-            return Array(bufferPointer)
-        } else {
-            let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
-            return Array(bufferPointer)
-        }
+        let _ = llama_token_to_piece(model, token, result, 8)
+
+        let resultStr = String(cString: result)
+
+        result.deallocate()
+
+        return resultStr
    }
 }
--- a/examples/llama.swiftui/llama.cpp.swift/bridging-header.h
+++ b/examples/llama.swiftui/llama.cpp.swift/bridging-header.h
@@ -0,0 +1,5 @@
+//
+//  Use this file to import your target's public headers that you would like to expose to Swift.
+//
+
+#import "llama.h"
--- a/Show More
+++ b/Show More