fix issues

Merge remote-tracking branch 'origin/master' into fix-tensor-split-zero
cuda : fix disabling device with --tensor-split 1,0
2026-02-05 13:53:23 +02:00 · 2023-11-05 13:20:22 +01:00 · 2023-11-05 12:42:43 +01:00 · 2023-11-05 00:56:32 -04:00
189 changed files with 15263 additions and 32256 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -14,8 +14,7 @@ ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git

-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
+COPY requirements.txt requirements.txt

 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -23,8 +23,7 @@ ARG ROCM_DOCKER_ARCH=\
    gfx1101 \
    gfx1102

-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
+COPY requirements.txt requirements.txt

 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -5,8 +5,7 @@ FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git

-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
+COPY requirements.txt requirements.txt

 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@@ -23,8 +23,7 @@ ARG ROCM_DOCKER_ARCH=\
    gfx1101 \
    gfx1102

-COPY requirements.txt   requirements.txt
-COPY requirements       requirements
+COPY requirements.txt requirements.txt

 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@@ -1,22 +0,0 @@
-{
-  perSystem =
-    { config, lib, ... }:
-    {
-      apps =
-        let
-          inherit (config.packages) default;
-          binaries = [
-            "llama"
-            "llama-embedding"
-            "llama-server"
-            "quantize"
-            "train-text-from-scratch"
-          ];
-          mkApp = name: {
-            type = "app";
-            program = "${default}/bin/${name}";
-          };
-        in
-        lib.genAttrs binaries mkApp;
-    };
-}
--- a/.devops/nix/devshells.nix
+++ b/.devops/nix/devshells.nix
@@ -1,13 +0,0 @@
-{
-  perSystem =
-    { config, lib, ... }:
-    {
-      devShells =
-        lib.concatMapAttrs
-          (name: package: {
-            ${name} = package.passthru.shell;
-            ${name + "-extra"} = package.passthru.shell-extra;
-          })
-          config.packages;
-    };
-}
--- a/.devops/nix/jetson-support.nix
+++ b/.devops/nix/jetson-support.nix
@@ -1,39 +0,0 @@
-{ inputs, ... }:
-{
-  perSystem =
-    {
-      config,
-      system,
-      lib,
-      pkgsCuda,
-      ...
-    }:
-    {
-      legacyPackages =
-        let
-          caps.llamaPackagesXavier = "7.2";
-          caps.llamaPackagesOrin = "8.7";
-          caps.llamaPackagesTX2 = "6.2";
-          caps.llamaPackagesNano = "5.3";
-
-          pkgsFor =
-            cap:
-            import inputs.nixpkgs {
-              inherit system;
-              config = {
-                cudaSupport = true;
-                cudaCapabilities = [ cap ];
-                cudaEnableForwardCompat = false;
-                inherit (pkgsCuda.config) allowUnfreePredicate;
-              };
-            };
-        in
-        builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
-
-      packages = lib.optionalAttrs (system == "aarch64-linux") {
-        jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
-        jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
-        jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
-      };
-    };
-}
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@@ -1,35 +0,0 @@
-{ inputs, ... }:
-{
-  # The _module.args definitions are passed on to modules as arguments. E.g.
-  # the module `{ pkgs ... }: { /* config */ }` implicitly uses
-  # `_module.args.pkgs` (defined in this case by flake-parts).
-  perSystem =
-    { system, ... }:
-    {
-      _module.args = {
-        pkgsCuda = import inputs.nixpkgs {
-          inherit system;
-          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
-          # and ucx are built with CUDA support)
-          config.cudaSupport = true;
-          config.allowUnfreePredicate =
-            p:
-            builtins.all
-              (
-                license:
-                license.free
-                || builtins.elem license.shortName [
-                  "CUDA EULA"
-                  "cuDNN EULA"
-                ]
-              )
-              (p.meta.licenses or [ p.meta.license ]);
-        };
-        # Ensure dependencies use ROCm consistently
-        pkgsRocm = import inputs.nixpkgs {
-          inherit system;
-          config.rocmSupport = true;
-        };
-      };
-    };
-}
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -1,265 +0,0 @@
-{
-  lib,
-  config,
-  stdenv,
-  mkShell,
-  cmake,
-  ninja,
-  pkg-config,
-  git,
-  python3,
-  mpi,
-  openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
-  cudaPackages,
-  darwin,
-  rocmPackages,
-  clblast,
-  useBlas ? builtins.all (x: !x) [
-    useCuda
-    useMetalKit
-    useOpenCL
-    useRocm
-  ],
-  useCuda ? config.cudaSupport,
-  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
-  useMpi ? false, # Increases the runtime closure size by ~700M
-  useOpenCL ? false,
-  useRocm ? config.rocmSupport,
-  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
-}@inputs:
-
-let
-  inherit (lib)
-    cmakeBool
-    cmakeFeature
-    optionals
-    strings
-    versionOlder
-    ;
-
-  # It's necessary to consistently use backendStdenv when building with CUDA support,
-  # otherwise we get libstdc++ errors downstream.
-  stdenv = throw "Use effectiveStdenv instead";
-  effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
-
-  suffices =
-    lib.optionals useBlas [ "BLAS" ]
-    ++ lib.optionals useCuda [ "CUDA" ]
-    ++ lib.optionals useMetalKit [ "MetalKit" ]
-    ++ lib.optionals useMpi [ "MPI" ]
-    ++ lib.optionals useOpenCL [ "OpenCL" ]
-    ++ lib.optionals useRocm [ "ROCm" ];
-
-  pnameSuffix =
-    strings.optionalString (suffices != [ ])
-      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
-  descriptionSuffix =
-    strings.optionalString (suffices != [ ])
-      ", accelerated with ${strings.concatStringsSep ", " suffices}";
-
-  # TODO: package the Python in this repository in a Nix-like way.
-  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
-  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
-  # https://peps.python.org/pep-0517/
-  llama-python = python3.withPackages (
-    ps: [
-      ps.numpy
-      ps.sentencepiece
-    ]
-  );
-
-  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
-  llama-python-extra = python3.withPackages (
-    ps: [
-      ps.numpy
-      ps.sentencepiece
-      ps.torchWithoutCuda
-      ps.transformers
-    ]
-  );
-
-  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
-  # separately
-  darwinBuildInputs =
-    with darwin.apple_sdk.frameworks;
-    [
-      Accelerate
-      CoreVideo
-      CoreGraphics
-    ]
-    ++ optionals useMetalKit [ MetalKit ];
-
-  cudaBuildInputs = with cudaPackages; [
-    cuda_cccl.dev # <nv/target>
-
-    # A temporary hack for reducing the closure size, remove once cudaPackages
-    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
-    cuda_cudart.dev
-    cuda_cudart.lib
-    cuda_cudart.static
-    libcublas.dev
-    libcublas.lib
-    libcublas.static
-  ];
-
-  rocmBuildInputs = with rocmPackages; [
-    clr
-    hipblas
-    rocblas
-  ];
-in
-
-effectiveStdenv.mkDerivation (
-  finalAttrs: {
-    pname = "llama-cpp${pnameSuffix}";
-    version = llamaVersion;
-
-    src = lib.cleanSourceWith {
-      filter =
-        name: type:
-        !(builtins.any (_: _) [
-          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-          (name == "README.md") # Ignore *.md changes whe computing outPaths
-          (lib.hasPrefix "." name) # Skip hidden files and directories
-        ]);
-      src = lib.cleanSource ../../.;
-    };
-
-    postPatch = ''
-      substituteInPlace ./ggml-metal.m \
-        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-
-      # TODO: Package up each Python script or service appropriately.
-      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
-      # we could make those *.py into setuptools' entrypoints
-      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
-    '';
-
-    nativeBuildInputs =
-      [
-        cmake
-        ninja
-        pkg-config
-        git
-      ]
-      ++ optionals useCuda [
-        cudaPackages.cuda_nvcc
-
-        # TODO: Replace with autoAddDriverRunpath
-        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
-        cudaPackages.autoAddOpenGLRunpathHook
-      ];
-
-    buildInputs =
-      optionals effectiveStdenv.isDarwin darwinBuildInputs
-      ++ optionals useCuda cudaBuildInputs
-      ++ optionals useMpi [ mpi ]
-      ++ optionals useOpenCL [ clblast ]
-      ++ optionals useRocm rocmBuildInputs;
-
-    cmakeFlags =
-      [
-        (cmakeBool "LLAMA_NATIVE" true)
-        (cmakeBool "LLAMA_BUILD_SERVER" true)
-        (cmakeBool "BUILD_SHARED_LIBS" true)
-        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
-        (cmakeBool "LLAMA_BLAS" useBlas)
-        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
-        (cmakeBool "LLAMA_CUBLAS" useCuda)
-        (cmakeBool "LLAMA_HIPBLAS" useRocm)
-        (cmakeBool "LLAMA_METAL" useMetalKit)
-        (cmakeBool "LLAMA_MPI" useMpi)
-      ]
-      ++ optionals useCuda [
-        (
-          with cudaPackages.flags;
-          cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
-            builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
-          )
-        )
-      ]
-      ++ optionals useRocm [
-        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
-        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
-
-        # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
-        # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
-        # and select the line that matches the current nixpkgs version of rocBLAS.
-        # Should likely use `rocmPackages.clr.gpuTargets`.
-        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
-      ]
-      ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
-      ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
-
-    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
-    # if they haven't been added yet.
-    postInstall = ''
-      mv $out/bin/main $out/bin/llama
-      mv $out/bin/server $out/bin/llama-server
-      mkdir -p $out/include
-      cp $src/llama.h $out/include/
-    '';
-
-    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
-    passthru = {
-      inherit
-        useBlas
-        useCuda
-        useMetalKit
-        useMpi
-        useOpenCL
-        useRocm
-        ;
-
-      shell = mkShell {
-        name = "shell-${finalAttrs.finalPackage.name}";
-        description = "contains numpy and sentencepiece";
-        buildInputs = [ llama-python ];
-        inputsFrom = [ finalAttrs.finalPackage ];
-      };
-
-      shell-extra = mkShell {
-        name = "shell-extra-${finalAttrs.finalPackage.name}";
-        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
-        buildInputs = [ llama-python-extra ];
-        inputsFrom = [ finalAttrs.finalPackage ];
-      };
-    };
-
-    meta = {
-      # Configurations we don't want even the CI to evaluate. Results in the
-      # "unsupported platform" messages. This is mostly a no-op, because
-      # cudaPackages would've refused to evaluate anyway.
-      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
-
-      # Configurations that are known to result in build failures. Can be
-      # overridden by importing Nixpkgs with `allowBroken = true`.
-      broken = (useMetalKit && !effectiveStdenv.isDarwin);
-
-      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-      homepage = "https://github.com/ggerganov/llama.cpp/";
-      license = lib.licenses.mit;
-
-      # Accommodates `nix run` and `lib.getExe`
-      mainProgram = "llama";
-
-      # These people might respond, on the best effort basis, if you ping them
-      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
-      # Consider adding yourself to this list if you want to ensure this flake
-      # stays maintained and you're willing to invest your time. Do not add
-      # other people without their consent. Consider removing people after
-      # they've been unreachable for long periods of time.
-
-      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
-      # an attrset following the same format as in
-      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
-      maintainers = with lib.maintainers; [
-        philiptaron
-        SomeoneSerge
-      ];
-
-      # Extend `badPlatforms` instead
-      platforms = lib.platforms.all;
-    };
-  }
-)
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -1,12 +0,0 @@
-{
-  lib,
-  newScope,
-  llamaVersion ? "0.0.0",
-}:
-
-lib.makeScope newScope (
-  self: {
-    inherit llamaVersion;
-    llama-cpp = self.callPackage ./package.nix { };
-  }
-)
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -13,8 +13,6 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
    ./quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
    ./main "$@"
-elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
-    ./finetune "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
    for i in `ls $1/$2/ggml-model-f16.bin*`; do
@@ -36,8 +34,6 @@ else
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --finetune (-f): Run finetune command to create a lora finetune of the model"
-    echo "              See documentation for finetune for command-line parameters"
    echo "  --all-in-one (-a): Execute --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
    echo "  --server (-s): Run a model on the server"
--- a/.editorconfig
+++ b/.editorconfig
@@ -15,14 +15,8 @@ indent_size = 4
 [Makefile]
 indent_style = tab

-[scripts/*.mk]
-indent_style = tab
-
 [prompts/*.txt]
 insert_final_newline = unset

 [examples/server/public/*]
 indent_size = 2
-
-[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
-indent_style = tab
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ b/.github/ISSUE_TEMPLATE/bug.md
@@ -6,4 +6,179 @@ assignees: ''

 ---

-Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
+# Prerequisites
+
+Please answer the following questions for yourself before submitting an issue.
+
+- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
+- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
+- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
+
+# Expected Behavior
+
+Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do.
+
+# Current Behavior
+
+Please provide a detailed written description of what `llama.cpp` did, instead.
+
+# Environment and Context
+
+Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
+
+* Physical (or virtual) hardware you are using, e.g. for Linux:
+
+`$ lscpu`
+
+* Operating System, e.g. for Linux:
+
+`$ uname -a`
+
+* SDK version, e.g. for Linux:
+
+```
+$ python3 --version
+$ make --version
+$ g++ --version
+```
+
+# Failure Information (for bugs)
+
+Please help provide information about the failure / bug.
+
+# Steps to Reproduce
+
+Please provide detailed steps for reproducing the issue. We are not sitting in front of your screen, so the more detail the better.
+
+1. step 1
+2. step 2
+3. step 3
+4. etc.
+
+# Failure Logs
+
+Please include any relevant log snippets or files. If it works under one configuration but not under another, please provide logs for both configurations and their corresponding outputs so it is easy to see where behavior changes.
+
+Also, please try to **avoid using screenshots** if at all possible. Instead, copy/paste the console output and use [Github's markdown](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) to cleanly format your logs for easy readability.
+
+Example environment info:
+```
+llama.cpp$ git log | head -1
+commit 2af23d30434a677c6416812eea52ccc0af65119c
+
+llama.cpp$ lscpu | egrep "AMD|Flags"
+Vendor ID:                       AuthenticAMD
+Model name:                      AMD Ryzen Threadripper 1950X 16-Core Processor
+Flags:                           fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid amd_dcm aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb hw_pstate ssbd ibpb vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt sha_ni xsaveopt xsavec xgetbv1 xsaves clzero irperf xsaveerptr arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif overflow_recov succor smca sme sev
+Virtualization:                  AMD-V
+
+llama.cpp$ python3 --version
+Python 3.10.9
+
+llama.cpp$ pip list | egrep "torch|numpy|sentencepiece"
+numpy                         1.24.2
+numpydoc                      1.5.0
+sentencepiece                 0.1.97
+torch                         1.13.1
+torchvision                   0.14.1
+
+llama.cpp$ make --version | head -1
+GNU Make 4.3
+
+$ md5sum ./models/65B/ggml-model-q4_0.bin
+dbdd682cce80e2d6e93cefc7449df487  ./models/65B/ggml-model-q4_0.bin
+```
+
+Example run with the Linux command [perf](https://www.brendangregg.com/perf.html)
+```
+llama.cpp$ perf stat ./main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p "Please close your issue when it has been answered."
+main: seed = 1679149377
+llama_model_load: loading model from './models/65B/ggml-model-q4_0.bin' - please wait ...
+llama_model_load: n_vocab = 32000
+llama_model_load: n_ctx   = 512
+llama_model_load: n_embd  = 8192
+llama_model_load: n_mult  = 256
+llama_model_load: n_head  = 64
+llama_model_load: n_layer = 80
+llama_model_load: n_rot   = 128
+llama_model_load: f16     = 2
+llama_model_load: n_ff    = 22016
+llama_model_load: n_parts = 8
+llama_model_load: ggml ctx size = 41477.73 MB
+llama_model_load: memory_size =  2560.00 MB, n_mem = 40960
+llama_model_load: loading model part 1/8 from './models/65B/ggml-model-q4_0.bin'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 2/8 from './models/65B/ggml-model-q4_0.bin.1'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 3/8 from './models/65B/ggml-model-q4_0.bin.2'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 4/8 from './models/65B/ggml-model-q4_0.bin.3'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 5/8 from './models/65B/ggml-model-q4_0.bin.4'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 6/8 from './models/65B/ggml-model-q4_0.bin.5'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 7/8 from './models/65B/ggml-model-q4_0.bin.6'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.7'
+llama_model_load: .......................................................................................... done
+llama_model_load: model size =  4869.09 MB / num tensors = 723
+
+system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
+
+main: prompt: 'Please close your issue when it has been answered.'
+main: number of tokens in prompt = 11
+     1 -> ''
+ 12148 -> 'Please'
+  3802 -> ' close'
+   596 -> ' your'
+  2228 -> ' issue'
+   746 -> ' when'
+   372 -> ' it'
+   756 -> ' has'
+  1063 -> ' been'
+  7699 -> ' answered'
+ 29889 -> '.'
+
+sampling parameters: temp = 0.800000, top_k = 40, top_p = 0.950000, repeat_last_n = 64, repeat_penalty = 1.300000
+
+
+Please close your issue when it has been answered.
+@duncan-donut: I'm trying to figure out what kind of "support" you need for this script and why, exactly? Is there a question about how the code works that hasn't already been addressed in one or more comments below this ticket, or are we talking something else entirely like some sorta bugfixing job because your server setup is different from mine??
+I can understand if your site needs to be running smoothly and you need help with a fix of sorts but there should really be nothing wrong here that the code itself could not handle. And given that I'm getting reports about how it works perfectly well on some other servers, what exactly are we talking? A detailed report will do wonders in helping us get this resolved for ya quickly so please take your time and describe the issue(s) you see as clearly & concisely as possible!!
+@duncan-donut: I'm not sure if you have access to cPanel but you could try these instructions. It is worth a shot! Let me know how it goes (or what error message, exactly!) when/if ya give that code a go? [end of text]
+
+
+main: mem per token = 71159620 bytes
+main:     load time = 19309.95 ms
+main:   sample time =   168.62 ms
+main:  predict time = 223895.61 ms / 888.47 ms per token
+main:    total time = 246406.42 ms
+
+ Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
+
+        3636882.89 msec task-clock                #   14.677 CPUs utilized
+             13509      context-switches          #    3.714 /sec
+              2436      cpu-migrations            #    0.670 /sec
+          10476679      page-faults               #    2.881 K/sec
+    13133115082869      cycles                    #    3.611 GHz                      (16.77%)
+       29314462753      stalled-cycles-frontend   #    0.22% frontend cycles idle     (16.76%)
+    10294402631459      stalled-cycles-backend    #   78.39% backend cycles idle      (16.74%)
+    23479217109614      instructions              #    1.79  insn per cycle
+                                                  #    0.44  stalled cycles per insn  (16.76%)
+     2353072268027      branches                  #  647.002 M/sec                    (16.77%)
+        1998682780      branch-misses             #    0.08% of all branches          (16.76%)
+
+     247.802177522 seconds time elapsed
+
+    3618.573072000 seconds user
+      18.491698000 seconds sys
+```
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -143,9 +143,6 @@ jobs:
          cd build
          ctest --verbose

-  # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
-  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
  macOS-latest-make:
    runs-on: macos-latest

@@ -163,18 +160,14 @@ jobs:
      - name: Build
        id: make_build
        run: |
-          LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
+          make -j $(sysctl -n hw.logicalcpu)

      - name: Test
        id: make_test
        run: |
-          LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
-          LLAMA_NO_METAL=1 make test  -j $(sysctl -n hw.logicalcpu)
+          make tests -j $(sysctl -n hw.logicalcpu)
+          make test -j $(sysctl -n hw.logicalcpu)

-  # TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
-  #       how to debug it.
-  #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
-  #       would be great if we fix these
  macOS-latest-cmake:
    runs-on: macos-latest

@@ -195,7 +188,7 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_METAL=OFF ..
+          cmake ..
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)

      - name: Test
@@ -505,16 +498,6 @@ jobs:
          path: |
            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip

-  ios-xcode-build:
-    runs-on: macos-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-
-      - name: Build Xcode project
-        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
-
 #  freeBSD-latest:
 #    runs-on: macos-12
 #    steps:
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -52,36 +52,6 @@ jobs:
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}

-      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
-      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
-        with:
-          # this might remove tools that are actually needed,
-          # if set to "true" but frees about 6 GB
-          tool-cache: false
-
-          # all of these default to true, but feel free to set to
-          # "false" if necessary for your workflow
-          android: true
-          dotnet: true
-          haskell: true
-          large-packages: true
-          docker-images: true
-          swap-storage: true
-
-      - name: Determine tag name
-        id: tag
-        shell: bash
-        run: |
-          BUILD_NUMBER="$(git rev-list --count HEAD)"
-          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
-          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
-            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
-          else
-            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
-            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
-          fi
-
      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v4
@@ -89,7 +59,7 @@ jobs:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}

      - name: Build and push Docker image (tagged)
@@ -98,5 +68,5 @@ jobs:
          context: .
          push: ${{ github.event_name == 'push' }}
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -1,112 +0,0 @@
-name: Nix CI
-
-on:
-  workflow_dispatch: # allows manual triggering
-  push:
-    branches:
-      - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
-
-jobs:
-  nix-eval:
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: List all flake outputs
-      run: nix flake show --all-systems
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
-  nix-build:
-    if: ${{ vars.CACHIX_NAME != '' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ ubuntu-latest, macos-latest ]
-    runs-on: ${{ matrix.os }}
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --flake
-          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
-  nix-build-aarch64:
-    if: ${{ vars.CACHIX_NAME != '' }}
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v4
-    - name: Install QEMU
-      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
-      run: |
-        sudo apt-get install -y qemu-user-static qemu-system-aarch64
-        sudo usermod -a -G kvm $USER
-    - name: Install Nix
-      uses: DeterminateSystems/nix-installer-action@v9
-      with:
-        github-token: ${{ secrets.GITHUB_TOKEN }}
-        extra-conf: |
-          extra-platforms = aarch64-linux
-          extra-system-features = nixos-test kvm
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
-    - uses: DeterminateSystems/magic-nix-cache-action@v2
-      with:
-        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
-    - name: Set-up cachix to push the results to
-      uses: cachix/cachix-action@v13
-      with:
-        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
-    - name: Show all output paths
-      run: >
-          nix run github:nix-community/nix-eval-jobs
-          -- --gc-roots-dir gcroot
-          --flake
-          ".#packages.aarch64-linux"
-    - name: Build
-      run: >
-          nix run github:Mic92/nix-fast-build
-          -- --skip-cached --no-nom
-          --systems aarch64-linux
-          --flake
-          ".#checks.aarch64-linux"
--- a/.github/workflows/nix-flake-update.yml
+++ b/.github/workflows/nix-flake-update.yml
@@ -1,22 +0,0 @@
-name: update-flake-lock
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
-
-jobs:
-  lockfile:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v4
-      - name: Install Nix
-        uses: DeterminateSystems/nix-installer-action@main
-      - name: Update flake.lock
-        uses: DeterminateSystems/update-flake-lock@main
-        with:
-          pr-title: "nix: update flake.lock"
-          pr-labels: |
-            nix
-          pr-reviewers: philiptaron,SomeoneSerge
-          token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/nix-publish-flake.yml
+++ b/.github/workflows/nix-publish-flake.yml
@@ -1,36 +0,0 @@
-# Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
-name: "Publish a flake to flakestry & flakehub"
-on:
-    push:
-        tags:
-        - "*"
-    workflow_dispatch:
-        inputs:
-            tag:
-                description: "The existing tag to publish"
-                type: "string"
-                required: true
-jobs:
-    flakestry-publish:
-        runs-on: ubuntu-latest
-        permissions:
-            id-token: "write"
-            contents: "read"
-        steps:
-            - uses: flakestry/flakestry-publish@main
-              with:
-                version: "${{ inputs.tag || github.ref_name }}"
-    flakehub-publish:
-      runs-on: "ubuntu-latest"
-      permissions:
-        id-token: "write"
-        contents: "read"
-      steps:
-        - uses: "actions/checkout@v4"
-          with:
-            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
-        - uses: "DeterminateSystems/nix-installer-action@main"
-        - uses: "DeterminateSystems/flakehub-push@main"
-          with:
-            visibility: "public"
-            tag: "${{ inputs.tag }}"
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -1,29 +0,0 @@
-name: Python check requirements.txt
-
-on:
-  push:
-    paths:
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
-  pull_request:
-    paths:
-      - 'scripts/check-requirements.sh'
-      - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
-
-jobs:
-  python-check-requirements:
-    runs-on: ubuntu-latest
-    name: check-requirements
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v3
-      - name: Set up Python environment
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.11"
-      - name: Run check-requirements.sh script
-        run:  bash scripts/check-requirements.sh nocleanup
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@@ -1,20 +0,0 @@
-name: flake8 Lint
-
-on: [push, pull_request]
-
-jobs:
-  flake8-lint:
-    runs-on: ubuntu-latest
-    name: Lint
-    steps:
-      - name: Check out source repository
-        uses: actions/checkout@v3
-      - name: Set up Python environment
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.11"
-      - name: flake8 Lint
-        uses: py-actions/flake8@v2
-        with:
-            ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
-            exclude: "examples/*,examples/*/**,*/**/__init__.py"
--- a/.gitignore
+++ b/.gitignore
@@ -46,12 +46,9 @@ models-mnt
 /infill
 /libllama.so
 /llama-bench
-/llava-cli
-/lookahead
-/lookup
+/llava
 /main
 /metal
-/passkey
 /perplexity
 /q8dot
 /quantize
@@ -67,7 +64,6 @@ models-mnt
 /speculative
 /parallel
 /train-text-from-scratch
-/tokenize
 /vdot
 /common/build-info.cpp
 arm_neon.h
@@ -90,17 +86,15 @@ poetry.lock
 poetry.toml

 # Test binaries
-/tests/test-grammar-parser
-/tests/test-llama-grammar
-/tests/test-double-float
-/tests/test-grad0
-/tests/test-opt
-/tests/test-quantize-fns
-/tests/test-quantize-perf
-/tests/test-sampling
-/tests/test-tokenizer-0-llama
-/tests/test-tokenizer-0-falcon
-/tests/test-tokenizer-1-llama
-/tests/test-tokenizer-1-bpe
-/tests/test-rope
-/tests/test-backend-ops
+tests/test-grammar-parser
+tests/test-llama-grammar
+tests/test-double-float
+tests/test-grad0
+tests/test-opt
+tests/test-quantize-fns
+tests/test-quantize-perf
+tests/test-sampling
+tests/test-tokenizer-0-llama
+tests/test-tokenizer-0-falcon
+tests/test-tokenizer-1-llama
+tests/test-tokenizer-1-bpe
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,7 +43,6 @@ else()
 endif()

 # general
-option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
@@ -91,20 +90,15 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                             "llama: max. batch size for using peer access")
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
-option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
-option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)

-option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
-option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)
-
-# Required for relocatable CMake package
-include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
+option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER               "llama: build server example"                           ON)

 #
 # Compile flags
@@ -118,11 +112,6 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 include(CheckCXXCompilerFlag)

-# enable libstdc++ assertions for debug builds
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
-endif()
-
 if (NOT MSVC)
    if (LLAMA_SANITIZE_THREAD)
        add_compile_options(-fsanitize=thread)
@@ -155,9 +144,9 @@ if (APPLE AND LLAMA_ACCELERATE)
 endif()

 if (LLAMA_METAL)
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
-    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
-    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
+    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
+    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
+    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)

    message(STATUS "Metal framework found")
    set(GGML_HEADERS_METAL ggml-metal.h)
@@ -172,36 +161,7 @@ if (LLAMA_METAL)
    #add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")

    # copy ggml-metal.metal to bin directory
-    configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
-
-    if (LLAMA_METAL_SHADER_DEBUG)
-        # custom command to do the following:
-        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
-        #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
-        #
-        # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
-        #       disabling fast math is needed in order to pass tests/test-backend-ops
-        # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
-        # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
-        #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
-        set(XC_FLAGS -fno-fast-math -fno-inline -g)
-        if (LLAMA_QKK_64)
-            set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
-        endif()
-
-        add_custom_command(
-            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
-            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-            DEPENDS ggml-metal.metal
-            COMMENT "Compiling Metal kernels"
-        )
-
-        add_custom_target(
-            ggml-metal ALL
-            DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
-        )
-    endif()
+    configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)

    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
        ${FOUNDATION_LIBRARY}
@@ -230,11 +190,7 @@ if (LLAMA_BLAS)
            if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
                pkg_check_modules(DepBLAS REQUIRED blas)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
-                # As of openblas v0.3.22, the 64-bit is named openblas64.pc
-                pkg_check_modules(DepBLAS openblas64)
-                if (NOT DepBLAS_FOUND)
-                    pkg_check_modules(DepBLAS REQUIRED openblas)
-                endif()
+                pkg_check_modules(DepBLAS REQUIRED openblas)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
                pkg_check_modules(DepBLAS REQUIRED blis)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
@@ -326,18 +282,11 @@ if (LLAMA_CUBLAS)
        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})

        if (LLAMA_STATIC)
-            if (WIN32)
-                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-            else ()
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-            endif()
+            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
        else()
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()

-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
-
    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
        # 52 == lowest CUDA 12 standard
        # 60 == f16 CUDA intrinsics
@@ -414,9 +363,6 @@ if (LLAMA_HIPBLAS)
    if (${hipblas_FOUND} AND ${hip_FOUND})
        message(STATUS "HIP and hipBLAS found")
        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
-        if (LLAMA_HIP_UMA)
-            add_compile_definitions(GGML_HIP_UMA)
-        endif()
        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
        if (BUILD_SHARED_LIBS)
            set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -442,102 +388,57 @@ if (LLAMA_HIPBLAS)
    endif()
 endif()

-function(get_flags CCID CCVER)
-    set(C_FLAGS "")
-    set(CXX_FLAGS "")
-
-    if (CCID MATCHES "Clang")
-        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
-        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
-
-        if (
-            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
-            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
-        )
-            set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
-        endif()
-    elseif (CCID STREQUAL "GNU")
-        set(C_FLAGS   -Wdouble-promotion)
-        set(CXX_FLAGS -Wno-array-bounds)
-
-        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
-            set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
-        endif()
-        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
-            set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
-        endif()
-    endif()
-
-    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
-    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
-endfunction()
-
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
-        set(WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
-        set(C_FLAGS       -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
-                          -Werror=implicit-int -Werror=implicit-function-declaration)
-        set(CXX_FLAGS     -Wmissing-declarations -Wmissing-noreturn)
+        set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
+        set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
+        set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
+        set(host_cxx_flags "")

-        set(C_FLAGS   ${WARNING_FLAGS} ${C_FLAGS})
-        set(CXX_FLAGS ${WARNING_FLAGS} ${CXX_FLAGS})
+        if (CMAKE_C_COMPILER_ID MATCHES "Clang")
+            set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
+            set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)

-        get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
+            if (
+                (CMAKE_C_COMPILER_ID STREQUAL "Clang"      AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
+                (CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3.0)
+            )
+                set(c_flags ${c_flags} -Wdouble-promotion)
+            endif()
+        elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
+            set(c_flags ${c_flags} -Wdouble-promotion)
+            set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)

-        add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
-                            "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
+            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
+                set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
+            endif()
+            if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
+                set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
+            endif()
+        endif()
    else()
        # todo : msvc
-        set(C_FLAGS   "")
-        set(CXX_FLAGS "")
    endif()
+
+    set(c_flags   ${c_flags}   ${warning_flags})
+    set(cxx_flags ${cxx_flags} ${warning_flags})
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
+                        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
+                        "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
+
 endif()

-if (LLAMA_CUBLAS)
-    set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
-    if (NOT MSVC)
-        set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
-    endif()
-
-    if (LLAMA_ALL_WARNINGS AND NOT MSVC)
-        set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
-        if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
-            set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
-        endif()
-
-        execute_process(
-            COMMAND ${NVCC_CMD} -Xcompiler --version
-            OUTPUT_VARIABLE CUDA_CCFULLVER
-            ERROR_QUIET
-        )
-
-        if (NOT CUDA_CCFULLVER MATCHES clang)
-            set(CUDA_CCID "GNU")
-            execute_process(
-                COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
-                OUTPUT_VARIABLE CUDA_CCVER
-                ERROR_QUIET
-            )
-        else()
-            if (CUDA_CCFULLVER MATCHES Apple)
-                set(CUDA_CCID "AppleClang")
-            else()
-                set(CUDA_CCID "Clang")
-            endif()
-            string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
-        endif()
-
-        message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
-
-        get_flags(${CUDA_CCID} ${CUDA_CCVER})
-        list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS)  # pass host compiler flags as a single argument
-        if (NOT CUDA_CXX_FLAGS STREQUAL "")
-            set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
-        endif()
-    endif()
-
-    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
+if (NOT MSVC)
+    set(cuda_flags -Wno-pedantic)
 endif()
+set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
+
+list(JOIN host_cxx_flags " " cuda_host_flags)  # pass host compiler flags as a single argument
+if (NOT cuda_host_flags STREQUAL "")
+    set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
+endif()
+
+add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")

 if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
@@ -557,16 +458,6 @@ if (LLAMA_LTO)
    endif()
 endif()

-# this version of Apple ld64 is buggy
-execute_process(
-    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
-    ERROR_VARIABLE output
-    OUTPUT_QUIET
-)
-if (output MATCHES "dyld-1015\.7")
-    add_compile_definitions(HAVE_BUGGY_APPLE_LINKER)
-endif()
-
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
@@ -674,21 +565,12 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
    endif()
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
    message(STATUS "PowerPC detected")
-    if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-        add_compile_options(-mcpu=powerpc64le)
-    else()
-        add_compile_options(-mcpu=native -mtune=native)
-        #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
-    endif()
+    add_compile_options(-mcpu=native -mtune=native)
+    #TODO: Add  targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
 else()
    message(STATUS "Unknown architecture")
 endif()

-if (MINGW)
-    # Target Windows 8 for PrefetchVirtualMemory
-    add_compile_definitions(_WIN32_WINNT=0x602)
-endif()
-
 #
 # POSIX conformance
 #
@@ -758,11 +640,11 @@ add_library(ggml OBJECT
            ggml-backend.h
            ggml-quants.c
            ggml-quants.h
-            ${GGML_SOURCES_CUDA}   ${GGML_HEADERS_CUDA}
+            ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
            ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
-            ${GGML_SOURCES_METAL}  ${GGML_HEADERS_METAL}
-            ${GGML_SOURCES_MPI}    ${GGML_HEADERS_MPI}
-            ${GGML_SOURCES_EXTRA}  ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
+            ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
+            ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
            )

 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
--- a/232
+++ b/232
@@ -1,15 +1,14 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search  \
-	speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
+	simple batched batched-bench save-load-state server gguf llama-bench llava baby-llama beam-search  \
+	speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops
+	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -26,6 +25,20 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif

+ifeq '' '$(findstring clang,$(shell $(CC) --version))'
+	CC_IS_GCC=1
+	CC_VER := $(shell $(CC) -dumpfullversion -dumpversion | awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
+else
+	CC_IS_CLANG=1
+	ifeq '' '$(findstring Apple LLVM,$(shell $(CC) --version))'
+		CC_IS_LLVM_CLANG=1
+	else
+		CC_IS_APPLE_CLANG=1
+	endif
+	CC_VER := $(shell $(CC) --version | sed -n 's/^.* version \([0-9.]*\).*$$/\1/p' \
+				| awk -F. '{ printf("%02d%02d%02d", $$1, $$2, $$3) }')
+endif
+
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
@@ -65,7 +78,7 @@ test: $(TEST_TARGETS)
 			./$$test_target; \
 		fi; \
 		if [ $$? -ne 0 ]; then \
-			printf 'Test %s FAILED!\n\n' $$test_target; \
+			printf 'Test $$test_target FAILED!\n\n' $$test_target; \
 			failures=$$(( failures + 1 )); \
 		else \
 			printf 'Test %s passed.\n\n' $$test_target; \
@@ -107,12 +120,12 @@ MK_CXXFLAGS = -std=c++11 -fPIC

 # -Ofast tends to produce faster code, but may not be available for some compilers.
 ifdef LLAMA_FAST
-MK_CFLAGS     += -Ofast
-HOST_CXXFLAGS += -Ofast
-MK_NVCCFLAGS  += -O3
+MK_CFLAGS        += -Ofast
+MK_HOST_CXXFLAGS += -Ofast
+MK_CUDA_CXXFLAGS += -O3
 else
-MK_CFLAGS     += -O3
-MK_CXXFLAGS   += -O3
+MK_CFLAGS        += -O3
+MK_CXXFLAGS      += -O3
 endif

 # clock_gettime came in POSIX.1b (1993)
@@ -161,10 +174,6 @@ ifdef LLAMA_DEBUG
 	MK_CFLAGS   += -O0 -g
 	MK_CXXFLAGS += -O0 -g
 	MK_LDFLAGS  += -g
-
-	ifeq ($(UNAME_S),Linux)
-		MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
-	endif
 else
 	MK_CPPFLAGS += -DNDEBUG
 endif
@@ -206,9 +215,28 @@ MK_CFLAGS    += $(WARN_FLAGS) -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmis
 				-Werror=implicit-function-declaration
 MK_CXXFLAGS  += $(WARN_FLAGS) -Wmissing-declarations -Wmissing-noreturn

-# this version of Apple ld64 is buggy
-ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
-	MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
+ifeq ($(CC_IS_CLANG), 1)
+	# clang options
+	MK_CFLAGS        += -Wunreachable-code-break -Wunreachable-code-return
+	MK_HOST_CXXFLAGS += -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi
+
+	ifneq '' '$(and $(CC_IS_LLVM_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 030800)))'
+		MK_CFLAGS += -Wdouble-promotion
+	endif
+	ifneq '' '$(and $(CC_IS_APPLE_CLANG),$(filter 1,$(shell expr $(CC_VER) \>= 070300)))'
+		MK_CFLAGS += -Wdouble-promotion
+	endif
+else
+	# gcc options
+	MK_CFLAGS        += -Wdouble-promotion
+	MK_HOST_CXXFLAGS += -Wno-array-bounds
+
+	ifeq ($(shell expr $(CC_VER) \>= 070100), 1)
+		MK_HOST_CXXFLAGS += -Wno-format-truncation
+	endif
+	ifeq ($(shell expr $(CC_VER) \>= 080100), 1)
+		MK_HOST_CXXFLAGS += -Wextra-semi
+	endif
 endif

 # OS specific
@@ -256,8 +284,8 @@ ifndef RISCV

 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
-	MK_CFLAGS     += -march=native -mtune=native
-	HOST_CXXFLAGS += -march=native -mtune=native
+	MK_CFLAGS   += -march=native -mtune=native
+	MK_HOST_CXXFLAGS += -march=native -mtune=native

 	# Usage AVX-only
 	#MK_CFLAGS   += -mfma -mf16c -mavx
@@ -268,31 +296,19 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	#MK_CXXFLAGS += -mssse3
 endif

+# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
+# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
+# https://github.com/ggerganov/llama.cpp/issues/2922
 ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
-	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
-	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
-	# https://github.com/ggerganov/llama.cpp/issues/2922
 	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
 	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
-
-	# Target Windows 8 for PrefetchVirtualMemory
-	MK_CPPFLAGS += -D_WIN32_WINNT=0x602
 endif

 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	# Apple M1, M2, etc.
 	# Raspberry Pi 3, 4, Zero 2 (64-bit)
-	# Nvidia Jetson
 	MK_CFLAGS   += -mcpu=native
 	MK_CXXFLAGS += -mcpu=native
-	JETSON_RELEASE_INFO = $(shell jetson_release)
-	ifdef JETSON_RELEASE_INFO
-		ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),)
-			JETSON_EOL_MODULE_DETECT = 1
-			CC = aarch64-unknown-linux-gnu-gcc
-			cxx = aarch64-unknown-linux-gnu-g++
-		endif
-	endif
 endif

 ifneq ($(filter armv6%,$(UNAME_M)),)
@@ -321,12 +337,6 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
 	endif
 endif

-ifneq ($(filter ppc64le%,$(UNAME_M)),)
-	MK_CFLAGS   += -mcpu=powerpc64le
-	MK_CXXFLAGS += -mcpu=powerpc64le
-	CUDA_POWER_ARCH = 1
-endif
-
 else
 	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
@@ -366,72 +376,62 @@ ifdef LLAMA_BLIS
 endif # LLAMA_BLIS

 ifdef LLAMA_CUBLAS
-	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
-	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
+	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	MK_LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS         += ggml-cuda.o
-	MK_NVCCFLAGS  = -use_fast_math
-ifndef JETSON_EOL_MODULE_DETECT
-	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
-endif # JETSON_EOL_MODULE_DETECT
-ifdef LLAMA_DEBUG
-	MK_NVCCFLAGS += -lineinfo
-endif # LLAMA_DEBUG
+	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
 ifdef LLAMA_CUDA_NVCC
 	NVCC = $(LLAMA_CUDA_NVCC)
 else
 	NVCC = nvcc
 endif #LLAMA_CUDA_NVCC
 ifdef CUDA_DOCKER_ARCH
-	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
-else ifndef CUDA_POWER_ARCH
-	MK_NVCCFLAGS += -arch=native
+	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
+else
+	NVCCFLAGS += -arch=native
 endif # CUDA_DOCKER_ARCH
 ifdef LLAMA_CUDA_FORCE_DMMV
-	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
+	NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # LLAMA_CUDA_FORCE_DMMV
 ifdef LLAMA_CUDA_FORCE_MMQ
-	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
+	NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
 endif # LLAMA_CUDA_FORCE_MMQ
 ifdef LLAMA_CUDA_DMMV_X
-	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
+	NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 else
-	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
+	NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
 endif # LLAMA_CUDA_DMMV_X
 ifdef LLAMA_CUDA_MMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
+	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
 else ifdef LLAMA_CUDA_DMMV_Y
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
+	NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
 else
-	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
+	NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # LLAMA_CUDA_MMV_Y
 ifdef LLAMA_CUDA_F16
-	MK_NVCCFLAGS += -DGGML_CUDA_F16
+	NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_F16
 ifdef LLAMA_CUDA_DMMV_F16
-	MK_NVCCFLAGS += -DGGML_CUDA_F16
+	NVCCFLAGS += -DGGML_CUDA_F16
 endif # LLAMA_CUDA_DMMV_F16
 ifdef LLAMA_CUDA_KQUANTS_ITER
-	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
+	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
 else
-	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
+	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
 ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE
-	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
+	NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(LLAMA_CUDA_PEER_MAX_BATCH_SIZE)
 else
-	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
+	NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
 endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
 #ifdef LLAMA_CUDA_CUBLAS
-#	MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS
+#	NVCCFLAGS += -DGGML_CUDA_CUBLAS
 #endif # LLAMA_CUDA_CUBLAS
 ifdef LLAMA_CUDA_CCBIN
-	MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
+	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-ifdef JETSON_EOL_MODULE_DETECT
-	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
-else
-	$(NVCC) $(BASE_CXXFLAGS) $(NVCCFLAGS) -Wno-pedantic -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
-endif # JETSON_EOL_MODULE_DETECT
+	$(NVCC) $(NVCCFLAGS) -c $< -o $@
 endif # LLAMA_CUBLAS

 ifdef LLAMA_CLBLAST
@@ -453,22 +453,13 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
 endif # LLAMA_CLBLAST

 ifdef LLAMA_HIPBLAS
-
-	ifeq ($(wildcard /opt/rocm),)
-		ROCM_PATH	?= /usr
-		GPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
-	else
-		ROCM_PATH	?= /opt/rocm
-		GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
-	endif
-	HIPCC                   ?= $(ROCM_PATH)/bin/hipcc
+	ROCM_PATH	?= /opt/rocm
+	HIPCC	    ?= $(ROCM_PATH)/bin/hipcc
+	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
 	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
-ifdef LLAMA_HIP_UMA
-	MK_CPPFLAGS += -DGGML_HIP_UMA
-endif # LLAMA_HIP_UMA
 	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
 	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
@@ -502,22 +493,16 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI

-GF_CC := $(CC)
-include scripts/get-flags.mk
-
 # combine build flags with cmdline overrides
-override CFLAGS    := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
-BASE_CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
-override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS)
-override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
-override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
+override CFLAGS        := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CFLAGS) $(CFLAGS)
+override CXXFLAGS      := $(MK_CPPFLAGS) $(CPPFLAGS) $(MK_CXXFLAGS) $(CXXFLAGS)
+override CUDA_CXXFLAGS := $(MK_CUDA_CXXFLAGS) $(CUDA_CXXFLAGS)
+override HOST_CXXFLAGS := $(MK_HOST_CXXFLAGS) $(HOST_CXXFLAGS)
+override LDFLAGS       := $(MK_LDFLAGS) $(LDFLAGS)

-# identify CUDA host compiler
-ifdef LLAMA_CUBLAS
-GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
-include scripts/get-flags.mk
-CUDA_CXXFLAGS := $(GF_CXXFLAGS)
-endif
+# save CXXFLAGS before we add host-only options
+NVCCFLAGS := $(NVCCFLAGS) $(CXXFLAGS) $(CUDA_CXXFLAGS) -Wno-pedantic -Xcompiler "$(HOST_CXXFLAGS)"
+override CXXFLAGS += $(HOST_CXXFLAGS)

 #
 # Print build information
@@ -596,9 +581,6 @@ infill: examples/infill/infill.cpp                            ggml.o llama.o $(C
 simple: examples/simple/simple.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tokenize: examples/tokenize/tokenize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
 batched: examples/batched/batched.cpp                         ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

@@ -623,7 +605,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual

-gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
+gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
@@ -635,10 +617,7 @@ convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggm
 llama-bench: examples/llama-bench/llama-bench.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-libllava.a: examples/llava/llava.cpp examples/llava/llava.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h common/base64.hpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
-
-llava-cli: examples/llava/llava-cli.cpp examples/llava/clip.h examples/llava/clip.cpp examples/llava/llava.h examples/llava/llava.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+llava: examples/llava/llava.cpp examples/llava/llava-utils.h examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

 baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
@@ -650,7 +629,7 @@ beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS)
 finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS)
+export-lora: examples/export-lora/export-lora.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
@@ -659,15 +638,6 @@ speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS)
 parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
 ifdef LLAMA_METAL
 metal: examples/metal/metal.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@@ -709,47 +679,41 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

-tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o grammar-parser.o $(OBJS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar-parser.o $(OBJS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS)
+tests/test-double-float: tests/test-double-float.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-grad0: tests/test-grad0.cpp ggml.o $(OBJS)
+tests/test-grad0: tests/test-grad0.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-opt: tests/test-opt.cpp ggml.o $(OBJS)
+tests/test-opt: tests/test-opt.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o $(OBJS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o $(OBJS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
+tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-rope: tests/test-rope.cpp ggml.o $(OBJS)
+tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 tests/test-c.o: tests/test-c.c llama.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
-
-tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@@ -2,41 +2,59 @@

 import PackageDescription

+#if arch(arm) || arch(arm64)
+let platforms: [SupportedPlatform]? = [
+    .macOS(.v12),
+    .iOS(.v14),
+    .watchOS(.v4),
+    .tvOS(.v14)
+]
+let exclude: [String] = []
+let resources: [Resource] = [
+    .process("ggml-metal.metal")
+]
+let additionalSources: [String] = ["ggml-metal.m"]
+let additionalSettings: [CSetting] = [
+    .unsafeFlags(["-fno-objc-arc"]),
+    .define("GGML_USE_METAL")
+]
+#else
+let platforms: [SupportedPlatform]? = nil
+let exclude: [String] = ["ggml-metal.metal"]
+let resources: [Resource] = []
+let additionalSources: [String] = []
+let additionalSettings: [CSetting] = []
+#endif
+
 let package = Package(
    name: "llama",
-    platforms: [
-        .macOS(.v12),
-        .iOS(.v14),
-        .watchOS(.v4),
-        .tvOS(.v14)
-    ],
+    platforms: platforms,
    products: [
        .library(name: "llama", targets: ["llama"]),
    ],
-    dependencies: [
-        .package(url: "https://github.com/ggerganov/ggml.git", .branch("master"))
-    ],
    targets: [
        .target(
            name: "llama",
-            dependencies: ["ggml"],
            path: ".",
-            exclude: [],
+            exclude: exclude,
            sources: [
+                "ggml.c",
                "llama.cpp",
-            ],
+                "ggml-alloc.c",
+                "ggml-backend.c",
+                "ggml-quants.c",
+            ] + additionalSources,
+            resources: resources,
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE"),
-                .unsafeFlags(["-fno-objc-arc"]),
-                .define("GGML_USE_METAL"),
+                .define("GGML_USE_ACCELERATE")
                // NOTE: NEW_LAPACK will required iOS version 16.4+
                // We should consider add this in the future when we drop support for iOS 14
                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
                // .define("ACCELERATE_NEW_LAPACK"),
                // .define("ACCELERATE_LAPACK_ILP64")
-            ],
+            ] + additionalSettings,
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
--- a/README.md
+++ b/README.md
@@ -10,11 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

 ### Hot topics

- Collecting Apple Silicon performance stats:
-  - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
-  - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
+- ⚠️ **Upcoming change that might break functionality. Help with testing is needed:** https://github.com/ggerganov/llama.cpp/pull/3912

 ----

@@ -97,20 +93,6 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
 - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
 - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
- [X] [StableLM-3b-4e1t](https://github.com/ggerganov/llama.cpp/pull/3586)
- [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
- [x] [GPT-2](https://huggingface.co/gpt2)
-
-**Multimodal models:**
-
- [x] [Llava 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e)
- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)


 **Bindings:**
@@ -125,16 +107,12 @@ as the main playground for developing new features for the [ggml](https://github
 - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)

 **UI:**

 - [nat/openplayground](https://github.com/nat/openplayground)
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
 - [withcatai/catai](https://github.com/withcatai/catai)
- [semperai/amica](https://github.com/semperai/amica)
- [psugihara/FreeChat](https://github.com/psugihara/FreeChat)
- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal)

 ---

@@ -341,7 +319,7 @@ mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128

 ### BLAS Build

-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). BLAS doesn't affect the normal generation performance. There are currently three different implementations of it:

 - #### Accelerate Framework:

@@ -385,37 +363,20 @@ Building the program with BLAS support may lead to some performance improvements

  Check [BLIS.md](docs/BLIS.md) for more information.

- #### Intel oneMKL
-  - Using manual oneAPI installation:
-    By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
-      ```bash
-      mkdir build
-      cd build
-      source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-runtime docker image, only required for manual installation
-      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
-      cmake --build . --config Release
-      ```
+- #### Intel MKL

-  - Using oneAPI docker image:
-    If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime)
+  By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. You may also specify it by:

-      ```bash
-      mkdir build
-      cd build
-      cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
-      cmake --build . --config Release
-      ```
-
-  Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni.
-
-  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
+  ```bash
+  mkdir build
+  cd build
+  cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+  cmake --build . --config Release
+  ```

 - #### cuBLAS

  This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
-
-  For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
-
  - Using `make`:
    ```bash
    make LLAMA_CUBLAS=1
@@ -448,39 +409,22 @@ Building the program with BLAS support may lead to some performance improvements
  This provides BLAS acceleration on HIP-supported AMD GPUs.
  Make sure to have ROCm installed.
  You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
+  Windows support is coming soon...

  - Using `make`:
    ```bash
    make LLAMA_HIPBLAS=1
    ```
-  - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
+  - Using `CMake`:
    ```bash
-    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \
-        cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
-        && cmake --build build -- -j 16
-    ```
-    On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`.
-    However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
-
-  - Using `make` (example for target gfx1030, build with 16 CPU threads):
-    ```bash
-    make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030
-    ```
-
-  - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
-    ```bash
-    set PATH=%HIP_PATH%\bin;%PATH%
    mkdir build
    cd build
-    cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
+    CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
    cmake --build .
    ```
-    Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
-    Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
-

  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
-  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
+  If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
  The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):

  | Option                  | Legal values           | Default | Description |
@@ -938,7 +882,7 @@ Additionally, there the following images, similar to the above:
 - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
 - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)

-The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
+The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).

 #### Usage

@@ -1011,8 +955,6 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
 - There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`

 ### Docs

--- a/awq-py/README.md
+++ b/awq-py/README.md
@@ -1,116 +0,0 @@
-# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
-[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
-
-**Supported models:**
-
- [X] LLaMA
- [x] LLaMA 2
- [X] MPT
- [X] Mistral AI v0.1
- [ ] Bloom
- [ ] Mixtral MoE
-
-**TODO:**
- [x] Update version work with both MPT and MPT-AWQ model
- [ ] Add OPT model
- [ ] Add Bloom model
- [ ] Add Mixtral MoE
- [ ] Support w3, w2
-
-
-## Contents
-
- [Install](##Install)
- [Convert](##Convert)
- [Quantize](##Quantize)
- [Test](##Test)
- [Benchmark](##Benchmark)
- [Results](##Results)
-
-## Install
-Install requirements
-```bash
-pip install -r requirements.txt
-```
-Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT
-```bash
-git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
-```
-
-## Convert
-Example for llama model
-```bash
-# For llama7b and llama2 models
-python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
-# For mistral and mpt models
-python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
-```
-
-## Quantize
-```bash
-# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types.
-./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
-```
-
-## Test
-```bash
-# For all models.
-./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time"
-```
-
-## Benchmark
-The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
-```bash
-# For llama and llama2, and mistral models.
-./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
-```
-
-## Results
-Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
-We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
-
-### Llama 7B (Build with OpenBLAS)
-
-| Model      | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
-|-----------:|--------------|-------:|-------:|-------:|-------:|
-|Llama 7B    | perplexity   | 5.9066 | 6.1214 | 6.0643 | 6.5808 |
-|Llama 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|Llama 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-|AWQ-LLama 7B| perplexity   | 5.9175 | 6.0252 | 5.9987 | 6.3692 |
-|AWQ-LLama 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|AWQ-LLama 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-
-
-### Llama2 7B (Build with CuBLAS)
-
-| Model       | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
-|------------:|--------------|-------:|-------:|-------:|-------:|
-|Llama2 7B    | perplexity   | 5.8664 | 6.0260 | 6.0656 | 6.4496 |
-|Llama2 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|Llama2 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-|AWQ-LLama2 7B| perplexity   | 5.8801 | 6.0054 | 5.9849 | 6.3650 |
-|AWQ-LLama2 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
-|AWQ-LLama2 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-
-
-### Mistral 7B v0.1 (Build with CuBLAS)
-
-| Model        | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
-|-------------:|--------------|-------:|-------:|-------:|-------:|
-|Mistral 7B    | perplexity   | 5.6931 | 5.8202 | 5.8268 | 6.1645 |
-|Mistral 7B    | file size     |  14.5G |   4.1G |   4.5G |   3.1G |
-|Mistral 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-|AWQ-Mistral 7B| perplexity   | 5.6934 | 5.8020 | 5.7691 | 6.0426 |
-|AWQ-Mistral 7B| file size     |  14.5G |   4.1G |   4.5G |   3.1G |
-|AWQ-Mistral 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
-
-### MPT 7B (Build with OpenBLAS)
-
-| Model    | Measure      | F16    | Q4_0   | Q4_1   | Q2_K    |
-|---------:|--------------|-------:|-------:|-------:|--------:|
-|MPT 7B    | perplexity   | 8.4369 | 8.7956 | 8.6265 | 11.4913 |
-|MPT 7B    | file size    |  13.7G  |   3.9G |   4.3G |   2.8G  |
-|MPT 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  |
-|AWQ-MPT 7B| perplexity   | 8.4944 | 8.7053 |  8.6750 | 10.2873|
-|AWQ-MPT 7B| file size    |  13.7G  |   3.9G |   4.3G |   2.8G  |
-|AWQ-MPT 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  |
--- a/awq-py/awq/apply_awq.py
+++ b/awq-py/awq/apply_awq.py
@@ -1,254 +0,0 @@
-"""
-Implements the AWQ for llama.cpp use cases.
-Original paper: https://arxiv.org/abs/2306.00978
-
-This code is based on versions of the AWQ implementation found in the following repositories:
-* https://github.com/mit-han-lab/llm-awq
-* https://github.com/casper-hansen/AutoAWQ
-"""
-
-import os
-import torch
-import torch.nn as nn
-
-from transformers import AutoModelForCausalLM, AutoConfig
-from transformers.models.bloom.modeling_bloom import BloomGelu
-from transformers.models.llama.modeling_llama import LlamaRMSNorm
-from transformers.activations import GELUActivation
-
-
-class ScaledActivation(nn.Module):
-    """
-    ScaledActivation module wraps an existing activation function and applies a
-    scale factor to its output.
-
-    Args:
-        module (nn.Module): The activation function to be scaled.
-        scales (torch.Tensor): A tensor of size (num_features,) containing the initial
-            scale factors for each feature.
-
-    Returns:
-        torch.Tensor: The scaled output of the activation function.
-    """
-
-    def __init__(self, module, scales):
-        super().__init__()
-        self.act = module
-        self.scales = nn.Parameter(scales.data)
-
-    def forward(self, x):
-        return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
-
-
-def set_op_by_name(layer, name, new_module):
-    """
-    Set the new module for given module's name.
-
-    Args:
-        layer (nn.Module): The layer in which to replace the submodule.
-        name (str): The path to the submodule to be replaced, using dot notation
-            to access nested modules.
-        new_module (nn.Module): The new module to replace the existing one.
-    """
-    levels = name.split(".")
-    if len(levels) > 1:
-        mod_ = layer
-        for l_idx in range(len(levels) - 1):
-            if levels[l_idx].isdigit():
-                mod_ = mod_[int(levels[l_idx])]
-            else:
-                mod_ = getattr(mod_, levels[l_idx])
-        setattr(mod_, levels[-1], new_module)
-    else:
-        setattr(layer, name, new_module)
-
-
-def get_op_by_name(module, op_name):
-    """
-    Retrieves a submodule within a given layer based on its name.
-
-    Args:
-        module (nn.Module): The layer containing the submodule to find.
-        op_name (str): The name of the submodule.
-
-    Returns:
-        nn.Module: The requested submodule found within the given layer.
-
-    Raises:
-        ValueError: If the specified submodule cannot be found within the layer.
-    """
-    for name, m in module.named_modules():
-        if name == op_name:
-            return m
-    raise ValueError(f"Cannot find op {op_name} in module {module}")
-
-
-@torch.no_grad()
-def scale_ln_fcs(ln, fcs, scales):
-    """
-    Scales the weights of a LayerNorm and a list of fully-connected layers proportionally.
-
-    Args:
-        ln (nn.LayerNorm): The LayerNorm module to be scaled.
-        fcs (List[nn.Linear]): A list of fully-connected layers to be scaled.
-        scales (torch.Tensor): A 1D tensor of size (num_features,).
-    """
-
-    if not isinstance(fcs, list):
-        fcs = [fcs]
-
-    scales = scales.to(ln.weight.device)
-
-    ln.weight.div_(scales)
-    if hasattr(ln, "bias") and ln.bias is not None:
-        ln.bias.div_(scales)
-
-    for fc in fcs:
-        fc.weight.mul_(scales.view(1, -1))
-
-    for p in ln.parameters():
-        assert torch.isnan(p).sum() == 0
-    for fc in fcs:
-        for p in fc.parameters():
-            assert torch.isnan(p).sum() == 0
-
-
-@torch.no_grad()
-def scale_fc_fc(fc1, fc2, scales):
-    """
-    Scales the weights of two fully-connected layers in a specific pattern.
-
-    Args:
-        fc1 (nn.Linear): The first fully-connected layer to be scaled.
-        fc2 (nn.Linear): The second fully-connected layer to be scaled.
-        scales (torch.Tensor): A 1D tensor of size (num_features,).
-    """
-    assert isinstance(fc1, nn.Linear)
-    assert isinstance(fc2, nn.Linear)
-
-    scales = scales.to(fc1.weight.device)
-
-    fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
-    if fc1.bias is not None:
-        fc1.bias.div_(scales.view(-1))
-
-    fc2.weight.mul_(scales.view(1, -1))
-
-    for p in fc1.parameters():
-        assert torch.isnan(p).sum() == 0
-    for p in fc2.parameters():
-        assert torch.isnan(p).sum() == 0
-
-
-@torch.no_grad()
-def scale_gelu_fc(gelu, fc, scales):
-    """
-    Scales the weight of a GELU activation and a fully-connected layer proportionally.
-
-    Args:
-        gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled.
-        fc (nn.Linear): The fully-connected layer to be scaled.
-        scales (torch.Tensor): A 1D tensor of size (num_features,).
-
-    Raises:
-        TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`.
-        TypeError: If the `fc` module is not of type `nn.Linear`.
-    """
-    assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
-    assert isinstance(fc, nn.Linear)
-
-    fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
-
-    for p in fc.parameters():
-        assert torch.isnan(p).sum() == 0
-
-
-def apply_scale(module, scales_list, input_feat_dict=None):
-    """
-    Applies different scaling strategies to layers based on their type and hierarchy within a given module.
-
-    Args:
-        module (nn.Module): The module containing the layers to be scaled.
-        scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing:
-            * prev_op_name (str): The name of the preceding operation or module,
-                relative to which the layers to be scaled are located.
-            * layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation.
-            * scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
-        input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding
-            input features (optional).
-    """
-    for prev_op_name, layer_names, scales in scales_list:
-        prev_op = get_op_by_name(module, prev_op_name)
-        layers = [get_op_by_name(module, name) for name in layer_names]
-
-        prev_op.cuda()
-        for layer in layers:
-            layer.cuda()
-        scales.cuda()
-
-        if isinstance(prev_op, nn.Linear):
-            assert len(layers) == 1
-            scale_fc_fc(prev_op, layers[0], scales)
-        elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower():
-            scale_ln_fcs(prev_op, layers, scales)
-        elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
-            new_module = ScaledActivation(prev_op, scales)
-            set_op_by_name(module, prev_op_name, new_module)
-            scale_gelu_fc(prev_op, layers[0], scales)
-        else:
-            raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!")
-
-        # apply the scaling to input feat if given; prepare it for clipping
-        if input_feat_dict is not None:
-            for layer_name in layer_names:
-                inp = input_feat_dict[layer_name]
-                inp.div_(scales.view(1, -1).to(inp.device))
-
-        prev_op.cpu()
-        for layer in layers:
-            layer.cpu()
-        scales.cpu()
-
-
-@torch.no_grad()
-def apply_clip(module, clip_list):
-    """
-    Applies element-wise clipping to the weight of a specific layer within a given module.
-
-    Args:
-        module (nn.Module): The module containing the layer to be clipped.
-        clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing:
-            * name (str): The name of the layer to be clipped, relative to the root of the module.
-            * max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight.
-    """
-    for name, max_val in clip_list:
-        layer = get_op_by_name(module, name)
-        layer.cuda()
-        max_val = max_val.to(layer.weight.device)
-        org_shape = layer.weight.shape
-        layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
-        layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
-        layer.weight.data = layer.weight.data.reshape(org_shape)
-        layer.cpu()
-
-
-def add_scale_weights(model_path, scale_path, tmp_path):
-    """
-    Adds pre-computed Activation Weight Quantization (AWQ) results to a model,
-    including scaling factors and clipping bounds.
-
-    Args:
-        model_path (str): Path to the pre-trained model to be equipped with AWQ.
-        scale_path (str): Path to the AWQ scale factors (.pt file).
-        tmp_path (str): Path to the temporary directory where the equipped model will be saved.
-    """
-    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path, config=config, trust_remote_code=True
-    )
-    model.eval()
-    awq_results = torch.load(str(scale_path), map_location="cpu")
-    apply_scale(model, awq_results["scale"])
-    apply_clip(model, awq_results["clip"])
-    model.save_pretrained(str(tmp_path))
-    os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")
--- a/awq-py/requirements.txt
+++ b/awq-py/requirements.txt
@@ -1,2 +0,0 @@
-torch>=2.1.1
-transformers>=4.32.0
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -30,12 +30,6 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`

-CMAKE_EXTRA=""
-
-if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
-fi
-
 ## helpers

 # download a file if it does not exist or if it is outdated
@@ -87,8 +81,8 @@ function gg_run_ctest_debug {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Debug ..     ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log

    (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log

@@ -115,8 +109,8 @@ function gg_run_ctest_release {

    set -e

-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log

    if [ -z ${GG_BUILD_LOW_PERF} ]; then
        (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -11,12 +11,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/../.git")
    if(NOT IS_DIRECTORY "${GIT_DIR}")
        file(READ ${GIT_DIR} REAL_GIT_DIR_LINK)
        string(REGEX REPLACE "gitdir: (.*)\n$" "\\1" REAL_GIT_DIR ${REAL_GIT_DIR_LINK})
-        string(FIND "${REAL_GIT_DIR}" "/" SLASH_POS)
-        if (SLASH_POS EQUAL 0)
-            set(GIT_DIR "${REAL_GIT_DIR}")
-        else()
-            set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
-        endif()
+        set(GIT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../${REAL_GIT_DIR}")
    endif()

    set(GIT_INDEX "${GIT_DIR}/index")
@@ -31,7 +26,7 @@ add_custom_command(
    COMMENT "Generating build details from Git"
    COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
            -DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
-            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/build-info.cmake"
    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
    DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
    VERBATIM
@@ -46,7 +41,6 @@ endif()
 set(TARGET common)

 add_library(${TARGET} STATIC
-    base64.hpp
    common.h
    common.cpp
    sampling.h
@@ -65,4 +59,4 @@ endif()

 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
+target_link_libraries(${TARGET} PRIVATE llama build_info)
--- a/common/base64.hpp
+++ b/common/base64.hpp
@@ -1,392 +0,0 @@
-/*
-This is free and unencumbered software released into the public domain.
-
-Anyone is free to copy, modify, publish, use, compile, sell, or
-distribute this software, either in source code form or as a compiled
-binary, for any purpose, commercial or non-commercial, and by any
-means.
-
-In jurisdictions that recognize copyright laws, the author or authors
-of this software dedicate any and all copyright interest in the
-software to the public domain. We make this dedication for the benefit
-of the public at large and to the detriment of our heirs and
-successors. We intend this dedication to be an overt act of
-relinquishment in perpetuity of all present and future rights to this
-software under copyright law.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-
-For more information, please refer to <http://unlicense.org>
-*/
-
-#ifndef PUBLIC_DOMAIN_BASE64_HPP_
-#define PUBLIC_DOMAIN_BASE64_HPP_
-
-#include <cstdint>
-#include <iterator>
-#include <stdexcept>
-#include <string>
-
-class base64_error : public std::runtime_error
-{
-public:
-    using std::runtime_error::runtime_error;
-};
-
-class base64
-{
-public:
-    enum class alphabet
-    {
-        /** the alphabet is detected automatically */
-        auto_,
-        /** the standard base64 alphabet is used */
-        standard,
-        /** like `standard` except that the characters `+` and `/` are replaced by `-` and `_` respectively*/
-        url_filename_safe
-    };
-
-    enum class decoding_behavior
-    {
-        /** if the input is not padded, the remaining bits are ignored */
-        moderate,
-        /** if a padding character is encounter decoding is finished */
-        loose
-    };
-
-    /**
-     Encodes all the elements from `in_begin` to `in_end` to `out`.
-
-     @warning The source and destination cannot overlap. The destination must be able to hold at least
-     `required_encode_size(std::distance(in_begin, in_end))`, otherwise the behavior depends on the output iterator.
-
-     @tparam Input_iterator the source; the returned elements are cast to `std::uint8_t` and should not be greater than
-     8 bits
-     @tparam Output_iterator the destination; the elements written to it are from the type `char`
-     @param in_begin the beginning of the source
-     @param in_end the ending of the source
-     @param out the destination iterator
-     @param alphabet which alphabet should be used
-     @returns the iterator to the next element past the last element copied
-     @throws see `Input_iterator` and `Output_iterator`
-    */
-    template<typename Input_iterator, typename Output_iterator>
-    static Output_iterator encode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
-                                  alphabet alphabet = alphabet::standard)
-    {
-        constexpr auto pad = '=';
-        const char* alpha  = alphabet == alphabet::url_filename_safe
-                                ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
-                                : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-
-        while (in_begin != in_end) {
-            std::uint8_t i0 = 0, i1 = 0, i2 = 0;
-
-            // first character
-            i0 = static_cast<std::uint8_t>(*in_begin);
-            ++in_begin;
-
-            *out = alpha[i0 >> 2 & 0x3f];
-            ++out;
-
-            // part of first character and second
-            if (in_begin != in_end) {
-                i1 = static_cast<std::uint8_t>(*in_begin);
-                ++in_begin;
-
-                *out = alpha[((i0 & 0x3) << 4) | (i1 >> 4 & 0x0f)];
-                ++out;
-            } else {
-                *out = alpha[(i0 & 0x3) << 4];
-                ++out;
-
-                // last padding
-                *out = pad;
-                ++out;
-
-                // last padding
-                *out = pad;
-                ++out;
-
-                break;
-            }
-
-            // part of second character and third
-            if (in_begin != in_end) {
-                i2 = static_cast<std::uint8_t>(*in_begin);
-                ++in_begin;
-
-                *out = alpha[((i1 & 0xf) << 2) | (i2 >> 6 & 0x03)];
-                ++out;
-            } else {
-                *out = alpha[(i1 & 0xf) << 2];
-                ++out;
-
-                // last padding
-                *out = pad;
-                ++out;
-
-                break;
-            }
-
-            // rest of third
-            *out = alpha[i2 & 0x3f];
-            ++out;
-        }
-
-        return out;
-    }
-    /**
-     Encodes a string.
-
-     @param str the string that should be encoded
-     @param alphabet which alphabet should be used
-     @returns the encoded base64 string
-     @throws see base64::encode()
-    */
-    static std::string encode(const std::string& str, alphabet alphabet = alphabet::standard)
-    {
-        std::string result;
-
-        result.reserve(required_encode_size(str.length()) + 1);
-
-        encode(str.begin(), str.end(), std::back_inserter(result), alphabet);
-
-        return result;
-    }
-    /**
-     Encodes a char array.
-
-     @param buffer the char array
-     @param size the size of the array
-     @param alphabet which alphabet should be used
-     @returns the encoded string
-    */
-    static std::string encode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::standard)
-    {
-        std::string result;
-
-        result.reserve(required_encode_size(size) + 1);
-
-        encode(buffer, buffer + size, std::back_inserter(result), alphabet);
-
-        return result;
-    }
-    /**
-     Decodes all the elements from `in_begin` to `in_end` to `out`. `in_begin` may point to the same location as `out`,
-     in other words: inplace decoding is possible.
-
-     @warning The destination must be able to hold at least `required_decode_size(std::distance(in_begin, in_end))`,
-     otherwise the behavior depends on the output iterator.
-
-     @tparam Input_iterator the source; the returned elements are cast to `char`
-     @tparam Output_iterator the destination; the elements written to it are from the type `std::uint8_t`
-     @param in_begin the beginning of the source
-     @param in_end the ending of the source
-     @param out the destination iterator
-     @param alphabet which alphabet should be used
-     @param behavior the behavior when an error was detected
-     @returns the iterator to the next element past the last element copied
-     @throws base64_error depending on the set behavior
-     @throws see `Input_iterator` and `Output_iterator`
-    */
-    template<typename Input_iterator, typename Output_iterator>
-    static Output_iterator decode(Input_iterator in_begin, Input_iterator in_end, Output_iterator out,
-                                  alphabet alphabet          = alphabet::auto_,
-                                  decoding_behavior behavior = decoding_behavior::moderate)
-    {
-        //constexpr auto pad = '=';
-        std::uint8_t last  = 0;
-        auto bits          = 0;
-
-        while (in_begin != in_end) {
-            auto c = *in_begin;
-            ++in_begin;
-
-            if (c == '=') {
-                break;
-            }
-
-            auto part = _base64_value(alphabet, c);
-
-            // enough bits for one byte
-            if (bits + 6 >= 8) {
-                *out = (last << (8 - bits)) | (part >> (bits - 2));
-                ++out;
-
-                bits -= 2;
-            } else {
-                bits += 6;
-            }
-
-            last = part;
-        }
-
-        // check padding
-        if (behavior != decoding_behavior::loose) {
-            while (in_begin != in_end) {
-                auto c = *in_begin;
-                ++in_begin;
-
-                if (c != '=') {
-                    throw base64_error("invalid base64 character.");
-                }
-            }
-        }
-
-        return out;
-    }
-    /**
-     Decodes a string.
-
-     @param str the base64 encoded string
-     @param alphabet which alphabet should be used
-     @param behavior the behavior when an error was detected
-     @returns the decoded string
-     @throws see base64::decode()
-    */
-    static std::string decode(const std::string& str, alphabet alphabet = alphabet::auto_,
-                              decoding_behavior behavior = decoding_behavior::moderate)
-    {
-        std::string result;
-
-        result.reserve(max_decode_size(str.length()));
-
-        decode(str.begin(), str.end(), std::back_inserter(result), alphabet, behavior);
-
-        return result;
-    }
-    /**
-     Decodes a string.
-
-     @param buffer the base64 encoded buffer
-     @param size the size of the buffer
-     @param alphabet which alphabet should be used
-     @param behavior the behavior when an error was detected
-     @returns the decoded string
-     @throws see base64::decode()
-    */
-    static std::string decode(const char* buffer, std::size_t size, alphabet alphabet = alphabet::auto_,
-                              decoding_behavior behavior = decoding_behavior::moderate)
-    {
-        std::string result;
-
-        result.reserve(max_decode_size(size));
-
-        decode(buffer, buffer + size, std::back_inserter(result), alphabet, behavior);
-
-        return result;
-    }
-    /**
-     Decodes a string inplace.
-
-     @param[in,out] str the base64 encoded string
-     @param alphabet which alphabet should be used
-     @param behavior the behavior when an error was detected
-     @throws base64::decode_inplace()
-    */
-    static void decode_inplace(std::string& str, alphabet alphabet = alphabet::auto_,
-                               decoding_behavior behavior = decoding_behavior::moderate)
-    {
-        str.resize(decode(str.begin(), str.end(), str.begin(), alphabet, behavior) - str.begin());
-    }
-    /**
-     Decodes a char array inplace.
-
-     @param[in,out] str the string array
-     @param size the length of the array
-     @param alphabet which alphabet should be used
-     @param behavior the behavior when an error was detected
-     @returns the pointer to the next element past the last element decoded
-     @throws base64::decode_inplace()
-    */
-    static char* decode_inplace(char* str, std::size_t size, alphabet alphabet = alphabet::auto_,
-                                decoding_behavior behavior = decoding_behavior::moderate)
-    {
-        return decode(str, str + size, str, alphabet, behavior);
-    }
-    /**
-     Returns the required decoding size for a given size. The value is calculated with the following formula:
-
-     $$
-     \lceil \frac{size}{4} \rceil \cdot 3
-     $$
-
-     @param size the size of the encoded input
-     @returns the size of the resulting decoded buffer; this the absolute maximum
-    */
-    static std::size_t max_decode_size(std::size_t size) noexcept
-    {
-        return (size / 4 + (size % 4 ? 1 : 0)) * 3;
-    }
-    /**
-     Returns the required encoding size for a given size. The value is calculated with the following formula:
-
-     $$
-     \lceil \frac{size}{3} \rceil \cdot 4
-     $$
-
-     @param size the size of the decoded input
-     @returns the size of the resulting encoded buffer
-    */
-    static std::size_t required_encode_size(std::size_t size) noexcept
-    {
-        return (size / 3 + (size % 3 ? 1 : 0)) * 4;
-    }
-
-private:
-    static std::uint8_t _base64_value(alphabet& alphabet, char c)
-    {
-        if (c >= 'A' && c <= 'Z') {
-            return c - 'A';
-        } else if (c >= 'a' && c <= 'z') {
-            return c - 'a' + 26;
-        } else if (c >= '0' && c <= '9') {
-            return c - '0' + 52;
-        }
-
-        // comes down to alphabet
-        if (alphabet == alphabet::standard) {
-            if (c == '+') {
-                return 62;
-            } else if (c == '/') {
-                return 63;
-            }
-        } else if (alphabet == alphabet::url_filename_safe) {
-            if (c == '-') {
-                return 62;
-            } else if (c == '_') {
-                return 63;
-            }
-        } // auto detect
-        else {
-            if (c == '+') {
-                alphabet = alphabet::standard;
-
-                return 62;
-            } else if (c == '/') {
-                alphabet = alphabet::standard;
-
-                return 63;
-            } else if (c == '-') {
-                alphabet = alphabet::url_filename_safe;
-
-                return 62;
-            } else if (c == '_') {
-                alphabet = alphabet::url_filename_safe;
-
-                return 63;
-            }
-        }
-
-        throw base64_error("invalid base64 character.");
-    }
-};
-
-#endif // !PUBLIC_DOMAIN_BASE64_HPP_
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -12,7 +12,6 @@
 #include <regex>
 #include <sstream>
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include <cinttypes>
@@ -91,19 +90,6 @@ void process_escapes(std::string& input) {
                case '\'': input[output_idx++] = '\''; break;
                case '\"': input[output_idx++] = '\"'; break;
                case '\\': input[output_idx++] = '\\'; break;
-                case 'x':
-                    // Handle \x12, etc
-                    if (input_idx + 2 < input_len) {
-                        const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
-                        char *err_p = nullptr;
-                        const long val = std::strtol(x, &err_p, 16);
-                        if (err_p == x + 2) {
-                            input_idx += 2;
-                            input[output_idx++] = char(val);
-                            break;
-                        }
-                    }
-                    // fall through
                default:   input[output_idx++] = '\\';
                           input[output_idx++] = input[input_idx]; break;
            }
@@ -278,18 +264,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.yarn_beta_slow = std::stof(argv[i]);
-        } else if (arg == "--samplers") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.samplers_sequence = parse_samplers_input(argv[i]);
-        } else if (arg == "--sampling-seq") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.samplers_sequence = argv[i];
+        } else if (arg == "--memory-f32") {
+            params.memory_f16 = false;
        } else if (arg == "--top-p") {
            if (++i >= argc) {
                invalid_param = true;
@@ -502,18 +478,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.interactive_first = true;
        } else if (arg == "-ins" || arg == "--instruct") {
            params.instruct = true;
-        } else if (arg == "-cml" || arg == "--chatml") {
-            params.chatml = true;
        } else if (arg == "--infill") {
            params.infill = true;
-        } else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
-            params.dump_kv_cache = true;
-        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
-            params.no_kv_offload = true;
-        } else if (arg == "-ctk" || arg == "--cache-type-k") {
-            params.cache_type_k = argv[++i];
-        } else if (arg == "-ctv" || arg == "--cache-type-v") {
-            params.cache_type_v = argv[++i];
        } else if (arg == "--multiline-input") {
            params.multiline_input = true;
        } else if (arg == "--simple-io") {
@@ -656,10 +622,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        } else if (arg == "-h" || arg == "--help") {
            return false;

-        } else if (arg == "--version") {
-            fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
-            fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
-            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
        } else if (arg == "--in-prefix-bos") {
@@ -698,47 +660,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                std::istreambuf_iterator<char>(),
                std::back_inserter(sparams.grammar)
            );
-        } else if (arg == "--override-kv") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            char * sep = strchr(argv[i], '=');
-            if (sep == nullptr || sep - argv[i] >= 128) {
-                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            struct llama_model_kv_override kvo;
-            std::strncpy(kvo.key, argv[i], sep - argv[i]);
-            kvo.key[sep - argv[i]] = 0;
-            sep++;
-            if (strncmp(sep, "int:", 4) == 0) {
-                sep += 4;
-                kvo.tag = LLAMA_KV_OVERRIDE_INT;
-                kvo.int_value = std::atol(sep);
-            } else if (strncmp(sep, "float:", 6) == 0) {
-                sep += 6;
-                kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
-                kvo.float_value = std::atof(sep);
-            } else if (strncmp(sep, "bool:", 5) == 0) {
-                sep += 5;
-                kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
-                if (std::strcmp(sep, "true") == 0) {
-                    kvo.bool_value = true;
-                } else if (std::strcmp(sep, "false") == 0) {
-                    kvo.bool_value = false;
-                } else {
-                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
-                    invalid_param = true;
-                    break;
-                }
-            } else {
-                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            params.kv_overrides.push_back(kvo);
 #ifndef LOG_DISABLE_LOGS
        // Parse args for logging parameters
        } else if ( log_param_single_parse( argv[i] ) ) {
@@ -782,11 +703,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        }
    }

-    if (!params.kv_overrides.empty()) {
-        params.kv_overrides.emplace_back(llama_model_kv_override());
-        params.kv_overrides.back().key[0] = 0;
-    }
-
    return true;
 }

@@ -798,11 +714,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("\n");
    printf("options:\n");
    printf("  -h, --help            show this help message and exit\n");
-    printf("      --version         show version and build info\n");
    printf("  -i, --interactive     run in interactive mode\n");
    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
-    printf("  -cml, --chatml        run in chatml mode (use with ChatML-compatible models)\n");
    printf("  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
    printf("  -r PROMPT, --reverse-prompt PROMPT\n");
    printf("                        halt generation at PROMPT, return control in interactive mode\n");
@@ -828,8 +742,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
-    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
@@ -867,6 +779,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
    printf("  --no-penalize-nl      do not penalize newline token\n");
+    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
+    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
    printf("  --logits-all          return logits for all tokens in the batch (default: disabled)\n");
    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
@@ -905,14 +819,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
 #endif // GGML_USE_CUBLAS
 #endif
    printf("  --verbose-prompt      print prompt before generation\n");
-    printf("  -dkvc, --dump-kv-cache\n");
-    printf("                        verbose print of the KV cache\n");
-    printf("  -nkvo, --no-kv-offload\n");
-    printf("                        disable KV offload\n");
-    printf("  -ctk TYPE, --cache-type-k TYPE\n");
-    printf("                        KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
-    printf("  -ctv TYPE, --cache-type-v TYPE\n");
-    printf("                        KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
    printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
@@ -920,12 +826,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -m FNAME, --model FNAME\n");
    printf("                        model path (default: %s)\n", params.model.c_str());
    printf("  -md FNAME, --model-draft FNAME\n");
-    printf("                        draft model for speculative decoding\n");
+    printf("                        draft model for speculative decoding (default: %s)\n", params.model.c_str());
    printf("  -ld LOGDIR, --logdir LOGDIR\n");
    printf("                        path under which to save YAML logs (no logging if unset)\n");
-    printf("  --override-kv KEY=TYPE:VALUE\n");
-    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
    printf("\n");
 #ifndef LOG_DISABLE_LOGS
    log_print_usage();
@@ -962,48 +865,6 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
    GGML_UNREACHABLE();
 }

-//
-// String parsing
-//
-
-std::string parse_samplers_input(std::string input) {
-    std::string output = "";
-    // since samplers names are written multiple ways
-    // make it ready for both system names and input names
-    std::unordered_map<std::string, char> samplers_symbols {
-        {"top_k",      'k'},
-        {"top-k",      'k'},
-        {"top_p",      'p'},
-        {"top-p",      'p'},
-        {"nucleus",    'p'},
-        {"typical_p",  'y'},
-        {"typical-p",  'y'},
-        {"typical",    'y'},
-        {"min_p",      'm'},
-        {"min-p",      'm'},
-        {"tfs_z",      'f'},
-        {"tfs-z",      'f'},
-        {"tfs",        'f'},
-        {"temp",       't'},
-        {"temperature",'t'}
-    };
-    // expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
-    size_t separator = input.find(';');
-    while (separator != input.npos) {
-        std::string name = input.substr(0,separator);
-        input = input.substr(separator+1);
-        separator = input.find(';');
-
-        if (samplers_symbols.find(name) != samplers_symbols.end()) {
-            output += samplers_symbols[name];
-        }
-    }
-    if (samplers_symbols.find(input) != samplers_symbols.end()) {
-        output += samplers_symbols[input];
-    }
-    return output;
-}
-
 //
 // Model utils
 //
@@ -1018,39 +879,10 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
-    if (params.kv_overrides.empty()) {
-        mparams.kv_overrides = NULL;
-    } else {
-        GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
-        mparams.kv_overrides = params.kv_overrides.data();
-    }

    return mparams;
 }

-static ggml_type kv_cache_type_from_str(const std::string & s) {
-    if (s == "f16") {
-        return GGML_TYPE_F16;
-    }
-    if (s == "q8_0") {
-        return GGML_TYPE_Q8_0;
-    }
-    if (s == "q4_0") {
-        return GGML_TYPE_Q4_0;
-    }
-    if (s == "q4_1") {
-        return GGML_TYPE_Q4_1;
-    }
-    if (s == "q5_0") {
-        return GGML_TYPE_Q5_0;
-    }
-    if (s == "q5_1") {
-        return GGML_TYPE_Q5_1;
-    }
-
-    throw std::runtime_error("Invalid cache type: " + s);
-}
-
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
    auto cparams = llama_context_default_params();

@@ -1060,6 +892,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    cparams.mul_mat_q         = params.mul_mat_q;
    cparams.seed              = params.seed;
+    cparams.f16_kv            = params.memory_f16;
    cparams.logits_all        = params.logits_all;
    cparams.embedding         = params.embedding;
    cparams.rope_scaling_type = params.rope_scaling_type;
@@ -1070,10 +903,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.yarn_beta_fast    = params.yarn_beta_fast;
    cparams.yarn_beta_slow    = params.yarn_beta_slow;
    cparams.yarn_orig_ctx     = params.yarn_orig_ctx;
-    cparams.offload_kqv       = !params.no_kv_offload;
-
-    cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
-    cparams.type_v = kv_cache_type_from_str(params.cache_type_v);

    return cparams;
 }
@@ -1089,7 +918,7 @@ void llama_batch_add(
    const std::vector<llama_seq_id> & seq_ids,
                               bool   logits) {
    batch.token   [batch.n_tokens] = id;
-    batch.pos     [batch.n_tokens] = pos;
+    batch.pos     [batch.n_tokens] = pos,
    batch.n_seq_id[batch.n_tokens] = seq_ids.size();
    for (size_t i = 0; i < seq_ids.size(); ++i) {
        batch.seq_id[batch.n_tokens][i] = seq_ids[i];
@@ -1230,12 +1059,6 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
    return result;
 }

-bool llama_should_add_bos_token(const llama_model * model) {
-    const int add_bos = llama_add_bos_token(model);
-
-    return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
-}
-
 //
 // YAML utils
 //
@@ -1352,7 +1175,6 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha
    if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
        data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
        data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
-        data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
        data_str = "\"" + data_str + "\"";
        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
        return;
@@ -1394,7 +1216,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
-    fprintf(stream, "cpu_has_avx_vnni: %s\n",    ggml_cpu_has_avx_vnni()    ? "true" : "false");
    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
@@ -1487,6 +1308,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    }
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
+    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
@@ -1541,77 +1363,3 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
 }
-
-//
-// KV cache utils
-//
-
-void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
-    static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
-
-    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
-        view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
-
-    llama_kv_cache_view_cell * c_curr = view.cells;
-    llama_seq_id * cs_curr = view.cells_sequences;
-
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
-        if (i % row_size == 0) {
-            printf("\n%5d: ", i);
-        }
-        int seq_count = 0;
-        for (int j = 0; j < view.n_max_seq; j++) {
-            if (cs_curr[j] >= 0) { seq_count++; }
-        }
-        putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
-    }
-
-    printf("\n=== Done dumping\n");
-}
-
-void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
-    static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
-
-    printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
-        view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
-
-    std::unordered_map<llama_seq_id, size_t> seqs;
-    llama_kv_cache_view_cell * c_curr = view.cells;
-    llama_seq_id * cs_curr = view.cells_sequences;
-
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
-        for (int j = 0; j < view.n_max_seq; j++) {
-            if (cs_curr[j] < 0) { continue; }
-            if (seqs.find(cs_curr[j]) == seqs.end()) {
-                if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-                seqs[cs_curr[j]] = seqs.size();
-            }
-        }
-        if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
-    }
-
-    printf("=== Sequence legend: ");
-    for (const auto & it : seqs) {
-        printf("%zu=%d, ", it.second, it.first);
-    }
-    printf("'+'=other sequence ids");
-
-    c_curr = view.cells;
-    cs_curr = view.cells_sequences;
-    for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
-        if (i % row_size == 0) {
-            printf("\n%5d: ", i);
-        }
-        for (int j = 0; j < view.n_max_seq; j++) {
-            if (cs_curr[j] >= 0) {
-                const auto & it = seqs.find(cs_curr[j]);
-                putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
-            } else {
-                putchar('.');
-            }
-        }
-        putchar(' ');
-    }
-
-    printf("\n=== Done dumping\n");
-}
--- a/common/common.h
+++ b/common/common.h
@@ -51,7 +51,7 @@ struct gpt_params {
    int32_t n_ctx                           = 512;   // context size
    int32_t n_batch                         = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                          = 0;     // number of tokens to keep from initial prompt
-    int32_t n_draft                         = 8;     // number of tokens to draft during speculative decoding
+    int32_t n_draft                         = 16;    // number of tokens to draft during speculative decoding
    int32_t n_chunks                        = -1;    // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel                      = 1;     // number of parallel sequences to decode
    int32_t n_sequences                     = 1;     // number of sequences to decode
@@ -86,8 +86,6 @@ struct gpt_params {
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files

-    std::vector<llama_model_kv_override> kv_overrides;
-
    // TODO: avoid tuple, use struct
    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
    std::string lora_base  = "";                              // base model path for the lora adapter
@@ -100,10 +98,10 @@ struct gpt_params {
    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score

    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
+    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
-    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it

@@ -123,15 +121,10 @@ struct gpt_params {
    bool numa              = false; // attempt optimizations that help on some NUMA systems
    bool verbose_prompt    = false; // print prompt tokens before generation
    bool infill            = false; // use infill mode
-    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
-    bool no_kv_offload     = false; // disable KV offloading
-
-    std::string cache_type_k = "f16"; // KV cache data type for the K
-    std::string cache_type_v = "f16"; // KV cache data type for the V

    // multimodal models (see examples/llava)
    std::string mmproj = ""; // path to multimodal projector
-    std::string image  = ""; // path to an image file
+    std::string image = ""; // path to an image file
 };

 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
@@ -146,12 +139,6 @@ std::string gpt_random_prompt(std::mt19937 & rng);

 void process_escapes(std::string& input);

-//
-// String parsing
-//
-
-std::string parse_samplers_input(std::string input);
-
 //
 // Model utils
 //
@@ -213,10 +200,6 @@ std::string llama_detokenize_bpe(
                         llama_context * ctx,
        const std::vector<llama_token> & tokens);

-// Uses the value from the model metadata if possible, otherwise
-// defaults to true when model type is SPM, otherwise false.
-bool llama_should_add_bos_token(const llama_model * model);
-
 //
 // YAML utils
 //
@@ -230,14 +213,3 @@ std::string get_sortable_timestamp();
 void dump_non_result_info_yaml(
    FILE * stream, const gpt_params & params, const llama_context * lctx,
    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
-
-//
-// KV cache utils
-//
-
-// Dump the KV cache view with the number of sequences per cell.
-void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
-
-// Dump the KV cache view showing individual sequences in each cell (long output).
-void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
-
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -190,7 +190,7 @@ namespace grammar_parser {
                pos = parse_space(pos + 1, is_nested);
            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
                if (last_sym_start == out_elements.size()) {
-                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
+                    throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
                }

                // apply transformation to previous symbol (last_sym_start to end) according to
--- a/common/log.h
+++ b/common/log.h
@@ -61,13 +61,13 @@
 //  #define LOG_TARGET stderr
 //  #include "log.h"
 //
-//  The log target can also be redirected to a different function
+//  The log target can also be redirected to a diffrent function
 //  like so:
 //
-//  #define LOG_TARGET log_handler_different()
+//  #define LOG_TARGET log_handler_diffrent()
 //  #include "log.h"
 //
-//  FILE* log_handler_different()
+//  FILE* log_handler_diffrent()
 //  {
 //      return stderr;
 //  }
@@ -421,7 +421,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS

 // Disables logs entirely at runtime.
 //  Makes LOG() and LOG_TEE() produce no output,
-//  until enabled back.
+//  untill enabled back.
 #define log_disable() log_disable_impl()

 // INTERNAL, DO NOT USE
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -99,67 +99,21 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
    return std::string(result);
 }

-std::string llama_sampling_order_print(const llama_sampling_params & params) {
-    std::string result = "CFG -> Penalties ";
-    if (params.mirostat == 0) {
-        for (auto s : params.samplers_sequence) {
-            switch (s) {
-                case 'k': result += "-> top_k "; break;
-                case 'f': result += "-> tfs_z "; break;
-                case 'y': result += "-> typical_p "; break;
-                case 'p': result += "-> top_p "; break;
-                case 'm': result += "-> min_p "; break;
-                case 't': result += "-> temp "; break;
-                default : break;
-            }
-        }
-    } else {
-        result += "-> mirostat ";
-    }
-
-    return result;
-}
-
-// no reasons to expose this function in header
-static void sampler_queue(
-                   struct llama_context * ctx_main,
-            const llama_sampling_params & params,
-                 llama_token_data_array & cur_p,
-                                 size_t & min_keep) {
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
-
-    const float         temp              = params.temp;
-    const int32_t       top_k             = params.top_k <= 0 ? n_vocab : params.top_k;
-    const float         top_p             = params.top_p;
-    const float         min_p             = params.min_p;
-    const float         tfs_z             = params.tfs_z;
-    const float         typical_p         = params.typical_p;
-    const std::string & samplers_sequence = params.samplers_sequence;
-
-    for (auto s : samplers_sequence) {
-        switch (s){
-            case 'k': llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep); break;
-            case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep); break;
-            case 'y': llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep); break;
-            case 'p': llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep); break;
-            case 'm': llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep); break;
-            case 't': llama_sample_temp     (ctx_main, &cur_p, temp); break;
-            default : break;
-        }
-    }
-}
-
-static llama_token llama_sampling_sample_impl(
+llama_token llama_sampling_sample(
                  struct llama_sampling_context * ctx_sampling,
                  struct llama_context * ctx_main,
                  struct llama_context * ctx_cfg,
-                  const int idx,
-                  bool is_resampling) {  // Add a parameter to indicate if we are resampling
+                  const int idx) {
    const llama_sampling_params & params = ctx_sampling->params;

    const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));

    const float   temp            = params.temp;
+    const int32_t top_k           = params.top_k <= 0 ? n_vocab : params.top_k;
+    const float   top_p           = params.top_p;
+    const float   min_p           = params.min_p;
+    const float   tfs_z           = params.tfs_z;
+    const float   typical_p       = params.typical_p;
    const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
    const float   penalty_repeat  = params.penalty_repeat;
    const float   penalty_freq    = params.penalty_freq;
@@ -174,17 +128,8 @@ static llama_token llama_sampling_sample_impl(

    llama_token id = 0;

-    // Get a pointer to the logits
    float * logits = llama_get_logits_ith(ctx_main, idx);

-    // Declare original_logits at the beginning of the function scope
-    std::vector<float> original_logits;
-
-    if (!is_resampling) {
-        // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this.
-        original_logits = std::vector<float>(logits, logits + llama_n_vocab(llama_get_model(ctx_main)));
-    }
-
    // apply params.logit_bias map
    for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
        logits[it->first] += it->second;
@@ -203,14 +148,12 @@ static llama_token llama_sampling_sample_impl(
    }

    // apply penalties
-    const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
-    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
-    if (penalty_tokens_used_size) {
+    if (!prev.empty()) {
        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];

        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
-                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
+                prev.data() + prev.size() - penalty_last_n,
+                penalty_last_n, penalty_repeat, penalty_freq, penalty_present);

        if (!penalize_nl) {
            for (size_t idx = 0; idx < cur_p.size; idx++) {
@@ -222,8 +165,7 @@ static llama_token llama_sampling_sample_impl(
        }
    }

-    // If we are in the resampling phase, apply grammar checks before sampling logic
-    if (is_resampling && ctx_sampling->grammar != NULL) {
+    if (ctx_sampling->grammar != NULL) {
        llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
    }

@@ -246,7 +188,12 @@ static llama_token llama_sampling_sample_impl(
            // temperature sampling
            size_t min_keep = std::max(1, params.n_probs);

-            sampler_queue(ctx_main, params, cur_p, min_keep);
+            llama_sample_top_k    (ctx_main, &cur_p, top_k,     min_keep);
+            llama_sample_tail_free(ctx_main, &cur_p, tfs_z,     min_keep);
+            llama_sample_typical  (ctx_main, &cur_p, typical_p, min_keep);
+            llama_sample_top_p    (ctx_main, &cur_p, top_p,     min_keep);
+            llama_sample_min_p    (ctx_main, &cur_p, min_p,     min_keep);
+            llama_sample_temp     (ctx_main, &cur_p, temp);

            id = llama_sample_token(ctx_main, &cur_p);

@@ -265,40 +212,9 @@ static llama_token llama_sampling_sample_impl(
        }
    }

-    if (ctx_sampling->grammar != NULL && !is_resampling) {
-        // Create an array with a single token data element for the sampled id
-        llama_token_data single_token_data = {id, logits[id], 0.0f};
-        llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
-
-        // Apply grammar constraints to the single token
-        llama_sample_grammar(ctx_main, &single_token_data_array, ctx_sampling->grammar);
-
-        // Check if the token is valid according to the grammar by seeing if its logit has been set to -INFINITY
-        bool is_valid = single_token_data_array.data[0].logit != -INFINITY;
-
-        // If the token is not valid according to the grammar, perform resampling
-        if (!is_valid) {
-            LOG("Resampling because token %d: '%s' does not meet grammar rules\n", id, llama_token_to_piece(ctx_main, id).c_str());
-
-            // Restore logits from the copy
-            std::copy(original_logits.begin(), original_logits.end(), logits);
-
-            return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true);  // Pass true for is_resampling
-        }
-    }
-
    return id;
 }

-llama_token llama_sampling_sample(
-                  struct llama_sampling_context * ctx_sampling,
-                  struct llama_context * ctx_main,
-                  struct llama_context * ctx_cfg,
-                  const int idx) {
-    // Call the implementation function with is_resampling set to false by default
-    return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false);
-}
-
 void llama_sampling_accept(
        struct llama_sampling_context * ctx_sampling,
        struct llama_context * ctx_main,
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -10,23 +10,22 @@

 // sampling parameters
 typedef struct llama_sampling_params {
-    int32_t     n_prev                = 64;       // number of previous tokens to remember
-    int32_t     n_probs               = 0;        // if greater than 0, output the probabilities of top n_probs tokens.
-    int32_t     top_k                 = 40;       // <= 0 to use vocab size
-    float       top_p                 = 0.95f;    // 1.0 = disabled
-    float       min_p                 = 0.05f;    // 0.0 = disabled
-    float       tfs_z                 = 1.00f;    // 1.0 = disabled
-    float       typical_p             = 1.00f;    // 1.0 = disabled
-    float       temp                  = 0.80f;    // 1.0 = disabled
-    int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
-    float       penalty_repeat        = 1.10f;    // 1.0 = disabled
-    float       penalty_freq          = 0.00f;    // 0.0 = disabled
-    float       penalty_present       = 0.00f;    // 0.0 = disabled
-    int32_t     mirostat              = 0;        // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
-    float       mirostat_tau          = 5.00f;    // target entropy
-    float       mirostat_eta          = 0.10f;    // learning rate
-    bool        penalize_nl           = true;     // consider newlines as a repeatable token
-    std::string samplers_sequence     = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
+    int32_t n_prev            = 64;    // number of previous tokens to remember
+    int32_t n_probs           = 0;     // if greater than 0, output the probabilities of top n_probs tokens.
+    int32_t top_k             = 40;    // <= 0 to use vocab size
+    float   top_p             = 0.95f; // 1.0 = disabled
+    float   min_p             = 0.05f; // 0.0 = disabled
+    float   tfs_z             = 1.00f; // 1.0 = disabled
+    float   typical_p         = 1.00f; // 1.0 = disabled
+    float   temp              = 0.80f; // 1.0 = disabled
+    int32_t penalty_last_n    = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
+    float   penalty_repeat    = 1.10f; // 1.0 = disabled
+    float   penalty_freq      = 0.00f; // 0.0 = disabled
+    float   penalty_present   = 0.00f; // 0.0 = disabled
+    int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
+    float   mirostat_tau      = 5.00f; // target entropy
+    float   mirostat_eta      = 0.10f; // learning rate
+    bool    penalize_nl       = true;  // consider newlines as a repeatable token

    std::string grammar;  // optional BNF-like grammar to constrain sampling

@@ -36,9 +35,6 @@ typedef struct llama_sampling_params {
    float       cfg_scale     = 1.f; // how strong is guidance

    std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
-
-    std::vector<llama_token> penalty_prompt_tokens;
-    bool                     use_penalty_prompt_tokens = false;
 } llama_sampling_params;

 // general sampler context
@@ -84,9 +80,6 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
 // Print sampling parameters into a string
 std::string llama_sampling_print(const llama_sampling_params & params);

-// Print sampling order into a string
-std::string llama_sampling_order_print(const llama_sampling_params & params);
-
 // this is a common sampling function used across the examples for convenience
 // it can serve as a starting point for implementing your own sampling function
 // Note: When using multiple sequences, it is the caller's responsibility to call
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -32,7 +32,6 @@ struct train_state  * init_train_state() {
    state->opt = new struct ggml_opt_context;
    state->opt->ctx = NULL;
    state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
-    state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES;
    state->opt->loss_after = 0.0f;

    return state;
@@ -71,7 +70,7 @@ void free_random_uniform_distribution(struct random_uniform_distribution * rnd)

 struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
    float scale = 1.0f; // xavier
-    switch (ggml_n_dims(tensor)) {
+    switch (tensor->n_dims) {
        case 1:
            scale /= sqrtf((float) tensor->ne[0]);
            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
@@ -119,7 +118,7 @@ struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct
 }

 struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd) {
-    switch (ggml_n_dims(tensor)) {
+    switch (tensor->n_dims) {
        case 1:
            for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
                float * dst = (float *) ((char *) tensor->data + i0*tensor->nb[0]);
@@ -183,27 +182,25 @@ float fclamp(const float v, const float min, const float max) {
 }

 void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
+    GGML_ASSERT(tensor->n_dims == 1);
    GGML_ASSERT(tensor->ne[0] == ne0);
-    GGML_ASSERT(tensor->ne[1] == 1);
-    GGML_ASSERT(tensor->ne[2] == 1);
-    GGML_ASSERT(tensor->ne[3] == 1);
 }

 void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
+    GGML_ASSERT(tensor->n_dims == 2);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
-    GGML_ASSERT(tensor->ne[2] == 1);
-    GGML_ASSERT(tensor->ne[3] == 1);
 }

 void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
+    GGML_ASSERT(tensor->n_dims == 3);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
-    GGML_ASSERT(tensor->ne[3] == 1);
 }

 void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
+    GGML_ASSERT(tensor->n_dims == 4);
    GGML_ASSERT(tensor->ne[0] == ne0);
    GGML_ASSERT(tensor->ne[1] == ne1);
    GGML_ASSERT(tensor->ne[2] == ne2);
@@ -227,8 +224,8 @@ int64_t get_example_targets_batch(
    bool                   sample_random_offsets
 ) {
    GGML_ASSERT(samples_count > 0);
-    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(ggml_is_3d(target_probs));
+    GGML_ASSERT(tokens_input->n_dims  == 2);
+    GGML_ASSERT(target_probs->n_dims  == 3);
    int64_t n_vocab  = target_probs->ne[0];
    int64_t n_tokens = tokens_input->ne[0];
    int64_t n_batch  = tokens_input->ne[1];
@@ -1107,7 +1104,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
    fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
    fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
    fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    fprintf(stderr, "  --overlapping-samples      Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
+    fprintf(stderr, "  --overlapping-samples      Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
    fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
    fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
    fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");
@@ -1138,7 +1135,6 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
    fprintf(stderr, "  --adam-beta2 N             AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
    fprintf(stderr, "  --adam-gclip N             AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
    fprintf(stderr, "  --adam-epsf N              AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
-    fprintf(stderr, "  -ngl N, --n-gpu-layers N   Number of model layers to offload to GPU (default %d)", params->n_gpu_layers);
    fprintf(stderr, "\n");
 }

@@ -1358,17 +1354,6 @@ bool consume_common_train_arg(
            return true;
        }
        params->adam_gclip = std::stof(argv[i]);
-    } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
-            if (++i >= argc) {
-                *invalid_param = true;
-                return true;
-            }
-#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
-            params->n_gpu_layers = std::stoi(argv[i]);
-#else
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
-            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
-#endif
    } else if (arg == "-h" || arg == "--help") {
        params->print_usage = true;
        return true;
--- a/common/train.h
+++ b/common/train.h
@@ -9,8 +9,6 @@
 #include "ggml.h"
 #include "llama.h"

-#define LLAMA_TRAIN_MAX_NODES 16384
-
 typedef std::string mt19937_state;

 struct train_state {
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+# HF baichuan --> gguf conversion
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import struct
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+import itertools
+import numpy as np
+import torch
+from sentencepiece import SentencePieceProcessor  # type: ignore[import]
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
+
+if TYPE_CHECKING:
+    from typing import TypeAlias
+
+NDArray: TypeAlias = 'np.ndarray[Any, Any]'
+
+# reverse HF permute back to original pth layout
+
+
+def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
+    if n_kv_head is not None and n_head != n_kv_head:
+        n_head //= n_kv_head
+
+    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+            .swapaxes(1, 2)
+            .reshape(weights.shape))
+
+def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
+        r = weights.shape[0] // 3
+        return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
+
+def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
+        r = weights.shape[0] // 3
+        return weights[r * n_part : r * n_part + r, ...]
+
+def count_model_parts(dir_model: str) -> int:
+    num_parts = 0
+
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+
+    return num_parts
+
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
+    parser.add_argument(
+        "--vocab-only", action="store_true",
+        help="extract only the vocab",
+    )
+    parser.add_argument(
+        "--outfile", type=Path,
+        help="path to write to; default: based on input",
+    )
+    parser.add_argument(
+        "model", type=Path,
+        help="directory containing model file, or model file itself (*.bin)",
+    )
+    parser.add_argument(
+        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
+        help="output format - use 0 for float32, 1 for float16",
+    )
+    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
+    return parser.parse_args()
+
+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
+    sys.exit(1)
+
+endianess = gguf.GGUFEndian.LITTLE
+if args.bigendian:
+    endianess = gguf.GGUFEndian.BIG
+endianess_str = "Big Endian" if args.bigendian else "Little Endian"
+print(f"gguf: Conversion Endianess {endianess}")
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
+
+print("gguf: loading model "+dir_model.name)
+
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+print("hello print: ",hparams["architectures"][0])
+if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+print(f"num_parts:{num_parts}\n")
+ARCH=gguf.MODEL_ARCH.BAICHUAN
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+head_count = hparams["num_attention_heads"]
+
+if "num_key_value_heads" in hparams:
+    head_count_kv = hparams["num_key_value_heads"]
+else:
+    head_count_kv = head_count
+
+if "_name_or_path" in hparams:
+    hf_repo = hparams["_name_or_path"]
+else:
+    hf_repo = ""
+
+if "max_sequence_length" in hparams:
+    ctx_length = hparams["max_sequence_length"]
+elif "max_position_embeddings" in hparams:
+    ctx_length = hparams["max_position_embeddings"]
+elif "model_max_length" in hparams:
+    ctx_length = hparams["model_max_length"]
+else:
+    print("gguf: can not find ctx length parameter.")
+
+    sys.exit()
+
+
+gguf_writer.add_name(dir_model.name)
+gguf_writer.add_source_hf_repo(hf_repo)
+gguf_writer.add_tensor_data_layout("Meta AI original pth")
+gguf_writer.add_context_length(ctx_length)
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+gguf_writer.add_head_count(head_count)
+gguf_writer.add_head_count_kv(head_count_kv)
+gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
+
+if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
+    if "type" in hparams["rope_scaling"]:
+        if hparams["rope_scaling"]["type"] == "linear":
+            gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
+            gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
+
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: list[bytes] = []
+scores: list[float] = []
+toktypes: list[int] = []
+
+tokenizer_model_file = dir_model / 'tokenizer.model'
+if not tokenizer_model_file.is_file():
+    print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
+    sys.exit(1)
+
+# vocab type sentencepiece
+print("gguf: get sentencepiece tokenizer vocab, scores and token types")
+
+tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
+vocab_size = hparams.get('vocab_size')
+if vocab_size is None:
+    vocab_size = tokenizer.vocab_size()
+
+for i in range(vocab_size):
+    text: bytes
+    score: float
+
+    piece = tokenizer.id_to_piece(i)
+    text = piece.encode("utf-8")
+    score = tokenizer.get_score(i)
+
+    toktype = 1  # defualt to normal token type
+    if tokenizer.is_unknown(i):
+        toktype = 2
+    if tokenizer.is_control(i):
+        toktype = 3
+
+    # toktype = 4 is user-defined = tokens from added_tokens.json
+
+    if tokenizer.is_unused(i):
+        toktype = 5
+    if tokenizer.is_byte(i):
+        toktype = 6
+
+    tokens.append(text)
+    scores.append(score)
+    toktypes.append(toktype)
+
+added_tokens_file = dir_model / 'added_tokens.json'
+if added_tokens_file.is_file():
+    with open(added_tokens_file, "r", encoding="utf-8") as f:
+        addtokens_json = json.load(f)
+
+        print("gguf: get added tokens")
+
+        for key in addtokens_json:
+            tokens.append( key.encode("utf-8") )
+            scores.append(-1000.0)
+            toktypes.append(4) # user-defined token type
+
+
+gguf_writer.add_tokenizer_model("llama")
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
+gguf_writer.add_token_types(toktypes)
+
+special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens))
+special_vocab.add_to_gguf(gguf_writer)
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = iter(("pytorch_model.bin",))
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+
+for part_name in part_names:
+    if args.vocab_only:
+        break
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    tmp=model_part
+    for i in range(block_count):
+        if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
+            print(f"Unpacking and permuting layer {i}")
+            tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
+            tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
+            tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
+            del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
+
+    for name in model_part.keys():
+        data = model_part[name]
+        # we don't need these
+        if name.endswith(".rotary_emb.inv_freq"):
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+        if new_name is None:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name + " -> " +  new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+        gguf_writer.add_tensor(new_name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print(f"gguf: model successfully exported to '{fname_out}'")
+print("")
--- a/convert-bloom-hf-to-gguf.py
+++ b/convert-bloom-hf-to-gguf.py
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+# HF bloom --> gguf conversion
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import re
+import struct
+import sys
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer  # type: ignore[import]
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
+
+def count_model_parts(dir_model: Path) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+# Supported Models:
+#   https://huggingface.co/bigscience/bloom-1b7
+#   https://huggingface.co/bigscience/bloom-3b
+#   https://huggingface.co/bigscience/bloom-7b1
+#   https://huggingface.co/Langboat/bloom-1b4-zh
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file")
+    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
+    parser.add_argument("--outfile",    type=Path,           help="path to write to; default: based on input")
+    parser.add_argument("model",        type=Path,           help="directory containing model file, or model file itself (*.bin)")
+    parser.add_argument("ftype",        type=int,            help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
+    return parser.parse_args()
+
+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
+    sys.exit(1)
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
+
+print("gguf: loading model "+dir_model.name)
+
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "BloomForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+    sys.exit(1)
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.BLOOM
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["n_layer"]
+
+gguf_writer.add_name("Bloom")
+n_embed = hparams.get("hidden_size", hparams.get("n_embed"))
+n_head = hparams.get("n_head", hparams.get("num_attention_heads"))
+gguf_writer.add_context_length(hparams.get("seq_length", n_embed))
+gguf_writer.add_embedding_length(n_embed)
+gguf_writer.add_feed_forward_length(4 * n_embed)
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_head_count(n_head)
+gguf_writer.add_head_count_kv(n_head)
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
+gguf_writer.add_file_type(ftype)
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: list[bytearray] = []
+scores: list[float] = []
+toktypes: list[int] = []
+
+# gpt2 tokenizer
+gguf_writer.add_tokenizer_model("gpt2")
+
+print("gguf: get gpt2 tokenizer vocab")
+
+# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+assert max(tokenizer.vocab.values()) < vocab_size
+
+added_vocab = tokenizer.get_added_vocab()
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+
+for i in range(vocab_size):
+    if i not in reverse_vocab:
+        tokens.append(f"[PAD{i}]")
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    elif reverse_vocab[i] in added_vocab:
+        tokens.append(reverse_vocab[i])
+        if tokenizer.added_tokens_decoder[i].special:
+            toktypes.append(gguf.TokenType.CONTROL)
+        else:
+            toktypes.append(gguf.TokenType.USER_DEFINED)
+    else:
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.NORMAL)
+
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_types(toktypes)
+
+special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
+special_vocab.add_to_gguf(gguf_writer)
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
+
+# params for qkv transform
+n_head_kv = hparams.get("n_head_kv", n_head)
+head_dim = n_embed // n_head
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = iter(("pytorch_model.bin",))
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    if args.vocab_only:
+        break
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(dir_model / part_name, map_location="cpu")
+
+    has_lm_head = True
+    if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys():
+        has_lm_head = False
+
+    for original_name in model_part.keys():
+        data = model_part[original_name]
+        name = re.sub(r'transformer\.', '', original_name)
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name):
+            # Map bloom-style qkv_linear to gpt-style qkv_linear
+            # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252  # noqa
+            # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312  # noqa
+            qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed))
+            data = np.concatenate(
+                (qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
+                 qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
+                 qkv_weights[:, 2, :, :].reshape((-1, n_embed))),
+                axis=0
+            )
+            print("re-format attention.linear_qkv.weight")
+        elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name):
+            qkv_bias = data.reshape((n_head, 3, n_embed // n_head))
+            data = np.concatenate(
+                (qkv_bias[:, 0, :].reshape((n_embed,)),
+                 qkv_bias[:, 1, :].reshape((n_embed,)),
+                 qkv_bias[:, 2, :].reshape((n_embed,))),
+                axis=0
+            )
+            print("re-format attention.linear_qkv.bias")
+
+        # map tensor names
+        new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+        if new_name is None:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(new_name, data)
+
+        if not has_lm_head and name == "word_embeddings.weight":
+            gguf_writer.add_tensor("output.weight", data)
+            print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))  # noqa
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print(f"gguf: model successfully exported to '{fname_out}'")
+print("")
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+# HF falcon--> gguf conversion
+
+from __future__ import annotations
+
+import argparse
+import contextlib
+import json
+import os
+import struct
+import sys
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer  # type: ignore[import]
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
+
+def count_model_parts(dir_model: Path, prefix: str) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith(prefix):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
+    parser.add_argument(
+        "--vocab-only", action="store_true",
+        help="extract only the vocab",
+    )
+    parser.add_argument(
+        "--outfile", type=Path,
+        help="path to write to; default: based on input",
+    )
+    parser.add_argument(
+        "model", type=Path,
+        help="directory containing model file, or model file itself (*.bin)",
+    )
+    parser.add_argument(
+        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
+        help="output format - use 0 for float32, 1 for float16",
+    )
+    return parser.parse_args()
+
+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
+    sys.exit(1)
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
+
+print("gguf: loading model "+dir_model.name)
+
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"):
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit(1)
+
+# get number of model parts
+num_parts = count_model_parts(dir_model, "model-00")
+if num_parts:
+    is_safetensors = True
+    from safetensors import safe_open
+else:
+    is_safetensors = False
+    num_parts = count_model_parts(dir_model, "pytorch_model-")
+
+ARCH=gguf.MODEL_ARCH.FALCON
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams.get("num_hidden_layers")
+if block_count is None:
+    block_count = hparams["n_layer"]  # old name
+
+n_head = hparams.get("num_attention_heads")
+if n_head is None:
+    n_head = hparams["n_head"]  # old name
+
+n_head_kv = hparams.get("num_kv_heads")
+if n_head_kv is None:
+    n_head_kv = hparams.get("n_head_kv", 1)  # old name
+
+gguf_writer.add_name("Falcon")
+gguf_writer.add_context_length(2048) # not in config.json
+gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_head_count(n_head)
+gguf_writer.add_head_count_kv(n_head_kv)
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
+gguf_writer.add_file_type(ftype)
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: list[bytearray] = []
+scores: list[float] = []
+toktypes: list[int] = []
+
+# gpt2 tokenizer
+gguf_writer.add_tokenizer_model("gpt2")
+
+print("gguf: get gpt2 tokenizer vocab")
+
+# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+assert max(tokenizer.vocab.values()) < vocab_size
+
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+
+for i in range(vocab_size):
+    tokens.append(reverse_vocab[i])
+    scores.append(0.0) # dummy
+    toktypes.append(gguf.TokenType.NORMAL)
+
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
+gguf_writer.add_token_types(toktypes)
+
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
+special_vocab.add_to_gguf(gguf_writer)
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+head_dim = hparams["hidden_size"] // n_head
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = iter(("pytorch_model.bin",))
+elif is_safetensors:
+    part_names = (
+        f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
+    )
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    if args.vocab_only:
+        break
+    print("gguf: loading model part '" + part_name + "'")
+    if is_safetensors:
+        ctx = safe_open(dir_model / part_name, framework="pt", device="cpu")
+    else:
+        ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu"))
+
+    with ctx as model_part:
+        for name in model_part.keys():
+            data = model_part.get_tensor(name) if is_safetensors else model_part[name]
+
+            old_dtype = data.dtype
+
+            # convert any unsupported data types to float32
+            if data.dtype != torch.float16 and data.dtype != torch.float32:
+                data = data.to(torch.float32)
+
+            # QKV tensor transform
+            # The original query_key_value tensor contains n_head_kv "kv groups",
+            # each consisting of n_head/n_head_kv query weights followed by one key
+            # and one value weight (shared by all query heads in the kv group).
+            # This layout makes it a big pain to work with in GGML.
+            # So we rearrange them here,, so that we have n_head query weights
+            # followed by n_head_kv key weights followed by n_head_kv value weights,
+            # in contiguous fashion.
+            # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
+
+            if "query_key_value" in name:
+                qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
+                q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
+                k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
+                v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
+                data = torch.cat((q,k,v)).reshape_as(data)
+
+            data = data.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+            if new_name is None:
+                print("Can not map tensor '" + name + "'")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+            gguf_writer.add_tensor(new_name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print(f"gguf: model successfully exported to '{fname_out}'")
+print("")
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+# HF gptneox--> gguf conversion
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import struct
+import sys
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer  # type: ignore[import]
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
+
+def count_model_parts(dir_model: Path) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
+    parser.add_argument(
+        "--vocab-only", action="store_true",
+        help="extract only the vocab",
+    )
+    parser.add_argument(
+        "--outfile", type=Path,
+        help="path to write to; default: based on input",
+    )
+    parser.add_argument(
+        "model", type=Path,
+        help="directory containing model file, or model file itself (*.bin)",
+    )
+    parser.add_argument(
+        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
+        help="output format - use 0 for float32, 1 for float16",
+    )
+    return parser.parse_args()
+
+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
+    sys.exit(1)
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
+
+print("gguf: loading model "+dir_model.name)
+
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "GPTNeoXForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.GPTNEOX
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["num_hidden_layers"]
+
+gguf_writer.add_name(dir_model.name)
+gguf_writer.add_context_length(hparams["max_position_embeddings"])
+gguf_writer.add_embedding_length(hparams["hidden_size"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
+gguf_writer.add_rope_dimension_count(int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))
+gguf_writer.add_head_count(hparams["num_attention_heads"])
+gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: list[bytearray] = []
+scores: list[float] = []
+toktypes: list[int] = []
+
+# gpt2 tokenizer
+gguf_writer.add_tokenizer_model("gpt2")
+
+print("gguf: get gpt2 tokenizer vocab")
+
+# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+assert max(tokenizer.vocab.values()) < vocab_size
+
+added_vocab = tokenizer.get_added_vocab()
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+
+for i in range(vocab_size):
+    if i not in reverse_vocab:
+        tokens.append(f"[PAD{i}]")
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    elif reverse_vocab[i] in added_vocab:
+        tokens.append(reverse_vocab[i])
+        if tokenizer.added_tokens_decoder[i].special:
+            toktypes.append(gguf.TokenType.CONTROL)
+        else:
+            toktypes.append(gguf.TokenType.USER_DEFINED)
+    else:
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.NORMAL)
+
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_types(toktypes)
+
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
+special_vocab.add_to_gguf(gguf_writer)
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = iter(("pytorch_model.bin",))
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    if args.vocab_only:
+        break
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        # we don't need these
+        if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"):
+            continue
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+        if new_name is None:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(new_name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print(f"gguf: model successfully exported to '{fname_out}'")
+print("")
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -2,6 +2,7 @@
 from __future__ import annotations

 import argparse
+import math
 import struct
 import sys
 from enum import IntEnum
@@ -11,16 +12,34 @@ import numpy as np

 import os
 if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf

+# Note: Does not support GGML_QKK_64
+QK_K = 256
+# Items here are (block size, type size)
+GGML_QUANT_SIZES = {
+    gguf.GGMLQuantizationType.F32  : (1, 4),
+    gguf.GGMLQuantizationType.F16  : (1, 2),
+    gguf.GGMLQuantizationType.Q4_0 : (32, 2 + 16),
+    gguf.GGMLQuantizationType.Q4_1 : (32, 2 + 2 + 16),
+    gguf.GGMLQuantizationType.Q5_0 : (32, 2 + 4 + 16),
+    gguf.GGMLQuantizationType.Q5_1 : (32, 2 + 2 + 4 + 16),
+    gguf.GGMLQuantizationType.Q8_0 : (32, 2 + 32),
+    gguf.GGMLQuantizationType.Q8_1 : (32, 4 + 4 + 32),
+    gguf.GGMLQuantizationType.Q2_K : (256, 2 + 2 + QK_K // 16 + QK_K // 4),
+    gguf.GGMLQuantizationType.Q3_K : (256, 2 + QK_K // 4 + QK_K // 8 + 12),
+    gguf.GGMLQuantizationType.Q4_K : (256, 2 + 2 + QK_K // 2 + 12),
+    gguf.GGMLQuantizationType.Q5_K : (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
+    gguf.GGMLQuantizationType.Q6_K : (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
+    gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
+}

 class GGMLFormat(IntEnum):
    GGML = 0
    GGMF = 1
    GGJT = 2

-
 class GGMLFType(IntEnum):
    ALL_F32              = 0
    MOSTLY_F16           = 1
@@ -40,7 +59,6 @@ class GGMLFType(IntEnum):
    MOSTLY_Q5_K_M        = 17
    MOSTLY_Q6_K          = 18

-
 class Hyperparameters:
    def __init__(self):
        self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
@@ -72,7 +90,6 @@ class Hyperparameters:
    def __str__(self):
        return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'

-
 class Vocab:
    def __init__(self, load_scores = True):
        self.items = []
@@ -94,7 +111,6 @@ class Vocab:
            self.items.append((item_text, item_score))
        return offset - orig_offset

-
 class Tensor:
    def __init__(self, use_padding = True):
        self.name = None
@@ -109,7 +125,7 @@ class Tensor:
        (n_dims, name_len, dtype) = struct.unpack('<3I', data[offset:offset + 12])
        assert n_dims >= 0 and n_dims <= 4, f'Invalid tensor dimensions {n_dims}'
        assert name_len < 4096, 'Absurd tensor name length'
-        quant = gguf.GGML_QUANT_SIZES.get(dtype)
+        quant = GGML_QUANT_SIZES.get(dtype)
        assert quant is not None, 'Unknown tensor type'
        (blksize, tysize) = quant
        offset += 12
@@ -128,7 +144,6 @@ class Tensor:
        # print(n_dims, name_len, dtype, self.dims, self.name, pad)
        return offset - orig_offset

-
 class GGMLModel:
    def __init__(self):
        self.hyperparameters = None
@@ -165,8 +180,8 @@ class GGMLModel:
            if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
                err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
        elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
-            if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
-                         GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
+            if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
+                          GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
                err = 'Q4 and Q8 quantizations changed in GGJTv3.'
        if len(err) > 0:
            raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
@@ -193,7 +208,6 @@ class GGMLModel:
        hp.set_n_ff(self)
        return offset

-
 class GGMLToGGUF:
    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
        hp = ggml_model.hyperparameters
@@ -224,7 +238,7 @@ class GGMLToGGUF:
        gguf_writer = gguf.GGUFWriter(
            self.cfg.output,
            gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
-            use_temp_file = False)
+            use_temp_file = False )
        self.add_params(gguf_writer)
        self.add_vocab(gguf_writer)
        if self.special_vocab is not None:
@@ -348,8 +362,7 @@ class GGMLToGGUF:
                mapped_name,
                data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
                raw_shape = tempdims,
-                raw_dtype = tensor.dtype)
-
+                raw_dtype = tensor.dtype )

 def handle_metadata(cfg, hp):
    import convert
@@ -373,40 +386,38 @@ def handle_metadata(cfg, hp):
        raise ValueError('Unable to load metadata')
    vocab = convert.load_vocab(
        cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
-        cfg.vocabtype)
+        cfg.vocabtype )
    # FIXME: Respect cfg.vocab_dir?
    svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
-                               load_merges = cfg.vocabtype == 'bpe',
-                               n_vocab = vocab.vocab_size)
+        load_merges = cfg.vocabtype == 'bpe',
+        n_vocab = vocab.vocab_size)
    convert.check_vocab_size(params, vocab)
    return (params, vocab, svocab)

-
 def handle_args():
    parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
    parser.add_argument('--input', '-i', type = Path, required = True,
-                        help = 'Input GGMLv3 filename')
+        help = 'Input GGMLv3 filename')
    parser.add_argument('--output', '-o', type = Path, required = True,
-                        help ='Output GGUF filename')
+        help ='Output GGUF filename')
    parser.add_argument('--name',
-                        help = 'Set model name')
+        help = 'Set model name')
    parser.add_argument('--desc',
-                        help = 'Set model description')
+        help = 'Set model description')
    parser.add_argument('--gqa', type = int, default = 1,
-                        help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
+        help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
    parser.add_argument('--eps', default = '5.0e-06',
-                        help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
+        help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
    parser.add_argument('--context-length', '-c', type=int, default = 2048,
-                        help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
+        help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
    parser.add_argument('--model-metadata-dir', '-m', type = Path,
-                        help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
+        help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
    parser.add_argument("--vocab-dir", type=Path,
-                        help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
+        help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
    parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
-                        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
+        help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
    return parser.parse_args()

-
 def main():
    cfg = handle_args()
    print(f'* Using config: {cfg}')
@@ -416,7 +427,7 @@ def main():
    data = np.memmap(cfg.input, mode = 'r')
    model = GGMLModel()
    print('* Scanning GGML input file')
-    offset = model.load(data, 0)  # noqa
+    offset = model.load(data, 0)
    print(f'* GGML model hyperparameters: {model.hyperparameters}')
    vocab_override = None
    params_override = None
@@ -431,15 +442,12 @@ def main():
        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
        if model.file_format == GGMLFormat.GGML:
            print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
-    converter = GGMLToGGUF(
-        model, data, cfg,
+    converter = GGMLToGGUF(model, data, cfg,
        params_override = params_override,
        vocab_override = vocab_override,
-        special_vocab = special_vocab
-    )
+        special_vocab = special_vocab )
    converter.save()
    print(f'* Successful completion. Output saved to: {cfg.output}')

-
 if __name__ == '__main__':
    main()
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -3,6 +3,7 @@ from __future__ import annotations

 import json
 import os
+import re
 import struct
 import sys
 from typing import Any, BinaryIO, Sequence
@@ -10,15 +11,43 @@ from typing import Any, BinaryIO, Sequence
 import numpy as np
 import torch

-from pathlib import Path
-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
-import gguf
-
-
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}


+HF_SUBLAYER_TO_GGML = {
+    "self_attn.q_proj": "attn_q",
+    "self_attn.k_proj": "attn_k",
+    "self_attn.v_proj": "attn_v",
+    "self_attn.o_proj": "attn_output",
+    "mlp.gate_proj": "ffn_gate",
+    "mlp.down_proj": "ffn_down",
+    "mlp.up_proj": "ffn_up",
+    "input_layernorm": "attn_norm",
+    "post_attention_layernorm": "ffn_norm",
+}
+
+
+def translate_tensor_name(t: str) -> str:
+    match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
+    if match:
+        nn = match.group(1)
+        sub_layer = match.group(2)
+        lora_type = match.group(3)
+
+        sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
+        if sub_layer_renamed is None:
+            print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
+            sys.exit(1)
+
+        output_string = (
+            f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
+        )
+        return output_string
+    else:
+        print(f"Error: unrecognized tensor {t}")
+        sys.exit(1)
+
+
 def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(b"ggla"[::-1])  # magic (ggml lora)
    fout.write(struct.pack("i", 1))  # file version
@@ -32,7 +61,9 @@ def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
    fout.write(struct.pack("i", int(params["lora_alpha"])))


-def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
+def write_tensor_header(
+    self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
+) -> None:
    sname = name.encode("utf-8")
    fout.write(
        struct.pack(
@@ -47,96 +78,60 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
    fout.seek((fout.tell() + 31) & -32)


-if __name__ == '__main__':
-    if len(sys.argv) < 2:
-        print(f"Usage: python {sys.argv[0]} <path> [arch]")
-        print(
-            "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
-        )
-        print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
-        sys.exit(1)
+if len(sys.argv) != 2:
+    print(f"Usage: python {sys.argv[0]} <path>")
+    print(
+        "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
+    )
+    sys.exit(1)

-    input_json = os.path.join(sys.argv[1], "adapter_config.json")
-    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
-    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
+input_json = os.path.join(sys.argv[1], "adapter_config.json")
+input_model = os.path.join(sys.argv[1], "adapter_model.bin")
+output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")

-    model = torch.load(input_model, map_location="cpu")
-    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
+model = torch.load(input_model, map_location="cpu")

-    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
-        print(f"Error: unsupported architecture {arch_name}")
-        sys.exit(1)
+with open(input_json, "r") as f:
+    params = json.load(f)

-    arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
-    name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
+if params["peft_type"] != "LORA":
+    print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
+    sys.exit(1)

-    with open(input_json, "r") as f:
-        params = json.load(f)
+if params["fan_in_fan_out"] is True:
+    print("Error: param fan_in_fan_out is not supported")
+    sys.exit(1)

-    if params["peft_type"] != "LORA":
-        print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
-        sys.exit(1)
+if params["bias"] is not None and params["bias"] != "none":
+    print("Error: param bias is not supported")
+    sys.exit(1)

-    if params["fan_in_fan_out"] is True:
-        print("Error: param fan_in_fan_out is not supported")
-        sys.exit(1)
+# TODO: these seem to be layers that have been trained but without lora.
+# doesn't seem widely used but eventually should be supported
+if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
+    print("Error: param modules_to_save is not supported")
+    sys.exit(1)

-    if params["bias"] is not None and params["bias"] != "none":
-        print("Error: param bias is not supported")
-        sys.exit(1)
+with open(output_path, "wb") as fout:
+    fout.truncate()

-    # TODO: these seem to be layers that have been trained but without lora.
-    # doesn't seem widely used but eventually should be supported
-    if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
-        print("Error: param modules_to_save is not supported")
-        sys.exit(1)
-
-    with open(output_path, "wb") as fout:
-        fout.truncate()
-
-        write_file_header(fout, params)
-        for k, v in model.items():
-            orig_k = k
-            if k.endswith(".default.weight"):
-                k = k.replace(".default.weight", ".weight")
-            if k in ["llama_proj.weight", "llama_proj.bias"]:
-                continue
-            if k.endswith("lora_A.weight"):
-                if v.dtype != torch.float16 and v.dtype != torch.float32:
-                    v = v.float()
-                v = v.T
-            else:
+    write_file_header(fout, params)
+    for k, v in model.items():
+        if k.endswith(".default.weight"):
+            k = k.replace(".default.weight", ".weight")
+        if k in ["llama_proj.weight", "llama_proj.bias"]:
+            continue
+        if k.endswith("lora_A.weight"):
+            if v.dtype != torch.float16 and v.dtype != torch.float32:
                v = v.float()
+            v = v.T
+        else:
+            v = v.float()

-            t = v.detach().numpy()
+        t = v.detach().numpy()
+        tname = translate_tensor_name(k)
+        print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
+        write_tensor_header(fout, tname, t.shape, t.dtype)
+        t.tofile(fout)

-            prefix = "base_model.model."
-            if k.startswith(prefix):
-                k = k[len(prefix) :]
-
-            lora_suffixes = (".lora_A.weight", ".lora_B.weight")
-            if k.endswith(lora_suffixes):
-                suffix = k[-len(lora_suffixes[0]):]
-                k = k[: -len(lora_suffixes[0])]
-            else:
-                print(f"Error: unrecognized tensor name {orig_k}")
-                sys.exit(1)
-
-            tname = name_map.get_name(k)
-            if tname is None:
-                print(f"Error: could not map tensor name {orig_k}")
-                print(" Note: the arch parameter must be specified if the model is not llama")
-                sys.exit(1)
-
-            if suffix == ".lora_A.weight":
-                tname += ".weight.loraA"
-            elif suffix == ".lora_B.weight":
-                tname += ".weight.loraB"
-            else:
-                assert False
-
-            print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
-            write_tensor_header(fout, tname, t.shape, t.dtype)
-            t.tofile(fout)
-
-    print(f"Converted {input_json} and {input_model} to {output_path}")
+print(f"Converted {input_json} and {input_model} to {output_path}")
--- a/convert-mpt-hf-to-gguf.py
+++ b/convert-mpt-hf-to-gguf.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+# HF mpt--> gguf conversion
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import struct
+import sys
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer  # type: ignore[import]
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
+
+def count_model_parts(dir_model: Path) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file")
+    parser.add_argument(
+        "--vocab-only", action="store_true",
+        help="extract only the vocab",
+    )
+    parser.add_argument(
+        "--outfile", type=Path,
+        help="path to write to; default: based on input",
+    )
+    parser.add_argument(
+        "model", type=Path,
+        help="directory containing model file, or model file itself (*.bin)",
+    )
+    parser.add_argument(
+        "ftype", type=int, choices=[0, 1], default=1, nargs='?',
+        help="output format - use 0 for float32, 1 for float16",
+    )
+    return parser.parse_args()
+
+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
+    sys.exit(1)
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
+
+print("gguf: loading model "+dir_model.name)
+
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "MPTForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit()
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.MPT
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["n_layers"]
+
+gguf_writer.add_name(dir_model.name)
+gguf_writer.add_context_length(hparams["max_seq_len"])
+gguf_writer.add_embedding_length(hparams["d_model"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(4 * hparams["d_model"])
+gguf_writer.add_head_count(hparams["n_heads"])
+if kv_n_heads := hparams["attn_config"].get("kv_n_heads"):
+    gguf_writer.add_head_count_kv(kv_n_heads)
+gguf_writer.add_layer_norm_eps(1e-05)
+if hparams["attn_config"]["clip_qkv"] is not None:
+    gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"])
+gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"])
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: list[bytearray] = []
+scores: list[float] = []
+toktypes: list[int] = []
+
+# gpt2 tokenizer
+gguf_writer.add_tokenizer_model("gpt2")
+
+print("gguf: get gpt2 tokenizer vocab")
+
+# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but
+# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to
+# accomodate some "reserved" tokens; this is causing problems down the line in
+# llama.cpp, so we pad the vocab with dummy tokens:
+
+vocab_size = hparams["vocab_size"]
+
+# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+added_vocab = tokenizer.get_added_vocab()
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+
+for i in range(vocab_size):
+    if i not in reverse_vocab:
+        tokens.append(f"[PAD{i}]")
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    elif reverse_vocab[i] in added_vocab:
+        tokens.append(reverse_vocab[i])
+        if tokenizer.added_tokens_decoder[i].special:
+            toktypes.append(gguf.TokenType.CONTROL)
+        else:
+            toktypes.append(gguf.TokenType.USER_DEFINED)
+    else:
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.NORMAL)
+
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_types(toktypes)
+
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
+special_vocab.add_to_gguf(gguf_writer)
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = iter(("pytorch_model.bin",))
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    if args.vocab_only:
+        break
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+        if new_name is None:
+            print("Cannot map tensor '" + name + "'")
+            continue # for the sake of compatibility with some old published models, don't quit
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(new_name, data)
+
+        # note: MPT output is tied to (same as) wte in original model;
+        # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
+        if new_name == "token_embd.weight":
+            gguf_writer.add_tensor("output.weight", data)
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print(f"gguf: model successfully exported to '{fname_out}'")
+print("")
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 import torch
 import os
 from pprint import pprint
@@ -7,10 +6,9 @@ import argparse
 from pathlib import Path
 from sentencepiece import SentencePieceProcessor
 if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf

-
 def _flatten_dict(dct, tensors, prefix=None):
    assert isinstance(dct, dict)
    for key in dct.keys():
@@ -23,7 +21,6 @@ def _flatten_dict(dct, tensors, prefix=None):
            raise ValueError(type(dct[key]))
    return None

-
 def _get_sentencepiece_tokenizer_info(dir_model: Path):
    tokenizer_path = dir_model / 'adept_vocab.model'
    print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
@@ -57,7 +54,6 @@ def _get_sentencepiece_tokenizer_info(dir_model: Path):
        pass
    return tokens, scores, toktypes

-
 def main():
    parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
    parser.add_argument("--outfile",             type=Path, help="path to write to; default: based on input")
@@ -129,5 +125,6 @@ def main():
    print("")


+
 if __name__ == '__main__':
    main()
--- a/convert-refact-hf-to-gguf.py
+++ b/convert-refact-hf-to-gguf.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+# HF refact--> gguf conversion
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer  # type: ignore[import]
+
+if "NO_LOCAL_GGUF" not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
+import gguf
+
+def count_model_parts(dir_model: Path) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Convert a Refact model to a GGML compatible file"
+    )
+    parser.add_argument(
+        "--vocab-only",
+        action="store_true",
+        help="extract only the vocab",
+    )
+    parser.add_argument(
+        "--outfile",
+        type=Path,
+        help="path to write to; default: based on input",
+    )
+    parser.add_argument(
+        "model",
+        type=Path,
+        help="directory containing model file, or model file itself (*.bin)",
+    )
+    parser.add_argument(
+        "ftype",
+        type=int,
+        choices=[0, 1],
+        default=1,
+        nargs="?",
+        help="output format - use 0 for float32, 1 for float16",
+    )
+    return parser.parse_args()
+
+
+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f"Error: {args.model} is not a directory", file=sys.stderr)
+    sys.exit(1)
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f"ggml-model-{ftype_str[ftype]}.gguf"
+
+print("gguf: loading model " + dir_model.name)
+
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "GPTRefactForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit(1)
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH = gguf.MODEL_ARCH.REFACT
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+# Get refact feed forward dimension
+hidden_dim = hparams["n_embd"]
+inner_dim = 4 * hidden_dim
+hidden_dim = int(2 * inner_dim / 3)
+multiple_of = 256
+ff_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+
+block_count = hparams["n_layer"]
+
+gguf_writer.add_name("Refact")
+# refact uses Alibi. So this is from config.json which might be used by training.
+gguf_writer.add_context_length(hparams["n_positions"])
+gguf_writer.add_embedding_length(hparams["n_embd"])
+
+gguf_writer.add_feed_forward_length(ff_dim)
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_head_count(hparams["n_head"])
+gguf_writer.add_head_count_kv(1)
+gguf_writer.add_layer_norm_rms_eps(hparams["layer_norm_epsilon"])
+gguf_writer.add_file_type(ftype)
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: list[bytearray] = []
+scores: list[float] = []
+toktypes: list[int] = []
+
+# gpt2 tokenizer
+gguf_writer.add_tokenizer_model("gpt2")
+
+print("gguf: get gpt2 tokenizer vocab")
+
+# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+assert max(tokenizer.vocab.values()) < vocab_size
+
+added_vocab = tokenizer.get_added_vocab()
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+
+for i in range(vocab_size):
+    if i not in reverse_vocab:
+        tokens.append(f"[PAD{i}]")
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    elif reverse_vocab[i] in added_vocab:
+        tokens.append(reverse_vocab[i])
+        if tokenizer.added_tokens_decoder[i].special:
+            toktypes.append(gguf.TokenType.CONTROL)
+        else:
+            toktypes.append(gguf.TokenType.USER_DEFINED)
+    else:
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.NORMAL)
+
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_types(toktypes)
+
+special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
+special_vocab.add_to_gguf(gguf_writer)
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
+
+# params for qkv transform
+n_head = hparams["n_head"]
+n_head_kv = 1
+
+head_dim = hparams["n_embd"] // n_head
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = iter(("pytorch_model.bin",))
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+for part_name in part_names:
+    if args.vocab_only:
+        break
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(dir_model / part_name, map_location="cpu")
+
+    for i in range(block_count):
+        if f"transformer.h.{i}.attn.kv.weight" in model_part:
+            data = model_part[f"transformer.h.{i}.attn.kv.weight"]
+            model_part[f"model.layers.{i}.self_attn.k_proj.weight"] = data[
+                : n_head_kv * head_dim
+            ]
+            model_part[f"model.layers.{i}.self_attn.v_proj.weight"] = data[
+                n_head_kv * head_dim :
+            ]
+            del model_part[f"transformer.h.{i}.attn.kv.weight"]
+        if f"transformer.h.{i}.attn.q.weight" in model_part:
+            model_part[f"model.layers.{i}.self_attn.q_proj.weight"] = model_part[
+                f"transformer.h.{i}.attn.q.weight"
+            ]
+            del model_part[f"transformer.h.{i}.attn.q.weight"]
+        if f"transformer.h.{i}.mlp.gate_up_proj.weight" in model_part:
+            data = model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
+            model_part[f"model.layers.{i}.mlp.gate_proj.weight"] = data[:ff_dim]
+            model_part[f"model.layers.{i}.mlp.up_proj.weight"] = data[ff_dim:]
+            del model_part[f"transformer.h.{i}.mlp.gate_up_proj.weight"]
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        new_name = tensor_map.get_name(name, try_suffixes=(".weight",))
+        if new_name is None:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if (
+            ftype == 1
+            and data_dtype == np.float32
+            and name.endswith(".weight")
+            and n_dims == 2
+        ):
+            data = data.astype(np.float16)
+
+        print(
+            new_name
+            + ", n_dims = "
+            + str(n_dims)
+            + ", "
+            + str(old_dtype)
+            + " --> "
+            + str(data.dtype)
+        )
+
+        gguf_writer.add_tensor(new_name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print(f"gguf: model successfully exported to '{fname_out}'")
+print("")
--- a/convert-starcoder-hf-to-gguf.py
+++ b/convert-starcoder-hf-to-gguf.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+# HF starcoder --> gguf conversion
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import struct
+import sys
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer  # type: ignore[import]
+
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
+
+def count_model_parts(dir_model: Path) -> int:
+    num_parts = 0
+    for filename in os.listdir(dir_model):
+        if filename.startswith("pytorch_model-"):
+            num_parts += 1
+
+    if num_parts > 0:
+        print("gguf: found " + str(num_parts) + " model parts")
+    return num_parts
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a StarCoder model to a GGML compatible file")
+    parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
+    parser.add_argument("--outfile",    type=Path,           help="path to write to; default: based on input")
+    parser.add_argument("model",        type=Path,           help="directory containing model file, or model file itself (*.bin)")
+    parser.add_argument("ftype",        type=int,            help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1)
+    return parser.parse_args()
+
+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
+    sys.exit(1)
+
+# possible tensor data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
+
+print("gguf: loading model "+dir_model.name)
+
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
+    hparams = json.load(f)
+
+if hparams["architectures"][0] != "GPTBigCodeForCausalLM":
+    print("Model architecture not supported: " + hparams["architectures"][0])
+
+    sys.exit(1)
+
+# get number of model parts
+num_parts = count_model_parts(dir_model)
+
+ARCH=gguf.MODEL_ARCH.STARCODER
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+block_count = hparams["n_layer"]
+
+gguf_writer.add_name("StarCoder")
+gguf_writer.add_context_length(hparams["n_positions"])
+gguf_writer.add_embedding_length(hparams["n_embd"])
+gguf_writer.add_feed_forward_length(4 * hparams["n_embd"])
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_head_count(hparams["n_head"])
+gguf_writer.add_head_count_kv(1)
+gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
+gguf_writer.add_file_type(ftype)
+
+# TOKENIZATION
+
+print("gguf: get tokenizer metadata")
+
+tokens: list[bytearray] = []
+scores: list[float] = []
+toktypes: list[int] = []
+
+# gpt2 tokenizer
+gguf_writer.add_tokenizer_model("gpt2")
+
+print("gguf: get gpt2 tokenizer vocab")
+
+# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+assert max(tokenizer.vocab.values()) < vocab_size
+
+added_vocab = tokenizer.get_added_vocab()
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+
+for i in range(vocab_size):
+    if i not in reverse_vocab:
+        tokens.append(f"[PAD{i}]")
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    elif reverse_vocab[i] in added_vocab:
+        tokens.append(reverse_vocab[i])
+        if tokenizer.added_tokens_decoder[i].special:
+            toktypes.append(gguf.TokenType.CONTROL)
+        else:
+            toktypes.append(gguf.TokenType.USER_DEFINED)
+    else:
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.NORMAL)
+
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_types(toktypes)
+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
+special_vocab.add_to_gguf(gguf_writer)
+
+# TENSORS
+
+tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
+
+# params for qkv transform
+n_head    = hparams["n_head"]
+n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
+
+head_dim = hparams["n_embd"] // n_head
+
+# tensor info
+print("gguf: get tensor metadata")
+
+if num_parts == 0:
+    part_names = iter(("pytorch_model.bin",))
+else:
+    part_names = (
+        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
+    )
+
+for part_name in part_names:
+    if args.vocab_only:
+        break
+    print("gguf: loading model part '" + part_name + "'")
+    model_part = torch.load(dir_model / part_name, map_location="cpu")
+
+    for name in model_part.keys():
+        data = model_part[name]
+
+        old_dtype = data.dtype
+
+        # convert any unsupported data types to float32
+        if data.dtype != torch.float16 and data.dtype != torch.float32:
+            data = data.to(torch.float32)
+
+        data = data.squeeze().numpy()
+
+        # map tensor names
+        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+        if new_name is None:
+            print("Can not map tensor '" + name + "'")
+            sys.exit()
+
+        n_dims = len(data.shape)
+        data_dtype = data.dtype
+
+        # if f32 desired, convert any float16 to float32
+        if ftype == 0 and data_dtype == np.float16:
+            data = data.astype(np.float32)
+
+        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+            data = data.astype(np.float32)
+
+        # if f16 desired, convert any float32 2-dim weight tensors to float16
+        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+            data = data.astype(np.float16)
+
+        print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+        gguf_writer.add_tensor(new_name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print(f"gguf: model successfully exported to '{fname_out}'")
+print("")
--- a/convert.py
+++ b/convert.py
@@ -3,14 +3,15 @@ from __future__ import annotations

 import argparse
 import concurrent.futures
+import copy
 import enum
 import faulthandler
 import functools
+import io
 import itertools
 import json
 import math
 import mmap
-import os
 import pickle
 import re
 import signal
@@ -19,17 +20,17 @@ import sys
 import time
 import zipfile
 from abc import ABCMeta, abstractmethod
-from collections import OrderedDict
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, Optional, TypeVar, cast
+from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar

 import numpy as np
-from sentencepiece import SentencePieceProcessor
+from sentencepiece import SentencePieceProcessor  # type: ignore[import]

+import os
 if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf

 if TYPE_CHECKING:
@@ -43,12 +44,10 @@ NDArray: TypeAlias = 'np.ndarray[Any, Any]'
 ARCH = gguf.MODEL_ARCH.LLAMA

 DEFAULT_CONCURRENCY = 8
-
 #
 # data types
 #

-
@dataclass(frozen=True)
 class DataType:
    name: str
@@ -58,17 +57,14 @@ class DataType:
    def elements_to_bytes(self, n_elements: int) -> int:
        return n_elements * self.dtype.itemsize

-
@dataclass(frozen=True)
 class UnquantizedDataType(DataType):
    pass

-
-DT_F16  = UnquantizedDataType('F16',  dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
-DT_F32  = UnquantizedDataType('F32',  dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
-DT_I32  = UnquantizedDataType('I32',  dtype = np.dtype(np.int16),   valid_conversions = [])
-DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16),  valid_conversions = ['F32', 'F16', 'Q8_0'])
-
+DT_F16  = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
+DT_F32  = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
+DT_I32  = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
+DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])

@dataclass(frozen=True)
 class QuantizedDataType(DataType):
@@ -83,7 +79,6 @@ class QuantizedDataType(DataType):
        assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
        return self.quantized_dtype.itemsize * (n_elements // self.block_size)

-
@dataclass(frozen=True)
 class Q8_0QuantizedDataType(QuantizedDataType):
    # Mini Q8_0 quantization in Python!
@@ -93,7 +88,6 @@ class Q8_0QuantizedDataType(QuantizedDataType):
        n_blocks = arr.size // self.block_size
        blocks = arr.reshape((n_blocks, self.block_size))
        # Much faster implementation of block quantization contributed by @Cebtenzzre
-
        def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
            d = abs(blocks).max(axis = 1) / np.float32(127)
            with np.errstate(divide = 'ignore'):
@@ -102,11 +96,10 @@ class Q8_0QuantizedDataType(QuantizedDataType):
            yield from zip(d, qs)
        return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)

-
 DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
-                                dtype = np.dtype(np.float32), valid_conversions = [],
-                                ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
-                                quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
+    dtype = np.dtype(np.float32), valid_conversions = [],
+    ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
+    quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))

 # Quantized types skipped here because they may also map to np.float32
 NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
@@ -125,8 +118,6 @@ SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
 # TODO: match this with `llama_ftype`
 # TODO: rename to LLAMAFileType
 # TODO: move to `gguf.py`
-
-
 class GGMLFileType(enum.IntEnum):
    AllF32     = 0
    MostlyF16  = 1  # except 1d tensors
@@ -139,7 +130,6 @@ class GGMLFileType(enum.IntEnum):
        # 1D tensors are always F32.
        return dt if len(tensor.shape) > 1 else DT_F32

-
 GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
    GGMLFileType.AllF32    : DT_F32,
    GGMLFileType.MostlyF16 : DT_F16,
@@ -150,19 +140,16 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
 # hparams loading
 #

-
@dataclass
 class Params:
-    n_vocab:        int
-    n_embd:         int
-    n_layer:        int
-    n_ctx:          int
-    n_ff:           int
-    n_head:         int
-    n_head_kv:      int
-    n_experts:      int | None = None
-    n_experts_used: int | None = None
-    f_norm_eps:     float | None = None
+    n_vocab:    int
+    n_embd:     int
+    n_layer:    int
+    n_ctx:      int
+    n_ff:       int
+    n_head:     int
+    n_head_kv:  int
+    f_norm_eps: float

    rope_scaling_type: gguf.RopeScalingType | None = None
    f_rope_freq_base: float | None = None
@@ -182,11 +169,11 @@ class Params:

        # try transformer naming first
        if "model.layers.0.self_attn.q_proj.weight" in model:
-            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
+            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
-            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
+            n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
        else:
-            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
+            n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)

        if n_layer < 1:
            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
@@ -237,13 +224,6 @@ class Params:
            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")

-        n_experts      = None
-        n_experts_used = None
-
-        if "num_local_experts" in config:
-            n_experts = config["num_local_experts"]
-            n_experts_used = config["num_experts_per_tok"]
-
        return Params(
            n_vocab           = config["vocab_size"],
            n_embd            = config["hidden_size"],
@@ -252,8 +232,6 @@ class Params:
            n_ff              = config["intermediate_size"],
            n_head            = (n_head := config["num_attention_heads"]),
            n_head_kv         = config.get("num_key_value_heads", n_head),
-            n_experts         = n_experts,
-            n_experts_used    = n_experts_used,
            f_norm_eps        = config["rms_norm_eps"],
            f_rope_freq_base  = config.get("rope_theta"),
            rope_scaling_type = rope_scaling_type,
@@ -268,15 +246,8 @@ class Params:
    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
        config = json.load(open(config_path))

-        n_experts      = None
-        n_experts_used = None
-        f_rope_freq_base = None
-
        # hack to determine LLaMA v1 vs v2 vs CodeLlama
-        if config.get("moe"):
-            # Mixtral
-            n_ctx = 32768
-        elif config.get("rope_theta") == 1000000:
+        if config.get("rope_theta") == 1000000:
            # CodeLlama
            n_ctx = 16384
        elif config["norm_eps"] == 1e-05:
@@ -286,27 +257,16 @@ class Params:
            # LLaMA v1
            n_ctx = 2048

-        if "layers.0.feed_forward.w1.weight" in model:
-            n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
-
-        if config.get("moe"):
-            n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
-            n_experts      = config["moe"]["num_experts"]
-            n_experts_used = config["moe"]["num_experts_per_tok"]
-            f_rope_freq_base = 1e6
-
        return Params(
-            n_vocab          = model["tok_embeddings.weight"].shape[0],
+            n_vocab          = config.get("vocab_size", model["tok_embeddings.weight"].shape[0]),
            n_embd           = config["dim"],
            n_layer          = config["n_layers"],
            n_ctx            = n_ctx,
-            n_ff             = n_ff,
+            n_ff             = model["layers.0.feed_forward.w1.weight"].shape[0],
            n_head           = (n_head := config["n_heads"]),
            n_head_kv        = config.get("n_kv_heads", n_head),
-            n_experts        = n_experts,
-            n_experts_used   = n_experts_used,
            f_norm_eps       = config["norm_eps"],
-            f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
+            f_rope_freq_base = config.get("rope_theta"),
        )

    @staticmethod
@@ -328,154 +288,140 @@ class Params:
        return params


-class VocabLoader:
-    def __init__(self, params: Params, fname_tokenizer: Path) -> None:
-        try:
-            from transformers import AutoTokenizer
-        except ImportError as e:
-            raise ImportError(
-                "To use VocabLoader, please install the `transformers` package. "
-                "You can install it with `pip install transformers`."
-            ) from e
+#
+# vocab
+#

-        try:
-            self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), trust_remote_code=True)
-        except ValueError:
-            self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer), use_fast=False, trust_remote_code=True)
-
-        self.added_tokens_dict: OrderedDict[str, int] = OrderedDict()
-
-        for tok, tokidx in sorted(self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]):
-            if tokidx >= params.n_vocab or tokidx < self.tokenizer.vocab_size:
-                continue
-
-            self.added_tokens_dict[tok] = tokidx
-
-        self.unk_token_id: int = self.tokenizer.unk_token_id
-        self.specials: dict[str, int] = {
-            tok: self.tokenizer.get_vocab()[tok]
-            for tok in self.tokenizer.all_special_tokens
-        }
-        self.special_ids: set[int] = set(self.tokenizer.all_special_ids)
-        self.reverse_vocab = {id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()}
-        self.vocab_size_base: int = self.tokenizer.vocab_size
-        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_dict)
-        self.fname_tokenizer: Path = fname_tokenizer
-
-        vocab_file = "tokenizer.model"
-        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
-        if path_candidate is not None:
-            self.spm = SentencePieceProcessor(str(path_candidate))
-            print(self.spm.vocab_size(), self.vocab_size_base)
+class BpeVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
+        added_tokens: dict[str, int]
+        if fname_added_tokens is not None:
+            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
+            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
        else:
-            self.spm = None
+            # Fall back to trying to find the added tokens in tokenizer.json
+            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
+            if not tokenizer_json_file.is_file():
+                added_tokens = {}
+            else:
+                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
+                added_tokens = dict(
+                    (item['content'], item['id'])
+                    for item in tokenizer_json.get('added_tokens', [])
+                    # Added tokens here can be duplicates of the main vocabulary.
+                    if item['content'] not in self.bpe_tokenizer )

-    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        added_tokens_ids = set(self.added_tokens_dict.values())
+        vocab_size: int = len(self.bpe_tokenizer)
+        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids      = sorted(added_tokens.values())
+        if expected_ids != actual_ids:
+            expected_end_id = vocab_size + len(actual_ids) - 1
+            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")

-        for i in range(self.vocab_size_base):
-            if i in added_tokens_ids:
-                continue
+        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_list    = [text for (text, idx) in items]
+        self.vocab_size_base: int = vocab_size
+        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer      = fname_tokenizer
+        self.fname_added_tokens   = fname_added_tokens

-            text = self.reverse_vocab[i].encode("utf-8")
-            yield text, self.get_token_score(i), self.get_token_type(i)
+    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        tokenizer = self.bpe_tokenizer
+        from transformers.models.gpt2 import tokenization_gpt2  # type: ignore[import]
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}

-    def get_token_type(self, token_id: int) -> gguf.TokenType:
-        toktype = gguf.TokenType.NORMAL
-
-        if self.spm is not None and token_id < self.spm.vocab_size():
-            if self.spm.is_unknown(token_id):
-                toktype = gguf.TokenType.UNKNOWN
-            if self.spm.is_control(token_id):
-                toktype = gguf.TokenType.CONTROL
-            if self.spm.is_unused(token_id):
-                toktype = gguf.TokenType.UNUSED
-            if self.spm.is_byte(token_id):
-                toktype = gguf.TokenType.BYTE
-        else:
-            token = self.reverse_vocab[token_id]
-            if token_id == self.unk_token_id:
-                toktype = gguf.TokenType.UNKNOWN
-            elif token_id in self.special_ids:
-                toktype = gguf.TokenType.CONTROL
-            elif len(token) == 6 and token.startswith("<0x") and token.endswith(">"):
-                toktype = gguf.TokenType.BYTE
-
-        return toktype
-
-    def get_token_score(self, token_id: int) -> float:
-        if self.spm is not None and token_id < self.spm.vocab_size():
-            return cast(float, self.spm.get_score(token_id))
-        return 0.0
+        for i, _ in enumerate(tokenizer):
+            yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL

    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-
-        for text in self.added_tokens_dict:
-            if text in self.specials:
-
-                toktype = self.get_token_type(self.specials[text])
-                score = self.get_token_score(self.specials[text])
-
-            else:
-                toktype = gguf.TokenType.USER_DEFINED
-                score = -1000.0
-
-            yield text.encode("utf-8"), score, toktype
-
-    def has_newline_token(self) -> bool:
-        return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.CONTROL

    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        yield from self.hf_tokens()
+        yield from self.bpe_tokens()
        yield from self.added_tokens()

-    def get_vocab_type(self) -> str:
-        path_candidates = []
-        vocab_file = "tokenizer.model"
-        path_candidates.append(vocab_file)
-        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
-        if path_candidate is not None:
-            return "llama"
+    def __repr__(self) -> str:
+        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"

-        vocab_file = "vocab.json"
-        path_candidates.append(vocab_file)
-        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
-        if path_candidate is not None:
-            return "gpt2"

-        vocab_file = "tokenizer.json"
-        path_candidates.append(vocab_file)
-        path_candidate = find_vocab_file_path(self.fname_tokenizer, vocab_file)
-        if path_candidate:
-            if not self.has_newline_token():
-                return "gpt2"
-            return "llama"
+class SentencePieceVocab:
+    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
+        added_tokens: dict[str, int]
+        if fname_added_tokens is not None:
+            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
+        else:
+            added_tokens = {}

-        raise FileNotFoundError(
-            f"Could not find {path_candidates} in {self.fname_tokenizer} or its parent; "
-            "if it's in another directory, pass the directory as --vocab-dir"
-        )
+        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
+
+        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
+        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
+        actual_new_ids   = sorted(new_tokens.keys())
+
+        if expected_new_ids != actual_new_ids:
+            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
+
+        # Token pieces that were added to the base vocabulary.
+        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
+        self.vocab_size_base    = vocab_size
+        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer    = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens
+
+    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        tokenizer = self.sentencepiece_tokenizer
+        for i in range(tokenizer.vocab_size()):
+            piece = tokenizer.id_to_piece(i)
+            text: bytes = piece.encode("utf-8")
+            score: float = tokenizer.get_score(i)
+
+            toktype = gguf.TokenType.NORMAL
+            if tokenizer.is_unknown(i):
+                toktype = gguf.TokenType.UNKNOWN
+            if tokenizer.is_control(i):
+                toktype = gguf.TokenType.CONTROL
+
+            # NOTE: I think added_tokens are user defined.
+            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
+            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
+
+            if tokenizer.is_unused(i):
+                toktype = gguf.TokenType.UNUSED
+            if tokenizer.is_byte(i):
+                toktype = gguf.TokenType.BYTE
+
+            yield text, score, toktype
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+
+    def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        yield from self.sentencepiece_tokens()
+        yield from self.added_tokens()

    def __repr__(self) -> str:
-        return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_dict)} added tokens>"
-
-
-Vocab: TypeAlias = 'VocabLoader'
+        return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"

+Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'

 #
 # data loading
 # TODO: reuse (probably move to gguf.py?)
 #

-
 def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
-    # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
+    #print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
    if n_head_kv is not None and n_head != n_head_kv:
        n_head = n_head_kv
    return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-            .swapaxes(1, 2)
-            .reshape(weights.shape))
+                .swapaxes(1, 2)
+                .reshape(weights.shape))


 class Tensor(metaclass=ABCMeta):
@@ -556,7 +502,7 @@ class LazyTensor:
        ret = self._load()
        # Should be okay if it maps to the same numpy type?
        assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
-            (self.data_type, ret.data_type, self.description)
+                (self.data_type, ret.data_type, self.description)
        return ret

    def astype(self, data_type: DataType) -> LazyTensor:
@@ -629,7 +575,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:

    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
        # Transformers models put different tensors in different files, but
-        # don't split individual tensors between files.
+        # don't split indivdual tensors between files.
        model: LazyModel = {}
        for mp in models_plus:
            model.update(mp.model)
@@ -644,7 +590,6 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
        return lazy_tensor.load().permute(n_head, n_head_kv)
    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)

-
 def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
    def load() -> Tensor:
        return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
@@ -652,7 +597,6 @@ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_
    s[0] = s[0] // 3
    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)

-
 def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
    def load() -> Tensor:
        return lazy_tensor.load().part(n_part)
@@ -722,7 +666,7 @@ class LazyUnpickler(pickle.Unpickler):
        return func(*args)

    CLASSES: dict[tuple[str, str], Any] = {
-        # getattr used here as a workaround for mypy not being smart enough to determine
+        # getattr used here as a workaround for mypy not being smart enough to detrmine
        # the staticmethods have a __func__ attribute.
        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
@@ -748,7 +692,6 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
                              data_base_path=pickle_paths[0][:-4],
                              zip_file=zf)
    model = unpickler.load()
-    if 'model' in model: model = model['model']
    as_dict = dict(model.items())
    return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)

@@ -802,7 +745,6 @@ def lazy_load_file(path: Path) -> ModelPlus:
 In = TypeVar('In')
 Out = TypeVar('Out')

-
 def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
    '''Parallel map, but with backpressure.  If the caller doesn't call `next`
    fast enough, this will stop calling `func` at some point rather than
@@ -837,33 +779,25 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
                    break
            yield result

-
-def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
+def check_vocab_size(params: Params, vocab: Vocab) -> None:
    if params.n_vocab != vocab.vocab_size:
-        if params.n_vocab == vocab.vocab_size:
+        assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
+        if params.n_vocab == vocab.vocab_size_base:
            print("Ignoring added_tokens.json since model matches vocab size without it.")
-            vocab.added_tokens_dict = OrderedDict()
-            vocab.vocab_size = vocab.vocab_size
-            return
-
-        if pad_vocab and params.n_vocab > vocab.vocab_size:
-            pad_count = params.n_vocab - vocab.vocab_size
-            print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
-            for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
-                vocab.added_tokens_dict[f'<dummy{i:05}>'] = -1
-            vocab.vocab_size = params.n_vocab
+            vocab.added_tokens_list = []
+            vocab.vocab_size = vocab.vocab_size_base
            return
        msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
+        if vocab.fname_added_tokens is not None:
+            msg += f" combined with {vocab.fname_added_tokens}"
        msg += f" has {vocab.vocab_size})."
-        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
+        if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20 and vocab.fname_added_tokens is None:
            msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
-        if vocab.vocab_size < params.n_vocab:
-            msg += " Possibly try using the --padvocab option."
        raise Exception(msg)


 class OutputFile:
-    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

    def add_meta_arch(self, params: Params) -> None:
@@ -883,17 +817,7 @@ class OutputFile:
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
        self.gguf.add_head_count          (params.n_head)
        self.gguf.add_head_count_kv       (params.n_head_kv)
-
-        if params.n_experts:
-            self.gguf.add_expert_count(params.n_experts)
-
-        if params.n_experts_used:
-            self.gguf.add_expert_used_count(params.n_experts_used)
-
-        if params.f_norm_eps:
-            self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
-        else:
-            raise ValueError('f_norm_eps is None')
+        self.gguf.add_layer_norm_rms_eps  (params.f_norm_eps)

        if params.f_rope_freq_base is not None:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)
@@ -922,8 +846,12 @@ class OutputFile:
            scores.append(score)
            toktypes.append(toktype)

-        vocab_type = vocab.get_vocab_type()
-        self.gguf.add_tokenizer_model(vocab_type)
+        if isinstance(vocab, SentencePieceVocab):
+            self.gguf.add_tokenizer_model("llama")
+        elif isinstance(vocab, BpeVocab):
+            self.gguf.add_tokenizer_model("gpt2")
+        else:
+            raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
        self.gguf.add_token_list(tokens)
        self.gguf.add_token_scores(scores)
        self.gguf.add_token_types(toktypes)
@@ -949,12 +877,8 @@ class OutputFile:
        self.gguf.close()

    @staticmethod
-    def write_vocab_only(
-        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
-        pad_vocab: bool = False,
-    ) -> None:
-        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
+    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
+        check_vocab_size(params, vocab)

        of = OutputFile(fname_out, endianess=endianess)

@@ -981,13 +905,8 @@ class OutputFile:
        return dt.quantize(arr)

    @staticmethod
-    def write_all(
-        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
-        concurrency: int = DEFAULT_CONCURRENCY,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
-        pad_vocab: bool = False,
-    ) -> None:
-        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
+    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
+        check_vocab_size(params, vocab)

        of = OutputFile(fname_out, endianess=endianess)

@@ -1020,9 +939,8 @@ class OutputFile:

        of.close()

-
 def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
-    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
+    wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type

    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
        return GGMLFileType.AllF32
@@ -1035,12 +953,10 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT

    raise Exception(f"Unexpected combination of types: {name_to_type}")

-
 def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
            for (name, tensor) in model.items()}

-
 def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
    tmap = gguf.TensorNameMap(ARCH, params.n_layer)
    should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
@@ -1053,7 +969,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
            print(f"Permuting layer {i}")
            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
-            # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
+           #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
            print(f"Unpacking and permuting layer {i}")
            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
@@ -1078,7 +994,6 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:

    return out

-
 def nth_multifile_path(path: Path, n: int) -> Path | None:
    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
    the nth path in the model.
@@ -1123,8 +1038,7 @@ def load_some_model(path: Path) -> ModelPlus:
    # Be extra-friendly and accept either a file or a directory:
    if path.is_dir():
        # Check if it's a set of safetensors files first
-        globs = ["model-00001-of-*.safetensors", "model.safetensors"]
-        files = [file for glob in globs for file in path.glob(glob)]
+        files = list(path.glob("model-00001-of-*.safetensors"))
        if not files:
            # Try the PyTorch patterns too, with lower priority
            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"]
@@ -1145,17 +1059,35 @@ def load_some_model(path: Path) -> ModelPlus:
    return model_plus


-def find_vocab_file_path(path: Path, vocab_file: str) -> Optional[Path]:
-    path2 = path / vocab_file
-    # Use `.parent` instead of /.. to handle the symlink case better.
-    path3 = path.parent / vocab_file
+def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
+    # Be extra-friendly and accept either a file or a directory.  Also, if it's
+    # a directory, it might be the model directory, and tokenizer.model might
+    # be in the parent of that.
+    if path.is_dir():
+        vocab_file = "tokenizer.model"
+        if vocabtype == 'bpe':
+            vocab_file = "vocab.json"
+        path2 = path / vocab_file
+        # Use `.parent` instead of /.. to handle the symlink case better.
+        path3 = path.parent / vocab_file
+        if path2.exists():
+            path = path2
+        elif path3.exists():
+            path = path3
+        else:
+            raise FileNotFoundError(
+                f"Could not find {vocab_file} in {path} or its parent; "
+                "if it's in another directory, pass the directory as --vocab-dir")

-    if path2.exists():
-        return path2
-    if path3.exists():
-        return path3
+    print(f"Loading vocab file '{path}', type '{vocabtype}'")

-    return None
+    added_tokens_path = path.parent / "added_tokens.json"
+    if vocabtype == "bpe":
+        return BpeVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+    elif vocabtype == "spm":
+        return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None)
+    else:
+        raise ValueError(f"Unsupported vocabulary type {vocabtype}")


 def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
@@ -1182,38 +1114,20 @@ def do_dump_model(model_plus: ModelPlus) -> None:


 def main(args_in: list[str] | None = None) -> None:
-    output_choices = ["f32", "f16"]
-    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
-        # We currently only support Q8_0 output on little endian systems.
-        output_choices.append("q8_0")
    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
-    parser.add_argument("--awq-path",    type=Path,              help="Path to scale awq cache file", default=None)
    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
-    parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
+    parser.add_argument("--outtype",     choices=["f32", "f16", "q8_0"], help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
+    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
-    parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")

    args = parser.parse_args(args_in)
-    if args.awq_path:
-        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-        from awq.apply_awq import add_scale_weights
-        tmp_model_path = args.model / "weighted_model"
-        if tmp_model_path.is_dir():
-            print(f"{tmp_model_path} exists as a weighted model.")
-        else:
-            tmp_model_path.mkdir(parents=True, exist_ok=True)
-            print("Saving new weighted model ...")
-            add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
-            print(f"Saved weighted model at {tmp_model_path}.")
-        args.model = tmp_model_path
-
    if args.dump_single:
        model_plus = lazy_load_file(args.model)
        do_dump_model(model_plus)
@@ -1254,13 +1168,12 @@ def main(args_in: list[str] | None = None) -> None:
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        # FIXME: Try to respect vocab_dir somehow?
-        vocab = VocabLoader(params, args.vocab_dir or args.model)
+        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
-                                          load_merges = True,
-                                          n_vocab = vocab.vocab_size)
+            load_merges = args.vocabtype == 'bpe',
+            n_vocab = vocab.vocab_size)
        outfile = args.outfile
-        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-                                    endianess = endianess, pad_vocab = args.padvocab)
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
        print(f"Wrote {outfile}")
        return

@@ -1268,15 +1181,12 @@ def main(args_in: list[str] | None = None) -> None:
        vocab = model_plus.vocab
    else:
        vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
-        vocab = VocabLoader(params, vocab_dir)
-
+        vocab = load_vocab(vocab_dir, args.vocabtype)
    # FIXME: Try to respect vocab_dir somehow?
-    print(f"Vocab info: {vocab}")
    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
-                                      load_merges = True,
-                                      n_vocab = vocab.vocab_size)
+        load_merges = args.vocabtype == 'bpe',
+        n_vocab = vocab.vocab_size)

-    print(f"Special vocab info: {special_vocab}")
    model   = model_plus.model
    model   = convert_model_names(model, params)
    ftype   = pick_output_type(model, args.outtype)
@@ -1286,8 +1196,7 @@ def main(args_in: list[str] | None = None) -> None:
    params.ftype = ftype
    print(f"Writing {outfile}, format {ftype}")

-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-                         concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
    print(f"Wrote {outfile}")


--- a/docs/llama-star/idea-arch.key
+++ b/docs/llama-star/idea-arch.key
--- a/docs/llama-star/idea-arch.pdf
+++ b/docs/llama-star/idea-arch.pdf
--- a/docs/token_generation_performance_tips.md
+++ b/docs/token_generation_performance_tips.md
@@ -17,7 +17,7 @@ llama_model_load_internal: [cublas] total VRAM used: 17223 MB
 If you see these lines, then the GPU is being used.

 ## Verifying that the CPU is not oversaturated
-llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physical CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.
+llama accepts a `-t N` (or `--threads N`) parameter. It's extremely important that this parameter is not too large. If your token generation is extremely slow, try setting this number to 1. If this significantly improves your token generation speed, then your CPU is being oversaturated and you need to explicitly set this parameter to the number of the physicial CPU cores on your machine (even if you utilize a GPU). If in doubt, start with 1 and double the amount until you hit a performance bottleneck, then scale the number down.

 # Example of runtime flags effect on inference speed benchmark
 These runs were tested on the following machine:
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -24,17 +24,13 @@ else()
    add_subdirectory(llama-bench)
    add_subdirectory(llava)
    add_subdirectory(main)
-    add_subdirectory(tokenize)
    add_subdirectory(parallel)
    add_subdirectory(perplexity)
    add_subdirectory(quantize)
    add_subdirectory(quantize-stats)
    add_subdirectory(save-load-state)
    add_subdirectory(simple)
-    add_subdirectory(passkey)
    add_subdirectory(speculative)
-    add_subdirectory(lookahead)
-    add_subdirectory(lookup)
    add_subdirectory(train-text-from-scratch)
    if (LLAMA_METAL)
        add_subdirectory(metal)
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -575,7 +575,10 @@ static struct ggml_tensor * forward(

            // KQ_scaled = KQ / sqrt(n_embd/n_head)
            // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));

            // KQ_masked = mask_past(KQ_scaled)
            // KQ_masked shape [n_past + N, N, n_head, 1]
@@ -841,7 +844,10 @@ static struct ggml_tensor * forward_batch(

            // KQ_scaled = KQ / sqrt(n_embd/n_head)
            // KQ_scaled shape [n_past + N, N, n_head, n_batch]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
            assert_shape_4d(KQ_scaled, n_past + N, N, n_head, n_batch);

            // KQ_masked = mask_past(KQ_scaled)
@@ -1125,7 +1131,10 @@ static struct ggml_tensor * forward_lora(

            // KQ_scaled = KQ / sqrt(n_embd/n_head)
            // KQ_scaled shape [n_past + N, N, n_head, 1]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrtf(float(n_embd)/n_head));
+            struct ggml_tensor * KQ_scaled =
+                ggml_scale(ctx0,
+                        KQ,
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));

            // KQ_masked = mask_past(KQ_scaled)
            // KQ_masked shape [n_past + N, N, n_head, 1]
@@ -1249,9 +1258,9 @@ static struct ggml_tensor * forward_lora(
 }

 static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
-    assert(ggml_is_matrix(logits));
-    assert(ggml_is_matrix(probs));
-    assert(ggml_is_vector(best_samples));
+    assert(logits->n_dims == 2);
+    assert(probs->n_dims == 2);
+    assert(best_samples->n_dims == 1);
    assert(logits->ne[1] == best_samples->ne[0]);
    assert(logits->ne[0] == probs->ne[0]);
    assert(logits->ne[1] == probs->ne[1]);
@@ -1283,9 +1292,9 @@ static void sample_softmax_batch(
    struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
    struct ggml_tensor * best_samples
 ) {
-    GGML_ASSERT(ggml_is_matrix(best_samples));
-    GGML_ASSERT(ggml_is_3d(logits));
-    GGML_ASSERT(ggml_is_3d(probs));
+    GGML_ASSERT(best_samples->n_dims == 2);
+    GGML_ASSERT(logits->n_dims == 3);
+    GGML_ASSERT(probs->n_dims == 3);
    int n_tokens = best_samples->ne[0];
    int n_batch  = best_samples->ne[1];
    int n_vocab  = logits->ne[0];
@@ -1325,7 +1334,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }

 static void print_matrix(struct ggml_tensor * probs) {
-    assert(ggml_is_matrix(probs));
+    assert(probs->n_dims == 2);
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
@@ -1377,8 +1386,8 @@ static void get_example_targets(int example_id, struct ggml_tensor * tokens_inpu
 static void get_example_targets_batch(
    struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
 ) {
-    GGML_ASSERT(ggml_is_matrix(tokens_input));
-    GGML_ASSERT(ggml_is_3d(targets));
+    GGML_ASSERT(tokens_input->n_dims == 2);
+    GGML_ASSERT(     targets->n_dims == 3);
    int n_tokens = tokens_input->ne[0];
    int n_batch  = tokens_input->ne[1];
    GGML_ASSERT(n_tokens == targets->ne[1]);
--- a/examples/base-translate.sh
+++ b/examples/base-translate.sh
@@ -1,61 +0,0 @@
-#!/bin/bash
-#
-# Few-shot translation example.
-# Requires a base model (i.e. no fine-tuned or instruct models).
-#
-# Usage:
-#
-#   cd llama.cpp
-#   make -j
-#
-#   ./examples/base-translate.sh <model-base> "<text>" [extra-main-args]
-#
-
-if [ $# -lt 2 ]; then
-  echo "Usage: ./base-translate.sh <model-base> \"<text>\" [extra-main-args]"
-  exit 1
-fi
-
-eargs=""
-if [ $# -gt 2 ]; then
-  eargs="${@:3}"
-fi
-
-ftmp="__llama.cpp_example_tmp__.txt"
-trap "rm -f $ftmp" EXIT
-
-echo "Translate from English to French:
-
-===
-
-sea otter, peppermint, plush girafe:
-
-sea otter => loutre de mer
-peppermint => menthe poivrée
-plush girafe => girafe peluche
-
-===
-
-violin
-
-violin => violon
-
-===
-
-phone, computer, mouse, keyboard:
-
-phone => téléphone
-computer => ordinateur
-mouse => souris
-keyboard => clavier
-
-===
-" > $ftmp
-
-echo "$2
-" >> $ftmp
-
-model=$1
-
-# generate the most likely continuation until the string "===" is found
-./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -r "===" $eargs
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -155,7 +155,7 @@ int main(int argc, char ** argv) {
    }

    LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %d, n_threads_batch = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
    LOG_TEE("\n");

    LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
--- a/examples/batched.swift/README.md
+++ b/examples/batched.swift/README.md
@@ -1,4 +1,4 @@
 This is a swift clone of `examples/batched`.

 $ `make`
-$ `./batched_swift MODEL_PATH [PROMPT] [PARALLEL]`
+$ `./swift MODEL_PATH [PROMPT] [PARALLEL]`
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@@ -153,7 +153,7 @@ while n_cur <= n_len {
        // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

        // is it an end of stream? -> mark the stream as finished
-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
+        if new_token_id == llama_token_eos(context) || n_cur == n_len {
            i_batch[i] = -1
            // print("")
            if n_parallel > 1 {
@@ -215,10 +215,9 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end
 llama_print_timings(context)

 private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-    let utf8Count = text.utf8.count
-    let n_tokens = utf8Count + (add_bos ? 1 : 0)
+    let n_tokens = text.count + (add_bos ? 1 : 0)
    let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-    let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
+    let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
    var swiftTokens: [llama_token] = []
    for i in 0 ..< tokenCount {
        swiftTokens.append(tokens[Int(i)])
@@ -231,15 +230,18 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
    var result = [CChar](repeating: 0, count: 8)
    let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count))
    if nTokens < 0 {
-        let actualTokensCount = -Int(nTokens)
-        result = .init(repeating: 0, count: actualTokensCount)
+        if result.count >= -Int(nTokens) {
+            result.removeLast(-Int(nTokens))
+        } else {
+            result.removeAll()
+        }
        let check = llama_token_to_piece(
            model,
            token,
            &result,
            Int32(result.count)
        )
-        assert(check == actualTokensCount)
+        assert(check == nTokens)
    } else {
        result.removeLast(result.count - Int(nTokens))
    }
@@ -257,4 +259,5 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
        buffer = []
        return bufferString
    }
+    return nil
 }
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -69,7 +69,6 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> tokens_list;
    tokens_list = ::llama_tokenize(model, params.prompt, true);
-
    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;

    // initialize the context
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -129,13 +129,13 @@ int main(int argc, char ** argv)  {
    const ggml_type qtype = GGML_TYPE_Q4_1;

    size_t ctx_size = 0;
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(qtype,         sizex*sizey);
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
-    ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
+    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(qtype);
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
+    ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
    ctx_size += 1024*1024*16;

    printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
@@ -171,8 +171,7 @@ int main(int argc, char ** argv)  {
    struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);

    // printf("Creating compute graph\n");
-    struct ggml_cgraph * gf = ggml_new_graph(ctx);
-    ggml_build_forward_expand(gf, m11xm2);
+    struct ggml_cgraph gf = ggml_build_forward(m11xm2);

    printf("n_threads=%i\n", benchmark_params.n_threads);

@@ -181,9 +180,9 @@ int main(int argc, char ** argv)  {

    std::vector<uint8_t> work_buffer;

-    ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
+    ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);

-    TENSOR_DUMP(gf->nodes[0]);
+    TENSOR_DUMP(gf.nodes[0]);

    printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));

@@ -201,8 +200,7 @@ int main(int argc, char ** argv)  {
    struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);

    // printf("Creating compute graph\n");
-    struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
-    ggml_build_forward_expand(gf31, q31);
+    struct ggml_cgraph gf31 = ggml_build_forward(q31);

    // Set up a second graph computation to make sure we override the CPU cache lines
    // printf("Creating new tensor q12 & Running quantize\n");
@@ -213,8 +211,7 @@ int main(int argc, char ** argv)  {
    struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);

    //printf("Creating compute graph\n");
-    struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
-    ggml_build_forward_expand(gf32, q32);
+    struct ggml_cgraph gf32 = ggml_build_forward(q32);
    printf("n_threads=%i\n", benchmark_params.n_threads);

    const int dimx = sizex;
@@ -226,7 +223,7 @@ int main(int argc, char ** argv)  {


    // Let's use the F32 result from above as a reference for the quantized multiplication
-    float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
+    float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);

    printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
    printf("=====================================================================================\n");
@@ -236,7 +233,7 @@ int main(int argc, char ** argv)  {

        long long int start = ggml_time_us();
        //printf("Running ggml_graph_compute\n");
-        ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
+        ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);

        long long int stop = ggml_time_us();
        long long int usec = stop-start;
@@ -254,7 +251,7 @@ int main(int argc, char ** argv)  {

        // Check that the matrix multiplication result is in the right ballpark
        // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
-        float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
+        float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
        float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
        float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6

@@ -269,7 +266,7 @@ int main(int argc, char ** argv)  {
        }

        // Running a different graph computation to make sure we override the CPU cache lines
-        ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
+        ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
    }
    printf("\n");
    printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -427,7 +427,7 @@ static void print_row(struct ggml_tensor * probs, int i) {
 }

 static void print_matrix(struct ggml_tensor * probs) {
-    assert(ggml_is_matrix(probs));
+    assert(probs->n_dims == 2);
    for (int i = 0; i < probs->ne[1]; ++i) {
        for (int k = 0; k < probs->ne[0]; ++k) {
            float p = get_f32_2d(probs, k, i);
@@ -639,7 +639,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab

 static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
    int ct;
-    switch (ggml_n_dims(gg_weights)) {
+    switch (gg_weights->n_dims){
        case 1:
            ct = 0;
            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -240,7 +240,7 @@ static struct lora_data * load_lora(struct lora_info * info) {
    }

    struct ggml_init_params params_ggml;
-    params_ggml.mem_size   = ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE;
+    params_ggml.mem_size   = ggml_tensor_overhead() * GGML_MAX_NODES;
    params_ggml.mem_buffer = NULL;
    params_ggml.no_alloc   = true;
    result->ctx = ggml_init(params_ggml);
@@ -309,7 +309,7 @@ static struct ggml_cgraph * build_graph_lora(
 ) {
    struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
    if (scaling != 1.0f) {
-        ab = ggml_scale(ctx, ab, scaling);
+        ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
    }
    struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);

@@ -334,7 +334,7 @@ static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int
    float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;

    struct ggml_init_params params;
-    params.mem_size   = GGML_OBJECT_SIZE + ggml_graph_overhead() + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
+    params.mem_size   = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
    params.mem_buffer = NULL;
    params.no_alloc   = true;
    struct ggml_context * ctx = NULL;
--- a/examples/finetune/README.md
+++ b/examples/finetune/README.md
@@ -21,7 +21,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
 ./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
 ```

-**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
+Finetune output files will be saved every N iterations (config with `--save-every N`).
 The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
 So in above example after 10 iterations these files will be written:
 - chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
@@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
  --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
 ```

-The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
+The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.

 Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
 If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
--- a/examples/finetune/convert-finetune-checkpoint-to-gguf.py
+++ b/examples/finetune/convert-finetune-checkpoint-to-gguf.py
@@ -3,7 +3,9 @@

 import argparse
 import gguf
+import os
 import struct
+import sys
 import numpy as np
 from pathlib import Path

--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -3,9 +3,15 @@
 #include "llama.h"
 #include "common.h"
 #include "train.h"
+#include <unordered_map>
 #include <vector>
+#include <cassert>
+#include <climits>
 #include <cstring>
+#include <cstdarg>
 #include <ctime>
+#include <random>
+#include <stdexcept>
 #include <algorithm>
 #include <string>

@@ -190,13 +196,13 @@ static const char * LLM_TENSOR_FFN_DOWN      = "blk.%d.ffn_down";
 static const char * LLM_TENSOR_FFN_UP        = "blk.%d.ffn_up";

 static void print_params(struct my_llama_hparams * params) {
-    printf("%s: n_vocab               : %u\n", __func__, params->n_vocab);
-    printf("%s: n_ctx                 : %u\n", __func__, params->n_ctx);
-    printf("%s: n_embd                : %u\n", __func__, params->n_embd);
-    printf("%s: n_ff                  : %u\n", __func__, params->n_ff);
-    printf("%s: n_head                : %u\n", __func__, params->n_head);
-    printf("%s: n_head_kv             : %u\n", __func__, params->n_head_kv);
-    printf("%s: n_layer               : %u\n", __func__, params->n_layer);
+    printf("%s: n_vocab:   %u\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:     %u\n", __func__, params->n_ctx);
+    printf("%s: n_embd:    %u\n", __func__, params->n_embd);
+    printf("%s: n_ff:      %u\n", __func__, params->n_ff);
+    printf("%s: n_head:    %u\n", __func__, params->n_head);
+    printf("%s: n_head_kv: %u\n", __func__, params->n_head_kv);
+    printf("%s: n_layer:   %u\n", __func__, params->n_layer);
    printf("%s: norm_rms_eps          : %f\n", __func__, params->f_norm_rms_eps);
    printf("%s: rope_freq_base        : %f\n", __func__, params->rope_freq_base);
    printf("%s: rope_freq_scale       : %f\n", __func__, params->rope_freq_scale);
@@ -263,7 +269,7 @@ static void load_model_hparams_gguf(struct gguf_context * ctx, struct my_llama_h
    float rope_freq_scale = 1.0f;
    GGUF_GET_KEY(ctx, hparams->f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
    GGUF_GET_KEY(ctx, hparams->rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
-    GGUF_GET_KEY(ctx, rope_freq_scale,         gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
+    GGUF_GET_KEY(ctx, rope_freq_scale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
    if (rope_freq_scale != 1.0f) {
        hparams->rope_freq_scale = 1.0f / rope_freq_scale;
    }
@@ -542,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
    struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);

    randomize_tensor_normal(lora->tok_embeddings_a, rnd);
-    ggml_set_zero(lora->tok_embeddings_b);
+    randomize_tensor_normal(lora->tok_embeddings_b, rnd);
    randomize_tensor_normal(lora->norm_a,           rnd);
-    ggml_set_zero(lora->norm_b);
+    randomize_tensor_normal(lora->norm_b,           rnd);
    randomize_tensor_normal(lora->output_a,         rnd);
-    ggml_set_zero(lora->output_b);
+    randomize_tensor_normal(lora->output_b,         rnd);

    for (uint32_t i = 0; i < n_layer; ++i) {
        auto & layer = lora->layers[i];
        randomize_tensor_normal(layer.attention_norm_a, rnd);
-        ggml_set_zero(layer.attention_norm_b);
+        randomize_tensor_normal(layer.attention_norm_b, rnd);

        randomize_tensor_normal(layer.wq_a, rnd);
-        ggml_set_zero(layer.wq_b);
+        randomize_tensor_normal(layer.wq_b, rnd);
        randomize_tensor_normal(layer.wk_a, rnd);
-        ggml_set_zero(layer.wk_b);
+        randomize_tensor_normal(layer.wk_b, rnd);
        randomize_tensor_normal(layer.wv_a, rnd);
-        ggml_set_zero(layer.wv_b);
+        randomize_tensor_normal(layer.wv_b, rnd);
        randomize_tensor_normal(layer.wo_a, rnd);
-        ggml_set_zero(layer.wo_b);
+        randomize_tensor_normal(layer.wo_b, rnd);

        randomize_tensor_normal(layer.ffn_norm_a, rnd);
-        ggml_set_zero(layer.ffn_norm_b);
+        randomize_tensor_normal(layer.ffn_norm_b, rnd);

        randomize_tensor_normal(layer.w1_a, rnd);
-        ggml_set_zero(layer.w1_b);
+        randomize_tensor_normal(layer.w1_b, rnd);
        randomize_tensor_normal(layer.w2_a, rnd);
-        ggml_set_zero(layer.w2_b);
+        randomize_tensor_normal(layer.w2_b, rnd);
        randomize_tensor_normal(layer.w3_a, rnd);
-        ggml_set_zero(layer.w3_b);
+        randomize_tensor_normal(layer.w3_b, rnd);
    }

    free_random_normal_distribution(rnd);
@@ -606,7 +612,6 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    const int n_rot       = hparams.n_embd_head();
    const int n_embd_head = hparams.n_embd_head();
    const int n_embd_gqa  = hparams.n_embd_gqa();
-
    const float rms_norm_eps    = hparams.f_norm_rms_eps;
    const float rope_freq_base  = hparams.rope_freq_base;
    const float rope_freq_scale = hparams.rope_freq_scale;
@@ -638,7 +643,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(

        return ggml_rope_custom(ctx,
            t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
-            rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
+            rope_freq_base, rope_freq_scale, 0.0f, 0.0f, 0.0f, 0.0f
        );
    };

@@ -675,7 +680,10 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
        checkpoints.push_back(t01);
    }

-    const float kv_scale = 1.0f/sqrtf(float(n_embd)/n_head);
+    struct ggml_tensor * kv_scale = NULL;
+    if (!enable_flash_attn) {
+        kv_scale = ggml_new_f32(ctx, 1.0f/sqrtf(float(n_embd)/n_head));
+    }

    for (int il = 0; il < n_layer; ++il) {
        struct my_llama_layer & layer = model->layers[il];
@@ -764,7 +772,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    if (enable_checkpointing) {
        ggml_build_backward_gradient_checkpointing(ctx, gf, gb, gb_tmp, checkpoints.data(), (int) checkpoints.size());
    } else {
-        ggml_graph_cpy(gf, gb);
+        *gb = *gf;
        ggml_build_backward_expand(ctx, gf, gb, true);
    }

@@ -773,32 +781,32 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
    // make sure some tensors are not reallocated by inserting new temporary nodes depending on them
    int n_leafs_before = gb->n_leafs;
    int n_nodes_before = gb->n_nodes;
-
+    struct ggml_tensor * one = ggml_new_f32(ctx, 1.0f);
    // output tensors
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, 1.0f));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t35, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36, one));
    // input gradient
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, 1.0f));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, t36->grad, one));
    GGML_ASSERT(t36->grad->data == NULL && t36->grad->view_src == NULL);
    ggml_allocr_alloc(alloc, t36->grad);
    // KQ_pos
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, 1.0f));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, KQ_pos, one));

    // make sure base model tensors data cannot be used in viewable operations
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, 1.0f));
-    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, 1.0f));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->tok_embeddings, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->norm, one));
+    ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, model->output, one));
    for (int il = 0; il < n_layer; ++il) {
        struct my_llama_layer & layer = model->layers[il];
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, 1.0f));
-        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, 1.0f));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.attention_norm, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.ffn_norm, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wq, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wk, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wv, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.wo, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w1, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w2, one));
+        ggml_build_forward_expand(gb, ggml_scale_inplace(ctx, layer.w3, one));
    }

    // allocating checkpoints in one block to reduce memory fragmentation
@@ -1102,7 +1110,7 @@ static void write_tensor(struct llama_file * file, struct ggml_tensor * tensor,
        name = ggml_get_name(tensor);
    }
    uint32_t name_len = strlen(name);
-    uint32_t nd = ggml_n_dims(tensor);
+    uint32_t nd = tensor->n_dims;
    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
                       (uint32_t)tensor->ne[1],
                       (uint32_t)tensor->ne[2],
@@ -1452,6 +1460,17 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
            }
            params->n_rank_w3 = std::stoi(argv[i]);
            params->custom_n_rank_w3 = true;
+        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
+            params->common.n_gpu_layers = std::stoi(argv[i]);
+#else
+            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
+#endif
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            train_print_usage(argc, argv, &default_params);
@@ -1596,7 +1615,6 @@ int main(int argc, char ** argv) {
    opt->params = ggml_opt_default_params(GGML_OPT_ADAM);
    opt->params.print_forward_graph     = false;
    opt->params.print_backward_graph    = false;
-    opt->params.graph_size              = LLAMA_TRAIN_MAX_NODES;
    opt->params.n_threads               = params.common.n_threads;
    opt->params.past                    = params.common.opt_past;
    opt->params.delta                   = params.common.opt_delta;
@@ -1612,6 +1630,8 @@ int main(int argc, char ** argv) {
    opt->params.adam.gclip              = params.common.adam_gclip;
    opt->params.adam.eps_f              = params.common.adam_eps_f;

+    ggml_allocr * alloc = NULL;
+
    printf("%s: init model\n", __func__);
    bool existed = load_checkpoint_lora_file(params.common.fn_checkpoint_in, &model, &lora, train);

@@ -1715,14 +1735,17 @@ int main(int argc, char ** argv) {

    // allocate input tensors
    mem_input_data.resize(max_input_size);
-    ggml_allocr_t alloc_inps = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
-    ggml_allocr_alloc(alloc_inps, tokens_input);
-    ggml_allocr_alloc(alloc_inps, target_probs);
+    alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
+    ggml_allocr_alloc(alloc, tokens_input);
+    ggml_allocr_alloc(alloc, target_probs);
+    ggml_allocr_free(alloc);

    // context for compute tensors without their data
-    const size_t estimated_compute_size_wo_data = (
-            2*LLAMA_TRAIN_MAX_NODES*ggml_tensor_overhead() +
-            (params.common.use_checkpointing ? 3 : 2)*(GGML_OBJECT_SIZE+ggml_graph_overhead_custom(LLAMA_TRAIN_MAX_NODES, true))
+    size_t estimated_compute_size_wo_data = (
+        ggml_tensor_overhead()*GGML_MAX_NODES*2
+      + (GGML_OBJECT_SIZE+GGML_GRAPH_SIZE)*(
+            params.common.use_checkpointing ? 3 : 2
+        )
    );
    struct ggml_init_params ctx_compute_params = {
        estimated_compute_size_wo_data, // mem_size
@@ -1744,12 +1767,12 @@ int main(int argc, char ** argv) {
    // find best evaluation order
    for (unsigned order = 0; order < (unsigned) GGML_CGRAPH_EVAL_ORDER_COUNT; ++order) {
        ctx_compute = ggml_init(ctx_compute_params);
-        ggml_allocr_t alloc = ggml_allocr_new_measure(tensor_alignment);
-        gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
+        alloc = ggml_allocr_new_measure(tensor_alignment);
+        gf = ggml_new_graph(ctx_compute);
        gf->order = (enum ggml_cgraph_eval_order) order;
-        gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
+        gb = ggml_new_graph(ctx_compute);
        gb_tmp = params.common.use_checkpointing
-            ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
+            ? ggml_new_graph(ctx_compute)
            : NULL;
        loss = llama_build_lora_finetune_graphs(
            &model, &lora, alloc, ctx_compute,
@@ -1777,12 +1800,12 @@ int main(int argc, char ** argv) {
    // allocate compute tensors
    mem_compute_data.resize(max_compute_size);
    ctx_compute = ggml_init(ctx_compute_params);
-    ggml_allocr_t alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
-    gf = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
+    alloc = ggml_allocr_new(mem_compute_data.data(), mem_compute_data.size(), tensor_alignment);
+    gf = ggml_new_graph(ctx_compute);
    gf->order = best_order;
-    gb = ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true);
+    gb = ggml_new_graph(ctx_compute);
    gb_tmp = params.common.use_checkpointing
-        ? ggml_new_graph_custom(ctx_compute, LLAMA_TRAIN_MAX_NODES, true)
+        ? ggml_new_graph(ctx_compute)
        : NULL;
    loss = llama_build_lora_finetune_graphs(
        &model, &lora, alloc, ctx_compute,
@@ -1793,8 +1816,6 @@ int main(int argc, char ** argv) {
        params.common.use_checkpointing
    );
    ggml_allocr_free(alloc);
-    ggml_allocr_free(alloc_inps);
-

    // tokenize data
    std::vector<llama_token> train_tokens;
--- a/examples/gguf/CMakeLists.txt
+++ b/examples/gguf/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET gguf)
 add_executable(${TARGET} gguf.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "llama.h"

 #include <cstdio>
 #include <cinttypes>
@@ -194,7 +195,7 @@ static bool gguf_ex_read_1(const std::string & fname) {

            struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);

-            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
+            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);

            // print first 10 elements
            const float * data = (const float *) cur->data;
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -146,13 +146,6 @@ int main(int argc, char ** argv) {

        return 0;
    }
-    if (params.chatml) {
-        printf("\n************\n");
-        printf("%s: please use the 'main' tool for chatml mode\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
    if (!params.antiprompt.empty()) {
        printf("\n************\n");
        printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
@@ -237,7 +230,7 @@ int main(int argc, char ** argv) {
        LOG_TEE("\n");
        LOG_TEE("%s\n", get_system_info(params).c_str());
    }
-    const bool add_bos = llama_should_add_bos_token(model);
+    const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
    LOG("add_bos: %d\n", add_bos);

    bool suff_rm_leading_spc = params.escape;
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -53,13 +53,6 @@ static std::vector<T> split(const std::string & str, char delim) {
    return values;
 }

-template<typename T, typename F>
-static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
-    std::vector<std::string> str_values;
-    std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
-    return str_values;
-}
-
 template<typename T>
 static T avg(const std::vector<T> & v) {
    if (v.empty()) {
@@ -133,8 +126,7 @@ struct cmd_params {
    std::vector<int> n_prompt;
    std::vector<int> n_gen;
    std::vector<int> n_batch;
-    std::vector<ggml_type> type_k;
-    std::vector<ggml_type> type_v;
+    std::vector<bool> f32_kv;
    std::vector<int> n_threads;
    std::vector<int> n_gpu_layers;
    std::vector<int> main_gpu;
@@ -150,8 +142,7 @@ static const cmd_params cmd_params_defaults = {
    /* n_prompt      */ {512},
    /* n_gen         */ {128},
    /* n_batch       */ {512},
-    /* type_k        */ {GGML_TYPE_F16},
-    /* type_v        */ {GGML_TYPE_F16},
+    /* f32_kv        */ {false},
    /* n_threads     */ {get_num_physical_cores()},
    /* n_gpu_layers  */ {99},
    /* main_gpu      */ {0},
@@ -171,8 +162,7 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
    printf("  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
    printf("  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  -ctk <t>, --cache-type-k <t>      (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
-    printf("  -ctv <t>, --cache-type-v <t>      (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
+    printf("  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
    printf("  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
    printf("  -ngl, --n-gpu-layers <n>          (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
    printf("  -mg, --main-gpu <i>               (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
@@ -183,32 +173,9 @@ static void print_usage(int /* argc */, char ** argv) {
    printf("  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
    printf("\n");
    printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
+
 }

-static ggml_type ggml_type_from_name(const std::string & s) {
-    if (s == "f16") {
-        return GGML_TYPE_F16;
-    }
-    if (s == "q8_0") {
-        return GGML_TYPE_Q8_0;
-    }
-    if (s == "q4_0") {
-        return GGML_TYPE_Q4_0;
-    }
-    if (s == "q4_1") {
-        return GGML_TYPE_Q4_1;
-    }
-    if (s == "q5_0") {
-        return GGML_TYPE_Q5_0;
-    }
-    if (s == "q5_1") {
-        return GGML_TYPE_Q5_1;
-    }
-
-    return GGML_TYPE_COUNT;
-}
-
-
 static cmd_params parse_cmd_params(int argc, char ** argv) {
    cmd_params params;
    std::string arg;
@@ -257,38 +224,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
            }
            auto p = split<int>(argv[i], split_delim);
            params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
-        } else if (arg == "-ctk" || arg == "--cache-type-k") {
+        } else if (arg == "--memory-f32") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
-            auto p = split<std::string>(argv[i], split_delim);
-            std::vector<ggml_type> types;
-            for (const auto & t : p) {
-                ggml_type gt = ggml_type_from_name(t);
-                if (gt == GGML_TYPE_COUNT) {
-                    invalid_param = true;
-                    break;
-                }
-                types.push_back(gt);
-            }
-            params.type_k.insert(params.type_k.end(), types.begin(), types.end());
-        } else if (arg == "-ctv" || arg == "--cache-type-v") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            auto p = split<std::string>(argv[i], split_delim);
-            std::vector<ggml_type> types;
-            for (const auto & t : p) {
-                ggml_type gt = ggml_type_from_name(t);
-                if (gt == GGML_TYPE_COUNT) {
-                    invalid_param = true;
-                    break;
-                }
-                types.push_back(gt);
-            }
-            params.type_v.insert(params.type_v.end(), types.begin(), types.end());
+            auto p = split<int>(argv[i], split_delim);
+            params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
        } else if (arg == "-t" || arg == "--threads") {
            if (++i >= argc) {
                invalid_param = true;
@@ -379,8 +321,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
    if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
    if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
    if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
-    if (params.type_k.empty())       { params.type_k = cmd_params_defaults.type_k; }
-    if (params.type_v.empty())       { params.type_v = cmd_params_defaults.type_v; }
+    if (params.f32_kv.empty())       { params.f32_kv = cmd_params_defaults.f32_kv; }
    if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
    if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
    if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
@@ -395,8 +336,7 @@ struct cmd_params_instance {
    int n_prompt;
    int n_gen;
    int n_batch;
-    ggml_type type_k;
-    ggml_type type_v;
+    bool f32_kv;
    int n_threads;
    int n_gpu_layers;
    int main_gpu;
@@ -425,8 +365,7 @@ struct cmd_params_instance {

        cparams.n_ctx = n_prompt + n_gen;
        cparams.n_batch = n_batch;
-        cparams.type_k = type_k;
-        cparams.type_v = type_v;
+        cparams.f16_kv = !f32_kv;
        cparams.mul_mat_q = mul_mat_q;

        return cparams;
@@ -441,8 +380,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
    for (const auto & nb : params.n_batch)
-    for (const auto & tk : params.type_k)
-    for (const auto & tv : params.type_v)
+    for (const auto & fk : params.f32_kv)
    for (const auto & mmq : params.mul_mat_q)
    for (const auto & nt : params.n_threads) {
        cmd_params_instance instance = {
@@ -450,8 +388,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
            /* .n_prompt     = */ n_prompt,
            /* .n_gen        = */ n_gen,
            /* .n_batch      = */ nb,
-            /* .type_k       = */ tk,
-            /* .type_v       = */ tv,
+            /* .f32_kv       = */ fk,
            /* .n_threads    = */ nt,
            /* .n_gpu_layers = */ nl,
            /* .main_gpu     = */ mg,
@@ -473,8 +410,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
    for (const auto & mg : params.main_gpu)
    for (const auto & ts : params.tensor_split)
    for (const auto & nb : params.n_batch)
-    for (const auto & tk : params.type_k)
-    for (const auto & tv : params.type_v)
+    for (const auto & fk : params.f32_kv)
    for (const auto & mmq : params.mul_mat_q)
    for (const auto & nt : params.n_threads) {
        for (const auto & n_prompt : params.n_prompt) {
@@ -486,8 +422,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_prompt     = */ n_prompt,
                /* .n_gen        = */ 0,
                /* .n_batch      = */ nb,
-                /* .type_k       = */ tk,
-                /* .type_v       = */ tv,
+                /* .f32_kv       = */ fk,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .main_gpu     = */ mg,
@@ -506,8 +441,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                /* .n_prompt     = */ 0,
                /* .n_gen        = */ n_gen,
                /* .n_batch      = */ nb,
-                /* .type_k       = */ tk,
-                /* .type_v       = */ tv,
+                /* .f32_kv       = */ fk,
                /* .n_threads    = */ nt,
                /* .n_gpu_layers = */ nl,
                /* .main_gpu     = */ mg,
@@ -555,8 +489,7 @@ struct test {
    uint64_t model_n_params;
    int n_batch;
    int n_threads;
-    ggml_type type_k;
-    ggml_type type_v;
+    bool f32_kv;
    int n_gpu_layers;
    int main_gpu;
    bool mul_mat_q;
@@ -575,8 +508,7 @@ struct test {
        model_n_params = llama_model_n_params(lmodel);
        n_batch = inst.n_batch;
        n_threads = inst.n_threads;
-        type_k = inst.type_k;
-        type_v = inst.type_v;
+        f32_kv = inst.f32_kv;
        n_gpu_layers = inst.n_gpu_layers;
        main_gpu = inst.main_gpu;
        mul_mat_q = inst.mul_mat_q;
@@ -639,7 +571,7 @@ struct test {
            "cuda", "opencl", "metal", "gpu_blas", "blas",
            "cpu_info", "gpu_info",
            "model_filename", "model_type", "model_size", "model_n_params",
-            "n_batch", "n_threads", "type_k", "type_v",
+            "n_batch", "n_threads", "f16_kv",
            "n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
            "n_prompt", "n_gen", "test_time",
            "avg_ns", "stddev_ns",
@@ -689,7 +621,7 @@ struct test {
            std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
            cpu_info, gpu_info,
            model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
-            std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
+            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
            std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
            std::to_string(n_prompt), std::to_string(n_gen), test_time,
            std::to_string(avg_ns()), std::to_string(stdev_ns()),
@@ -873,11 +805,8 @@ struct markdown_printer : public printer {
        if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
            fields.push_back("n_batch");
        }
-        if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
-            fields.push_back("type_k");
-        }
-        if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
-            fields.push_back("type_v");
+        if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
+            fields.push_back("f16_kv");
        }
        if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
            fields.push_back("main_gpu");
--- a/examples/llama.swiftui/.gitignore
+++ b/examples/llama.swiftui/.gitignore
@@ -1,2 +0,0 @@
-xcuserdata
-xcshareddata
--- a/examples/llama.swiftui/README.md
+++ b/examples/llama.swiftui/README.md
@@ -1,7 +0,0 @@
-# llama.swiftui
-
-Local inference of llama.cpp on an iPhone.
-So far I only tested with starcoder 1B model, but it can most likely handle 7B models as well.
-
-https://github.com/bachittle/llama.cpp/assets/39804642/e290827a-4edb-4093-9642-2a5e399ec545
-
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@@ -1,339 +0,0 @@
-import Foundation
-import llama
-
-enum LlamaError: Error {
-    case couldNotInitializeContext
-}
-
-func llama_batch_clear(_ batch: inout llama_batch) {
-    batch.n_tokens = 0
-}
-
-func llama_batch_add(_ batch: inout llama_batch, _ id: llama_token, _ pos: llama_pos, _ seq_ids: [llama_seq_id], _ logits: Bool) {
-    batch.token   [Int(batch.n_tokens)] = id
-    batch.pos     [Int(batch.n_tokens)] = pos
-    batch.n_seq_id[Int(batch.n_tokens)] = Int32(seq_ids.count)
-    for i in 0..<seq_ids.count {
-        batch.seq_id[Int(batch.n_tokens)]![Int(i)] = seq_ids[i]
-    }
-    batch.logits  [Int(batch.n_tokens)] = logits ? 1 : 0
-
-    batch.n_tokens += 1
-}
-
-actor LlamaContext {
-    private var model: OpaquePointer
-    private var context: OpaquePointer
-    private var batch: llama_batch
-    private var tokens_list: [llama_token]
-
-    /// This variable is used to store temporarily invalid cchars
-    private var temporary_invalid_cchars: [CChar]
-
-    var n_len: Int32 = 64
-    var n_cur: Int32 = 0
-
-    var n_decode: Int32 = 0
-
-    init(model: OpaquePointer, context: OpaquePointer) {
-        self.model = model
-        self.context = context
-        self.tokens_list = []
-        self.batch = llama_batch_init(512, 0, 1)
-        self.temporary_invalid_cchars = []
-    }
-
-    deinit {
-        llama_batch_free(batch)
-        llama_free(context)
-        llama_free_model(model)
-        llama_backend_free()
-    }
-
-    static func create_context(path: String) throws -> LlamaContext {
-        llama_backend_init(false)
-        var model_params = llama_model_default_params()
-
-#if targetEnvironment(simulator)
-        model_params.n_gpu_layers = 0
-        print("Running on simulator, force use n_gpu_layers = 0")
-#endif
-        let model = llama_load_model_from_file(path, model_params)
-        guard let model else {
-            print("Could not load model at \(path)")
-            throw LlamaError.couldNotInitializeContext
-        }
-
-        let n_threads = max(1, min(8, ProcessInfo.processInfo.processorCount - 2))
-        print("Using \(n_threads) threads")
-
-        var ctx_params = llama_context_default_params()
-        ctx_params.seed  = 1234
-        ctx_params.n_ctx = 2048
-        ctx_params.n_threads       = UInt32(n_threads)
-        ctx_params.n_threads_batch = UInt32(n_threads)
-
-        let context = llama_new_context_with_model(model, ctx_params)
-        guard let context else {
-            print("Could not load context!")
-            throw LlamaError.couldNotInitializeContext
-        }
-
-        return LlamaContext(model: model, context: context)
-    }
-
-    func model_info() -> String {
-        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 256)
-        result.initialize(repeating: Int8(0), count: 256)
-        defer {
-            result.deallocate()
-        }
-
-        // TODO: this is probably very stupid way to get the string from C
-
-        let nChars = llama_model_desc(model, result, 256)
-        let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nChars))
-
-        var SwiftString = ""
-        for char in bufferPointer {
-            SwiftString.append(Character(UnicodeScalar(UInt8(char))))
-        }
-
-        return SwiftString
-    }
-
-    func get_n_tokens() -> Int32 {
-        return batch.n_tokens;
-    }
-
-    func completion_init(text: String) {
-        print("attempting to complete \"\(text)\"")
-
-        tokens_list = tokenize(text: text, add_bos: true)
-        temporary_invalid_cchars = []
-
-        let n_ctx = llama_n_ctx(context)
-        let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
-
-        print("\n n_len = \(n_len), n_ctx = \(n_ctx), n_kv_req = \(n_kv_req)")
-
-        if n_kv_req > n_ctx {
-            print("error: n_kv_req > n_ctx, the required KV cache size is not big enough")
-        }
-
-        for id in tokens_list {
-            print(String(cString: token_to_piece(token: id) + [0]))
-        }
-
-        llama_batch_clear(&batch)
-
-        for i1 in 0..<tokens_list.count {
-            let i = Int(i1)
-            llama_batch_add(&batch, tokens_list[i], Int32(i), [0], false)
-        }
-        batch.logits[Int(batch.n_tokens) - 1] = 1 // true
-
-        if llama_decode(context, batch) != 0 {
-            print("llama_decode() failed")
-        }
-
-        n_cur = batch.n_tokens
-    }
-
-    func completion_loop() -> String {
-        var new_token_id: llama_token = 0
-
-        let n_vocab = llama_n_vocab(model)
-        let logits = llama_get_logits_ith(context, batch.n_tokens - 1)
-
-        var candidates = Array<llama_token_data>()
-        candidates.reserveCapacity(Int(n_vocab))
-
-        for token_id in 0..<n_vocab {
-            candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0))
-        }
-        candidates.withUnsafeMutableBufferPointer() { buffer in
-            var candidates_p = llama_token_data_array(data: buffer.baseAddress, size: buffer.count, sorted: false)
-
-            new_token_id = llama_sample_token_greedy(context, &candidates_p)
-        }
-
-        if new_token_id == llama_token_eos(model) || n_cur == n_len {
-            print("\n")
-            let new_token_str = String(cString: temporary_invalid_cchars + [0])
-            temporary_invalid_cchars.removeAll()
-            return new_token_str
-        }
-
-        let new_token_cchars = token_to_piece(token: new_token_id)
-        temporary_invalid_cchars.append(contentsOf: new_token_cchars)
-        let new_token_str: String
-        if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
-            temporary_invalid_cchars.removeAll()
-            new_token_str = string
-        } else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
-            // in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
-            let string = String(cString: temporary_invalid_cchars + [0])
-            temporary_invalid_cchars.removeAll()
-            new_token_str = string
-        } else {
-            new_token_str = ""
-        }
-        print(new_token_str)
-        // tokens_list.append(new_token_id)
-
-        llama_batch_clear(&batch)
-        llama_batch_add(&batch, new_token_id, n_cur, [0], true)
-
-        n_decode += 1
-        n_cur    += 1
-
-        if llama_decode(context, batch) != 0 {
-            print("failed to evaluate llama!")
-        }
-
-        return new_token_str
-    }
-
-    func bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) -> String {
-        var pp_avg: Double = 0
-        var tg_avg: Double = 0
-
-        var pp_std: Double = 0
-        var tg_std: Double = 0
-
-        for _ in 0..<nr {
-            // bench prompt processing
-
-            llama_batch_clear(&batch)
-
-            let n_tokens = pp
-
-            for i in 0..<n_tokens {
-                llama_batch_add(&batch, 0, Int32(i), [0], false)
-            }
-            batch.logits[Int(batch.n_tokens) - 1] = 1 // true
-
-            llama_kv_cache_clear(context)
-
-            let t_pp_start = ggml_time_us()
-
-            if llama_decode(context, batch) != 0 {
-                print("llama_decode() failed during prompt")
-            }
-
-            let t_pp_end = ggml_time_us()
-
-            // bench text generation
-
-            llama_kv_cache_clear(context)
-
-            let t_tg_start = ggml_time_us()
-
-            for i in 0..<tg {
-                llama_batch_clear(&batch)
-
-                for j in 0..<pl {
-                    llama_batch_add(&batch, 0, Int32(i), [Int32(j)], true)
-                }
-
-                if llama_decode(context, batch) != 0 {
-                    print("llama_decode() failed during text generation")
-                }
-            }
-
-            let t_tg_end = ggml_time_us()
-
-            llama_kv_cache_clear(context)
-
-            let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
-            let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
-
-            let speed_pp = Double(pp)    / t_pp
-            let speed_tg = Double(pl*tg) / t_tg
-
-            pp_avg += speed_pp
-            tg_avg += speed_tg
-
-            pp_std += speed_pp * speed_pp
-            tg_std += speed_tg * speed_tg
-
-            print("pp \(speed_pp) t/s, tg \(speed_tg) t/s")
-        }
-
-        pp_avg /= Double(nr)
-        tg_avg /= Double(nr)
-
-        if nr > 1 {
-            pp_std = sqrt(pp_std / Double(nr - 1) - pp_avg * pp_avg * Double(nr) / Double(nr - 1))
-            tg_std = sqrt(tg_std / Double(nr - 1) - tg_avg * tg_avg * Double(nr) / Double(nr - 1))
-        } else {
-            pp_std = 0
-            tg_std = 0
-        }
-
-        let model_desc     = model_info();
-        let model_size     = String(format: "%.2f GiB", Double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0);
-        let model_n_params = String(format: "%.2f B", Double(llama_model_n_params(model)) / 1e9);
-        let backend        = "Metal";
-        let pp_avg_str     = String(format: "%.2f", pp_avg);
-        let tg_avg_str     = String(format: "%.2f", tg_avg);
-        let pp_std_str     = String(format: "%.2f", pp_std);
-        let tg_std_str     = String(format: "%.2f", tg_std);
-
-        var result = ""
-
-        result += String("| model | size | params | backend | test | t/s |\n")
-        result += String("| --- | --- | --- | --- | --- | --- |\n")
-        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | pp \(pp) | \(pp_avg_str) ± \(pp_std_str) |\n")
-        result += String("| \(model_desc) | \(model_size) | \(model_n_params) | \(backend) | tg \(tg) | \(tg_avg_str) ± \(tg_std_str) |\n")
-
-        return result;
-    }
-
-    func clear() {
-        tokens_list.removeAll()
-        temporary_invalid_cchars.removeAll()
-        llama_kv_cache_clear(context)
-    }
-
-    private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
-        let utf8Count = text.utf8.count
-        let n_tokens = utf8Count + (add_bos ? 1 : 0) + 1
-        let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
-        let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
-
-        var swiftTokens: [llama_token] = []
-        for i in 0..<tokenCount {
-            swiftTokens.append(tokens[Int(i)])
-        }
-
-        tokens.deallocate()
-
-        return swiftTokens
-    }
-
-    /// - note: The result does not contain null-terminator
-    private func token_to_piece(token: llama_token) -> [CChar] {
-        let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
-        result.initialize(repeating: Int8(0), count: 8)
-        defer {
-            result.deallocate()
-        }
-        let nTokens = llama_token_to_piece(model, token, result, 8)
-
-        if nTokens < 0 {
-            let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
-            newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
-            defer {
-                newResult.deallocate()
-            }
-            let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
-            let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
-            return Array(bufferPointer)
-        } else {
-            let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
-            return Array(bufferPointer)
-        }
-    }
-}
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@@ -1,435 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 56;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; };
-		7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; };
-		8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; };
-		8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; };
-		8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 8A1C837A2AC328BE0096AF73 /* Assets.xcassets */; };
-		8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 8A39BE092AC7601000BFEB40 /* Accelerate.framework */; };
-		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
-		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
-		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
-		DF810E132B4A5BA200301144 /* llama in Frameworks */ = {isa = PBXBuildFile; productRef = DF810E122B4A5BA200301144 /* llama */; };
-		F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; };
-		7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
-		8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = "<group>"; };
-		8A1C83782AC328BD0096AF73 /* ContentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ContentView.swift; sourceTree = "<group>"; };
-		8A1C837A2AC328BE0096AF73 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
-		8A39BE092AC7601000BFEB40 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
-		8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
-		8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
-		8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
-		DF2D2FE72B4A59BE00FCB72D /* llama.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = llama.cpp; path = ../..; sourceTree = "<group>"; };
-		F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		8A1C83702AC328BD0096AF73 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				DF810E132B4A5BA200301144 /* llama in Frameworks */,
-				549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */,
-				8A39BE0A2AC7601100BFEB40 /* Accelerate.framework in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		8A1C836A2AC328BD0096AF73 = {
-			isa = PBXGroup;
-			children = (
-				DF2D2FE72B4A59BE00FCB72D /* llama.cpp */,
-				8A907F312AC7134E006146EA /* llama.cpp.swift */,
-				8A3F84232AC4C891005E2EE8 /* models */,
-				8A1C83752AC328BD0096AF73 /* llama.swiftui */,
-				8A1C83742AC328BD0096AF73 /* Products */,
-				8A39BE082AC7601000BFEB40 /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		8A1C83742AC328BD0096AF73 /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				8A1C83732AC328BD0096AF73 /* llama.swiftui.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		8A1C83752AC328BD0096AF73 /* llama.swiftui */ = {
-			isa = PBXGroup;
-			children = (
-				8A3F84102AC4BD85005E2EE8 /* Resources */,
-				8A9F7C4B2AC332DC008AE1EA /* Models */,
-				8A9F7C4A2AC332BF008AE1EA /* UI */,
-				8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */,
-				8A1C837A2AC328BE0096AF73 /* Assets.xcassets */,
-			);
-			path = llama.swiftui;
-			sourceTree = "<group>";
-		};
-		8A39BE082AC7601000BFEB40 /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				549479CA2AC9E16000E0F78B /* Metal.framework */,
-				8A39BE092AC7601000BFEB40 /* Accelerate.framework */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		8A3F84102AC4BD85005E2EE8 /* Resources */ = {
-			isa = PBXGroup;
-			children = (
-				8A3F84112AC4BD8C005E2EE8 /* models */,
-			);
-			path = Resources;
-			sourceTree = "<group>";
-		};
-		8A3F84112AC4BD8C005E2EE8 /* models */ = {
-			isa = PBXGroup;
-			children = (
-			);
-			path = models;
-			sourceTree = "<group>";
-		};
-		8A907F312AC7134E006146EA /* llama.cpp.swift */ = {
-			isa = PBXGroup;
-			children = (
-				8A907F322AC7134E006146EA /* LibLlama.swift */,
-			);
-			path = llama.cpp.swift;
-			sourceTree = "<group>";
-		};
-		8A9F7C4A2AC332BF008AE1EA /* UI */ = {
-			isa = PBXGroup;
-			children = (
-				7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
-				8A1C83782AC328BD0096AF73 /* ContentView.swift */,
-				F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */,
-			);
-			path = UI;
-			sourceTree = "<group>";
-		};
-		8A9F7C4B2AC332DC008AE1EA /* Models */ = {
-			isa = PBXGroup;
-			children = (
-				8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */,
-			);
-			path = Models;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		8A1C83722AC328BD0096AF73 /* llama.swiftui */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */;
-			buildPhases = (
-				8A1C836F2AC328BD0096AF73 /* Sources */,
-				8A1C83702AC328BD0096AF73 /* Frameworks */,
-				8A1C83712AC328BD0096AF73 /* Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = llama.swiftui;
-			packageProductDependencies = (
-				DF810E122B4A5BA200301144 /* llama */,
-			);
-			productName = llama.swiftui;
-			productReference = 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		8A1C836B2AC328BD0096AF73 /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				BuildIndependentTargetsInParallel = 1;
-				LastSwiftUpdateCheck = 1500;
-				LastUpgradeCheck = 1500;
-				TargetAttributes = {
-					8A1C83722AC328BD0096AF73 = {
-						CreatedOnToolsVersion = 15.0;
-						LastSwiftMigration = 1500;
-					};
-				};
-			};
-			buildConfigurationList = 8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */;
-			compatibilityVersion = "Xcode 14.0";
-			developmentRegion = en;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = 8A1C836A2AC328BD0096AF73;
-			packageReferences = (
-			);
-			productRefGroup = 8A1C83742AC328BD0096AF73 /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				8A1C83722AC328BD0096AF73 /* llama.swiftui */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		8A1C83712AC328BD0096AF73 /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				8A3F84242AC4C891005E2EE8 /* models in Resources */,
-				8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		8A1C836F2AC328BD0096AF73 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */,
-				8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
-				8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */,
-				8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */,
-				8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */,
-				7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		8A1C837F2AC328BE0096AF73 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				ENABLE_USER_SCRIPT_SANDBOXING = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu17;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
-				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
-				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
-				MTL_FAST_MATH = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				SWIFT_ACTIVE_COMPILATION_CONDITIONS = "DEBUG $(inherited)";
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-			};
-			name = Debug;
-		};
-		8A1C83802AC328BE0096AF73 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				ASSETCATALOG_COMPILER_GENERATE_SWIFT_ASSET_SYMBOL_EXTENSIONS = YES;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_ENABLE_OBJC_WEAK = YES;
-				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_COMMA = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
-				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
-				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
-				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
-				CLANG_WARN_STRICT_PROTOTYPES = YES;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_USER_SCRIPT_SANDBOXING = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu17;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 17.0;
-				LOCALIZATION_PREFERS_STRING_CATALOGS = YES;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				MTL_FAST_MATH = YES;
-				SDKROOT = iphoneos;
-				SWIFT_COMPILATION_MODE = wholemodule;
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		8A1C83822AC328BE0096AF73 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CLANG_ENABLE_MODULES = YES;
-				CODE_SIGN_STYLE = Automatic;
-				CURRENT_PROJECT_VERSION = 1;
-				DEVELOPMENT_TEAM = STLSG3FG8Q;
-				ENABLE_PREVIEWS = YES;
-				GENERATE_INFOPLIST_FILE = YES;
-				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
-				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
-				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
-				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				MARKETING_VERSION = 1.0;
-				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator xros xrsimulator";
-				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
-				SWIFT_EMIT_LOC_STRINGS = YES;
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-				SWIFT_VERSION = 5.0;
-				TARGETED_DEVICE_FAMILY = "1,2,7";
-			};
-			name = Debug;
-		};
-		8A1C83832AC328BE0096AF73 /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CLANG_ENABLE_MODULES = YES;
-				CODE_SIGN_STYLE = Automatic;
-				CURRENT_PROJECT_VERSION = 1;
-				DEVELOPMENT_TEAM = STLSG3FG8Q;
-				ENABLE_PREVIEWS = YES;
-				GENERATE_INFOPLIST_FILE = YES;
-				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
-				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
-				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
-				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
-				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
-				LD_RUNPATH_SEARCH_PATHS = (
-					"$(inherited)",
-					"@executable_path/Frameworks",
-				);
-				MARKETING_VERSION = 1.0;
-				PRODUCT_BUNDLE_IDENTIFIER = "com.bachittle.llama-swift";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator xros xrsimulator";
-				SUPPORTS_XR_DESIGNED_FOR_IPHONE_IPAD = NO;
-				SWIFT_EMIT_LOC_STRINGS = YES;
-				SWIFT_VERSION = 5.0;
-				TARGETED_DEVICE_FAMILY = "1,2,7";
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		8A1C836E2AC328BD0096AF73 /* Build configuration list for PBXProject "llama.swiftui" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				8A1C837F2AC328BE0096AF73 /* Debug */,
-				8A1C83802AC328BE0096AF73 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		8A1C83812AC328BE0096AF73 /* Build configuration list for PBXNativeTarget "llama.swiftui" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				8A1C83822AC328BE0096AF73 /* Debug */,
-				8A1C83832AC328BE0096AF73 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-
-/* Begin XCSwiftPackageProductDependency section */
-		DF810E122B4A5BA200301144 /* llama */ = {
-			isa = XCSwiftPackageProductDependency;
-			productName = llama;
-		};
-/* End XCSwiftPackageProductDependency section */
-	};
-	rootObject = 8A1C836B2AC328BD0096AF73 /* Project object */;
-}
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<Workspace
-   version = "1.0">
-   <FileRef
-      location = "self:">
-   </FileRef>
-</Workspace>
--- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-    <key>IDEDidComputeMac32BitWarning</key>
-    <true/>
-</dict>
-</plist>
--- a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json
+++ b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/AppIcon.appiconset/Contents.json
@@ -1,13 +0,0 @@
-{
-  "images" : [
-    {
-      "idiom" : "universal",
-      "platform" : "ios",
-      "size" : "1024x1024"
-    }
-  ],
-  "info" : {
-    "author" : "xcode",
-    "version" : 1
-  }
-}
--- a/examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json
+++ b/examples/llama.swiftui/llama.swiftui/Assets.xcassets/Contents.json
@@ -1,6 +0,0 @@
-{
-  "info" : {
-    "author" : "xcode",
-    "version" : 1
-  }
-}
--- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
+++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
@@ -1,100 +0,0 @@
-import Foundation
-
-@MainActor
-class LlamaState: ObservableObject {
-    @Published var messageLog = ""
-    @Published var cacheCleared = false
-    let NS_PER_S = 1_000_000_000.0
-
-    private var llamaContext: LlamaContext?
-    private var defaultModelUrl: URL? {
-        Bundle.main.url(forResource: "ggml-model", withExtension: "gguf", subdirectory: "models")
-        // Bundle.main.url(forResource: "llama-2-7b-chat", withExtension: "Q2_K.gguf", subdirectory: "models")
-    }
-
-    init() {
-        do {
-            try loadModel(modelUrl: defaultModelUrl)
-        } catch {
-            messageLog += "Error!\n"
-        }
-    }
-
-    func loadModel(modelUrl: URL?) throws {
-        if let modelUrl {
-            messageLog += "Loading model...\n"
-            llamaContext = try LlamaContext.create_context(path: modelUrl.path())
-            messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
-        } else {
-            messageLog += "Load a model from the list below\n"
-        }
-    }
-
-    func complete(text: String) async {
-        guard let llamaContext else {
-            return
-        }
-
-        let t_start = DispatchTime.now().uptimeNanoseconds
-        await llamaContext.completion_init(text: text)
-        let t_heat_end = DispatchTime.now().uptimeNanoseconds
-        let t_heat = Double(t_heat_end - t_start) / NS_PER_S
-
-        messageLog += "\(text)"
-
-        while await llamaContext.n_cur < llamaContext.n_len {
-            let result = await llamaContext.completion_loop()
-            messageLog += "\(result)"
-        }
-
-        let t_end = DispatchTime.now().uptimeNanoseconds
-        let t_generation = Double(t_end - t_heat_end) / NS_PER_S
-        let tokens_per_second = Double(await llamaContext.n_len) / t_generation
-
-        await llamaContext.clear()
-        messageLog += """
-            \n
-            Done
-            Heat up took \(t_heat)s
-            Generated \(tokens_per_second) t/s\n
-            """
-    }
-
-    func bench() async {
-        guard let llamaContext else {
-            return
-        }
-
-        messageLog += "\n"
-        messageLog += "Running benchmark...\n"
-        messageLog += "Model info: "
-        messageLog += await llamaContext.model_info() + "\n"
-
-        let t_start = DispatchTime.now().uptimeNanoseconds
-        let _ = await llamaContext.bench(pp: 8, tg: 4, pl: 1) // heat up
-        let t_end = DispatchTime.now().uptimeNanoseconds
-
-        let t_heat = Double(t_end - t_start) / NS_PER_S
-        messageLog += "Heat up time: \(t_heat) seconds, please wait...\n"
-
-        // if more than 5 seconds, then we're probably running on a slow device
-        if t_heat > 5.0 {
-            messageLog += "Heat up time is too long, aborting benchmark\n"
-            return
-        }
-
-        let result = await llamaContext.bench(pp: 512, tg: 128, pl: 1, nr: 3)
-
-        messageLog += "\(result)"
-        messageLog += "\n"
-    }
-
-    func clear() async {
-        guard let llamaContext else {
-            return
-        }
-
-        await llamaContext.clear()
-        messageLog = ""
-    }
-}
--- a/examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore
+++ b/examples/llama.swiftui/llama.swiftui/Resources/models/.gitignore
--- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@@ -1,138 +0,0 @@
-import SwiftUI
-
-struct ContentView: View {
-    @StateObject var llamaState = LlamaState()
-
-    @State private var multiLineText = ""
-
-    private static func cleanupModelCaches() {
-        // Delete all models (*.gguf)
-        let fileManager = FileManager.default
-        let documentsUrl =  FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
-        do {
-            let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil)
-            for fileURL in fileURLs {
-                if fileURL.pathExtension == "gguf" {
-                    try fileManager.removeItem(at: fileURL)
-                }
-            }
-        } catch {
-            print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)")
-        }
-    }
-
-    var body: some View {
-        VStack {
-            ScrollView(.vertical, showsIndicators: true) {
-                Text(llamaState.messageLog)
-                .font(.system(size: 12))
-                .frame(maxWidth: .infinity, alignment: .leading)
-                .padding()
-                .onTapGesture {
-                    UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil)
-                }
-            }
-
-            TextEditor(text: $multiLineText)
-                .frame(height: 80)
-                .padding()
-                .border(Color.gray, width: 0.5)
-
-            HStack {
-                Button("Send") {
-                    sendText()
-                }
-
-                Button("Bench") {
-                    bench()
-                }
-
-                Button("Clear") {
-                    clear()
-                }
-
-                Button("Copy") {
-                    UIPasteboard.general.string = llamaState.messageLog
-                }
-            }.buttonStyle(.bordered)
-
-            VStack(alignment: .leading) {
-                DownloadButton(
-                    llamaState: llamaState,
-                    modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",
-                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
-                    filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
-                )
-
-                DownloadButton(
-                    llamaState: llamaState,
-                    modelName: "TinyLlama-1.1B (Q8_0, 1.1 GiB)",
-                    modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true",
-                    filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf"
-                )
-
-                DownloadButton(
-                    llamaState: llamaState,
-                    modelName: "TinyLlama-1.1B (F16, 2.2 GiB)",
-                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true",
-                    filename: "tinyllama-1.1b-f16.gguf"
-                )
-
-                DownloadButton(
-                    llamaState: llamaState,
-                    modelName: "Phi-2.7B (Q4_0, 1.6 GiB)",
-                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true",
-                    filename: "phi-2-q4_0.gguf"
-                )
-
-                DownloadButton(
-                    llamaState: llamaState,
-                    modelName: "Phi-2.7B (Q8_0, 2.8 GiB)",
-                    modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true",
-                    filename: "phi-2-q8_0.gguf"
-                )
-
-                DownloadButton(
-                    llamaState: llamaState,
-                    modelName: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)",
-                    modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true",
-                    filename: "mistral-7b-v0.1.Q4_0.gguf"
-                )
-
-                Button("Clear downloaded models") {
-                    ContentView.cleanupModelCaches()
-                    llamaState.cacheCleared = true
-                }
-
-                LoadCustomButton(llamaState: llamaState)
-            }
-            .padding(.top, 4)
-            .font(.system(size: 12))
-            .frame(maxWidth: .infinity, alignment: .leading)
-        }
-        .padding()
-    }
-
-    func sendText() {
-        Task {
-            await llamaState.complete(text: multiLineText)
-            multiLineText = ""
-        }
-    }
-
-    func bench() {
-        Task {
-            await llamaState.bench()
-        }
-    }
-
-    func clear() {
-        Task {
-            await llamaState.clear()
-        }
-    }
-}
-
-//#Preview {
-//    ContentView()
-//}
--- a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift
@@ -1,122 +0,0 @@
-import SwiftUI
-
-struct DownloadButton: View {
-    @ObservedObject private var llamaState: LlamaState
-    private var modelName: String
-    private var modelUrl: String
-    private var filename: String
-
-    @State private var status: String
-
-    @State private var downloadTask: URLSessionDownloadTask?
-    @State private var progress = 0.0
-    @State private var observation: NSKeyValueObservation?
-
-    private static func getFileURL(filename: String) -> URL {
-        FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
-    }
-
-    private func checkFileExistenceAndUpdateStatus() {
-    }
-
-    init(llamaState: LlamaState, modelName: String, modelUrl: String, filename: String) {
-        self.llamaState = llamaState
-        self.modelName = modelName
-        self.modelUrl = modelUrl
-        self.filename = filename
-
-        let fileURL = DownloadButton.getFileURL(filename: filename)
-        status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
-    }
-
-    private func download() {
-        status = "downloading"
-        print("Downloading model \(modelName) from \(modelUrl)")
-        guard let url = URL(string: modelUrl) else { return }
-        let fileURL = DownloadButton.getFileURL(filename: filename)
-
-        downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
-            if let error = error {
-                print("Error: \(error.localizedDescription)")
-                return
-            }
-
-            guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
-                print("Server error!")
-                return
-            }
-
-            do {
-                if let temporaryURL = temporaryURL {
-                    try FileManager.default.copyItem(at: temporaryURL, to: fileURL)
-                    print("Writing to \(filename) completed")
-
-                    llamaState.cacheCleared = false
-
-                    status = "downloaded"
-                }
-            } catch let err {
-                print("Error: \(err.localizedDescription)")
-            }
-        }
-
-        observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
-            self.progress = progress.fractionCompleted
-        }
-
-        downloadTask?.resume()
-    }
-
-    var body: some View {
-        VStack {
-            if status == "download" {
-                Button(action: download) {
-                    Text("Download " + modelName)
-                }
-            } else if status == "downloading" {
-                Button(action: {
-                    downloadTask?.cancel()
-                    status = "download"
-                }) {
-                    Text("\(modelName) (Downloading \(Int(progress * 100))%)")
-                }
-            } else if status == "downloaded" {
-                Button(action: {
-                    let fileURL = DownloadButton.getFileURL(filename: filename)
-                    if !FileManager.default.fileExists(atPath: fileURL.path) {
-                        download()
-                        return
-                    }
-                    do {
-                        try llamaState.loadModel(modelUrl: fileURL)
-                    } catch let err {
-                        print("Error: \(err.localizedDescription)")
-                    }
-                }) {
-                    Text("Load \(modelName)")
-                }
-            } else {
-                Text("Unknown status")
-            }
-        }
-        .onDisappear() {
-            downloadTask?.cancel()
-        }
-        .onChange(of: llamaState.cacheCleared) { newValue in
-            if newValue {
-                downloadTask?.cancel()
-                let fileURL = DownloadButton.getFileURL(filename: filename)
-                status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download"
-            }
-        }
-    }
-}
-
-// #Preview {
-//    DownloadButton(
-//        llamaState: LlamaState(),
-//        modelName: "TheBloke / TinyLlama-1.1B-1T-OpenOrca-GGUF (Q4_0)",
-//        modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",
-//        filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf"
-//    )
-// }
--- a/examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift
+++ b/examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift
@@ -1,44 +0,0 @@
-import SwiftUI
-import UniformTypeIdentifiers
-
-struct LoadCustomButton: View {
-    @ObservedObject private var llamaState: LlamaState
-    @State private var showFileImporter = false
-
-    init(llamaState: LlamaState) {
-        self.llamaState = llamaState
-    }
-
-    var body: some View {
-        VStack {
-            Button(action: {
-                showFileImporter = true
-            }) {
-                Text("Load Custom Model")
-            }
-        }
-        .fileImporter(
-            isPresented: $showFileImporter,
-            allowedContentTypes: [UTType(filenameExtension: "gguf", conformingTo: .data)!],
-            allowsMultipleSelection: false
-        ) { result in
-            switch result {
-            case .success(let files):
-                files.forEach { file in
-                    let gotAccess = file.startAccessingSecurityScopedResource()
-                    if !gotAccess { return }
-
-                    do {
-                        try llamaState.loadModel(modelUrl: file.absoluteURL)
-                    } catch let err {
-                        print("Error: \(err.localizedDescription)")
-                    }
-
-                    file.stopAccessingSecurityScopedResource()
-                }
-            case .failure(let error):
-                print(error)
-            }
-        }
-    }
-}
--- a/examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
+++ b/examples/llama.swiftui/llama.swiftui/llama_swiftuiApp.swift
@@ -1,10 +0,0 @@
-import SwiftUI
-
-@main
-struct llama_swiftuiApp: App {
-    var body: some Scene {
-        WindowGroup {
-            ContentView()
-        }
-    }
-}
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -1,37 +1,14 @@
-add_library(llava OBJECT
-            llava.cpp
-            llava.h
-            clip.cpp
-            clip.h
-            )
-
-target_link_libraries(llava PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
-
-target_include_directories(llava PUBLIC .)
-target_include_directories(llava PUBLIC ../..)
-target_include_directories(llava PUBLIC ../../common)
-
-target_compile_features(llava PRIVATE cxx_std_11)
-
-add_library(llava_static STATIC $<TARGET_OBJECTS:llava>)
-if (BUILD_SHARED_LIBS)
-    set_target_properties(llava PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(llava PRIVATE LLAMA_SHARED LLAMA_BUILD)
-    add_library(llava_shared SHARED $<TARGET_OBJECTS:llava>)
-    target_link_libraries(llava_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
-    install(TARGETS llava_shared LIBRARY)
-endif()
-
+set(TARGET clip)
+add_library(${TARGET} clip.cpp clip.h)
+install(TARGETS ${TARGET} LIBRARY)
+target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if (NOT MSVC)
-    target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
+    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
 endif()

-if(TARGET BUILD_INFO)
-    add_dependencies(llava BUILD_INFO)
-endif()
-
-set(TARGET llava-cli)
-add_executable(llava-cli llava-cli.cpp)
-install(TARGETS llava-cli RUNTIME)
-target_link_libraries(llava-cli PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(llava PRIVATE cxx_std_11)
+set(TARGET llava)
+add_executable(${TARGET} llava.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -9,12 +9,12 @@ models are available.
 After API is confirmed, more models will be supported / uploaded.

 ## Usage
-Build with cmake or run `make llava-cli` to build it.
+Build with cmake or run `make llava` to build it.

-After building, run: `./llava-cli` to see the usage. For example:
+After building, run: `./llava` to see the usage. For example:

 ```sh
-./llava-cli -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
+./llava -m llava-v1.5-7b/ggml-model-q5_k.gguf --mmproj llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
 ```

 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
@@ -51,6 +51,7 @@ Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` director

 ## TODO

+- [ ] Support server mode.
 - [ ] Support non-CPU backend for the image encoding part.
 - [ ] Support different sampling methods.
 - [ ] Support more model variants.
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -16,19 +16,12 @@
 #include "clip.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
-#include "ggml-backend.h"
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif

 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"

+#define CLIP_DEBUG
+
 static std::string format(const char * fmt, ...) {
    va_list ap;
    va_list ap2;
@@ -146,27 +139,6 @@ static std::string get_ftype(int ftype) {
    }
 }

-//
-// image data
-//
-
-// RGB uint8 image
-struct clip_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> buf;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-};
-
 //
 // clip layers
 //
@@ -224,31 +196,39 @@ struct clip_vision_model {
    struct ggml_tensor * mm_2_b;
 };

+// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
+struct clip_buffer {
+    uint8_t * data = NULL;
+    size_t size = 0;
+
+    void resize(size_t size) {
+        delete[] data;
+        data = new uint8_t[size];
+        this->size = size;
+    }
+
+    ~clip_buffer() { delete[] data; }
+};
+
 struct clip_ctx {
-    bool has_text_encoder    = false;
-    bool has_vision_encoder  = false;
+    bool has_text_encoder = false;
+    bool has_vision_encoder = false;
    bool has_llava_projector = false;
-
    struct clip_vision_model vision_model;
-
    float image_mean[3];
    float image_std[3];
    bool use_gelu = false;
    int32_t ftype = 1;
-
+    struct ggml_context * ctx;
    struct gguf_context * ctx_gguf;
-    struct ggml_context * ctx_data;
-
-    std::vector<uint8_t> buf_compute_meta;

    // memory buffers to evaluate the model
-    ggml_backend_buffer_t params_buffer = NULL;
-    ggml_backend_buffer_t compute_buffer = NULL;
-    ggml_backend_t backend = NULL;
-    ggml_allocr * compute_alloc = NULL;
+    clip_buffer buf_compute;
+    clip_buffer buf_alloc;
+    ggml_allocr * alloc = NULL;
 };

-static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
+static ggml_cgraph * clip_image_build_graph(const clip_ctx * ctx, const clip_image_f32_batch * imgs) {
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
        return nullptr;
@@ -269,24 +249,28 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    //const int projection_dim = hparams.projection_dim;
    const float eps = hparams.eps;
    int batch_size = imgs->size;
-    if (ctx->has_llava_projector) {
+    if(ctx->has_llava_projector) {
        GGML_ASSERT(batch_size == 1);
    }

+    const auto & buf_compute = ctx->buf_compute;
+
    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
-        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
-        /*.no_alloc   =*/ true,
+        /*.mem_size =*/ buf_compute.size,
+        /*.mem_buffer =*/ buf_compute.data,
+        /*.no_alloc =*/ false,
    };

+    params.no_alloc = true;
+
    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph * gf = ggml_new_graph(ctx0);

    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size, image_size, 3, batch_size);
-    ggml_allocr_alloc(ctx->compute_alloc, inp_raw);
+    ggml_allocr_alloc(ctx->alloc, inp_raw);

-    if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
-        float * data = (float *)malloc(ggml_nbytes(inp_raw));
+    if (!ggml_allocr_is_measure(ctx->alloc)) {
+        float * data = (float *)ggml_get_data(inp_raw);

        for (size_t i = 0; i < imgs->size; i++) {
            const int nx = imgs->data[i].nx;
@@ -299,14 +283,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                for (int k = 0; k < 3; k++) {
                    for (int y = 0; y < ny; y++) {
                        for (int x = 0; x < nx; x++) {
-                            data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
+                            data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].data[3 * (y * nx + x) + k];
                        }
                    }
                }
            }
        }
-        ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
-        free(data);
    }

    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
@@ -316,39 +298,42 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32

    // concat class_embeddings and patch_embeddings
    struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-    ggml_allocr_alloc(ctx->compute_alloc, embeddings);
-    if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
-        void* zero_mem = malloc(ggml_nbytes(embeddings));
-        memset(zero_mem, 0, ggml_nbytes(embeddings));
-        ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
-        free(zero_mem);
+    ggml_allocr_alloc(ctx->alloc, embeddings);
+    if (!ggml_allocr_is_measure(ctx->alloc)) {
+        ggml_set_zero(embeddings);
    }

-    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
-            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+    struct ggml_tensor * temp = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, 1, batch_size);
+    ggml_allocr_alloc(ctx->alloc, temp);

-    embeddings = ggml_acc(ctx0, embeddings, inp,
-            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
+    embeddings = ggml_acc(ctx0, embeddings, ggml_repeat(ctx0, model.class_embedding, temp), embeddings->nb[1],
+                          embeddings->nb[2], embeddings->nb[3], 0);
+    embeddings =
+        ggml_acc(ctx0, embeddings, inp, embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);

    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
-    ggml_allocr_alloc(ctx->compute_alloc, positions);
-    if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
-        int* positions_data = (int*)malloc(ggml_nbytes(positions));
+    ggml_allocr_alloc(ctx->alloc, positions);
+    if (!ggml_allocr_is_measure(ctx->alloc)) {
        for (int i = 0; i < num_positions; i++) {
-            positions_data[i] = i;
+            ggml_set_i32_1d(positions, i, i);
        }
-        ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
-        free(positions_data);
    }

    embeddings =
-        ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
+        ggml_add(ctx0, embeddings, ggml_repeat(ctx0, ggml_get_rows(ctx0, model.position_embeddings, positions), embeddings));

    // pre-layernorm
    {
        embeddings = ggml_norm(ctx0, embeddings, eps);

-        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
+        embeddings = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.pre_ln_w, embeddings), embeddings),
+                              ggml_repeat(ctx0, model.pre_ln_b, embeddings));
+    }
+
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(ctx->alloc, KQ_scale);
+    if (!ggml_allocr_is_measure(ctx->alloc)) {
+        ggml_set_f32(KQ_scale, 1.0f / sqrt((float)d_head));
    }

    // loop over layers
@@ -361,30 +346,30 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        {
            cur = ggml_norm(ctx0, cur, eps);

-            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_1_w),
-                           model.layers[il].ln_1_b);
+            cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_1_w, cur), cur),
+                           ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
        }

        // self-attention
        {

            struct ggml_tensor * Q =
-                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
+                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, cur), ggml_mul_mat(ctx0, model.layers[il].q_w, cur));

-            Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
+            Q = ggml_scale_inplace(ctx0, Q, KQ_scale);
            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);

            struct ggml_tensor * K =
-                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
+                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, cur), ggml_mul_mat(ctx0, model.layers[il].k_w, cur));

            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);

            struct ggml_tensor * V =
-                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
+                ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, cur), ggml_mul_mat(ctx0, model.layers[il].v_w, cur));

            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
@@ -400,7 +385,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        }

        // attention output
-        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
+        cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].o_b, cur), ggml_mul_mat(ctx0, model.layers[il].o_w, cur));

        // re-add the layer input, e.g., residual
        cur = ggml_add(ctx0, cur, embeddings);
@@ -411,11 +396,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        {
            cur = ggml_norm(ctx0, cur, eps);

-            cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ln_2_w), model.layers[il].ln_2_b);
+            cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].ln_2_w, cur), cur),
+                           ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
        }

        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
+        cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_i_b, cur), cur);

        if (ctx->use_gelu) {
            cur = ggml_gelu_inplace(ctx0, cur);
@@ -424,7 +410,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        }

        cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
-        cur = ggml_add(ctx0, cur, model.layers[il].ff_o_b);
+        cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].ff_o_b, cur), cur);

        // residual 2
        cur = ggml_add(ctx0, embeddings, cur);
@@ -437,26 +423,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);

        struct ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
-        ggml_allocr_alloc(ctx->compute_alloc, patches);
-        if (!ggml_allocr_is_measure(ctx->compute_alloc)) {
-            int* patches_data = (int*)malloc(ggml_nbytes(patches));
-            for (int i = 0; i < num_patches; i++) {
-                patches_data[i] = i + 1;
+        ggml_allocr_alloc(ctx->alloc, patches);
+        if (!ggml_allocr_is_measure(ctx->alloc)) {
+            for (int i = 0; i < num_patches; ++i) {
+                ggml_set_i32_1d(patches, i, i+1);
            }
-            ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-            free(patches_data);
        }

        embeddings = ggml_get_rows(ctx0, embeddings, patches);

        // mm projection 0
        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+        embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_0_b, embeddings), embeddings);

        embeddings = ggml_gelu(ctx0, embeddings);

        embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
-        embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+        embeddings = ggml_add(ctx0, ggml_repeat(ctx0, model.mm_2_b, embeddings), embeddings);
    }

    // build the graph
@@ -469,6 +452,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32

 // read and create ggml_context containing the tensors and their data
 struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+
    struct ggml_context * meta = NULL;

    struct gguf_init_params params = {
@@ -501,7 +485,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        printf("%s: ftype:        %s\n", __func__, ftype_str.c_str());
        printf("\n");
    }
-    const int n_tensors = gguf_get_n_tensors(ctx);
+
    // kv
    if (verbosity >= 3) {
        const int n_kv = gguf_get_n_kv(ctx);
@@ -515,38 +499,27 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
    }

    // data
-    size_t buffer_size = 0;
+    size_t ctx_size = 0;
    {
+        const int n_tensors = gguf_get_n_tensors(ctx);
+
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
+
            struct ggml_tensor * cur = ggml_get_tensor(meta, name);
+            ctx_size += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
            size_t tensor_size = ggml_nbytes(cur);
-            buffer_size += tensor_size;
+            size_t padded_size = ggml_nbytes_pad(cur);
+            ctx_size += padded_size;
            if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
-                       ggml_n_dims(cur), cur->name, tensor_size, offset);
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, padded_size=%zu, offset=%zu\n", __func__, i,
+                       cur->n_dims, cur->name, tensor_size, padded_size, offset);
            }
        }
    }

-    buffer_size += n_tensors * 128 /* CLIP PADDING */;
-
    clip_ctx * new_clip = new clip_ctx;
-#ifdef GGML_USE_CUBLAS
-    new_clip->backend = ggml_backend_cuda_init(0);
-    printf("%s: CLIP using CUDA backend\n", __func__);
-#endif
-
-#ifdef GGML_USE_METAL
-    new_clip->backend = ggml_backend_metal_init();
-    printf("%s: CLIP using Metal backend\n", __func__);
-#endif
-
-    if (!new_clip->backend) {
-        new_clip->backend = ggml_backend_cpu_init();
-        printf("%s: CLIP using CPU backend\n", __func__);
-    }

    // model size and capabilities
    {
@@ -572,24 +545,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            printf("%s: text_encoder:   %d\n", __func__, new_clip->has_text_encoder);
            printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
            printf("%s: llava_projector:  %d\n", __func__, new_clip->has_llava_projector);
-            printf("%s: model size:     %.2f MB\n", __func__, buffer_size / 1024.0 / 1024.0);
+            printf("%s: model size:     %.2f MB\n", __func__, (ctx_size / 1024.0 / 1024.0));
            printf("%s: metadata size:  %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
        }
    }

-    printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, buffer_size / (1024.0 * 1024.0), n_tensors);
-
    // load tensors
    {
-        std::vector<uint8_t> read_buf;
        struct ggml_init_params params = {
-            /*.mem_size =*/ (n_tensors + 1) * ggml_tensor_overhead(),
+            /*.mem_size =*/ ctx_size,
            /*.mem_buffer =*/ NULL,
-            /*.no_alloc =*/ true,
+            /*.no_alloc =*/ false,
        };

-        new_clip->ctx_data = ggml_init(params);
-        if (!new_clip->ctx_data) {
+        new_clip->ctx = ggml_init(params);
+        if (!new_clip->ctx) {
            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
            clip_free(new_clip);
            return nullptr;
@@ -602,21 +572,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            return nullptr;
        }

-        // add tensors to context
+        const int n_tensors = gguf_get_n_tensors(ctx);
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            struct ggml_tensor * t = ggml_get_tensor(meta, name);
-            struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx_data, t);
+            struct ggml_tensor * cur = ggml_dup_tensor(new_clip->ctx, t);
            ggml_set_name(cur, name);
-        }

-        // alloc memory and offload data
-        new_clip->params_buffer = ggml_backend_alloc_buffer(new_clip->backend, buffer_size);
-        ggml_allocr* alloc = ggml_allocr_new_from_buffer(new_clip->params_buffer);
-        for (int i = 0; i < n_tensors; ++i) {
-            const char * name = gguf_get_tensor_name(ctx, i);
-            struct ggml_tensor * cur = ggml_get_tensor(new_clip->ctx_data, name);
-            ggml_allocr_alloc(alloc, cur);
            const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
            fin.seekg(offset, std::ios::beg);
            if (!fin) {
@@ -624,18 +586,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
                clip_free(new_clip);
                return nullptr;
            }
-            int num_bytes = ggml_nbytes(cur);
-            if (ggml_backend_buffer_is_host(new_clip->params_buffer)) {
-                // for the CPU and Metal backend, we can read directly into the tensor
-                fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
-            } else {
-                // read into a temporary buffer first, then copy to device memory
-                read_buf.resize(num_bytes);
-                fin.read(reinterpret_cast<char *>(read_buf.data()), num_bytes);
-                ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes);
-            }
+
+            fin.read(reinterpret_cast<char *>(cur->data), ggml_nbytes(t));
        }
-        ggml_allocr_free(alloc);
+
        fin.close();
    }

@@ -644,20 +598,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        // load vision model
        auto & vision_model = new_clip->vision_model;
        auto & hparams = vision_model.hparams;
-        hparams.hidden_size    = get_u32(ctx, format(KEY_N_EMBD, "vision"));
-        hparams.n_head         = get_u32(ctx, format(KEY_N_HEAD, "vision"));
+        hparams.hidden_size = get_u32(ctx, format(KEY_N_EMBD, "vision"));
+        hparams.n_head = get_u32(ctx, format(KEY_N_HEAD, "vision"));
        hparams.n_intermediate = get_u32(ctx, format(KEY_N_FF, "vision"));
-        hparams.n_layer        = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
-        hparams.image_size     = get_u32(ctx, KEY_IMAGE_SIZE);
-        hparams.patch_size     = get_u32(ctx, KEY_PATCH_SIZE);
+        hparams.n_layer = get_u32(ctx, format(KEY_N_BLOCK, "vision"));
+        hparams.image_size = get_u32(ctx, KEY_IMAGE_SIZE);
+        hparams.patch_size = get_u32(ctx, KEY_PATCH_SIZE);
        hparams.projection_dim = get_u32(ctx, format(KEY_PROJ_DIM, "vision"));
-        hparams.eps            = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));
+        hparams.eps = get_f32(ctx, format(KEY_LAYER_NORM_EPS, "vision"));

        int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
-        int idx_std  = get_key_idx(ctx, KEY_IMAGE_STD);
+        int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
        for (int i = 0; i < 3; ++i) {
            new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
-            new_clip->image_std[i]  = *((const float *)gguf_get_arr_data(ctx, idx_std));
+            new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
        }

        if (verbosity >= 2) {
@@ -671,35 +625,35 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
            printf("v_n_layer          %d\n", hparams.n_layer);
        }

-        vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
-        vision_model.class_embedding     = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
-        vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
-        vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
-        vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-        vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
-        vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
-        vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
-        vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
+        vision_model.patch_embeddings = get_tensor(new_clip->ctx, TN_PATCH_EMBD);
+        vision_model.class_embedding = get_tensor(new_clip->ctx, TN_CLASS_EMBD);
+        vision_model.position_embeddings = get_tensor(new_clip->ctx, format(TN_POS_EMBD, "v"));
+        vision_model.pre_ln_w = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "weight"));
+        vision_model.pre_ln_b = get_tensor(new_clip->ctx, format(TN_LN_PRE, "v", "bias"));
+        vision_model.mm_0_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "weight"));
+        vision_model.mm_0_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 0, "bias"));
+        vision_model.mm_2_w = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "weight"));
+        vision_model.mm_2_b = get_tensor(new_clip->ctx, format(TN_LLAVA_PROJ, 2, "bias"));

        vision_model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
            auto & layer = vision_model.layers[il];
-            layer.k_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_K,      "v", il, "weight"));
-            layer.q_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q,      "v", il, "weight"));
-            layer.v_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_V,      "v", il, "weight"));
-            layer.o_w    = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "weight"));
-            layer.ln_1_w = get_tensor(new_clip->ctx_data, format(TN_LN_1,        "v", il, "weight"));
-            layer.ln_2_w = get_tensor(new_clip->ctx_data, format(TN_LN_2,        "v", il, "weight"));
-            layer.ff_i_w = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN,    "v", il, "weight"));
-            layer.ff_o_w = get_tensor(new_clip->ctx_data, format(TN_FFN_UP,      "v", il, "weight"));
-            layer.k_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_K,      "v", il, "bias"));
-            layer.q_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_Q,      "v", il, "bias"));
-            layer.v_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_V,      "v", il, "bias"));
-            layer.o_b    = get_tensor(new_clip->ctx_data, format(TN_ATTN_OUTPUT, "v", il, "bias"));
-            layer.ln_1_b = get_tensor(new_clip->ctx_data, format(TN_LN_1,        "v", il, "bias"));
-            layer.ln_2_b = get_tensor(new_clip->ctx_data, format(TN_LN_2,        "v", il, "bias"));
-            layer.ff_i_b = get_tensor(new_clip->ctx_data, format(TN_FFN_DOWN,    "v", il, "bias"));
-            layer.ff_o_b = get_tensor(new_clip->ctx_data, format(TN_FFN_UP,      "v", il, "bias"));
+            layer.k_w = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "weight"));
+            layer.q_w = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "weight"));
+            layer.v_w = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "weight"));
+            layer.o_w = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "weight"));
+            layer.ln_1_w = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "weight"));
+            layer.ln_2_w = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "weight"));
+            layer.ff_i_w = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "weight"));
+            layer.ff_o_w = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "weight"));
+            layer.k_b = get_tensor(new_clip->ctx, format(TN_ATTN_K, "v", il, "bias"));
+            layer.q_b = get_tensor(new_clip->ctx, format(TN_ATTN_Q, "v", il, "bias"));
+            layer.v_b = get_tensor(new_clip->ctx, format(TN_ATTN_V, "v", il, "bias"));
+            layer.o_b = get_tensor(new_clip->ctx, format(TN_ATTN_OUTPUT, "v", il, "bias"));
+            layer.ln_1_b = get_tensor(new_clip->ctx, format(TN_LN_1, "v", il, "bias"));
+            layer.ln_2_b = get_tensor(new_clip->ctx, format(TN_LN_2, "v", il, "bias"));
+            layer.ff_i_b = get_tensor(new_clip->ctx, format(TN_FFN_DOWN, "v", il, "bias"));
+            layer.ff_o_b = get_tensor(new_clip->ctx, format(TN_FFN_UP, "v", il, "bias"));
        }
    }

@@ -707,69 +661,51 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

    new_clip->ctx_gguf = ctx;

-    // measure mem requirement and allocate
+// measure mem requirement and allocate
    {
-        new_clip->buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
-        new_clip->compute_alloc = ggml_allocr_new_measure_from_backend(new_clip->backend);
+        static const size_t tensor_alignment = 32;
+        new_clip->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
+        new_clip->alloc = ggml_allocr_new_measure(tensor_alignment);
        clip_image_f32_batch batch;
        batch.size = 1;
        ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
-        size_t compute_memory_buffer_size = ggml_allocr_alloc_graph(new_clip->compute_alloc, gf);
-        ggml_allocr_free(new_clip->compute_alloc);
-        new_clip->compute_buffer = ggml_backend_alloc_buffer(new_clip->backend, compute_memory_buffer_size);
-        new_clip->compute_alloc = ggml_allocr_new_from_buffer(new_clip->compute_buffer);
+        size_t alloc_size = ggml_allocr_alloc_graph(new_clip->alloc, gf) + tensor_alignment;
+        ggml_allocr_free(new_clip->alloc);
+        new_clip->buf_alloc.resize(alloc_size);
+        new_clip->alloc = ggml_allocr_new(new_clip->buf_alloc.data, new_clip->buf_alloc.size, tensor_alignment);

-        printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
+        printf("%s: total allocated memory: %.2f MB\n", __func__, (new_clip->buf_compute.size + alloc_size)/1024.0/1024.0);
    }

    return new_clip;
 }

-struct clip_image_u8 * clip_image_u8_init() {
-    return new clip_image_u8();
-}
+clip_image_u8 * make_clip_image_u8() { return new clip_image_u8(); }

-struct clip_image_f32 * clip_image_f32_init() {
-    return new clip_image_f32();
-}
-
-void clip_image_u8_free (struct clip_image_u8  * img) { delete img; }
-void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
-
-static void build_clip_img_from_data(const stbi_uc * data, int nx, int ny, clip_image_u8 * img) {
-    img->nx = nx;
-    img->ny = ny;
-    img->buf.resize(3 * nx * ny);
-    memcpy(img->buf.data(), data, img->buf.size());
-}
+clip_image_f32 * make_clip_image_f32() { return new clip_image_f32(); }

 bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
    int nx, ny, nc;
-    auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
+    auto data = stbi_load(fname, &nx, &ny, &nc, 3);
    if (!data) {
-        fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
+        fprintf(stderr, "%s: failed to load '%s'\n", __func__, fname);
        return false;
    }
-    build_clip_img_from_data(data, nx, ny, img);
-    stbi_image_free(data);
-    return true;
-}

-bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
-    int nx, ny, nc;
-    auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
-    if (!data) {
-        fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
-        return false;
-    }
-    build_clip_img_from_data(data, nx, ny, img);
+    img->nx = nx;
+    img->ny = ny;
+    img->size = nx * ny * 3;
+    img->data = new uint8_t[img->size]();
+    memcpy(img->data, data, img->size);
+
    stbi_image_free(data);
+
    return true;
 }

 // normalize: x = (x - mean) / std
 // TODO: implement bicubic interpolation instead of linear.
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
+bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32 * res, const bool pad2square) {
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
        return false;
@@ -778,45 +714,47 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156

-    clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
+    clip_image_u8 temp; // we will keep the input image data here temporarily
    if (pad2square && img->nx != img->ny) {
        int longer_side = std::max(img->nx, img->ny);
-        temp->nx = longer_side;
-        temp->ny = longer_side;
-        temp->buf.resize(3 * longer_side * longer_side);
-        const uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
+        temp.nx = longer_side;
+        temp.ny = longer_side;
+        temp.size = 3 * longer_side * longer_side;
+        temp.data = new uint8_t[temp.size]();
+        uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA

        // fill with background color
-        for (size_t i = 0; i < temp->buf.size(); i++) {
-            temp->buf[i] = bc[i % 3];
+        for (size_t i = 0; i < temp.size; i++) {
+            temp.data[i] = bc[i % 3];
        }

        // copy from the input image
        for (int y = 0; y < img->ny; y++) {
            for (int x = 0; x < img->nx; x++) {
                const int i = 3 * (y * img->nx + x);
-                const int j = 3 * (y * temp->nx + x);
-                temp->buf[j]   = img->buf[i];
-                temp->buf[j+1] = img->buf[i+1];
-                temp->buf[j+2] = img->buf[i+2];
+                const int j = 3 * (y * temp.nx + x);
+                temp.data[j] = img->data[i];
+                temp.data[j+1] = img->data[i+1];
+                temp.data[j+2] = img->data[i+2];
            }
        }
    } else {
-        temp->nx = img->nx;
-        temp->ny = img->ny;
-        temp->buf.resize(img->buf.size());
-        memcpy(temp->buf.data(), img->buf.data(), temp->buf.size());
+        temp.nx   = img->nx;
+        temp.ny   = img->ny;
+        temp.size = img->size;
+        temp.data = img->data;
    }

-    const int nx = temp->nx;
-    const int ny = temp->ny;
+    const int nx = temp.nx;
+    const int ny = temp.ny;

    const int nx2 = ctx->vision_model.hparams.image_size;
    const int ny2 = ctx->vision_model.hparams.image_size;

    res->nx = nx2;
    res->ny = ny2;
-    res->buf.resize(3 * nx2 * ny2);
+    res->size = 3 * nx2 * ny2;
+    res->data = new float[res->size]();

    const float scale = std::max(nx, ny) / (float)ctx->vision_model.hparams.image_size;

@@ -847,10 +785,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
                const int j10 = 3 * (y1 * nx + x0) + c;
                const int j11 = 3 * (y1 * nx + x1) + c;

-                const float v00 = temp->buf[j00];
-                const float v01 = temp->buf[j01];
-                const float v10 = temp->buf[j10];
-                const float v11 = temp->buf[j11];
+                const float v00 = temp.data[j00];
+                const float v01 = temp.data[j01];
+                const float v10 = temp.data[j10];
+                const float v11 = temp.data[j11];

                const float v0 = v00 * (1.0f - dx) + v01 * dx;
                const float v1 = v10 * (1.0f - dx) + v11 * dx;
@@ -861,23 +799,21 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli

                const int i = 3 * (y * nx3 + x) + c;

-                res->buf[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
+                res->data[i] = ((float(v2) / 255.0f) - m3[c]) / s3[c];
            }
        }
    }
-    clip_image_u8_free(temp);

    return true;
 }

 void clip_free(clip_ctx * ctx) {
-    ggml_free(ctx->ctx_data);
+    ggml_free(ctx->ctx);
    gguf_free(ctx->ctx_gguf);
-
    delete ctx;
 }

-bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
+bool clip_image_encode(const clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
        return false;
@@ -889,7 +825,8 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
    return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
 }

-bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
+bool clip_image_batch_encode(const clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
+
    if (!ctx->has_vision_encoder) {
        printf("This gguf file seems to have no vision encoder\n");
        return false;
@@ -901,29 +838,29 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    }

    // reset alloc buffer to clean the memory from previous invocations
-    ggml_allocr_reset(ctx->compute_alloc);
+    ggml_allocr_reset(ctx->alloc);

    // build the inference graph
    ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
-    ggml_allocr_alloc_graph(ctx->compute_alloc, gf);
+    ggml_allocr_alloc_graph(ctx->alloc, gf);

-    if (ggml_backend_is_cpu(ctx->backend)) {
-        ggml_backend_cpu_set_n_threads(ctx->backend, n_threads);
+    struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
+    if (plan.work_size > 0) {
+        plan.work_data = (uint8_t *)malloc(plan.work_size);
    }

-#ifdef GGML_USE_METAL
-    if (ggml_backend_is_metal(ctx->backend)) {
-        ggml_backend_metal_set_n_cb(ctx->backend, n_threads);
-    }
-#endif
-
-    ggml_backend_graph_compute(ctx->backend, gf);
+    ggml_graph_compute(gf, &plan);

    // the last node is the embedding tensor
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
+struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];

    // copy the embeddings to the location passed by the user
-    ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
+    memcpy(vec, ggml_get_data_f32(embeddings), ggml_nbytes(embeddings));
+
+    if (plan.work_size > 0) {
+        free(plan.work_data);
+    }
+
    return true;
 }

@@ -932,32 +869,31 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    ggml_type type = GGML_TYPE_Q4_1;

    switch (itype) {
-        case 2:
-            type = GGML_TYPE_Q4_0;
-            break;
-        case 3:
-            type = GGML_TYPE_Q4_1;
-            break;
-        case 6:
-            type = GGML_TYPE_Q5_0;
-            break;
-        case 7:
-            type = GGML_TYPE_Q5_1;
-            break;
-        case 8:
-            type = GGML_TYPE_Q8_0;
-            break;
-        default:
-            fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
-            return false;
+    case 2:
+        type = GGML_TYPE_Q4_0;
+        break;
+    case 3:
+        type = GGML_TYPE_Q4_1;
+        break;
+    case 6:
+        type = GGML_TYPE_Q5_0;
+        break;
+    case 7:
+        type = GGML_TYPE_Q5_1;
+        break;
+    case 8:
+        type = GGML_TYPE_Q8_0;
+        break;
+    default:
+        fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype);
+        return false;
    };

-    auto * ctx_clip = clip_model_load(fname_inp, 2);
-
+    auto ctx_clip = clip_model_load(fname_inp, 2);
    const auto & ctx_src = ctx_clip->ctx_gguf;
-    const auto & ctx_data = ctx_clip->ctx_data;
+    const auto & ctx_data = ctx_clip->ctx;

-    auto * ctx_out = gguf_init_empty();
+    auto ctx_out = gguf_init_empty();
    gguf_set_kv(ctx_out, ctx_src);
    gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
    gguf_set_val_u32(ctx_out, "general.file_type", itype);
@@ -1006,7 +942,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
        }

        // quantize only 2D tensors
-        quantize &= (ggml_n_dims(cur) == 2);
+        quantize &= (cur->n_dims == 2);

        if (quantize) {
            new_type = type;
@@ -1079,7 +1015,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
            fout.put(0);
        }

-        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
+        printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), cur->n_dims, quantize,
               orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
    }

@@ -1095,8 +1031,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    gguf_free(ctx_out);

    {
-        printf("%s: original  size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
-        printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
+        printf("%s: original size  = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
+        printf("%s: quantized size  = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);

        int64_t sum_all = 0;
        for (size_t i = 0; i < hist_all.size(); ++i) {
@@ -1113,16 +1049,16 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    return true;
 }

-int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
+int clip_n_mmproj_embd(struct clip_ctx * ctx) {
    return ctx->vision_model.mm_2_b->ne[0];
 }

-int clip_n_patches(const struct clip_ctx * ctx) {
+int clip_n_patches(struct clip_ctx * ctx) {
    auto & params = ctx->vision_model.hparams;

    return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
 }

-size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
+size_t clip_embd_nbytes(struct clip_ctx * ctx) {
    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -1,22 +1,7 @@
 #ifndef CLIP_H
 #define CLIP_H

-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef LLAMA_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef LLAMA_BUILD
-#            define CLIP_API __declspec(dllexport)
-#        else
-#            define CLIP_API __declspec(dllimport)
-#        endif
-#    else
-#        define CLIP_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define CLIP_API
-#endif
+#include "ggml.h"

 struct clip_ctx;

@@ -35,14 +20,30 @@ struct clip_vision_hparams {
    float eps;
 };

-CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
+struct clip_ctx * clip_model_load(const char * fname, const int verbosity);

-CLIP_API void clip_free(struct clip_ctx * ctx);
+void clip_free(struct clip_ctx * ctx);

-CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
+size_t clip_embd_nbytes(struct clip_ctx * ctx);
+int clip_n_patches(struct clip_ctx * ctx);
+int clip_n_mmproj_embd(struct clip_ctx * ctx);

-CLIP_API int clip_n_patches    (const struct clip_ctx * ctx);
-CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+    uint8_t * data;
+    size_t size;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+    float * data;
+    size_t size;
+};

 struct clip_image_u8_batch {
    struct clip_image_u8 * data;
@@ -54,22 +55,16 @@ struct clip_image_f32_batch {
    size_t size;
 };

-CLIP_API struct clip_image_u8  * clip_image_u8_init ();
-CLIP_API struct clip_image_f32 * clip_image_f32_init();
+struct clip_image_u8 * make_clip_image_u8();
+struct clip_image_f32 * make_clip_image_f32();
+bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
+bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, const bool pad2square);
+bool clip_image_encode(const struct clip_ctx * ctx, const int n_threads, struct clip_image_f32 * img, float * vec);

-CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
-CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
+bool clip_image_batch_encode(const struct clip_ctx * ctx, const int n_threads, const struct clip_image_f32_batch * imgs,
+                             float * vec);

-CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
-
-/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
-CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
-
-CLIP_API bool clip_image_preprocess  (struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res, bool pad2square);
-CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
-CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
-
-CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
+bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype);

 #ifdef __cplusplus
 }
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -5,7 +5,7 @@ import json
 import torch
 import numpy as np
 from gguf import *
-from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
+from transformers import CLIPModel, CLIPProcessor

 TEXT = "clip.text"
 VISION = "clip.vision"
@@ -51,7 +51,7 @@ def bytes_to_unicode():
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
@@ -78,19 +78,11 @@ ap.add_argument("--text-only", action="store_true", required=False,
                help="Save a text-only model. It can't be used to encode images")
 ap.add_argument("--vision-only", action="store_true", required=False,
                help="Save a vision-only model. It can't be used to encode texts")
-ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
-                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
-# Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
-default_image_mean = [0.48145466, 0.4578275, 0.40821073]
-default_image_std = [0.26862954, 0.26130258, 0.27577711]
-ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None)
-ap.add_argument('--image_std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None)

-# with proper
 args = ap.parse_args()


@@ -104,22 +96,15 @@ if args.use_f32:
 # output in the same directory as the model if output_dir is None
 dir_model = args.model_dir

-if args.clip_model_is_vision:
-    vocab = None
-    tokens = None
-else:
-    with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
-        vocab = json.load(f)
-        tokens = [key for key in vocab]
+
+with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
+    vocab = json.load(f)
+    tokens = [key for key in vocab]

 with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    config = json.load(f)
-    if args.clip_model_is_vision:
-        v_hparams = config
-        t_hparams = None
-    else:
-        v_hparams = config["vision_config"]
-        t_hparams = config["text_config"]
+    v_hparams = config["vision_config"]
+    t_hparams = config["text_config"]

 # possible data types
 #   ftype == 0 -> float32
@@ -132,12 +117,9 @@ ftype = 1
 if args.use_f32:
    ftype = 0

-if args.clip_model_is_vision:
-    model = CLIPVisionModel.from_pretrained(dir_model)
-    processor = None
-else:
-    model = CLIPModel.from_pretrained(dir_model)
-    processor = CLIPProcessor.from_pretrained(dir_model)
+
+model = CLIPModel.from_pretrained(dir_model)
+processor = CLIPProcessor.from_pretrained(dir_model)

 fname_middle = None
 has_text_encoder = True
@@ -146,13 +128,13 @@ has_llava_projector = False
 if args.text_only:
    fname_middle = "text-"
    has_vision_encoder = False
+elif args.vision_only:
+    fname_middle = "vision-"
+    has_text_encoder = False
 elif args.llava_projector is not None:
    fname_middle = "mmproj-"
    has_text_encoder = False
    has_llava_projector = True
-elif args.vision_only:
-    fname_middle = "vision-"
-    has_text_encoder = False
 else:
    fname_middle = ""

@@ -200,12 +182,8 @@ if has_vision_encoder:
    block_count = v_hparams["num_hidden_layers"] - 1 if has_llava_projector else v_hparams["num_hidden_layers"]
    fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), block_count)

-    if processor is not None:
-        image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
-        image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std
-    else:
-        image_mean = args.image_mean if args.image_mean is not None else default_image_mean
-        image_std = args.image_std if args.image_std is not None else default_image_std
+    image_mean = processor.image_processor.image_mean if args.image_mean is None else args.image_mean
+    image_std = processor.image_processor.image_std if args.image_std is None else args.image_std
    fout.add_array("clip.vision.image_mean", image_mean)
    fout.add_array("clip.vision.image_std", image_std)

--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -1,255 +0,0 @@
-#include "ggml.h"
-#include "common.h"
-#include "clip.h"
-#include "llava.h"
-#include "llama.h"
-
-#include "base64.hpp"
-
-#include <cstdio>
-#include <cstdlib>
-#include <vector>
-
-static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
-    int N = (int) tokens.size();
-    for (int i = 0; i < N; i += n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
-            fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
-            return false;
-        }
-        *n_past += n_eval;
-    }
-    return true;
-}
-
-static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    return eval_tokens(ctx_llama, tokens, 1, n_past);
-}
-
-static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
-    std::string              str2     = str;
-    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
-    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
-    return true;
-}
-
-static const char * sample(struct llama_sampling_context * ctx_sampling,
-                           struct llama_context * ctx_llama,
-                           int * n_past) {
-    const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
-    llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
-    static std::string ret;
-    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
-        ret = "</s>";
-    } else {
-        ret = llama_token_to_piece(ctx_llama, id);
-    }
-    eval_id(ctx_llama, id, n_past);
-    return ret.c_str();
-}
-
-static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
-static const char* IMG_BASE64_TAG_END = "\">";
-
-static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
-    begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
-    end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
-}
-
-static bool prompt_contains_image(const std::string& prompt) {
-    size_t begin, end;
-    find_image_tag_in_prompt(prompt, begin, end);
-    return (begin != std::string::npos);
-}
-
-// replaces the base64 image tag in the prompt with `replacement`
-static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
-    size_t img_base64_str_start, img_base64_str_end;
-    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
-    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
-        return NULL;
-    }
-
-    auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
-    auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
-    auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
-
-    auto required_bytes = base64::required_encode_size(base64_str.size());
-    auto img_bytes = std::vector<unsigned char>(required_bytes);
-    base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
-
-    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
-    if (!embed) {
-        fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
-        return NULL;
-    }
-
-    return embed;
-}
-
-static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
-    size_t begin, end;
-    find_image_tag_in_prompt(prompt, begin, end);
-    if (begin == std::string::npos || end == std::string::npos) {
-        return prompt;
-    }
-    auto pre = prompt.substr(0, begin);
-    auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
-    return pre + replacement + post;
-}
-
-struct llava_context {
-    struct clip_ctx * ctx_clip = NULL;
-    struct llama_context * ctx_llama = NULL;
-    struct llama_model * model = NULL;
-};
-
-static void show_additional_info(int /*argc*/, char ** argv) {
-    fprintf(stderr, "\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    fprintf(stderr, "  note: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
-
-    // load and preprocess the image
-    llava_image_embed * embed = NULL;
-    auto prompt = params->prompt;
-    if (prompt_contains_image(prompt)) {
-        if (!params->image.empty()) {
-            fprintf(stderr, "using base64 encoded image instead of command line image path\n");
-        }
-        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
-        if (!embed) {
-            fprintf(stderr, "%s: can't load image from prompt\n", __func__);
-            return NULL;
-        }
-        params->prompt = remove_image_from_prompt(prompt);
-    } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
-        if (!embed) {
-            fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
-            return NULL;
-        }
-    }
-
-    return embed;
-}
-
-static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
-    int n_past = 0;
-
-    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
-
-    // llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
-    eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant.  The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
-    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
-    eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
-
-    // generate the response
-
-    fprintf(stderr, "\n");
-
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-
-    for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
-        if (strcmp(tmp, "</s>") == 0) break;
-
-        printf("%s", tmp);
-        fflush(stdout);
-    }
-
-    llama_sampling_free(ctx_sampling);
-    printf("\n");
-}
-
-
-static struct llava_context * llava_init(gpt_params * params) {
-    const char * clip_path = params->mmproj.c_str();
-
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
-
-    llama_backend_init(params->numa);
-
-    llama_model_params model_params = llama_model_params_from_gpt_params(*params);
-
-    llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
-    if (model == NULL) {
-        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
-        return NULL;
-    }
-
-    llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
-    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
-
-    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
-
-    if (ctx_llama == NULL) {
-        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
-        return NULL;
-    }
-
-    auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
-
-    ctx_llava->ctx_llama = ctx_llama;
-    ctx_llava->ctx_clip = ctx_clip;
-    ctx_llava->model = model;
-    return ctx_llava;
-}
-
-static void llava_free(struct llava_context * ctx_llava) {
-    if (ctx_llava->ctx_clip) {
-        clip_free(ctx_llava->ctx_clip);
-        ctx_llava->ctx_clip = NULL;
-    }
-
-    llama_free(ctx_llava->ctx_llama);
-    llama_free_model(ctx_llava->model);
-    llama_backend_free();
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        show_additional_info(argc, argv);
-        return 1;
-    }
-    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        gpt_print_usage(argc, argv, params);
-        show_additional_info(argc, argv);
-        return 1;
-    }
-
-    auto ctx_llava = llava_init(&params);
-    if (ctx_llava == NULL) {
-        fprintf(stderr, "%s: error: failed to init llava\n", __func__);
-        return 1;
-    }
-
-    auto image_embed = load_image(ctx_llava, &params);
-
-    // process the prompt
-    process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
-    llama_print_timings(ctx_llava->ctx_llama);
-
-    llava_image_embed_free(image_embed);
-    llava_free(ctx_llava);
-    return 0;
-}
--- a/examples/llava/llava-utils.h
+++ b/examples/llava/llava-utils.h
@@ -0,0 +1,147 @@
+#pragma once
+
+// this one and clip lib will be eventually merged to a single lib, let's keep it this way for now
+
+#include "common.h"
+#include "llama.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <vector>
+
+inline bool eval_image_embd(llama_context * ctx_llama, float * embd, int N, int n_batch, int * n_past) {
+    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
+
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = N - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        llama_batch batch = {int32_t(n_eval), nullptr, (embd+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
+        if (llama_decode(ctx_llama, batch)) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+inline bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
+    int N = (int) tokens.size();
+    for (int i = 0; i < N; i += n_batch) {
+        int n_eval = (int) tokens.size() - i;
+        if (n_eval > n_batch) {
+            n_eval = n_batch;
+        }
+        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
+            fprintf(stderr, "%s : failed to eval\n", __func__);
+            return false;
+        }
+        *n_past += n_eval;
+    }
+    return true;
+}
+
+inline bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
+    std::vector<llama_token> tokens;
+    tokens.push_back(id);
+    return eval_tokens(ctx_llama, tokens, 1, n_past);
+}
+
+inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
+    std::string              str2     = str;
+    std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos);
+    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
+    return true;
+}
+
+// TODO: use common/sampling.h
+inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
+    auto & sparams = params.sparams;
+
+    // out of user input, sample next token
+    const float   temp      = sparams.temp;
+    const int32_t top_k     = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
+    const float   top_p     = sparams.top_p;
+    const float   tfs_z     = sparams.tfs_z;
+    const float   typical_p = sparams.typical_p;
+    // const int32_t repeat_last_n   = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
+    // const float   repeat_penalty  = sparams.repeat_penalty;
+    // const float   alpha_presence  = sparams.presence_penalty;
+    // const float   alpha_frequency = sparams.frequency_penalty;
+    const int     mirostat     = sparams.mirostat;
+    const float   mirostat_tau = sparams.mirostat_tau;
+    const float   mirostat_eta = sparams.mirostat_eta;
+    // const bool    penalize_nl     = sparams.penalize_nl;
+
+    llama_token id = 0;
+    {
+        auto logits  = llama_get_logits(ctx_llama);
+        auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
+
+        // Apply params.logit_bias map
+        for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
+            logits[it->first] += it->second;
+        }
+
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+            candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+
+        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+        // TODO: Apply penalties
+        // float nl_logit = logits[llama_token_nl(ctx)];
+        // auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
+        // llama_sample_repetition_penalty(ctx, &candidates_p,
+        //      last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        //      last_n_repeat, repeat_penalty);
+        // llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
+        // last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
+        // last_n_repeat, alpha_frequency, alpha_presence);
+        // if (!penalize_nl) {
+        //     logits[llama_token_nl(ctx)] = nl_logit;
+        // }
+
+        if (temp <= 0) {
+              // Greedy sampling
+            id = llama_sample_token_greedy(ctx_llama, &candidates_p);
+        } else {
+            if (mirostat == 1) {
+                static float mirostat_mu = 2.0f * mirostat_tau;
+                const  int mirostat_m    = 100;
+                llama_sample_temp(ctx_llama, &candidates_p, temp);
+                id = llama_sample_token_mirostat(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
+            } else if (mirostat == 2) {
+                static float mirostat_mu = 2.0f * mirostat_tau;
+                llama_sample_temp(ctx_llama, &candidates_p, temp);
+                id = llama_sample_token_mirostat_v2(ctx_llama, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
+            } else {
+                  // Temperature sampling
+                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
+                llama_sample_tail_free(ctx_llama, &candidates_p, tfs_z, 1);
+                llama_sample_typical(ctx_llama, &candidates_p, typical_p, 1);
+                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
+                llama_sample_temp(ctx_llama, &candidates_p, temp);
+                id = llama_sample_token(ctx_llama, &candidates_p);
+            }
+        }
+    }
+
+    return id;
+}
+
+inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
+    int id = sample_id(ctx_llama, params);
+    static std::string ret;
+    if (id == llama_token_eos(llama_get_model(ctx_llama))) {
+        ret = "</s>";
+    } else {
+        ret = llama_token_to_piece(ctx_llama, id);
+    }
+    eval_id(ctx_llama, id, n_past);
+    return ret.c_str();
+}
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -1,163 +1,164 @@
 #include "clip.h"
+#include "llava-utils.h"
 #include "common.h"
 #include "llama.h"
-#include "llava.h"

 #include <cstdio>
 #include <cstdlib>
 #include <vector>

-#include "base64.hpp"
-
-static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
-    clip_image_f32 * img_res = clip_image_f32_init();
-    if (!clip_image_preprocess(ctx_clip, img, img_res, /*pad2square =*/ true)) {
-        fprintf(stderr, "%s: unable to preprocess image\n", __func__);
-        clip_image_f32_free(img_res);
-        return false;
-    }
-
-    *n_img_pos = clip_n_patches(ctx_clip);
-
-    const int64_t t_img_enc_start_us = ggml_time_us();
-    bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
-    clip_image_f32_free(img_res);
-    if (!encoded) {
-        fprintf(stderr, "Unable to encode image\n");
-
-        return false;
-    }
-
-    const int64_t t_img_enc_end_us = ggml_time_us();
-    float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
-
-    printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
-
-    return true;
+static void show_additional_info(int /*argc*/, char ** argv) {
+    printf("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    printf("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }

-bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip) {
-        // make sure that the correct mmproj was used, i.e., compare apples to apples
-    int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
-    auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
-    if (n_image_embd != n_llama_embd) {
-        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
-        return false;
-    }
-    return true;
-}
+int main(int argc, char ** argv) {
+    ggml_time_init();
+
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        show_additional_info(argc, argv);
+        return 1;
+    }
+
+    if (params.mmproj.empty() || params.image.empty()) {
+        gpt_print_usage(argc, argv, params);
+        show_additional_info(argc, argv);
+        return 1;
+    }
+
+    const char * clip_path = params.mmproj.c_str();
+    const char * img_path = params.image.c_str();
+
+    if (params.prompt.empty()) {
+        params.prompt = "describe the image in detail.";
+    }
+
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
+
+    // load and preprocess the image
+    clip_image_u8 img;
+    clip_image_f32 img_res;
+
+    if (!clip_image_load_from_file(img_path, &img)) {
+        fprintf(stderr, "%s: is %s really an image file?\n", __func__, img_path);
+
+        clip_free(ctx_clip);
+        return 1;
+    }
+
+    if (!clip_image_preprocess(ctx_clip, &img, &img_res, /*pad2square =*/ true)) {
+        fprintf(stderr, "%s: unable to preprocess %s\n", __func__, img_path);
+
+        clip_free(ctx_clip);
+        return 1;
+    }
+
+    int n_img_pos  = clip_n_patches(ctx_clip);
+    int n_img_embd = clip_n_mmproj_embd(ctx_clip);

-static bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
    float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip));
+
    if (!image_embd) {
        fprintf(stderr, "Unable to allocate memory for image embeddings\n");
+
+        return 1;
+    }
+
+    const int64_t t_img_enc_start_us = ggml_time_us();
+    if (!clip_image_encode(ctx_clip, params.n_threads, &img_res, image_embd)) {
+        fprintf(stderr, "Unable to encode image\n");
+
+        return 1;
+    }
+    const int64_t t_img_enc_end_us = ggml_time_us();
+
+    // we get the embeddings, free up the memory required for CLIP
+    clip_free(ctx_clip);
+
+    llama_backend_init(params.numa);
+
+    llama_model_params model_params              = llama_model_default_params();
+                       model_params.n_gpu_layers = params.n_gpu_layers;
+                       model_params.main_gpu     = params.main_gpu;
+                       model_params.tensor_split = params.tensor_split;
+                       model_params.use_mmap     = params.use_mmap;
+                       model_params.use_mlock    = params.use_mlock;
+
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    llama_context_params ctx_params = llama_context_default_params();
+
+    ctx_params.n_ctx           = params.n_ctx < 2048 ? 2048 : params.n_ctx; // we need a longer context size to process image embeddings
+    ctx_params.n_threads       = params.n_threads;
+    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.seed            = params.seed;
+
+    llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx_llama == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    // make sure that the correct mmproj was used, i.e., compare apples to apples
+    const int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
+
+    if (n_img_embd != n_llama_embd) {
+        printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_img_embd, n_llama_embd);
+
+        llama_free(ctx_llama);
+        llama_free_model(model);
+        llama_backend_free();
        free(image_embd);
-        return false;
+
+        return 1;
    }

-    int n_img_pos;
-    if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
-        fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
-        free(image_embd);
-        return false;
-    }
-    *image_embd_out = image_embd;
-    *n_img_pos_out = n_img_pos;
+    // process the prompt
+    // llava chat format is "<system_prompt>USER: <image_embeddings>\n<textual_prompt>\nASSISTANT:"

-    return true;
-}
-
-bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
-    int n_embd  = llama_n_embd(llama_get_model(ctx_llama));
-
-    for (int i = 0; i < image_embed->n_image_pos; i += n_batch) {
-        int n_eval = image_embed->n_image_pos - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
-        if (llama_decode(ctx_llama, batch)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return false;
-        }
-        *n_past += n_eval;
-    }
-    return true;
-}
-
-LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length) {
-    clip_image_u8 * img = clip_image_u8_init();
-    if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
-        clip_image_u8_free(img);
-        fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
-        return NULL;
-    }
-
-    float* image_embed = NULL;
-    int n_image_pos = 0;
-    bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
-    if (!image_embed_result) {
-        clip_image_u8_free(img);
-        fprintf(stderr, "%s: coulnd't embed the image\n", __func__);
-        return NULL;
-    }
-
-    clip_image_u8_free(img);
-    auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed));
-    result->embed = image_embed;
-    result->n_image_pos = n_image_pos;
-    return result;
-}
-
-static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
-    auto file = fopen(path, "rb");
-    if (file == NULL) {
-        fprintf(stderr, "%s: can't read file %s\n", __func__, path);
-        return false;
-    }
-
-    fseek(file, 0, SEEK_END);
-    auto fileSize = ftell(file);
-    fseek(file, 0, SEEK_SET);
-
-    auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
-    if (buffer == NULL) {
-        fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
-        perror("Memory allocation error");
-        fclose(file);
-        return false;
-    }
-    errno = 0;
-    size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
-    if (ferror(file)) {
-        die_fmt("read error: %s", strerror(errno));
-    }
-    if (ret != (size_t) fileSize) {
-        die("unexpectedly reached end of file");
-    }
-    fclose(file); // Close the file
-
-    *bytesOut = buffer;
-    *sizeOut = fileSize;
-    return true;
-}
-
-LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path) {
-    unsigned char* image_bytes;
-    long image_bytes_length;
-    auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
-    if (!loaded) {
-        fprintf(stderr, "%s: failed to load %s\n", __func__, image_path);
-        return NULL;
-    }
-
-    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, image_bytes, image_bytes_length);
-    free(image_bytes);
-
-    return embed;
-}
-
-LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed) {
-    free(embed->embed);
-    free(embed);
+    int n_past = 0;
+
+    const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
+
+    eval_string(ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params.n_batch, &n_past, true);
+    eval_image_embd(ctx_llama, image_embd, n_img_pos, params.n_batch, &n_past);
+    eval_string(ctx_llama, (params.prompt + "\nASSISTANT:").c_str(), params.n_batch, &n_past, false);
+
+    // generate the response
+
+    printf("\n");
+    printf("prompt: '%s'\n", params.prompt.c_str());
+    printf("\n");
+
+    for (int i = 0; i < max_tgt_len; i++) {
+        const char * tmp = sample(ctx_llama, params, &n_past);
+        if (strcmp(tmp, "</s>") == 0) break;
+
+        printf("%s", tmp);
+        fflush(stdout);
+    }
+
+    printf("\n");
+
+    {
+        const float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
+
+        printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / n_img_pos);
+    }
+
+    llama_print_timings(ctx_llama);
+
+    llama_free(ctx_llama);
+    llama_free_model(model);
+    llama_backend_free();
+    free(image_embd);
+
+    return 0;
 }
--- a/examples/llava/llava.h
+++ b/examples/llava/llava.h
@@ -1,50 +0,0 @@
-#ifndef LLAVA_H
-#define LLAVA_H
-
-#include "ggml.h"
-
-
-#ifdef LLAMA_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef LLAMA_BUILD
-#            define LLAVA_API __declspec(dllexport)
-#        else
-#            define LLAVA_API __declspec(dllimport)
-#        endif
-#    else
-#        define LLAVA_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define LLAVA_API
-#endif
-
-struct clip_ctx;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct llava_image_embed {
-    float * embed;
-    int n_image_pos;
-};
-
-/** sanity check for clip <-> llava embed size match */
-LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
-
-/** build an image embed from image file bytes */
-LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
-/** build an image embed from a path to an image filename */
-LLAVA_API struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx * ctx_clip, int n_threads, const char * image_path);
-LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed);
-/** free an embedding made with llava_image_embed_make_* */
-
-/** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */
-LLAVA_API bool llava_eval_image_embed(struct llama_context * ctx_llama, const struct llava_image_embed * embed, int n_batch, int * n_past);
-
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/examples/lookahead/CMakeLists.txt
+++ b/examples/lookahead/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET lookahead)
-add_executable(${TARGET} lookahead.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/lookahead/README.md
+++ b/examples/lookahead/README.md
@@ -1,7 +0,0 @@
-# llama.cpp/examples/lookahead
-
-Demonstration of lookahead decoding technique:
-
-https://lmsys.org/blog/2023-11-21-lookahead-decoding/
-
-More info: https://github.com/ggerganov/llama.cpp/pull/4207
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -1,487 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include <cmath>
-#include <cstdio>
-#include <string>
-#include <vector>
-
-struct ngram_data {
-    bool active = false;
-
-    llama_seq_id seq_id = -1;
-
-    std::vector<int> i_batch;
-
-    std::vector<llama_token> tokens;
-};
-
-// n-gram container
-struct ngram_container {
-    ngram_container(int n_vocab, int N, int G) {
-        cnt.resize(n_vocab);
-        head.resize(n_vocab);
-        tokens.resize(n_vocab * G * (N - 1));
-    }
-
-    int n_total = 0;
-
-    std::vector<int> cnt;
-    std::vector<int> head;
-
-    // [n_vocab][G][N - 1]
-    // for each token of the vocab, keep a ring-buffer of capacity G of n-grams of size N - 1
-    std::vector<llama_token> tokens;
-};
-
-int main(int argc, char ** argv) {
-    gpt_params params;
-
-    if (gpt_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    const int W = 15; // lookahead window
-    const int N = 5;  // n-gram size
-    const int G = 15; // max verification n-grams
-
-    const bool dump_kv_cache = params.dump_kv_cache;
-
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("lookahead", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
-    // init llama.cpp
-    llama_backend_init(params.numa);
-
-    llama_model * model = NULL;
-    llama_context * ctx = NULL;
-
-    // load the target model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-
-    // Tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos tgt: %d\n", add_bos);
-
-    std::vector<llama_token> inp;
-    std::vector<llama_token> all;
-
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
-    all = inp;
-
-    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4;
-
-    if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
-        return 1;
-    }
-
-    fprintf(stderr, "\n\n");
-
-    for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
-    }
-
-    fflush(stderr);
-
-    const int n_input = inp.size();
-
-    const auto t_enc_start = ggml_time_us();
-
-    // eval the prompt
-    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
-    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
-
-    for (int s = 1; s < W + G + 1; ++s) {
-        llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
-    }
-
-    const auto t_enc_end = ggml_time_us();
-
-    int n_predict = 0;
-    int n_accept  = 0;
-
-    int n_past = inp.size();
-
-    llama_token id = 0;
-
-    // used to determine end of generation
-    bool has_eos = false;
-
-    // for each decoded batch, we have at most W + G + 1 distinct sequences:
-    // seq_id == 0           : the current input token
-    // seq_id [1, W]         : tokens from the past N - 1 Jacobi iterations
-    // seq_id [W + 1, W + G] : verification n-grams
-    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
-
-    // target model sampling context
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
-
-    // verification n-grams
-    std::vector<ngram_data> ngrams_cur(G);
-
-    // tokens for the past N - 1 Jacobi iterations
-    std::vector<llama_token> tokens_j_prev(W);
-    std::vector<std::vector<llama_token>> tokens_j(N - 1);
-    for (int j = 0; j < N - 1; j++) {
-        tokens_j[j].resize(W);
-
-        for (int i = 0; i < W; i++) {
-            // there are different ways to init these tokens
-            if (0) {
-                // initialize randomly from the prompt tokens
-                tokens_j[j][i] = all[1 + rand() % (all.size() - 1)];
-            } else {
-                // initialize with a sequence of increasing numbers
-                tokens_j[j][i] = 100 + i;
-            }
-        }
-    }
-
-    std::vector<llama_seq_id> seq_id_look;
-
-    // the input token belongs both to all sequences
-    std::vector<llama_seq_id> seq_id_all(W + G + 1);
-    for (int i = 0; i < W + G + 1; i++) {
-        seq_id_all[i] = i;
-    }
-
-    // here we keep adding new n-grams as we go
-    ngram_container ngrams_observed(llama_n_vocab(model), N, G);
-
-    // debug
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
-
-    const auto t_dec_start = ggml_time_us();
-
-    // sample first token
-    {
-        id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
-
-        llama_sampling_accept(ctx_sampling, ctx, id, true);
-
-        {
-            const std::string token_str = llama_token_to_piece(ctx, id);
-
-            printf("%s", token_str.c_str());
-            fflush(stdout);
-        }
-    }
-
-    while (true) {
-        // debug
-        if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
-            dump_kv_cache_view_seqs(kvc_view, 40);
-        }
-
-        // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
-        //
-        // Example for W = 5, N = 4, G = 2:
-        // (I = input, L = lookahead, V = verification)
-        //
-        // Batch:  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
-        // T:        -2 -2 -2 -2 -1 -1 -1 -1 -1  0  0  0  0  0  0
-        // Info:   I  L  L  L  L  L  L  L  L  L  L  L  L  L  L  V  V  V  V  V  V
-        // Pos:    0  1  2  3  4  1  2  3  4  5  2  3  4  5  6  1  2  3  1  2  3   (+ n_past)
-        // Logits: 1  0  0  0  0  0  0  0  0  0  1  1  1  1  1  1  1  1  1  1  1
-        // ---------------------------------------------------------------------
-        // Seq:    0
-        //         1              1              1
-        //         2  2              2              2
-        //         3  3  3              3              3
-        //         4  4  4  4              4              4
-        //         5  5  5  5  5              5              5
-        //         6                                            6  6  6
-        //         7                                                     7  7  7
-        // ---------------------------------------------------------------------
-        //                                       |  |  |  |  |  |  |  |  |  |  |
-        //                                       V  V  V  V  V  |  |  |  |  |  |
-        //                                         j_tokens     |  |  |  |  |  |
-        //                                                      V  V  V  V  V  V
-        //                                                             id
-        {
-            llama_batch_clear(batch);
-
-            // current token - first token of the first level
-            llama_batch_add(batch, id, n_past, seq_id_all, true);
-
-            // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
-            {
-                const int g_cur = ngrams_observed.cnt[id];
-
-                ngrams_cur.resize(g_cur);
-                for (int g = 0; g < g_cur; g++) {
-                    ngrams_cur[g].active = true;
-                    ngrams_cur[g].tokens.resize(N);
-                    ngrams_cur[g].i_batch.resize(N);
-                    ngrams_cur[g].seq_id = W + 1 + g;
-                    ngrams_cur[g].i_batch[0] = 0;
-                    ngrams_cur[g].tokens [0] = id;
-                }
-
-                for (int j = 0; j < N - 1; j++) {
-                    for (int g = 0; g < g_cur; g++) {
-                        const int idx = id*(N - 1)*G + g*(N - 1);
-
-                        const llama_token t = ngrams_observed.tokens[idx + j];
-
-                        ngrams_cur[g].tokens [j + 1] = t;
-                        ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
-
-                        llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
-                    }
-                }
-            }
-
-            // fill the remaining W - 1 tokens for the first level
-            for (int i = 1; i < W; i++) {
-                seq_id_look.resize(W - i);
-                for (int j = 0; j < W - i; j++) {
-                    seq_id_look[j] = i + j + 1;
-                }
-
-                llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
-            }
-
-            // fill the rest of the levels
-            for (int j = 1; j < N - 1; j++) {
-                for (int i = 0; i < W; i++) {
-                    llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
-                }
-            }
-        }
-
-        if (llama_decode(ctx, batch) != 0) {
-            fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
-            return 1;
-        }
-
-        int seq_id_best = 0;
-
-        for (int v = 0; v < N; ++v) {
-            int i_batch = 0;
-
-            // if no active ngrams are left, it means the sampled token does not pass the verification
-            if (v > 0) {
-                for (int g = 0; g < (int) ngrams_cur.size(); g++) {
-                    if (ngrams_cur[g].active) {
-                        i_batch = ngrams_cur[g].i_batch[v];
-                        seq_id_best = ngrams_cur[g].seq_id;
-
-                        ++n_accept;
-                        break;
-                    }
-                }
-
-                // no more matches -> create a new batch
-                if (i_batch == 0) {
-                    break;
-                }
-            }
-
-            // sample the next token
-            id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
-
-            llama_sampling_accept(ctx_sampling, ctx, id, true);
-
-            // print
-            {
-                const std::string token_str = llama_token_to_piece(ctx, id);
-
-                if (v == 0) {
-                    printf("%s", token_str.c_str());
-                } else {
-                    // print light cyan
-                    printf("\033[0;96m%s\033[0m", token_str.c_str());
-                }
-                fflush(stdout);
-
-                if (id == llama_token_eos(model)) {
-                    has_eos = true;
-                }
-
-                all.push_back(id);
-            }
-
-            ++n_predict;
-            ++n_past;
-
-            if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
-                break;
-            }
-
-            // verify across active n-grams
-            for (int g = 0; g < (int) ngrams_cur.size(); g++) {
-                if (ngrams_cur[g].active) {
-                    if (v == N - 1) {
-                        ngrams_cur[g].active = false;
-                    } else {
-                        if (id != ngrams_cur[g].tokens[v + 1]) {
-                            ngrams_cur[g].active = false;
-                        }
-                    }
-                }
-            }
-
-            // print known n-grams starting with token id (debug)
-            if (0 && v == 0) {
-                if (ngrams_observed.cnt[id] > 0) {
-                    printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
-                }
-
-                for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
-                    printf("   - ngram %2d: ", i);
-
-                    const int idx = id*(N - 1)*G + i*(N - 1);
-
-                    for (int j = 0; j < N - 1; j++) {
-                        const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
-
-                        printf("%s", token_str.c_str());
-                    }
-
-                    printf("\n");
-                }
-            }
-
-            // update lookahead tokens
-            {
-                for (int i = 0; i < W; i++) {
-                    tokens_j_prev[i] = tokens_j[0][i];
-                }
-
-                for (int j = 0; j < N - 2; j++) {
-                    tokens_j[j] = tokens_j[j + 1];
-                }
-
-                if (v == 0) {
-                    // sample from the last level
-                    for (int i = 0; i < W; i++) {
-                        tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
-                    }
-                } else {
-                    for (int i = 0; i < W; i++) {
-                        // there are different ways to init these tokens
-                        if (0) {
-                            // random init
-                            tokens_j[N - 2][i] = all[1 + rand() % (all.size() - 1)];
-                        } else {
-                            // init from the previous level
-                            tokens_j[N - 2][i] = tokens_j[0][i];
-                        }
-                    }
-                }
-            }
-
-            // update observed ngrams
-            if (v == 0) {
-                // the first token of the n-gram is determined by the index in the container so it is not stored
-                std::vector<llama_token> ngram(N - 1);
-
-                // n-gram generation
-                // ref: https://github.com/hao-ai-lab/LookaheadDecoding/issues/14#issuecomment-1826198518
-                for (int f = 0; f < W; ++f) {
-                    const int ft = tokens_j_prev[f]; // first token of the n-gram
-
-                    for (int j = 0; j < N - 1; ++j) {
-                        ngram[j] = tokens_j[j][f];
-                    }
-
-                    // filter-out repeating n-grams
-                    {
-                        bool is_unique = true;
-
-                        for (int k = 0; k < ngrams_observed.cnt[ft]; ++k) {
-                            const int idx = ft*(N - 1)*G + k*(N - 1);
-
-                            bool is_match = true;
-                            for (int j = 0; j < N - 1; ++j) {
-                                if (ngrams_observed.tokens[idx + j] != ngram[j]) {
-                                    is_match = false;
-                                    break;
-                                }
-                            }
-
-                            if (is_match) {
-                                is_unique = false;
-                                break;
-                            }
-                        }
-
-                        if (!is_unique) {
-                            continue;
-                        }
-                    }
-
-                    const int head = ngrams_observed.head[ft];
-                    const int idx  = ft*(N - 1)*G + head*(N - 1);
-
-                    for (int i = 0; i < N - 1; i++) {
-                        ngrams_observed.tokens[idx + i] = ngram[i];
-                    }
-
-                    ngrams_observed.cnt[ft]  = std::min(G, ngrams_observed.cnt[ft] + 1);
-                    ngrams_observed.head[ft] = (head + 1) % G;
-
-                    ngrams_observed.n_total++;
-                }
-            }
-        }
-
-        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
-            break;
-        }
-
-        // KV cache management
-        // if no verification token matched, we simply remove all cells from this batch -> no fragmentation
-        llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
-
-        if (seq_id_best != 0) {
-            // if a verification token matched, we keep the best sequence and remove the rest
-            // this leads to some KV cache fragmentation
-            llama_kv_cache_seq_keep(ctx, seq_id_best);
-            llama_kv_cache_seq_cp  (ctx, seq_id_best, 0, -1, -1);
-            llama_kv_cache_seq_rm  (ctx, seq_id_best,    -1, -1);
-
-            for (int s = 1; s < W + G + 1; ++s) {
-                llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
-            }
-        }
-    }
-
-    auto t_dec_end = ggml_time_us();
-
-    LOG_TEE("\n\n");
-
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
-
-    LOG_TEE("\n");
-    LOG_TEE("W = %2d\n", W);
-    LOG_TEE("N = %2d\n", N);
-    LOG_TEE("G = %2d\n", G);
-    LOG_TEE("\n");
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_accept  = %d\n", n_accept);
-
-    llama_print_timings(ctx);
-
-    llama_kv_cache_view_free(&kvc_view);
-    llama_sampling_free(ctx_sampling);
-
-    llama_batch_free(batch);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    fprintf(stderr, "\n\n");
-
-    return 0;
-}
--- a/examples/lookup/CMakeLists.txt
+++ b/examples/lookup/CMakeLists.txt
@@ -1,5 +0,0 @@
-set(TARGET lookup)
-add_executable(${TARGET} lookup.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/lookup/README.md
+++ b/examples/lookup/README.md
@@ -1,13 +0,0 @@
-# llama.cpp/examples/lookup
-
-Demonstration of Prompt Lookup Decoding
-
-https://github.com/apoorvumang/prompt-lookup-decoding
-
-The key parameters for lookup decoding are `ngram_min`, `ngram_max` and `n_draft`. The first two determine the size of the ngrams to search for in the prompt for a match. The latter specifies how many subsequent tokens to draft if a match is found.
-
-More info:
-
-https://github.com/ggerganov/llama.cpp/pull/4484
-https://github.com/ggerganov/llama.cpp/issues/4226
-
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -1,230 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include <cmath>
-#include <cstdio>
-#include <string>
-#include <vector>
-
-int main(int argc, char ** argv){
-    gpt_params params;
-
-    if (!gpt_params_parse(argc, argv, params)) {
-        return 1;
-    }
-
-    // max/min n-grams size to search for in prompt
-    const int ngram_max = 4;
-    const int ngram_min = 1;
-
-    // length of the candidate / draft sequence, if match is found
-    const int n_draft = params.n_draft;
-
-    const bool dump_kv_cache = params.dump_kv_cache;
-
-#ifndef LOG_DISABLE_LOGS
-    log_set_target(log_filename_generator("lookup", "log"));
-    LOG_TEE("Log start\n");
-    log_dump_cmdline(argc, argv);
-#endif // LOG_DISABLE_LOGS
-
-    // init llama.cpp
-    llama_backend_init(params.numa);
-
-    llama_model * model = NULL;
-    llama_context * ctx = NULL;
-
-    // load the model
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-
-    // tokenize the prompt
-    const bool add_bos = llama_should_add_bos_token(model);
-    LOG("add_bos tgt: %d\n", add_bos);
-
-    std::vector<llama_token> inp;
-    inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
-
-    const int max_context_size     = llama_n_ctx(ctx);
-    const int max_tokens_list_size = max_context_size - 4;
-
-    if ((int) inp.size() > max_tokens_list_size) {
-        fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
-        return 1;
-    }
-
-    fprintf(stderr, "\n\n");
-
-    for (auto id : inp) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
-    }
-
-    fflush(stderr);
-
-    const int n_input = inp.size();
-
-    const auto t_enc_start = ggml_time_us();
-
-    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0,           0));
-    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1, n_input - 1, 0));
-
-    const auto t_enc_end = ggml_time_us();
-
-    int n_predict = 0;
-    int n_drafted = 0;
-    int n_accept  = 0;
-
-    int n_past = inp.size();
-
-    bool has_eos = false;
-
-    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
-
-    std::vector<llama_token> draft;
-
-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
-
-    // debug
-    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
-
-    const auto t_dec_start = ggml_time_us();
-
-    while (true) {
-        // debug
-        if (dump_kv_cache) {
-            llama_kv_cache_view_update(ctx, &kvc_view);
-            dump_kv_cache_view_seqs(kvc_view, 40);
-        }
-
-        // print current draft sequence
-        LOG("drafted %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, draft).c_str());
-
-        int i_dft = 0;
-        while (true) {
-            // sample from the target model
-            llama_token id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_dft);
-
-            llama_sampling_accept(ctx_sampling, ctx, id, true);
-
-            const std::string token_str = llama_token_to_piece(ctx, id);
-
-            if (!params.use_color) {
-                printf("%s", token_str.c_str());
-            }
-
-            if (id == llama_token_eos(model)) {
-                has_eos = true;
-            }
-
-            ++n_predict;
-
-            // check if the target token matches the draft
-            if (i_dft < (int) draft.size() && id == draft[i_dft]) {
-                LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
-                ++n_accept;
-                ++n_past;
-                ++i_dft;
-                inp.push_back(id);
-
-                if (params.use_color) {
-                    // color accepted draft token
-                    printf("\033[34m%s\033[0m", token_str.c_str());
-                    fflush(stdout);
-                }
-                continue;
-            }
-
-            if (params.use_color) {
-                printf("%s", token_str.c_str());
-            }
-            fflush(stdout);
-
-
-            LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
-
-            draft.clear();
-            draft.push_back(id);
-            inp.push_back(id);
-            break;
-        }
-
-        if ((params.n_predict > 0 && n_predict > params.n_predict) || has_eos) {
-            break;
-        }
-
-        // KV cache management
-        // clean the cache of draft tokens that weren't accepted
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
-        llama_batch_clear(batch_tgt);
-        llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
-
-        // generate n_pred tokens through prompt lookup
-        auto prompt_lookup = [&]() -> void {
-            int inp_size = inp.size();
-            for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){
-                const llama_token * ngram = &inp[inp_size - ngram_size];
-
-                for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) {
-                    bool match = true;
-                    for (int j = 0; j < ngram_size; ++j) {
-                        if (inp[i + j] != ngram[j]) {
-                            match = false;
-                            break;
-                        }
-                    }
-
-                    if (match) {
-                        const int startIdx = i + ngram_size;
-                        const int endIdx = startIdx + n_draft;
-                        if (endIdx < inp_size) {
-                            for (int j = startIdx; j < endIdx; ++j) {
-                                LOG(" - draft candidate %d: %d\n", j, inp[j]);
-                                draft.push_back(inp[j]);
-                                llama_batch_add(batch_tgt, inp[j], n_past + (j - startIdx) + 1, { 0 }, true);
-                                ++n_drafted;
-                            }
-                            return;
-                        }
-                    }
-                }
-            }
-            return;
-        };
-
-        prompt_lookup();
-
-        llama_decode(ctx, batch_tgt);
-        ++n_past;
-
-        draft.erase(draft.begin());
-    }
-
-    auto t_dec_end = ggml_time_us();
-
-    LOG_TEE("\n\n");
-
-    LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
-    LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
-
-    LOG_TEE("\n");
-    LOG_TEE("n_draft   = %d\n", n_draft);
-    LOG_TEE("n_predict = %d\n", n_predict);
-    LOG_TEE("n_drafted = %d\n", n_drafted);
-    LOG_TEE("n_accept  = %d\n", n_accept);
-    LOG_TEE("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
-
-    LOG_TEE("\ntarget:\n");
-    llama_print_timings(ctx);
-
-    llama_sampling_free(ctx_sampling);
-    llama_batch_free(batch_tgt);
-
-    llama_free(ctx);
-    llama_free_model(model);
-
-    llama_backend_free();
-
-    fprintf(stderr, "\n\n");
-
-    return 0;
-}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
slaren	47d604fa2d	fix issues	2023-11-05 13:20:22 +01:00
slaren	73c0010e18	Merge remote-tracking branch 'origin/master' into fix-tensor-split-zero	2023-11-05 12:42:43 +01:00
Jared Van Bortel	05c51f96fe	cuda : fix disabling device with --tensor-split 1,0	2023-11-05 00:56:32 -04:00