wip : no ic 8 step

wip
2026-04-30 16:47:31 +03:00 · 2024-01-24 13:25:34 +02:00 · 2024-01-24 11:18:24 +02:00 · 2024-01-24 10:16:05 +02:00 · 2024-01-24 00:01:54 +02:00 · 2024-01-23 22:42:43 +02:00
41 changed files with 1903 additions and 2723 deletions
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@@ -1,26 +0,0 @@
-ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
-ARG UBUNTU_VERSION=22.04
-
-FROM intel/hpckit:$ONEAPI_VERSION as build
-
-RUN apt-get update && \
-    apt-get install -y git
-
-WORKDIR /app
-
-COPY . .
-
-# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
-RUN mkdir build && \
-    cd build && \
-    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
-    cmake --build . --config Release --target main server
-
-FROM ubuntu:$UBUNTU_VERSION as runtime
-
-COPY --from=build /app/build/bin/main /main
-COPY --from=build /app/build/bin/server /server
-
-ENV LC_ALL=C.utf8
-
-ENTRYPOINT [ "/main" ]
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@@ -7,18 +7,6 @@
    { system, ... }:
    {
      _module.args = {
-        # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
-        # again, the below creates several nixpkgs instances which the
-        # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
-        #
-        # This is currently "slow" and "expensive", on a certain scale.
-        # This also isn't "right" in that this hinders dependency injection at
-        # the level of flake inputs. This might get removed in the foreseeable
-        # future.
-        #
-        # Note that you can use these expressions without Nix
-        # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
-
        pkgsCuda = import inputs.nixpkgs {
          inherit system;
          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -73,7 +73,6 @@ let
    ps: [
      ps.numpy
      ps.sentencepiece
-      ps.tiktoken
      ps.torchWithoutCuda
      ps.transformers
    ]
@@ -115,22 +114,14 @@ effectiveStdenv.mkDerivation (
    pname = "llama-cpp${pnameSuffix}";
    version = llamaVersion;

-    # Note: none of the files discarded here are visible in the sandbox or
-    # affect the output hash. This also means they can be modified without
-    # triggering a rebuild.
    src = lib.cleanSourceWith {
      filter =
        name: type:
-        let
-          noneOf = builtins.all (x: !x);
-          baseName = baseNameOf name;
-        in
-        noneOf [
+        !(builtins.any (_: _) [
          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
-          (lib.hasPrefix "." baseName) # Skip hidden files and directories
-          (baseName == "flake.lock")
-        ];
+          (name == "README.md") # Ignore *.md changes whe computing outPaths
+          (lib.hasPrefix "." name) # Skip hidden files and directories
+        ]);
      src = lib.cleanSource ../../.;
    };

@@ -168,7 +159,7 @@ effectiveStdenv.mkDerivation (

    cmakeFlags =
      [
-        (cmakeBool "LLAMA_NATIVE" false)
+        (cmakeBool "LLAMA_NATIVE" true)
        (cmakeBool "LLAMA_BUILD_SERVER" true)
        (cmakeBool "BUILD_SHARED_LIBS" true)
        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@@ -4,10 +4,6 @@
  llamaVersion ? "0.0.0",
 }:

-# We're using `makeScope` instead of just writing out an attrset
-# because it allows users to apply overlays later using `overrideScope'`.
-# Cf. https://noogle.dev/f/lib/makeScope
-
 lib.makeScope newScope (
  self: {
    inherit llamaVersion;
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -295,7 +295,7 @@ jobs:
      OPENBLAS_VERSION: 0.3.23
      OPENCL_VERSION: 2023.04.17
      CLBLAST_VERSION: 1.6.0
-      SDE_VERSION: 9.33.0-2024-01-07
+      SDE_VERSION: 9.21.1-2023-04-24

    strategy:
      matrix:
@@ -400,7 +400,7 @@ jobs:
        id: cmake_test_sde
        if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
        run: |
-          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
+          curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
          # for some weird reason windows tar doesn't like sde tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -35,7 +35,6 @@ jobs:
          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
-          - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v3
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@@ -2,20 +2,13 @@ name: Nix aarch64 builds

 on:
  workflow_dispatch: # allows manual triggering
-  schedule:
-    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
-    # 1.5h instead of minutes with the cold cache).
-    #
-    # randint(0, 59), randint(0, 23)
-    - cron: '26 12 * * *'
-  # But also rebuild if we touched any of the Nix expressions:
  push:
    branches:
      - master
-    paths: ['**/*.nix', 'flake.lock']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/*.nix', 'flake.lock']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']

 jobs:
  nix-build-aarch64:
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@@ -5,8 +5,10 @@ on:
  push:
    branches:
      - master
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
  pull_request:
    types: [opened, synchronize, reopened]
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']

 jobs:
  nix-eval:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,7 +47,6 @@ option(BUILD_SHARED_LIBS                "build shared libraries"
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
 option(LLAMA_NATIVE                     "llama: enable -march=native flag"                      ON)
 option(LLAMA_LTO                        "llama: enable link time optimization"                  OFF)
-option(LLAMA_CCACHE                     "llama: use ccache if available"                        ON)

 # debug
 option(LLAMA_ALL_WARNINGS               "llama: enable all compiler warnings"                   ON)
@@ -108,13 +107,6 @@ option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STA
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER                    "llama: build server example"                      ON)

-
-# add perf arguments
-option(LLAMA_PERF                            "llama: enable perf"                               OFF)
-if (LLAMA_PERF)
-    add_definitions(-DGGML_PERF)
-endif()
-
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)

@@ -478,11 +470,6 @@ function(get_flags CCID CCVER)
        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
            set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
        endif()
-    elseif (CCID MATCHES "Intel")
-        # enable max optimization level when using Intel compiler
-        set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
-        set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
-        add_link_options(-fuse-ld=lld -static-intel)
    endif()

    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
@@ -574,17 +561,6 @@ if (LLAMA_LTO)
    endif()
 endif()

-if (LLAMA_CCACHE)
-    find_program(LLAMA_CCACHE_FOUND ccache)
-    if (LLAMA_CCACHE_FOUND)
-        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
-        set(ENV{CCACHE_SLOPPINESS} time_macros)
-        message(STATUS "Using ccache")
-    else()
-        message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF")
-    endif ()
-endif()
-
 # this version of Apple ld64 is buggy
 execute_process(
    COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
--- a/README.md
+++ b/README.md
@@ -128,7 +128,6 @@ as the main playground for developing new features for the [ggml](https://github
 - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)

 **UI:**

--- a/common/common.cpp
+++ b/common/common.cpp
@@ -203,23 +203,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            params.prompt_cache_all = true;
        } else if (arg == "--prompt-cache-ro") {
            params.prompt_cache_ro = true;
-        } else if (arg == "-bf" || arg == "--binary-file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream file(argv[i], std::ios::binary);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            // store the external file name in params
-            params.prompt_file = argv[i];
-            std::ostringstream ss;
-            ss << file.rdbuf();
-            params.prompt = ss.str();
-            fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
        } else if (arg == "-f" || arg == "--file") {
            if (++i >= argc) {
                invalid_param = true;
@@ -670,12 +653,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                params.logdir += DIRECTORY_SEPARATOR;
            }
-        } else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.logits_file = argv[i];
        } else if (arg == "--perplexity" || arg == "--all-logits") {
            params.logits_all = true;
        } else if (arg == "--ppl-stride") {
@@ -712,16 +689,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.winogrande_tasks = std::stoi(argv[i]);
-        } else if (arg == "--multiple-choice") {
-            params.multiple_choice = true;
-        } else if (arg == "--multiple-choice-tasks") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.multiple_choice_tasks = std::stoi(argv[i]);
-        } else if (arg == "--kl-divergence") {
-            params.kl_divergence = true;
        } else if (arg == "--ignore-eos") {
            params.ignore_eos = true;
        } else if (arg == "--no-penalize-nl") {
@@ -921,8 +888,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    printf("  -f FNAME, --file FNAME\n");
    printf("                        prompt file to start generation.\n");
-    printf("  -bf FNAME, --binary-file FNAME\n");
-    printf("                        binary file containing multiple choice tasks.\n");
    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
    printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
@@ -971,9 +936,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
    printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
    printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
-    printf("  --multiple-choice     compute multiple choice score over random tasks from datafile supplied with -f\n");
-    printf("  --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
-    printf("  --kl-divergence       computes KL-divergence to logits provided via --kl-divergence-base");
    printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
--- a/common/common.h
+++ b/common/common.h
@@ -91,7 +91,6 @@ struct gpt_params {
    std::string input_suffix      = "";  // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
    std::string logdir            = "";  // directory in which to save YAML log files
-    std::string logits_file       = "";  // file for saving *all* logits

    std::vector<llama_model_kv_override> kv_overrides;

@@ -109,11 +108,6 @@ struct gpt_params {
    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed

-    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
-    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
-
-    bool   kl_divergence   = false; // compute KL-divergence
-
    bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -10,7 +10,7 @@ import re
 import sys
 from enum import IntEnum
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
+from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional

 import numpy as np
 import torch
@@ -289,58 +289,6 @@ class Model:
        special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
        special_vocab.add_to_gguf(self.gguf_writer)

-    def _set_vocab_qwen(self):
-        dir_model = self.dir_model
-        hparams = self.hparams
-        tokens: list[bytearray] = []
-        toktypes: list[int] = []
-
-        from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
-        vocab_size = hparams["vocab_size"]
-        assert max(tokenizer.get_vocab().values()) < vocab_size
-
-        merges = []
-        vocab = {}
-        mergeable_ranks = tokenizer.mergeable_ranks
-        for token, rank in mergeable_ranks.items():
-            vocab[QwenModel.token_bytes_to_string(token)] = rank
-            if len(token) == 1:
-                continue
-            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
-            assert len(merged) == 2
-            merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
-
-        # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
-        added_vocab = tokenizer.special_tokens
-        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
-
-        for i in range(vocab_size):
-            if i not in reverse_vocab:
-                pad_token = f"[PAD{i}]".encode("utf-8")
-                tokens.append(bytearray(pad_token))
-                toktypes.append(gguf.TokenType.USER_DEFINED)
-            elif reverse_vocab[i] in added_vocab:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.CONTROL)
-            else:
-                tokens.append(reverse_vocab[i])
-                toktypes.append(gguf.TokenType.NORMAL)
-
-        self.gguf_writer.add_tokenizer_model("gpt2")
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_types(toktypes)
-
-        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
-        special_vocab.merges = merges
-        # only add special tokens when they were not already loaded from config.json
-        if len(special_vocab.special_token_ids) == 0:
-            special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
-            special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
-        # this one is usually not in config.json anyway
-        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
-        special_vocab.add_to_gguf(self.gguf_writer)
-
    def _set_vocab_sentencepiece(self):
        from sentencepiece import SentencePieceProcessor

@@ -539,8 +487,7 @@ class MPTModel(Model):
            # map tensor names
            if "scales" in name:
                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
-                if new_name is not None:
-                    new_name = new_name.replace("scales", "act.scales")
+                new_name = new_name.replace("scales", "act.scales")
            else:
                new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
            if new_name is None:
@@ -929,13 +876,6 @@ class PersimmonModel(Model):


 class StableLMModel(Model):
-    def set_vocab(self):
-        if (self.dir_model / "tokenizer.json").is_file():
-            self._set_vocab_gpt2()
-        else:
-            # StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
-            self._set_vocab_qwen()
-
    def set_gguf_parameters(self):
        hparams = self.hparams
        block_count = hparams["num_hidden_layers"]
@@ -964,7 +904,7 @@ class QwenModel(Model):
        return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])

    @staticmethod
-    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
+    def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
        parts = [bytes([b]) for b in token]
        while True:
            min_idx = None
@@ -981,7 +921,52 @@ class QwenModel(Model):
        return parts

    def set_vocab(self):
-        self._set_vocab_qwen()
+        dir_model = self.dir_model
+        hparams = self.hparams
+        tokens: list[bytearray] = []
+        toktypes: list[int] = []
+
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
+        vocab_size = hparams["vocab_size"]
+        assert max(tokenizer.get_vocab().values()) < vocab_size
+
+        merges = []
+        vocab = {}
+        mergeable_ranks = tokenizer.mergeable_ranks
+        for token, rank in mergeable_ranks.items():
+            vocab[self.token_bytes_to_string(token)] = rank
+            if len(token) == 1:
+                continue
+            merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
+            assert len(merged) == 2
+            merges.append(' '.join(map(self.token_bytes_to_string, merged)))
+
+        reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
+        added_vocab = tokenizer.special_tokens
+
+        for i in range(vocab_size):
+            if i not in reverse_vocab:
+                pad_token = f"[PAD{i}]".encode("utf-8")
+                tokens.append(bytearray(pad_token))
+                toktypes.append(gguf.TokenType.USER_DEFINED)
+            elif reverse_vocab[i] in added_vocab:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.CONTROL)
+            else:
+                tokens.append(reverse_vocab[i])
+                toktypes.append(gguf.TokenType.NORMAL)
+
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
+        special_vocab.merges = merges
+        special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
+        special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
+        special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
+        special_vocab.add_to_gguf(self.gguf_writer)

    def set_gguf_parameters(self):
        self.gguf_writer.add_name("Qwen")
@@ -1300,7 +1285,7 @@ def main() -> None:

    if args.awq_path:
        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
+        from awq.apply_awq import add_scale_weights
        tmp_model_path = args.model / "weighted_model"
        dir_model = tmp_model_path
        if tmp_model_path.is_dir():
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@@ -2,7 +2,6 @@
 from __future__ import annotations

 import argparse
-import os
 import struct
 import sys
 from enum import IntEnum
@@ -10,6 +9,7 @@ from pathlib import Path

 import numpy as np

+import os
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@@ -371,11 +371,15 @@ def handle_metadata(cfg, hp):
        params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
    else:
        raise ValueError('Unable to load metadata')
-    vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
-    vocab_factory = convert.VocabFactory(vocab_path)
-    vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
+    vocab = convert.load_vocab(
+        cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
+        cfg.vocabtype)
+    # FIXME: Respect cfg.vocab_dir?
+    svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
+                               load_merges = cfg.vocabtype == 'bpe',
+                               n_vocab = vocab.vocab_size)
    convert.check_vocab_size(params, vocab)
-    return params, vocab, special_vocab
+    return (params, vocab, svocab)


 def handle_args():
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -5,16 +5,17 @@ import json
 import os
 import struct
 import sys
-from pathlib import Path
 from typing import Any, BinaryIO, Sequence

 import numpy as np
 import torch

+from pathlib import Path
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
 import gguf

+
 NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}


@@ -59,14 +60,7 @@ if __name__ == '__main__':
    input_model = os.path.join(sys.argv[1], "adapter_model.bin")
    output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")

-    if os.path.exists(input_model):
-        model = torch.load(input_model, map_location="cpu")
-    else:
-        input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
-        # lazy import load_file only if lora is in safetensors format.
-        from safetensors.torch import load_file
-        model = load_file(input_model, device="cpu")
-
+    model = torch.load(input_model, map_location="cpu")
    arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"

    if arch_name not in gguf.MODEL_ARCH_NAMES.values():
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@@ -1,13 +1,11 @@
 #!/usr/bin/env python3
-import argparse
-import os
-import sys
-from pathlib import Path
-from pprint import pprint
-
 import torch
+import os
+from pprint import pprint
+import sys
+import argparse
+from pathlib import Path
 from sentencepiece import SentencePieceProcessor
-
 if 'NO_LOCAL_GGUF' not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
 import gguf
@@ -71,7 +69,7 @@ def main():
    persimmon_model = torch.load(args.ckpt_path)
    hparams = persimmon_model['args']
    pprint(hparams)
-    tensors: dict[str, torch.Tensor] = {}
+    tensors = {}
    _flatten_dict(persimmon_model['model'], tensors, None)

    arch = gguf.MODEL_ARCH.PERSIMMON
--- a/convert.py
+++ b/convert.py
@@ -17,28 +17,58 @@ import signal
 import struct
 import sys
 import time
+import warnings
 import zipfile
 from abc import ABCMeta, abstractmethod
+from argparse import ArgumentParser
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
 from dataclasses import dataclass
 from pathlib import Path
-from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
+from typing import (
+    IO,
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Iterable,
+    Literal,
+    Optional,
+    Tuple,
+    TypeVar,
+)

 import numpy as np
 from sentencepiece import SentencePieceProcessor

-if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
-import gguf
+try:
+    from transformers import AutoTokenizer
+except ModuleNotFoundError as e:
+    warnings.warn(f"Could not import AutoTokenizer from transformers: {e}")

-if TYPE_CHECKING:
-    from typing import TypeAlias
+# If NO_LOCAL_GGUF is not set, try to import gguf from the local gguf-py directory
+if "NO_LOCAL_GGUF" not in os.environ:
+    # Use absolute path to the gguf-py directory
+    gguf_py_dir = str(Path(__file__).resolve().parent / "gguf-py")
+    print(gguf_py_dir)  # NOTE: Remove this once path is verified after changes are completed
+    if gguf_py_dir not in sys.path:
+        sys.path.insert(1, gguf_py_dir)

-if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
+# Import gguf module
+try:
+    import gguf
+except ModuleNotFoundError as e:
+    print(f"Could not import gguf: {e}")
+    sys.exit(1)
+
+if TYPE_CHECKING:  # NOTE: This isn't necessary.
+    from typing import TypeAlias  # This can technically be omitted.
+
+if hasattr(faulthandler, "register") and hasattr(signal, "SIGUSR1"):
    faulthandler.register(signal.SIGUSR1)

-NDArray: TypeAlias = 'np.ndarray[Any, Any]'
+# NOTE: n-dimensional arrays should be directly referenced
+NDArray: TypeAlias = "np.ndarray[Any, Any]"

+# Why is this here? LLAMA and GPT are technically the only compatible ARCHs.
 ARCH = gguf.MODEL_ARCH.LLAMA

 DEFAULT_CONCURRENCY = 8
@@ -48,6 +78,7 @@ DEFAULT_CONCURRENCY = 8
 #


+# TODO: Clean up and refactor data types
@dataclass(frozen=True)
 class DataType:
    name: str
@@ -152,65 +183,85 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {

@dataclass
 class Params:
-    n_vocab:        int
-    n_embd:         int
-    n_layer:        int
-    n_ctx:          int
-    n_ff:           int
-    n_head:         int
-    n_head_kv:      int
-    n_experts:      int | None = None
-    n_experts_used: int | None = None
-    f_norm_eps:     float | None = None
+    n_vocab: int
+    n_embd: int
+    n_layer: int
+    n_ctx: int
+    n_ff: int
+    n_head: int
+    n_head_kv: int
+    f_norm_eps: Optional[float] = None
+    n_experts: Optional[int] = None
+    n_experts_used: Optional[int] = None

-    rope_scaling_type: gguf.RopeScalingType | None = None
-    f_rope_freq_base: float | None = None
-    f_rope_scale: float | None = None
-    n_orig_ctx: int | None = None
-    rope_finetuned: bool | None = None
+    rope_scaling_type: Optional[gguf.RopeScalingType] = None
+    f_rope_freq_base: Optional[float] = None
+    f_rope_scale: Optional[float] = None
+    n_orig_ctx: Optional[int] = None
+    rope_finetuned: Optional[bool] = None

-    ftype: GGMLFileType | None = None
+    ftype: Optional[GGMLFileType] = None

    # path to the directory containing the model files
-    path_model: Path | None = None
+    path_model: Optional[Path] = None

    @staticmethod
-    def guessed(model: LazyModel) -> Params:
+    def guessed(model: LazyModel) -> "Params":
        # try transformer naming first
-        n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape
+        n_vocab, n_embd = (
+            model["model.embed_tokens.weight"].shape
+            if "model.embed_tokens.weight" in model
+            else model["tok_embeddings.weight"].shape
+        )

        # try transformer naming first
        if "model.layers.0.self_attn.q_proj.weight" in model:
-            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
-        elif "model.layers.0.self_attn.W_pack.weight" in model:   # next: try baichuan naming
-            n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
+            n_layer = next(
+                i
+                for i in itertools.count()
+                if f"model.layers.{i}.self_attn.q_proj.weight" not in model
+            )
+        elif (
+            "model.layers.0.self_attn.W_pack.weight" in model
+        ):  # next: try baichuan naming
+            n_layer = next(
+                i
+                for i in itertools.count()
+                if f"model.layers.{i}.self_attn.W_pack.weight" not in model
+            )
        else:
-            n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
+            n_layer = next(
+                i
+                for i in itertools.count()
+                if f"layers.{i}.attention.wq.weight" not in model
+            )

        if n_layer < 1:
-            raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+            raise Exception(
+                "failed to guess 'n_layer'. This model is unknown or unsupported.\n"
+                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
+            )

-        n_head = n_embd // 128 # guessed
-        n_mult = 256           # guessed
+        n_head = n_embd // 128  # guessed
+        n_mult = 256  # guessed

        # TODO: verify this
        n_ff = int(2 * (4 * n_embd) / 3)
        n_ff = n_mult * ((n_ff + n_mult - 1) // n_mult)

        return Params(
-            n_vocab    = n_vocab,
-            n_embd     = n_embd,
-            n_layer    = n_layer,
-            n_ctx      = -1,
-            n_ff       = n_ff,
-            n_head     = n_head,
-            n_head_kv  = n_head,
-            f_norm_eps = 1e-5,
+            n_vocab=n_vocab,
+            n_embd=n_embd,
+            n_layer=n_layer,
+            n_ctx=-1,
+            n_ff=n_ff,
+            n_head=n_head,
+            n_head_kv=n_head,
+            f_norm_eps=1e-5,
        )

    @staticmethod
-    def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params:
+    def load_transformers_config(model: LazyModel, config_path: Path) -> "Params":
        config = json.load(open(config_path))

        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
@@ -223,20 +274,22 @@ class Params:
                rope_scaling_type = gguf.RopeScalingType.LINEAR
            elif typ == "yarn":
                rope_scaling_type = gguf.RopeScalingType.YARN
-                n_orig_ctx = rope_scaling['original_max_position_embeddings']
-                rope_finetuned = rope_scaling['finetuned']
+                n_orig_ctx = rope_scaling["original_max_position_embeddings"]
+                rope_finetuned = rope_scaling["finetuned"]
            else:
-                raise NotImplementedError(f'Unknown rope scaling type: {typ}')
+                raise NotImplementedError(f"Unknown rope scaling type: {typ}")

        if "max_sequence_length" in config:
            n_ctx = config["max_sequence_length"]
        elif "max_position_embeddings" in config:
            n_ctx = config["max_position_embeddings"]
        else:
-            raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
-                            "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
+            raise Exception(
+                "failed to guess 'n_ctx'. This model is unknown or unsupported.\n"
+                "Suggestion: provide 'config.json' of the model in the same directory containing model files."
+            )

-        n_experts      = None
+        n_experts = None
        n_experts_used = None

        if "num_local_experts" in config:
@@ -244,30 +297,30 @@ class Params:
            n_experts_used = config["num_experts_per_tok"]

        return Params(
-            n_vocab           = config["vocab_size"],
-            n_embd            = config["hidden_size"],
-            n_layer           = config["num_hidden_layers"],
-            n_ctx             = n_ctx,
-            n_ff              = config["intermediate_size"],
-            n_head            = (n_head := config["num_attention_heads"]),
-            n_head_kv         = config.get("num_key_value_heads", n_head),
-            n_experts         = n_experts,
-            n_experts_used    = n_experts_used,
-            f_norm_eps        = config["rms_norm_eps"],
-            f_rope_freq_base  = config.get("rope_theta"),
-            rope_scaling_type = rope_scaling_type,
-            f_rope_scale      = f_rope_scale,
-            n_orig_ctx        = n_orig_ctx,
-            rope_finetuned    = rope_finetuned,
+            n_vocab=config["vocab_size"],
+            n_embd=config["hidden_size"],
+            n_layer=config["num_hidden_layers"],
+            n_ctx=n_ctx,
+            n_ff=config["intermediate_size"],
+            n_head=(n_head := config["num_attention_heads"]),
+            n_head_kv=config.get("num_key_value_heads", n_head),
+            n_experts=n_experts,
+            n_experts_used=n_experts_used,
+            f_norm_eps=config["rms_norm_eps"],
+            f_rope_freq_base=config.get("rope_theta"),
+            rope_scaling_type=rope_scaling_type,
+            f_rope_scale=f_rope_scale,
+            n_orig_ctx=n_orig_ctx,
+            rope_finetuned=rope_finetuned,
        )

    # LLaMA v2 70B params.json
    # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
    @staticmethod
-    def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
+    def load_torch_params(model: LazyModel, config_path: Path) -> "Params":
        config = json.load(open(config_path))

-        n_experts      = None
+        n_experts = None
        n_experts_used = None
        f_rope_freq_base = None

@@ -290,50 +343,50 @@ class Params:

        if config.get("moe"):
            n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
-            n_experts      = config["moe"]["num_experts"]
+            n_experts = config["moe"]["num_experts"]
            n_experts_used = config["moe"]["num_experts_per_tok"]
            f_rope_freq_base = 1e6

        return Params(
-            n_vocab          = model["tok_embeddings.weight"].shape[0],
-            n_embd           = config["dim"],
-            n_layer          = config["n_layers"],
-            n_ctx            = n_ctx,
-            n_ff             = n_ff,
-            n_head           = (n_head := config["n_heads"]),
-            n_head_kv        = config.get("n_kv_heads", n_head),
-            n_experts        = n_experts,
-            n_experts_used   = n_experts_used,
-            f_norm_eps       = config["norm_eps"],
-            f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
+            n_vocab=model["tok_embeddings.weight"].shape[0],
+            n_embd=config["dim"],
+            n_layer=config["n_layers"],
+            n_ctx=n_ctx,
+            n_ff=n_ff,
+            n_head=(n_head := config["n_heads"]),
+            n_head_kv=config.get("n_kv_heads", n_head),
+            n_experts=n_experts,
+            n_experts_used=n_experts_used,
+            f_norm_eps=config["norm_eps"],
+            f_rope_freq_base=config.get("rope_theta", f_rope_freq_base),
        )

    @staticmethod
-    def load(model_plus: ModelPlus) -> Params:
-        hf_config_path   = model_plus.paths[0].parent / "config.json"
+    def load(model_plus: ModelPlus) -> "Params":
+        hf_config_path = model_plus.paths[0].parent / "config.json"
        orig_config_path = model_plus.paths[0].parent / "params.json"

        if hf_config_path.exists():
-            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
+            params = Params.load_transformers_config(model_plus.model, hf_config_path)
        elif orig_config_path.exists():
-            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
-        elif model_plus.format != 'none':
+            params = Params.load_torch_params(model_plus.model, orig_config_path)
+        elif model_plus.format != "none":
            params = Params.guessed(model_plus.model)
        else:
-            raise ValueError('Cannot guess params when model format is none')
+            raise ValueError("Cannot guess params when model format is none")

        params.path_model = model_plus.paths[0].parent

        return params


-#
-# vocab
-#
-
-class BpeVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
-        self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
+class BpeVocab:  # GPT
+    def __init__(
+        self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
+    ) -> None:
+        self.bpe_tokenizer = json.loads(
+            open(str(fname_tokenizer), encoding="utf-8").read()
+        )
        self.vocab = self.bpe_tokenizer["model"]["vocab"]
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
@@ -341,31 +394,34 @@ class BpeVocab:
            added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
        else:
            # Fall back to trying to find the added tokens in tokenizer.json
-            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
+            tokenizer_json_file = fname_tokenizer.parent / "tokenizer.json"
            if not tokenizer_json_file.is_file():
                added_tokens = {}
            else:
                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
                added_tokens = dict(
-                    (item['content'], item['id'])
-                    for item in tokenizer_json.get('added_tokens', [])
+                    (item["content"], item["id"])
+                    for item in tokenizer_json.get("added_tokens", [])
                    # Added tokens here can be duplicates of the main vocabulary.
-                    if item['content'] not in self.bpe_tokenizer)
+                    if item["content"] not in self.bpe_tokenizer
+                )

        vocab_size: int = len(self.vocab)
-        expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
-        actual_ids      = sorted(added_tokens.values())
+        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
+        actual_ids = sorted(added_tokens.values())
        if expected_ids != actual_ids:
            expected_end_id = vocab_size + len(actual_ids) - 1
-            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
+            raise Exception(
+                f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}"
+            )

        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
-        self.added_tokens_dict    = added_tokens
-        self.added_tokens_list    = [text for (text, idx) in items]
+        self.added_tokens_dict = added_tokens
+        self.added_tokens_list = [text for (text, idx) in items]
        self.vocab_size_base: int = vocab_size
-        self.vocab_size: int      = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer      = fname_tokenizer
-        self.fname_added_tokens   = fname_added_tokens
+        self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
+        self.fname_added_tokens = fname_added_tokens

    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
@@ -386,8 +442,10 @@ class BpeVocab:
        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


-class SentencePieceVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
+class SentencePieceVocab:  # LlaMa
+    def __init__(
+        self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]
+    ) -> None:
        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        added_tokens: dict[str, int]
        if fname_added_tokens is not None:
@@ -397,19 +455,23 @@ class SentencePieceVocab:

        vocab_size: int = self.sentencepiece_tokenizer.vocab_size()

-        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
+        new_tokens = {
+            id: piece for piece, id in added_tokens.items() if id >= vocab_size
+        }
        expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
-        actual_new_ids   = sorted(new_tokens.keys())
+        actual_new_ids = sorted(new_tokens.keys())

        if expected_new_ids != actual_new_ids:
-            raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
+            raise ValueError(
+                f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}"
+            )

        # Token pieces that were added to the base vocabulary.
        self.added_tokens_dict = added_tokens
-        self.added_tokens_list  = [new_tokens[id] for id in actual_new_ids]
-        self.vocab_size_base    = vocab_size
-        self.vocab_size         = self.vocab_size_base + len(self.added_tokens_list)
-        self.fname_tokenizer    = fname_tokenizer
+        self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
+        self.vocab_size_base = vocab_size
+        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
+        self.fname_tokenizer = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens

    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@@ -450,15 +512,11 @@ class SentencePieceVocab:


 class HfVocab:
-    def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
-        try:
-            from transformers import AutoTokenizer
-        except ImportError as e:
-            raise ImportError(
-                "To use HfVocab, please install the `transformers` package. "
-                "You can install it with `pip install transformers`."
-            ) from e
-
+    def __init__(
+        self,
+        fname_tokenizer: Path,
+        fname_added_tokens: Optional[Path] = None,
+    ) -> None:
        print("fname_tokenizer:", fname_tokenizer)
        # Allow the tokenizer to default to slow or fast versions.
        # Explicitly set tokenizer to use local paths.
@@ -471,7 +529,7 @@ class HfVocab:
        # Initialize lists and dictionaries for added tokens
        self.added_tokens_list = []
        self.added_tokens_dict = dict()
-        self.added_tokens_ids  = set()
+        self.added_tokens_ids = set()

        # Process added tokens
        for tok, tokidx in sorted(
@@ -492,12 +550,12 @@ class HfVocab:

        # Set vocabulary sizes
        self.vocab_size_base = self.tokenizer.vocab_size
-        self.vocab_size      = self.vocab_size_base + len(self.added_tokens_list)
+        self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)

-        self.fname_tokenizer    = fname_tokenizer
+        self.fname_tokenizer = fname_tokenizer
        self.fname_added_tokens = fname_added_tokens

-    def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+    def hf_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]:
        reverse_vocab = {
            id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
        }
@@ -515,9 +573,11 @@ class HfVocab:
                token_id, self.special_ids  # Reuse already stored special IDs
            )

-    def get_token_type(self, token_id: int, special_ids: set[int]) -> gguf.TokenType:
+    def get_token_type(self, token_id: int, special_ids: set) -> gguf.TokenType:
        # Determine token type based on whether it's a special token
-        return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
+        return (
+            gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
+        )

    def get_token_score(self, token_id: int) -> float:
        # Placeholder for actual logic to determine the token's score
@@ -529,6 +589,7 @@ class HfVocab:
            if text in self.specials:
                toktype = self.get_token_type(self.specials[text], self.special_ids)
                score = self.get_token_score(self.specials[text])
+
            else:
                toktype = gguf.TokenType.USER_DEFINED
                score = -1000.0
@@ -722,7 +783,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
    else:
        model = merge_sharded([mp.model for mp in models_plus])

-    return ModelPlus(model, paths, format, vocab)  # pytype: disable=wrong-arg-types
+    return ModelPlus(model, paths, format, vocab)


 def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTensor:
@@ -810,13 +871,17 @@ class LazyUnpickler(pickle.Unpickler):
    CLASSES: dict[tuple[str, str], Any] = {
        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
-        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
-        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
-        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
-        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
-        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
-        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
-        ('torch', 'Tensor'): LazyTensor,
+        ("torch._tensor", "_rebuild_from_type_v2"): getattr(
+            rebuild_from_type_v2, "__func__"
+        ),
+        ("torch._utils", "_rebuild_tensor_v2"): getattr(
+            lazy_rebuild_tensor_v2, "__func__"
+        ),
+        ("torch", "BFloat16Storage"): LazyStorageKind(DT_BF16),
+        ("torch", "HalfStorage"): LazyStorageKind(DT_F16),
+        ("torch", "FloatStorage"): LazyStorageKind(DT_F32),
+        ("torch", "IntStorage"): LazyStorageKind(DT_I32),
+        ("torch", "Tensor"): LazyTensor,
    }

    def find_class(self, module: str, name: str) -> Any:
@@ -903,7 +968,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
        executor_class = ProcessPoolExecutor
    else:
        executor_class = ThreadPoolExecutor
-    with executor_class(max_workers=max_workers) as executor:
+    with executor_class(max_workers = max_workers) as executor:
        futures: list[concurrent.futures.Future[Out]] = []
        done = False
        for _ in range(concurrency):
@@ -957,8 +1022,12 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N


 class OutputFile:
-    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
-        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
+    def __init__(
+        self, fname_out: Path, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE
+    ) -> None:
+        self.gguf = gguf.GGUFWriter(
+            fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess
+        )

    def add_meta_arch(self, params: Params) -> None:
        name = "LLaMA"
@@ -967,16 +1036,21 @@ class OutputFile:
        if params.n_ctx == 4096:
            name = "LLaMA v2"
        elif params.path_model is not None:
-            name = str(params.path_model.parent).split('/')[-1]
+            name = str(params.path_model.parent).split("/")[-1]

-        self.gguf.add_name                (name)
-        self.gguf.add_context_length      (params.n_ctx)
-        self.gguf.add_embedding_length    (params.n_embd)
-        self.gguf.add_block_count         (params.n_layer)
-        self.gguf.add_feed_forward_length (params.n_ff)
+        self.gguf.add_name(name)
+        self.gguf.add_context_length(params.n_ctx)
+        self.gguf.add_embedding_length(params.n_embd)
+        self.gguf.add_block_count(params.n_layer)
+        self.gguf.add_feed_forward_length(params.n_ff)
        self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
-        self.gguf.add_head_count          (params.n_head)
-        self.gguf.add_head_count_kv       (params.n_head_kv)
+        self.gguf.add_head_count(params.n_head)
+        self.gguf.add_head_count_kv(params.n_head_kv)
+
+        if params.f_norm_eps is None:
+            raise ValueError("f_norm_eps is None")
+
+        self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)

        if params.n_experts:
            self.gguf.add_expert_count(params.n_experts)
@@ -984,11 +1058,6 @@ class OutputFile:
        if params.n_experts_used:
            self.gguf.add_expert_used_count(params.n_experts_used)

-        if params.f_norm_eps:
-            self.gguf.add_layer_norm_rms_eps(params.f_norm_eps)
-        else:
-            raise ValueError('f_norm_eps is None')
-
        if params.f_rope_freq_base is not None:
            self.gguf.add_rope_freq_base(params.f_rope_freq_base)

@@ -1020,7 +1089,7 @@ class OutputFile:

        return tokenizer_model

-    def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
+    def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]:
        tokens = []
        scores = []
        toktypes = []
@@ -1055,10 +1124,14 @@ class OutputFile:

    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
        n_elements = int(np.prod(tensor.shape))
-        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
-        data_type = getattr(tensor.data_type, 'quantized_type', None) or tensor.data_type.dtype
+        raw_dtype = getattr(tensor.data_type, "ggml_type", None)
+        data_type = (
+            getattr(tensor.data_type, "quantized_type", None) or tensor.data_type.dtype
+        )
        data_nbytes = tensor.data_type.elements_to_bytes(n_elements)
-        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype)
+        self.gguf.add_tensor_info(
+            name, tensor.shape, data_type, data_nbytes, raw_dtype=raw_dtype
+        )

    def write_meta(self) -> None:
        self.gguf.write_header_to_file()
@@ -1072,10 +1145,14 @@ class OutputFile:

    @staticmethod
    def write_vocab_only(
-        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
-        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
+        fname_out: Path,
+        params: Params,
+        vocab: Vocab,
+        svocab: gguf.SpecialVocab,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        pad_vocab: bool = False,
    ) -> None:
-        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
+        check_vocab_size(params, vocab, pad_vocab=pad_vocab)

        of = OutputFile(fname_out, endianess=endianess)

@@ -1103,8 +1180,14 @@ class OutputFile:

    @staticmethod
    def write_all(
-        fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
-        concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        fname_out: Path,
+        ftype: GGMLFileType,
+        params: Params,
+        model: LazyModel,
+        vocab: Vocab,
+        svocab: gguf.SpecialVocab,
+        concurrency: int = DEFAULT_CONCURRENCY,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
        pad_vocab: bool = False,
    ) -> None:
        check_vocab_size(params, vocab, pad_vocab=pad_vocab)
@@ -1124,19 +1207,26 @@ class OutputFile:
        of.write_tensor_info()

        # tensor data
-        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
+        ndarrays_inner = bounded_parallel_map(
+            OutputFile.do_item, model.items(), concurrency=concurrency
+        )
        if ftype == GGMLFileType.MostlyQ8_0:
            ndarrays = bounded_parallel_map(
-                OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
+                OutputFile.maybe_do_quantize,
+                ndarrays_inner,
+                concurrency=concurrency,
+                max_workers=concurrency,
                use_processpool_executor=True,
            )
        else:
            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)

        start = time.time()
-        for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
+        for i, ((name, lazy_tensor), ndarray) in enumerate(
+            zip(model.items(), ndarrays)
+        ):
            elapsed = time.time() - start
-            size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
+            size = " x ".join(f"{dim:6d}" for dim in lazy_tensor.shape)
            padi = len(str(len(model)))
            print(
                f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
@@ -1273,7 +1363,7 @@ def load_some_model(path: Path) -> ModelPlus:
 class VocabFactory:
    def __init__(self, path: Path):
        self.path = path
-        self.files: dict[str, Path | None] = {
+        self.files = {
            "tokenizer.model": None,
            "vocab.json": None,
            "tokenizer.json": None,
@@ -1290,18 +1380,24 @@ class VocabFactory:
                self.files[file] = parent_file_path
        print(f"Found vocab files: {self.files}")

-    def _select_file(self, vocabtype: str | None) -> Path:
+    def _select_file(self, vocabtype: Optional[str]) -> Path:
        if vocabtype in ["spm", "bpe"]:
            for file_key in self.files.keys():
-                if (file := self.files[file_key]) is not None:
-                    return file
+                if self.files[file_key]:
+                    return self.files[file_key]
            raise FileNotFoundError(f"{vocabtype} vocab not found.")
-        if vocabtype == "hfft":
+        elif vocabtype == "hfft":
            # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
            return self.path
-        raise ValueError(f"Unsupported vocabulary type {vocabtype}")
+        else:
+            raise ValueError(f"Unsupported vocabulary type {vocabtype}")

-    def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
+    def _create_special_vocab(
+        self,
+        vocab: Vocab,
+        vocabtype: str,
+        model_parent_path: Path,
+    ) -> gguf.SpecialVocab:
        load_merges = vocabtype == "bpe"
        n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
        return gguf.SpecialVocab(
@@ -1311,12 +1407,13 @@ class VocabFactory:
            n_vocab=n_vocab,
        )

-    def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
+    def load_vocab(
+        self, vocabtype: str, model_parent_path: Path
+    ) -> Tuple[Vocab, gguf.SpecialVocab]:
        path = self._select_file(vocabtype)
        print(f"Loading vocab file '{path}', type '{vocabtype}'")

        added_tokens_path = path.parent / "added_tokens.json"
-        vocab: Vocab
        if vocabtype == "bpe":
            vocab = BpeVocab(
                path, added_tokens_path if added_tokens_path.exists() else None
@@ -1331,7 +1428,6 @@ class VocabFactory:
            )
        else:
            raise ValueError(f"Unsupported vocabulary type {vocabtype}")
-        # FIXME: Respect --vocab-dir?
        special_vocab = self._create_special_vocab(
            vocab,
            vocabtype,
@@ -1340,17 +1436,18 @@ class VocabFactory:
        return vocab, special_vocab


-def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
+def default_output_file(model_paths: list[Path], file_type: GGMLFileType) -> Path:
    namestr = {
-        GGMLFileType.AllF32:    "f32",
+        GGMLFileType.AllF32: "f32",
        GGMLFileType.MostlyF16: "f16",
-        GGMLFileType.MostlyQ8_0:"q8_0",
+        GGMLFileType.MostlyQ8_0: "q8_0",
    }[file_type]
    ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
    if ret in model_paths:
        sys.stderr.write(
            f"Error: Default output path ({ret}) would overwrite the input. "
-            "Please explicitly specify a path using --outfile.\n")
+            "Please explicitly specify a path using --outfile.\n"
+        )
        sys.exit(1)
    return ret

@@ -1360,34 +1457,111 @@ def do_dump_model(model_plus: ModelPlus) -> None:
    print(f"model_plus.format = {model_plus.format!r}")
    print(f"model_plus.vocab = {model_plus.vocab!r}")
    for name, lazy_tensor in model_plus.model.items():
-        print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
+        print(
+            f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}"
+        )


-def main(args_in: list[str] | None = None) -> None:
+def get_argument_parser() -> ArgumentParser:
    output_choices = ["f32", "f16"]
    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
        # We currently only support Q8_0 output on little endian systems.
        output_choices.append("q8_0")
-    vocab_types = ["spm", "bpe", "hfft"]
-    parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
-    parser.add_argument("--awq-path",    type=Path,              help="Path to scale awq cache file", default=None)
-    parser.add_argument("--dump",        action="store_true",    help="don't convert, just show what's in the model")
-    parser.add_argument("--dump-single", action="store_true",    help="don't convert, just show what's in a single model file")
-    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
-    parser.add_argument("--outtype",     choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
-    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
-    parser.add_argument("--vocab-type",  choices=vocab_types,    help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
-    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
-    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
-    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
-    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
-    parser.add_argument("--big-endian",  action="store_true",    help="model is executed on big endian machine")
-    parser.add_argument("--pad-vocab",   action="store_true",    help="add pad tokens when model vocab expects more than tokenizer metadata provides")

-    args = parser.parse_args(args_in)
+    parser = argparse.ArgumentParser(
+        description="Convert a LLaMa model to a GGML compatible file"
+    )
+
+    parser.add_argument(
+        "model",
+        type=Path,
+        help="Directory containing the model file or the model file itself (*.pth, *.pt, *.bin)",
+    )
+
+    parser.add_argument(
+        "--awq-path",
+        type=Path,
+        help="Path to the Activation-aware Weight Quantization cache file",
+        default=None,
+    )
+
+    parser.add_argument(
+        "--dump",
+        action="store_true",
+        help="Display the model content without converting it",
+    )
+
+    parser.add_argument(
+        "--dump-single",
+        action="store_true",
+        help="Display the content of a single model file without conversion",
+    )
+
+    parser.add_argument(
+        "--vocab-only",
+        action="store_true",
+        help="Extract and output only the vocabulary",
+    )
+
+    parser.add_argument(
+        "--outtype",
+        choices=output_choices,
+        help="Output format - note: q8_0 may be very slow (default: f16 or f32 based on input)",
+    )
+
+    parser.add_argument(
+        "--vocab-dir",
+        type=Path,
+        help="Directory containing the tokenizer.model, if separate from the model file",
+    )
+
+    parser.add_argument(
+        "--vocab-type",
+        choices=["spm", "bpe", "hfft"],  # hfft: Hugging Face Fast Tokenizer
+        default="spm",
+        help="The vocabulary format used to define the tokenizer model (default: spm)",
+    )
+
+    parser.add_argument(
+        "--pad-vocab",
+        action="store_true",
+        help="Add padding tokens when the model's vocabulary size exceeds the tokenizer metadata",
+    )
+
+    parser.add_argument(
+        "--outfile",
+        type=Path,
+        help="Specify the path for the output file (default is based on input)",
+    )
+
+    parser.add_argument(
+        "--ctx", type=int, help="Model training context (default is based on input)"
+    )
+
+    parser.add_argument(
+        "--concurrency",
+        type=int,
+        help=f"Concurrency used for conversion (default: {DEFAULT_CONCURRENCY})",
+        default=DEFAULT_CONCURRENCY,
+    )
+
+    parser.add_argument(
+        "--big-endian",
+        action="store_true",
+        help="Indicate that the model is executed on a big-endian machine",
+    )
+
+    return parser
+
+
+def main(argv: Optional[list[str]] = None) -> None:
+    parser = get_argument_parser()
+    args = parser.parse_args(argv)
+
    if args.awq_path:
-        sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
-        from awq.apply_awq import add_scale_weights  # type: ignore[import-not-found]
+        sys.path.insert(1, str(Path(__file__).resolve().parent / "awq-py"))
+        from awq.apply_awq import add_scale_weights
+
        tmp_model_path = args.model / "weighted_model"
        if tmp_model_path.is_dir():
            print(f"{tmp_model_path} exists as a weighted model.")
@@ -1406,11 +1580,14 @@ def main(args_in: list[str] | None = None) -> None:
    if not args.vocab_only:
        model_plus = load_some_model(args.model)
    else:
-        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
+        model_plus = ModelPlus(
+            model={}, paths=[args.model / "dummy"], format="none", vocab=None
+        )

    if args.dump:
        do_dump_model(model_plus)
        return
+
    endianess = gguf.GGUFEndian.LITTLE
    if args.big_endian:
        endianess = gguf.GGUFEndian.BIG
@@ -1418,10 +1595,12 @@ def main(args_in: list[str] | None = None) -> None:
    params = Params.load(model_plus)
    if params.n_ctx == -1:
        if args.ctx is None:
-            raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n"
-                            "Please specify one with --ctx:\n"
-                            " - LLaMA v1: --ctx 2048\n"
-                            " - LLaMA v2: --ctx 4096\n")
+            raise Exception(
+                "The model doesn't have a context size, and you didn't specify one with --ctx\n"
+                "Please specify one with --ctx:\n"
+                " - LLaMA v1: --ctx 2048\n"
+                " - LLaMA v2: --ctx 4096\n"
+            )
        params.n_ctx = args.ctx

    if args.outtype:
@@ -1442,30 +1621,42 @@ def main(args_in: list[str] | None = None) -> None:
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        outfile = args.outfile
-        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
-                                    endianess=endianess, pad_vocab=args.pad_vocab)
+        OutputFile.write_vocab_only(
+            outfile,
+            params,
+            vocab,
+            special_vocab,
+            endianess=endianess,
+            pad_vocab=args.pad_vocab,
+        )
        print(f"Wrote {outfile}")
        return

    if model_plus.vocab is not None and args.vocab_dir is None:
        vocab = model_plus.vocab

-    print(f"Vocab info: {vocab}")
-    print(f"Special vocab info: {special_vocab}")
-
-    model   = model_plus.model
-    model   = convert_model_names(model, params)
-    ftype   = pick_output_type(model, args.outtype)
-    model   = convert_to_output_type(model, ftype)
-    outfile = args.outfile or default_outfile(model_plus.paths, ftype)
+    model = model_plus.model
+    model = convert_model_names(model, params)
+    ftype = pick_output_type(model, args.outtype)
+    model = convert_to_output_type(model, ftype)
+    outfile = args.outfile or default_output_file(model_plus.paths, ftype)

    params.ftype = ftype
    print(f"Writing {outfile}, format {ftype}")

-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
-                         concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
+    OutputFile.write_all(
+        outfile,
+        ftype,
+        params,
+        model,
+        vocab,
+        special_vocab,
+        concurrency=args.concurrency,
+        endianess=endianess,
+        pad_vocab=args.pad_vocab,
+    )
    print(f"Wrote {outfile}")


-if __name__ == '__main__':
-    main()
+if __name__ == "__main__":
+    main(sys.argv[1:])  # Exclude the first element (script name) from sys.argv
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1800,8 +1800,6 @@ int main(int argc, char ** argv) {
    std::vector<size_t> train_samples_begin;
    std::vector<size_t> train_samples_size;
    printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
-    printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
-    printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
    tokenize_file(lctx,
            params.common.fn_train_data,
            params.common.sample_start,
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -26,7 +26,6 @@ struct StatParams {
    std::string ofile = "imatrix.dat";
    int         n_output_frequency = 10;
    int         verbosity = 1;
-    int         keep_every = 0;
    bool        collect_output_weight = false;
 };

@@ -43,9 +42,6 @@ private:
    int                                    m_last_call = 0;
    std::vector<float>                     m_src1_data;
    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
-                                                  //
-    void save_imatrix(const char * file_name) const;
-    void keep_imatrix(int ncall) const;
 };

 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -121,9 +117,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                if (m_last_call % m_params.n_output_frequency == 0) {
                    save_imatrix();
                }
-                if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
-                    keep_imatrix(m_last_call);
-                }
            }
        }
    } else {
@@ -150,9 +143,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
            if (m_last_call % m_params.n_output_frequency == 0) {
                save_imatrix();
            }
-            if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
-                keep_imatrix(m_last_call);
-            }
        }
    }

@@ -160,18 +150,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
 }

 void IMatrixCollector::save_imatrix() const {
-    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
-}
-
-void IMatrixCollector::keep_imatrix(int ncall) const {
-    auto file_name = m_params.ofile;
-    if (file_name.empty()) file_name = "imatrix.dat";
-    file_name += ".at_";
-    file_name += std::to_string(ncall);
-    save_imatrix(file_name.c_str());
-}
-
-void IMatrixCollector::save_imatrix(const char * fname) const {
+    const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
    std::ofstream out(fname, std::ios::binary);
    int n_entries = m_stats.size();
    out.write((const char*)&n_entries, sizeof(n_entries));
@@ -269,7 +248,7 @@ static void process_logits(
    }
 }

-static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {

    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    const int n_ctx = llama_n_ctx(ctx);
@@ -290,12 +269,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
    }

    std::vector<float> logit_history;
-    std::vector<float> prob_history;
+    logit_history.resize(tokens.size());

-    if (compute_ppl) {
-        logit_history.resize(tokens.size());
-        prob_history.resize(tokens.size());
-    }
+    std::vector<float> prob_history;
+    prob_history.resize(tokens.size());

    const int n_chunk_max = tokens.size() / n_ctx;

@@ -311,17 +288,12 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

-    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
-
-    std::vector<float> logits;
-    if (compute_ppl && num_batches > 1) {
-        logits.reserve((size_t)n_ctx * n_vocab);
-    }
-
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;

+        const int num_batches = (n_ctx + n_batch - 1) / n_batch;
+
        std::vector<float> logits;

        const auto t_start = std::chrono::high_resolution_clock::now();
@@ -349,10 +321,8 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
            // restore the original token in case it was set to BOS
            tokens[batch_start] = token_org;

-            if (compute_ppl && num_batches > 1) {
-                const auto * batch_logits = llama_get_logits(ctx);
-                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
-            }
+            const auto * batch_logits = llama_get_logits(ctx);
+            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
        }

        const auto t_end = std::chrono::high_resolution_clock::now();
@@ -368,32 +338,25 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
        }

-        if (compute_ppl) {
-            const int first = n_ctx/2;
-            const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-            count += n_ctx - first - 1;
+        const int first = n_ctx/2;
+        process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
+        count += n_ctx - first - 1;

-            printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
-            fflush(stdout);
-
-            logits.clear();
-        }
+        printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
+        fflush(stdout);
    }
    printf("\n");

-    if (compute_ppl) {
-        nll2 /= count;
-        nll /= count;
-        const double ppl = exp(nll);
-        nll2 -= nll * nll;
-        if (nll2 > 0) {
-            nll2 = sqrt(nll2/(count-1));
-            printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
-        } else {
-            printf("Unexpected negative standard deviation of log(prob)\n");
-        }
+    nll2 /= count;
+    nll /= count;
+    const double ppl = exp(nll);
+    nll2 -= nll * nll;
+    if (nll2 > 0) {
+        nll2 = sqrt(nll2/(count-1));
+        printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
+    } else {
+        printf("Unexpected negative standard deviation of log(prob)\n");
    }

    return true;
@@ -402,7 +365,6 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
 int main(int argc, char ** argv) {

    StatParams sparams;
-    bool compute_ppl = true;
    std::vector<char*> args;
    args.push_back(argv[0]);
    int iarg = 1;
@@ -419,21 +381,12 @@ int main(int argc, char ** argv) {
        }
        else if (arg == "--verbosity") {
            sparams.verbosity = std::stoi(argv[++iarg]);
-        } else if (arg == "--no-ppl") {
-            compute_ppl = false;
-        } else if (arg == "--keep-imatrix") {
-            sparams.keep_every = std::stoi(argv[++iarg]);
        } else {
            args.push_back(argv[iarg]);
        }
    }
    if (iarg < argc) {
-        std::string arg{argv[iarg]};
-        if (arg == "--no-ppl") {
-            compute_ppl = false;
-        } else {
-            args.push_back(argv[iarg]);
-        }
+        args.push_back(argv[iarg]);
    }

    gpt_params params;
@@ -495,7 +448,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s\n", get_system_info(params).c_str());
    }

-    bool OK = compute_imatrix(ctx, params, compute_ppl);
+    bool OK = compute_imatrix(ctx, params);
    if (!OK) {
        return 1;
    }
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -6,7 +6,7 @@
 " Similarly, you could add an insert mode keybind with
 " inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
 "
-" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
+" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
 " let g:llama_api_url = "192.168.1.10:8080"
 " llama_overrides can also be set through buffer/window scopes. For instance
 " autocmd filetype python let b:llama_overrides = {"temp": 0.2}
@@ -82,9 +82,6 @@ func llama#doLlamaGen()
   endif
   let l:querydata.prompt = join(l:buflines, "\n")
   let l:curlcommand = copy(s:curlcommand)
-   if exists("g:llama_api_key")
-       call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
-   endif
   let l:curlcommand[2] = json_encode(l:querydata)
   let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
 endfunction
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -1,131 +0,0 @@
-# MobileVLM
-
-Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
-
-for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
-
-The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
-
-## Usage
-Build with cmake or run `make llava-cli` to build it.
-
-After building, run: `./llava-cli` to see the usage. For example:
-
-```sh
-./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
-    --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
-    --image path/to/an/image.jpg \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
-```
-
-## Model conversion
-
- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
-
-```sh
-git clone https://huggingface.co/mtgv/MobileVLM-1.7B
-
-git clone https://huggingface.co/openai/clip-vit-large-patch14-336
-```
-
-2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
-
-```sh
-python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
-```
-
-3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
-
-```sh
-python ./examples/llava/convert-image-encoder-to-gguf \
-    -m path/to/clip-vit-large-patch14-336 \
-    --llava-projector path/to/MobileVLM-1.7B/llava.projector \
-    --output-dir path/to/MobileVLM-1.7B \
-    --projector-type ldp
-```
-
-4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
-
-```sh
-python ./convert.py path/to/MobileVLM-1.7B
-```
-
-5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
-```sh
-./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
-```
-
-Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
-
-## Android compile and run
-### compile
-refer to `examples/llava/android/build_64.sh`
-```sh
-mkdir examples/llava/android/build_64
-cd examples/llava/android/build_64
-../build_64.sh
-```
-### run on Android
-refer to `android/adb_run.sh`, modify resources' `name` and `path`
-
-## some result on Android with `Snapdragon 888` chip
-### case 1
-**input**
-```sh
-/data/local/tmp/llava-cli \
-    -m /data/local/tmp/ggml-model-q4_k.gguf \
-    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
-    -t 4 \
-    --image /data/local/tmp/demo.jpg \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
-```
-**output**
-```sh
-encode_image_with_clip: image encoded in 21148.71 ms by CLIP (  146.87 ms per image patch)
- Susan Wise Bauer
-llama_print_timings:        load time =   23574.72 ms
-llama_print_timings:      sample time =       1.24 ms /     6 runs   (    0.21 ms per token,  4850.44 tokens per second)
-llama_print_timings: prompt eval time =   12460.15 ms /   246 tokens (   50.65 ms per token,    19.74 tokens per second)
-llama_print_timings:        eval time =     424.86 ms /     6 runs   (   70.81 ms per token,    14.12 tokens per second)
-llama_print_timings:       total time =   34731.93 ms
-```
-### case 2
-**input**
-```sh
-/data/local/tmp/llava-cli \
-    -m /data/local/tmp/ggml-model-q4_k.gguf \
-    --mmproj /data/local/tmp/mmproj-model-f16.gguf \
-    -t 4 \
-    --image /data/local/tmp/cat.jpeg \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
-```
-
-**output**
-```sh
-encode_image_with_clip: image encoded in 21149.51 ms by CLIP (  146.87 ms per image patch)
- The image depicts a cat sitting in the grass near some tall green plants.
-llama_print_timings:        load time =   23257.32 ms
-llama_print_timings:      sample time =       5.25 ms /    18 runs   (    0.29 ms per token,  3430.53 tokens per second)
-llama_print_timings: prompt eval time =   11900.73 ms /   232 tokens (   51.30 ms per token,    19.49 tokens per second)
-llama_print_timings:        eval time =    1279.03 ms /    18 runs   (   71.06 ms per token,    14.07 tokens per second)
-llama_print_timings:       total time =   34570.79 ms
-```
-
-## Minor shortcomings
-The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
-
-## TODO
-
- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
- [ ] Optimize LDP projector performance
-
-      - Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
-      - Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
- [ ] run MobileVLM on `Jetson Orin`
- [ ] Support more model variants, such as `MobileVLM-3B`.
-
-
-## contributor
-```sh
-zhangjidong05, yangyang260, huyiming03, chenxiaotao03
-```
--- a/examples/llava/android/adb_run.sh
+++ b/examples/llava/android/adb_run.sh
@@ -1,53 +0,0 @@
-#!/bin/bash
-
-model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
-projector_name="mmproj-model-f16.gguf"
-llama_name="ggml-model-q4_k.gguf"
-img_dir="/Users/cxt/model/llm"
-img_name="demo.jpg"
-prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
-# img_name="cat.jpeg"
-# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
-
-program_dir="build_64/bin"
-binName="llava-cli"
-n_threads=4
-
-
-deviceDir="/data/local/tmp"
-saveDir="output"
-if [ ! -d ${saveDir} ]; then
-    mkdir ${saveDir}
-fi
-
-
-function android_run() {
-    # # copy resource into device
-    # adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
-    # adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
-    adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
-    # copy program into device
-    adb push ${program_dir}/${binName} ${deviceDir}/${binName}
-    adb shell "chmod 0777 ${deviceDir}/${binName}"
-
-    # run
-    adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
-                                                 -m ${deviceDir}/${llama_name} \
-                                                 --mmproj ${deviceDir}/${projector_name} \
-                                                 -t ${n_threads} \
-                                                 --image ${deviceDir}/${img_name} \
-                                                 -p \"${prompt}\" \
-                                                 > ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
-    adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
-                                                 -m ${deviceDir}/${llama_name} \
-                                                 --mmproj ${deviceDir}/${projector_name} \
-                                                 -t ${n_threads} \
-                                                 --image ${deviceDir}/${img_name} \
-                                                 -p \"${prompt}\" \
-                                                 >> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
-    adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
-}
-
-android_run
-
-echo "android_run is Done!"
--- a/examples/llava/android/build_64.sh
+++ b/examples/llava/android/build_64.sh
@@ -1,8 +0,0 @@
-#!/bin/bash
-cmake ../../../../ \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DANDROID_ABI="arm64-v8a" \
-DANDROID_PLATFORM=android-23 $1
-
-make -j4
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2,6 +2,17 @@
 // so there might be still unnecessary artifacts hanging around
 // I'll gradually clean and extend it

+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <regex>
+#include <stdexcept>
+#include <vector>
+
 #include "clip.h"
 #include "ggml.h"
 #include "ggml-alloc.h"
@@ -18,19 +29,6 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"

-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <regex>
-#include <stdexcept>
-#include <vector>
-#include <sstream>
-#include <cinttypes>
-
 static std::string format(const char * fmt, ...) {
    va_list ap;
    va_list ap2;
@@ -69,7 +67,6 @@ static std::string format(const char * fmt, ...) {
 #define KEY_PATCH_SIZE "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN "clip.vision.image_mean"
 #define KEY_IMAGE_STD "clip.vision.image_std"
-#define KEY_PROJ_TYPE "clip.projector_type"

 //
 // tensor name constants
@@ -92,21 +89,6 @@ static std::string format(const char * fmt, ...) {
 #define TN_TEXT_PROJ "text_projection.weight"
 #define TN_VIS_PROJ "visual_projection.weight"
 #define TN_LLAVA_PROJ "mm.%d.%s"
-#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
-#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
-
-
-enum projector_type {
-    PROJECTOR_TYPE_MLP,
-    PROJECTOR_TYPE_LDP,
-    PROJECTOR_TYPE_UNKNOWN,
-};
-
-static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
-    { PROJECTOR_TYPE_MLP,           "mlp"     },
-    { PROJECTOR_TYPE_LDP,          "ldp"    },
-};
-

 //
 // utilities to get data from a gguf file
@@ -147,91 +129,6 @@ static std::string get_ftype(int ftype) {
    return ggml_type_name(static_cast<ggml_type>(ftype));
 }

-static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
-    switch (type) {
-        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
-        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
-        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
-        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
-        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
-        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
-        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
-        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
-        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
-        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
-        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
-        default:                return format("unknown type %d", type);
-    }
-}
-
-
-static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    std::string result;
-    for (size_t pos = 0; ; pos += search.length()) {
-        auto new_pos = s.find(search, pos);
-        if (new_pos == std::string::npos) {
-            result += s.substr(pos, s.size() - pos);
-            break;
-        }
-        result += s.substr(pos, new_pos - pos) + replace;
-        pos = new_pos;
-    }
-    s = std::move(result);
-}
-
-static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
-    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
-
-    switch (type) {
-        case GGUF_TYPE_STRING:
-            return gguf_get_val_str(ctx_gguf, i);
-        case GGUF_TYPE_ARRAY:
-            {
-                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
-                int arr_n = gguf_get_arr_n(ctx_gguf, i);
-                const void * data = gguf_get_arr_data(ctx_gguf, i);
-                std::stringstream ss;
-                ss << "[";
-                for (int j = 0; j < arr_n; j++) {
-                    if (arr_type == GGUF_TYPE_STRING) {
-                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
-                        // escape quotes
-                        replace_all(val, "\\", "\\\\");
-                        replace_all(val, "\"", "\\\"");
-                        ss << '"' << val << '"';
-                    } else if (arr_type == GGUF_TYPE_ARRAY) {
-                        ss << "???";
-                    } else {
-                        ss << gguf_data_to_str(arr_type, data, j);
-                    }
-                    if (j < arr_n - 1) {
-                        ss << ", ";
-                    }
-                }
-                ss << "]";
-                return ss.str();
-            }
-        default:
-            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
-    }
-}
-
-static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
-    size_t tensor_size = ggml_nbytes(tensor);
-    printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
-            prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
-            tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
-}
-
-static projector_type clip_projector_type_from_string(const std::string & name) {
-    for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
-        if (kv.second == name) {
-            return kv.first;
-        }
-    }
-    return PROJECTOR_TYPE_UNKNOWN;
-}
-
 //
 // image data
 //
@@ -308,32 +205,6 @@ struct clip_vision_model {
    struct ggml_tensor * mm_0_b;
    struct ggml_tensor * mm_2_w;
    struct ggml_tensor * mm_2_b;
-
-    // MobileVLM projection
-    struct ggml_tensor * mm_model_mlp_1_w;
-    struct ggml_tensor * mm_model_mlp_1_b;
-    struct ggml_tensor * mm_model_mlp_3_w;
-    struct ggml_tensor * mm_model_mlp_3_b;
-    struct ggml_tensor * mm_model_block_1_block_0_0_w;
-    struct ggml_tensor * mm_model_block_1_block_0_1_w;
-    struct ggml_tensor * mm_model_block_1_block_0_1_b;
-    struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
-    struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
-    struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
-    struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
-    struct ggml_tensor * mm_model_block_1_block_2_0_w;
-    struct ggml_tensor * mm_model_block_1_block_2_1_w;
-    struct ggml_tensor * mm_model_block_1_block_2_1_b;
-    struct ggml_tensor * mm_model_block_2_block_0_0_w;
-    struct ggml_tensor * mm_model_block_2_block_0_1_w;
-    struct ggml_tensor * mm_model_block_2_block_0_1_b;
-    struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
-    struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
-    struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
-    struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
-    struct ggml_tensor * mm_model_block_2_block_2_0_w;
-    struct ggml_tensor * mm_model_block_2_block_2_1_w;
-    struct ggml_tensor * mm_model_block_2_block_2_1_b;
 };

 struct clip_ctx {
@@ -342,7 +213,6 @@ struct clip_ctx {
    bool has_llava_projector = false;

    struct clip_vision_model vision_model;
-    projector_type proj_type = PROJECTOR_TYPE_MLP;

    float image_mean[3];
    float image_std[3];
@@ -560,135 +430,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            free(patches_data);
        }

-        // shape [1, 576, 1024]
-        // ne is whcn, ne = [1024, 576, 1, 1]
        embeddings = ggml_get_rows(ctx0, embeddings, patches);

-        // print_tensor_info(embeddings, "embeddings");
+        // mm projection 0
+        embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+        embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);

-        // llava projector
-        if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
-            embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+        embeddings = ggml_gelu(ctx0, embeddings);

-            embeddings = ggml_gelu(ctx0, embeddings);
-
-            embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
-            embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-        }
-        else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
-            // MobileVLM projector
-            int n_patch = 24;
-            struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
-            mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
-            mlp_1 = ggml_gelu(ctx0, mlp_1);
-            struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
-            mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
-            // mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
-
-            // block 1
-            struct ggml_tensor * block_1 = nullptr;
-            {
-                // transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
-                mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
-                mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
-                // stride = 1, padding = 1, bias is nullptr
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
-
-                // layer norm
-                // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
-                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
-                block_1 = ggml_norm(ctx0, block_1, eps);
-                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
-
-                // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                // hardswish
-                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
-
-                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
-                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
-                // pointwise conv
-                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
-                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
-                block_1 = ggml_relu(ctx0, block_1);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
-                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
-                block_1 = ggml_hardsigmoid(ctx0, block_1);
-                // block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
-                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
-                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
-
-                int w = block_1->ne[0], h = block_1->ne[1];
-                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
-
-                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
-                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
-
-                // block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
-                block_1 = ggml_norm(ctx0, block_1, eps);
-                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
-                // block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
-                // residual
-                block_1 = ggml_add(ctx0, mlp_3, block_1);
-            }
-
-            // block_2
-            {
-                // stride = 2
-                block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
-
-                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
-                // layer norm
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
-                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
-                block_1 = ggml_norm(ctx0, block_1, eps);
-                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
-                // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
-                // hardswish
-                struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
-
-                // not sure the parameters is right for globalAvgPooling
-                block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
-                // block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
-                // pointwise conv
-                block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
-                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
-                block_1 = ggml_relu(ctx0, block_1);
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
-                block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
-                block_1 = ggml_hardsigmoid(ctx0, block_1);
-
-                // block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
-                block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
-                block_1 = ggml_mul(ctx0, block_1_hw, block_1);
-
-                int w = block_1->ne[0], h = block_1->ne[1];
-                block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
-                block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
-                // block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
-                block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
-                block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
-
-
-                // block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
-                block_1 = ggml_norm(ctx0, block_1, eps);
-                block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
-                block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
-                // block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
-            }
-            embeddings = block_1;
-        }
-        else {
-            GGML_ASSERT(false);
-        }
+        embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+        embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
    }

    // build the graph
@@ -734,47 +485,16 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        printf("\n");
    }
    const int n_tensors = gguf_get_n_tensors(ctx);
-
    // kv
-    const int n_kv = gguf_get_n_kv(ctx);
-    printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
-        __func__, n_kv, n_tensors, fname);
-    {
-        std::map<enum ggml_type, uint32_t> n_type;
+    if (verbosity >= 3) {
+        const int n_kv = gguf_get_n_kv(ctx);

-        for (int i = 0; i < n_tensors; i++) {
-            enum ggml_type type = gguf_get_tensor_type(ctx, i);
+        for (int i = 0; i < n_kv; ++i) {
+            const char * key = gguf_get_key(ctx, i);

-            n_type[type]++;
-        }
-
-        printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
-        for (int i = 0; i < n_kv; i++) {
-            const char * name           = gguf_get_key(ctx, i);
-            const enum gguf_type type   = gguf_get_kv_type(ctx, i);
-            const std::string type_name =
-                type == GGUF_TYPE_ARRAY
-                ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
-                : gguf_type_name(type);
-
-            std::string value          = gguf_kv_to_str(ctx, i);
-            const size_t MAX_VALUE_LEN = 40;
-            if (value.size() > MAX_VALUE_LEN) {
-                value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
-            }
-            replace_all(value, "\n", "\\n");
-
-            printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
-        }
-
-        // print type counts
-        for (auto & kv : n_type) {
-            if (kv.second == 0) {
-                continue;
-            }
-
-            printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
+            printf("%s: kv[%d]: key = %s\n", __func__, i, key);
        }
+        printf("\n");
    }

    // data
@@ -783,13 +503,12 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        for (int i = 0; i < n_tensors; ++i) {
            const char * name = gguf_get_tensor_name(ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
-            enum ggml_type type = gguf_get_tensor_type(ctx, i);
            struct ggml_tensor * cur = ggml_get_tensor(meta, name);
            size_t tensor_size = ggml_nbytes(cur);
            buffer_size += tensor_size;
            if (verbosity >= 3) {
-                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
-                       __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
+                printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
+                       ggml_n_dims(cur), cur->name, tensor_size, offset);
            }
        }
    }
@@ -798,18 +517,6 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {

    clip_ctx * new_clip = new clip_ctx;

-    // update projector type
-    {
-        int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
-        if (idx != -1) {
-            const std::string proj_type = gguf_get_val_str(ctx, idx);
-            new_clip->proj_type = clip_projector_type_from_string(proj_type);
-        }
-        else {
-            new_clip->proj_type = PROJECTOR_TYPE_MLP;
-        }
-    }
-
 #ifdef GGML_USE_CUBLAS
    new_clip->backend = ggml_backend_cuda_init(0);
    printf("%s: CLIP using CUDA backend\n", __func__);
@@ -954,45 +661,10 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
        vision_model.pre_ln_w            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
        vision_model.pre_ln_b            = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
-
-        // LLaVA projection
-        if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
-            vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
-            vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
-            vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
-            vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
-        }
-        else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
-            // MobileVLM projection
-            vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
-            vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
-            vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
-            vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
-            vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
-            vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
-            vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
-            vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
-            vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
-            vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
-            vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
-            vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
-            vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
-            vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
-            vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
-            vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
-            vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
-            vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
-            vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
-            vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
-            vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
-            vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
-            vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
-            vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
-        }
-        else {
-            std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
-            throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
-        }
+        vision_model.mm_0_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
+        vision_model.mm_0_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
+        vision_model.mm_2_w              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
+        vision_model.mm_2_b              = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));

        vision_model.layers.resize(hparams.n_layer);
        for (int il = 0; il < hparams.n_layer; ++il) {
@@ -1428,25 +1100,13 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 }

 int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
-    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
-        return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
-    }
-    else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
-        return ctx->vision_model.mm_2_b->ne[0];
-    }
-    else {
-        std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
-        throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
-    }
+    return ctx->vision_model.mm_2_b->ne[0];
 }

 int clip_n_patches(const struct clip_ctx * ctx) {
    auto & params = ctx->vision_model.hparams;
-    int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
-    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
-        n_patches /= 4;
-    }
-    return n_patches;
+
+    return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
 }

 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
--- a/examples/llava/convert-image-encoder-to-gguf.py
+++ b/examples/llava/convert-image-encoder-to-gguf.py
@@ -81,7 +81,6 @@ ap.add_argument("--vision-only", action="store_true", required=False,
 ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
                help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
 ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
-ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
 ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
 ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
 ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
@@ -175,8 +174,6 @@ elif args.vision_only and not has_llava_projector:
    fout.add_description("vision-only CLIP model")
 elif has_llava_projector:
    fout.add_description("image encoder for LLaVA")
-    # add projector type
-    fout.add_string("clip.projector_type", args.projector_type)
 else:
    fout.add_description("two-tower CLIP model")

@@ -221,8 +218,7 @@ if has_llava_projector:
    projector = torch.load(args.llava_projector)
    for name, data in projector.items():
        name = get_tensor_name(name)
-        # pw and dw conv ndim==4
-        if data.ndim == 2 or data.ndim == 4:
+        if data.ndim == 2:
            data = data.squeeze().numpy().astype(np.float16)
        else:
            data = data.squeeze().numpy().astype(np.float32)
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -112,43 +112,6 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
 }

-static inline int nearest_int(float fval) {
-    //assert(fval <= 4194303.f);
-    float val = fval + 12582912.f;
-    int i; memcpy(&i, &val, sizeof(int));
-    return (i & 0x007fffff) - 0x00400000;
-}
-
-static double log_softmax(int n_vocab, const float * logits, uint16_t * log_prob, int tok) {
-    float max_logit = logits[0];
-    float min_logit = logits[0];
-    for (int i = 1; i < n_vocab; ++i) {
-        max_logit = std::max(max_logit, logits[i]);
-        min_logit = std::min(min_logit, logits[i]);
-    }
-    min_logit = std::max(min_logit, max_logit - 16);
-    double sum_exp = 0.0;
-    for (int i = 0; i < n_vocab; ++i) {
-        sum_exp += expf(logits[i] - max_logit);
-    }
-    const float log_sum_exp = log(sum_exp);
-    const float min_log_prob = min_logit - max_logit - log_sum_exp;
-    const float scale = (max_logit - min_logit)/65535.f;
-    float * d = (float *)log_prob;
-    d[0] = scale;
-    d[1] = min_log_prob;
-    log_prob += 4;
-    if (scale) {
-        const float inv_scale = 1/scale;
-        for (int i = 0; i < n_vocab; ++i) {
-            log_prob[i] = logits[i] > min_logit ? nearest_int(inv_scale*(logits[i] - min_logit)) : 0;
-        }
-    } else {
-        std::memset(log_prob, 0, n_vocab*sizeof(uint16_t));
-    }
-    return max_logit + log_sum_exp - logits[tok];
-}
-
 static void process_logits(
    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
    double & nll, double & nll2, float * logit_history, float * prob_history
@@ -184,130 +147,6 @@ static void process_logits(
    }
 }

-static void process_logits(std::ostream& out, int n_vocab, const float * logits, const int * tokens, int n_token,
-        std::vector<std::thread> & workers, std::vector<uint16_t> & log_probs, double & nll, double & nll2) {
-    std::mutex mutex;
-    const int nv = 2*((n_vocab + 1)/2) + 4;
-    int counter = 0;
-    auto compute = [&mutex, &counter, &log_probs, &nll, &nll2, n_vocab, logits, tokens, n_token, nv] () {
-        double local_nll  = 0;
-        double local_nll2 = 0;
-        while (true) {
-            std::unique_lock<std::mutex> lock(mutex);
-            int i = counter++;
-            if (i >= n_token) {
-                nll += local_nll; nll2 += local_nll2;
-                break;
-            }
-            lock.unlock();
-            const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
-            local_nll += v;
-            local_nll2 += v*v;
-        }
-    };
-    for (auto & w : workers) {
-        w = std::thread(compute);
-    }
-    compute();
-    for (auto & w : workers) {
-        w.join();
-    }
-    out.write((const char *)log_probs.data(), n_token*nv*sizeof(uint16_t));
-}
-
-struct kl_divergence_result {
-    double sum_nll  = 0;
-    double sum_nll2 = 0;
-    double sum_kld  = 0;
-    double sum_kld2 = 0;
-    double sum_nll_diff  = 0;
-    double sum_nll_diff2 = 0;
-    size_t n_same_top = 0;
-    size_t count = 0;
-};
-
-static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) {
-    float max_logit = logits[0];
-    int imax = 0;
-    for (int i = 1; i < n_vocab; ++i) {
-        if (logits[i] > max_logit) {
-            max_logit = logits[i];
-            imax = i;
-        }
-    }
-    double sum_exp = 0.0;
-    for (int i = 0; i < n_vocab; ++i) {
-        sum_exp += expf(logits[i] - max_logit);
-    }
-    const float log_sum_exp = log(sum_exp);
-    const float * d = (const float *)base_log_prob;
-    const float scale = d[0];
-    const float min_log_prob = d[1];
-    base_log_prob += 4;
-    float nll = max_logit + log_sum_exp - logits[tok];
-    kld.sum_nll  += nll;
-    kld.sum_nll2 += nll*nll;
-    nll += (scale*base_log_prob[tok] + min_log_prob);
-    kld.sum_nll_diff  += nll;
-    kld.sum_nll_diff2 += nll*nll;
-    max_logit += log_sum_exp;
-    double sum = 0;
-    int imax_base = -1;
-    float p_log_base_max = 0;
-    for (int i = 0; i < n_vocab; ++i) {
-        const float p_log_base = scale*base_log_prob[i] + min_log_prob;
-        if (i == 0 || p_log_base > p_log_base_max) {
-            p_log_base_max = p_log_base;
-            imax_base = i;
-        }
-        if (p_log_base > -16.f) {
-            const float p_base = expf(p_log_base);
-            sum += p_base * (p_log_base - logits[i] + max_logit);
-        }
-    }
-    kld.sum_kld  += sum;
-    kld.sum_kld2 += sum*sum;
-    ++kld.count;
-    if (imax == imax_base) ++kld.n_same_top;
-    return sum;
-}
-
-static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token,
-        std::vector<std::thread> & workers, const std::vector<uint16_t> & base_log_probs, kl_divergence_result & kld,
-        float * kld_values) {
-    std::mutex mutex;
-    const int nv = 2*((n_vocab + 1)/2) + 4;
-    int counter = 0;
-    auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () {
-        kl_divergence_result local_kld;
-        while (true) {
-            std::unique_lock<std::mutex> lock(mutex);
-            int i = counter++;
-            if (i >= n_token) {
-                kld.sum_nll  += local_kld.sum_nll;
-                kld.sum_nll2 += local_kld.sum_nll2;
-                kld.sum_kld  += local_kld.sum_kld;
-                kld.sum_kld2 += local_kld.sum_kld2;
-                kld.sum_nll_diff  += local_kld.sum_nll_diff;
-                kld.sum_nll_diff2 += local_kld.sum_nll_diff2;
-                kld.n_same_top += local_kld.n_same_top;
-                kld.count += local_kld.count;
-                break;
-            }
-            lock.unlock();
-            double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
-            kld_values[i] = (float)v;
-        }
-    };
-    for (auto & w : workers) {
-        w = std::thread(compute);
-    }
-    compute();
-    for (auto & w : workers) {
-        w.join();
-    }
-}
-
 static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
@@ -455,18 +294,6 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
    const int n_ctx = llama_n_ctx(ctx);

-    std::ofstream logits_stream;
-    if (!params.logits_file.empty()) {
-        logits_stream.open(params.logits_file.c_str());
-        if (!logits_stream.is_open()) {
-            fprintf(stderr, "%s: failed to open %s for writing\n", __func__, params.logits_file.c_str());
-            return {};
-        }
-        fprintf(stderr, "%s: saving all logits to %s\n", __func__, params.logits_file.c_str());
-        logits_stream.write("_logits_", 8);
-        logits_stream.write((const char *)&n_ctx, sizeof(n_ctx));
-    }
-
    auto tim1 = std::chrono::high_resolution_clock::now();
    fprintf(stderr, "%s: tokenizing the input ..\n", __func__);

@@ -509,15 +336,6 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

-    std::vector<uint16_t> log_probs;
-    if (!params.logits_file.empty()) {
-        logits_stream.write((const char *)&n_vocab, sizeof(n_vocab));
-        logits_stream.write((const char *)&n_chunk, sizeof(n_chunk));
-        logits_stream.write((const char *)tokens.data(), n_chunk*n_ctx*sizeof(tokens[0]));
-        const int nv = 2*((n_vocab + 1)/2) + 4;
-        log_probs.resize(n_ctx * nv);
-    }
-
    for (int i = 0; i < n_chunk; ++i) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;
@@ -580,13 +398,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
        // process the entire prompt.
        const int first = n_ctx/2;
        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-        if (!params.logits_file.empty()) {
-            process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                    workers, log_probs, nll, nll2);
-        } else {
-            process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                    workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
-        }
+        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
+                       workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
        count += n_ctx - first - 1;

        // perplexity is e^(average negative log-likelihood)
@@ -645,24 +458,23 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
    return true;
 }

-#define K_TOKEN_CHUNK 4
-
 static void compute_logprobs(const float * batch_logits, int n_vocab, std::vector<std::thread>& workers,
        const std::vector<std::pair<size_t, llama_token>>& eval_pairs, std::vector<float>& eval_results) {
+    constexpr int k_token_chunk = 4;
    if (eval_results.size() != eval_pairs.size()) {
        eval_results.resize(eval_pairs.size());
    }
    if (eval_pairs.empty()) return;

-    size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
+    size_t max_threads = std::min((eval_pairs.size() + k_token_chunk - 1)/k_token_chunk, workers.size());

    std::atomic<int> counter(0);
    auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
-        float local_logprobs[K_TOKEN_CHUNK];
+        float local_logprobs[k_token_chunk];
        while (true) {
-            size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
+            size_t first = counter.fetch_add(k_token_chunk, std::memory_order_relaxed);
            if (first >= eval_results.size()) break;
-            size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
+            size_t last = std::min(first + k_token_chunk, eval_results.size());
            for (size_t i = first; i < last; ++i) {
                auto logits = batch_logits + eval_pairs[i].first * n_vocab;
                float max_logit = logits[0];
@@ -685,6 +497,7 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
    for (size_t it = 0; it < max_threads; ++it) {
        workers[it].join();
    }
+
 }

 static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
@@ -727,14 +540,14 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
    // This is needed as usual for LLaMA models
    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));

-    // The tasks should be randomized so the score stabilizes quickly.
-    bool randomize_tasks = true;
-
    // Number of tasks to use when computing the score
    if (params.hellaswag_tasks < hs_task_count) {
        hs_task_count = params.hellaswag_tasks;
    }

+    // The tasks should be randomized so the score stabilizes quickly.
+    bool randomize_tasks = true;
+
    // The random seed should not impact the final result if the computation is done over enough tasks, so kept hardcoded for now
    std::mt19937 rng(1);

@@ -1218,566 +1031,6 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
    printf("Final Winogrande score(%d tasks): %.4lf +/- %.4lf\n", n_done, 100*p, sigma);
 }

-static bool deserialize_string(std::istream & in, std::string & str) {
-    uint32_t size;
-    if (!in.read((char *)&size, sizeof(size)).fail()) {
-        str.resize(size);
-        if (!in.read((char *)&str[0], size).fail()) return true;
-    }
-    return false;
-}
-
-struct multiple_choice_answers {
-    std::vector<std::string> answers;
-    std::vector<int>         labels;
-    bool deserialize(std::istream& in) {
-        uint32_t n;
-        in.read((char *)&n, sizeof(n));
-        if (in.fail() || n > 100) return false; // 100 as max. number of answers should be good enough for any practical purpose
-        answers.resize(n);
-        labels.resize(n);
-        for (auto& a : answers) {
-            if (!deserialize_string(in, a)) return false;
-        }
-        in.read((char *)labels.data(), n*sizeof(int));
-        return !in.fail();
-    }
-};
-
-struct multiple_choice_task {
-    std::string question;         // the question (or context that needs to be continued)
-    multiple_choice_answers mc1;  // possible answers (continuations) with a single correct answer
-    multiple_choice_answers mc2;  // possible answers (continuations) with multiple correct answers - not handled yet
-    bool deserialize(std::istream& in) {
-        if (!deserialize_string(in, question)) return false;
-        return mc1.deserialize(in) && mc2.deserialize(in);
-    }
-
-    // For evaluation
-    size_t i_batch;         // starting index in the llama_batch
-    size_t common_prefix;   // max number of initial tokens that are the same in all sentences
-    size_t required_tokens; // needed number of tokens to evaluate all answers
-    std::vector<std::vector<llama_token>> seq_tokens;
-    std::vector<float> log_probs;
-};
-
-static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) {
-    if (task.question.empty() || task.mc1.answers.empty()) {
-        if (log_error) {
-            printf("%s: found bad task with empty question and/or answers\n", __func__);
-        }
-        return false;
-    }
-    task.seq_tokens.reserve(task.mc1.answers.size());
-    for (auto& answer : task.mc1.answers) {
-        if (answer.empty()) {
-            if (log_error) {
-                printf("%s: found empty answer\n", __func__);
-            }
-            return false;
-        }
-        task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos));
-    }
-    auto min_len = task.seq_tokens.front().size();
-    for (auto& seq : task.seq_tokens) {
-        min_len = std::min(min_len, seq.size());
-    }
-    task.common_prefix = 0;
-    for (size_t k = 0; k < min_len; ++k) {
-        auto token = task.seq_tokens[0][k];
-        bool all_same = true;
-        for (size_t i = 1; i < task.seq_tokens.size(); ++i) {
-            if (task.seq_tokens[i][k] != token) {
-                all_same = false;
-                break;
-            }
-        }
-        if (!all_same) {
-            break;
-        }
-        ++task.common_prefix;
-    }
-    task.required_tokens = task.common_prefix;
-    for (auto& seq : task.seq_tokens) {
-        task.required_tokens += seq.size() - task.common_prefix;
-    }
-    return true;
-}
-
-//
-// Calculates score for multiple choice tasks with single correct answer from prompt.
-// Commonly used LLM evaluation metrics of this type are
-//   * ARC
-//   * HellaSwag
-//   * MMLU
-//   * TruthfulQA
-//
-// Validation datasets for these 4 tests can be found at
-//     https://huggingface.co/datasets/ikawrakow/validation-datasets-for-llama.cpp
-// The data for these datasets was extracted from
-//     git@hf.co:datasets/allenai/ai2_arc
-//     https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
-//     git@hf.co:datasets/Stevross/mmlu
-//     https://huggingface.co/datasets/truthful_qa
-//
-static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
-
-    std::istringstream strstream(params.prompt);
-    uint32_t n_task;
-    strstream.read((char *)&n_task, sizeof(n_task));
-    if (strstream.fail() || n_task == 0) {
-        printf("%s: no tasks\n", __func__);
-        return;
-    }
-    printf("%s: there are %u tasks in prompt\n", __func__, n_task);
-    std::vector<uint32_t> task_pos(n_task);
-    strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t));
-    if (strstream.fail()) {
-        printf("%s: failed to raad task positions from prompt\n", __func__);
-        return;
-    }
-
-    std::vector<multiple_choice_task> tasks;
-    if (params.multiple_choice_tasks == 0 || params.multiple_choice_tasks >= (size_t)n_task) {
-        // Use all tasks
-        tasks.resize(n_task);
-        printf("%s: reading tasks", __func__);
-        int n_dot = n_task/100;
-        int i = 0;
-        for (auto& task : tasks) {
-            ++i;
-            if (!task.deserialize(strstream)) {
-                printf("%s: failed to read task %d of %u\n", __func__, i, n_task);
-                return;
-            }
-            if (i%n_dot == 0) printf(".");
-        }
-        printf("done\n");
-    }
-    else {
-        printf("%s: selecting %zu random tasks from %u tasks available\n", __func__, params.multiple_choice_tasks, n_task);
-        std::mt19937 rng(1);
-        std::vector<int> aux(n_task);
-        for (uint32_t i = 0; i < n_task; ++i) aux[i] = i;
-        float scale = 1.f/(1.f + (float)std::mt19937::max());
-        tasks.resize(params.multiple_choice_tasks);
-        for (auto& task : tasks) {
-            int j = (int)(scale * rng() * aux.size());
-            int idx = aux[j];
-            aux[j] = aux.back();
-            aux.pop_back();
-            strstream.seekg(task_pos[idx], std::ios::beg);
-            if (!task.deserialize(strstream)) {
-                printf("%s: failed to read task %d at position %u\n", __func__, idx, task_pos[idx]);
-                return;
-            }
-        }
-        n_task = params.multiple_choice_tasks;
-    }
-
-    // This is needed as usual for LLaMA models
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-
-    printf("%s: preparing task data", __func__);
-    fflush(stdout);
-    if (n_task > 500) {
-        printf("...");
-        fflush(stdout);
-        std::atomic<int> counter(0);
-        std::atomic<int> n_bad(0);
-        auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () {
-            int num_tasks = tasks.size();
-            int n_bad_local = 0;
-            while (true) {
-                int first = counter.fetch_add(K_TOKEN_CHUNK);
-                if (first >= num_tasks) {
-                    if (n_bad_local > 0) n_bad += n_bad_local;
-                    break;
-                }
-                int last = std::min(first + K_TOKEN_CHUNK, num_tasks);
-                for (int i = first; i < last; ++i) {
-                    if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local;
-                }
-            }
-        };
-        size_t max_thread = std::thread::hardware_concurrency();
-        max_thread = std::min(max_thread, (tasks.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK);
-        std::vector<std::thread> workers(max_thread-1);
-        for (auto& w : workers) w = std::thread(prepare);
-        prepare();
-        for (auto& w : workers) w.join();
-        printf("done\n");
-        fflush(stdout);
-        int nbad = n_bad;
-        if (nbad > 0) {
-            printf("%s: found %d malformed tasks\n", __func__, nbad);
-            return;
-        }
-    } else {
-        int n_dot = n_task/100;
-        int i_task = 0;
-        for (auto& task : tasks) {
-            ++i_task;
-            if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) {
-                return;
-            }
-            if (i_task%n_dot == 0) {
-                printf(".");
-                fflush(stdout);
-            }
-        }
-        printf("done\n");
-    }
-
-    printf("%s : calculating TruthfulQA score over %zu tasks.\n", __func__, tasks.size());
-
-    printf("\ntask\tacc_norm\n");
-
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
-    const int n_ctx   = llama_n_ctx(ctx);
-    const int n_batch = params.n_batch;
-
-    const int max_tasks_per_batch = 32;
-    const int max_seq = 4*max_tasks_per_batch;
-
-    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
-
-    std::vector<float> tok_logits(n_vocab);
-    std::vector<float> batch_logits(n_vocab*n_ctx);
-
-    std::vector<std::pair<size_t, llama_token>> eval_pairs;
-    std::vector<float> eval_results;
-    std::vector<std::thread> workers(std::thread::hardware_concurrency());
-    std::vector<int> batch_indeces;
-
-    int n_done = 0;
-    int n_correct = 0;
-    int n_tot_answers = 0;
-
-    for (size_t i0 = 0; i0 < tasks.size(); i0++) {
-        int n_cur = 0;
-
-        size_t i1 = i0;
-        size_t i_batch = 0; // this tells us where in `llama_batch` we are currently
-
-        llama_batch_clear(batch);
-
-        // batch as much tasks as possible into the available context
-        // each task has 4 unique seuqnce ids - one for each ending
-        // the common prefix is shared among the 4 sequences to save tokens
-        // we extract logits only from the last common token and from all ending tokens of each sequence
-        int s0 = 0;
-        while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) {
-            auto& cur_task = tasks[i1];
-
-            int num_answers = cur_task.seq_tokens.size();
-            if (s0 + num_answers > max_seq) {
-                break;
-            }
-
-            if (int(batch_indeces.size()) != num_answers) {
-                batch_indeces.resize(num_answers);
-            }
-            for (int s = 0; s < num_answers; ++s) batch_indeces[s] = s0 + s;
-
-            for (size_t i = 0; i < cur_task.common_prefix; ++i) {
-                //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
-                llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
-            }
-            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
-
-            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
-                for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) {
-                    llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true);
-                }
-            }
-
-            s0 += num_answers;
-
-            cur_task.i_batch = i_batch;
-            i_batch += cur_task.required_tokens;
-
-            n_cur += cur_task.required_tokens;
-            if (++i1 == tasks.size()) {
-                break;
-            }
-        }
-
-        if (i0 == i1) {
-            fprintf(stderr, "%s : task %zu does not fit in the context window\n", __func__, i0);
-            return;
-        }
-
-        llama_kv_cache_clear(ctx);
-
-        // decode all tasks [i0, i1)
-        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
-            fprintf(stderr, "%s: llama_decode() failed\n", __func__);
-            return;
-        }
-
-        // Compute log-probs in parallel
-        // First we collect all tasks
-        eval_pairs.clear();
-        for (size_t i = i0; i < i1; ++i) {
-            auto& cur_task = tasks[i];
-            size_t li = cur_task.common_prefix;
-            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
-                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
-                    eval_pairs.push_back(std::make_pair(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]));
-                }
-                ++li;
-            }
-        }
-        // Then we do the actual calculation
-        compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results);
-
-        size_t ir = 0;
-
-        // compute the logprobs for each ending of the decoded tasks
-        for (size_t i = i0; i < i1; ++i) {
-            auto & cur_task = tasks[i];
-            //printf("==== Evaluating <%s> with correct answer ", cur_task.question.c_str());
-            //for (int j = 0; j < int(cur_task.mc1.labels.size()); ++j) {
-            //    if (cur_task.mc1.labels[j] == 1) {
-            //        printf("%d", j+1);
-            //    }
-            //}
-            //printf("\n    common_prefix: %zu\n", cur_task.common_prefix);
-
-            std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float));
-
-            const auto first_probs = softmax(tok_logits);
-
-            cur_task.log_probs.resize(cur_task.seq_tokens.size());
-            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
-                size_t count = 1;
-                float  log_prob  = std::log(first_probs[cur_task.seq_tokens[s][cur_task.common_prefix]]);
-                for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) {
-                    //printf("        %zu  %g\n", ir, eval_results[ir]);
-                    ++count;
-                    log_prob += eval_results[ir++];
-                }
-                cur_task.log_probs[s] = log_prob / count;
-                //printf("        Final: %g\n", log_prob / count);
-                //printf("    <%s> : %g\n", cur_task.mc1.answers[s].c_str(), log_prob/count);
-            }
-
-            // Find the ending with maximum logprob
-            size_t logprob_max_idx = 0;
-            float  logprob_max_val = cur_task.log_probs[0];
-            for (size_t s = 1; s < cur_task.log_probs.size(); s++) {
-                if (cur_task.log_probs[s] > logprob_max_val) {
-                    logprob_max_val = cur_task.log_probs[s];
-                    logprob_max_idx = s;
-                }
-            }
-
-            n_tot_answers += cur_task.log_probs.size();
-            if (cur_task.mc1.labels[logprob_max_idx] == 1) {
-                ++n_correct;
-            }
-            ++n_done;
-
-            // Print the accumulated accuracy mean x 100
-            printf("%d\t%.8lf\n", n_done, 100.*n_correct/n_done);
-            fflush(stdout);
-        }
-
-        i0 = i1 - 1;
-    }
-
-    llama_batch_free(batch);
-
-    if (n_done < 100) return;
-
-    float p = 1.f*n_correct/n_done;
-    float sigma = sqrt(p*(1-p)/(n_done-1));
-    printf("\n Final result: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
-    p = 1.f*n_done/n_tot_answers;
-    sigma = sqrt(p*(1-p)/(n_done-1));
-    printf("Random chance: %.4f +/- %.4f\n", 100.f*p, 100.f*sigma);
-
-    printf("\n");
-}
-
-static void kl_divergence(llama_context * ctx, const gpt_params & params) {
-    if (params.logits_file.empty()) {
-        fprintf(stderr, "%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
-        return;
-    }
-    std::ifstream in(params.logits_file.c_str(), std::ios::binary);
-    if (!in) {
-        fprintf(stderr, "%s: failed to open %s\n", __func__, params.logits_file.c_str());
-        return;
-    }
-    {
-        char check[9]; check[8] = 0;
-        in.read(check, 8);
-        if (in.fail() || strncmp("_logits_", check, 8) != 0) {
-            fprintf(stderr, "%s: %s does not look like a file containing log-probabilities\n", __func__, params.logits_file.c_str());
-            return;
-        }
-    }
-
-    uint32_t n_ctx;
-    in.read((char *)&n_ctx, sizeof(n_ctx));
-    if (n_ctx > llama_n_ctx(ctx)) {
-        fprintf(stderr, "%s: %s has been computed with %d, while the current context is %d. Increase it with -c and retry\n",
-                __func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
-    }
-
-    int n_vocab, n_chunk;
-    in.read((char *)&n_vocab, sizeof(n_vocab));
-    in.read((char *)&n_chunk, sizeof(n_chunk));
-    if (in.fail()) {
-        fprintf(stderr, "%s: failed reading n_vocab, n_chunk from %s\n", __func__, params.logits_file.c_str());
-        return;
-    }
-    if (n_vocab != llama_n_vocab(llama_get_model(ctx))) {
-        fprintf(stderr, "%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
-    }
-
-    std::vector<llama_token> tokens(n_ctx * n_chunk);
-    if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
-        fprintf(stderr, "%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
-        return;
-    }
-
-    const int n_batch = params.n_batch;
-    const int num_batches = (n_ctx + n_batch - 1)/n_batch;
-    const int nv = 2*((n_vocab + 1)/2) + 4;
-    const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
-
-    std::vector<uint16_t> log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv);
-    std::vector<float> kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
-    std::vector<float> logits;
-    if (num_batches > 1) {
-        logits.reserve(n_ctx * n_vocab);
-    }
-
-    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
-
-    auto mean_and_uncertainty = [] (double sum, double sum2, size_t count) {
-        if (count < 1) {
-            return std::make_pair(0., 0.);
-        }
-        double f = sum/count;
-        double df = sum2/count - f*f;
-        df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.;
-        return std::make_pair(f, df);
-    };
-
-    kl_divergence_result kld;
-    auto kld_ptr = kld_values.data();
-
-    for (int i = 0; i < n_chunk; ++i) {
-        const int start =     i * n_ctx;
-        const int end   = start + n_ctx;
-
-        const auto t_start = std::chrono::high_resolution_clock::now();
-
-        if (in.read((char *)log_probs_uint16.data(), log_probs_uint16.size()*sizeof(uint16_t)).fail()) {
-            fprintf(stderr, "%s: failed reading log-probs for chunk %d\n", __func__, i);
-            return;
-        }
-
-        // clear the KV cache
-        llama_kv_cache_clear(ctx);
-
-        for (int j = 0; j < num_batches; ++j) {
-            const int batch_start = start + j * n_batch;
-            const int batch_size  = std::min(end - batch_start, n_batch);
-
-            // save original token and restore it after eval
-            const auto token_org = tokens[batch_start];
-
-            // add BOS token for the first batch of each chunk
-            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
-            }
-
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
-                fprintf(stderr, "%s : failed to eval\n", __func__);
-                return;
-            }
-
-            // restore the original token in case it was set to BOS
-            tokens[batch_start] = token_org;
-
-            if (num_batches > 1) {
-                const auto * batch_logits = llama_get_logits(ctx);
-                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
-            }
-        }
-
-        const auto t_end = std::chrono::high_resolution_clock::now();
-
-        if (i == 0) {
-            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
-            int total_seconds = (int)(t_total * n_chunk);
-            if (total_seconds >= 60*60) {
-                fprintf(stderr, "%d hours ", total_seconds / (60*60));
-                total_seconds = total_seconds % (60*60);
-            }
-            fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
-
-            printf("\nchunk        PPL          ln(PPL(Q)/PPL(base))          KL-Divergence           Same top\n");
-        }
-
-        const int first = n_ctx/2;
-        const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
-        process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
-                workers, log_probs_uint16, kld, kld_ptr);
-        kld_ptr += n_ctx - 1 - first;
-
-        auto ppl           = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count);
-        auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count);
-        auto kl_div        = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-        auto p_top = 1.*kld.n_same_top/kld.count;
-        auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1));
-
-        printf("%4d    %10.4lf    %10.5lf ± %10.5f    %10.5f ± %10.5lf    %.5f ± %.5f\n", i+1, exp(ppl.first),
-                log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second,
-                p_top, d_p_top);
-
-        fflush(stdout);
-
-        logits.clear();
-    }
-    printf("\n");
-
-    if (kld.count < 100) return; // we do not wish to do statistics on so few values
-
-    std::sort(kld_values.begin(), kld_values.end());
-
-    printf("===== KL-divergence statistics\n");
-    auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count);
-    printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second);
-    auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1])
-                                               : kld_values[kld_values.size()/2];
-    printf("Median : %10.6f\n", kld_median);
-
-    auto percentile = [&kld_values] (float fraction) {
-        if (fraction <= 0) return kld_values.front();
-        if (fraction >= 1) return kld_values.back();
-        float p = fraction*(kld_values.size() - 1);
-        size_t ip = size_t(p); p -= ip;
-        return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)];
-    };
-
-    printf("Maximum: %10.6f\n", kld_values.back());
-    printf("KLD_99 : %10.6f\n", percentile(0.99f));
-    printf("KLD_95 : %10.6f\n", percentile(0.95f));
-    printf("KLD_90 : %10.6f\n", percentile(0.90f));
-
-    printf("Minimum: %10.6f\n", kld_values.front());
-    printf("KLD_01 : %10.6f\n", percentile(0.01f));
-    printf("KLD_05 : %10.6f\n", percentile(0.05f));
-    printf("KLD_10 : %10.6f\n", percentile(0.10f));
-
-}

 int main(int argc, char ** argv) {
    gpt_params params;
@@ -1838,10 +1091,6 @@ int main(int argc, char ** argv) {
        hellaswag_score(ctx, params);
    } else if (params.winogrande) {
        winogrande_score(ctx, params);
-    } else if (params.multiple_choice) {
-        multiple_choice_score(ctx, params);
-    } else if (params.kl_divergence) {
-        kl_divergence(ctx, params);
    } else {
        results = perplexity(ctx, params);
    }
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -26,7 +26,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
    { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
-    { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization"   , },
    { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
    { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
    { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1705677747,
-        "narHash": "sha256-eyM3okYtMgYDgmYukoUzrmuoY4xl4FUujnsv/P6I/zI=",
+        "lastModified": 1705133751,
+        "narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "bbe7d8f876fbbe7c959c90ba2ae2852220573261",
+        "rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
        "type": "github"
      },
      "original": {
--- a/flake.nix
+++ b/flake.nix
@@ -1,17 +1,3 @@
-# The flake interface to llama.cpp's Nix expressions. The flake is used as a
-# more discoverable entry-point, as well as a way to pin the dependencies and
-# expose default outputs, including the outputs built by the CI.
-
-# For more serious applications involving some kind of customization  you may
-# want to consider consuming the overlay, or instantiating `llamaPackages`
-# directly:
-#
-# ```nix
-# pkgs.callPackage ${llama-cpp-root}/.devops/nix/scope.nix { }`
-# ```
-
-# Cf. https://jade.fyi/blog/flakes-arent-real/ for a more detailed exposition
-# of the relation between Nix and the Nix Flakes.
 {
  description = "Port of Facebook's LLaMA model in C/C++";

--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
        if (block->size >= size) {
            best_fit_block = alloc->n_free_blocks - 1;
        } else {
-            fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
-                    __func__, tensor->name, size, max_avail);
+            fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
+                    __func__, size, max_avail);
            GGML_ASSERT(!"not enough space in the buffer");
            return;
        }
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -1191,24 +1191,6 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                ggml_tallocr_t src_allocr = node_allocr(src);
                GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
                if (src_allocr != node_allocr) {
-                    // create a copy of the input in the split's backend
-                    size_t id = hash_id(src);
-                    if (sched->node_copies[id][cur_backend_id] == NULL) {
-                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
-                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
-                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
-
-                        sched->node_copies[id][cur_backend_id] = tensor_copy;
-                        node_allocr(tensor_copy) = cur_allocr;
-                        SET_CAUSE(tensor_copy, "4.cpy");
-
-                        int n_inputs = sched->splits[cur_split].n_inputs++;
-                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
-                        sched->splits[cur_split].inputs[n_inputs] = src;
-                    }
-                    node->src[j] = sched->node_copies[id][cur_backend_id];
-
-#if 0
                    // check if the input is already in the split
                    bool found = false;
                    for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
@@ -1224,7 +1206,19 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                        GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
                        sched->splits[cur_split].inputs[n_inputs] = src;
                    }
-#endif
+
+                    // create a copy of the input in the split's backend
+                    size_t id = hash_id(src);
+                    if (sched->node_copies[id][cur_backend_id] == NULL) {
+                        ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
+                        struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
+                        ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
+
+                        sched->node_copies[id][cur_backend_id] = tensor_copy;
+                        node_allocr(tensor_copy) = cur_allocr;
+                        SET_CAUSE(tensor_copy, "4.cpy");
+                    }
+                    node->src[j] = sched->node_copies[id][cur_backend_id];
                }
            }
        }
@@ -1339,7 +1333,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
        uint64_t compute_start_us = ggml_time_us();
        if (!sched->callback_eval) {
            ggml_backend_graph_compute(split_backend, &split->graph);
-            //ggml_backend_synchronize(split_backend); // necessary to measure compute time
+          //ggml_backend_synchronize(split_backend); // necessary to measure compute time
        } else {
            // similar to ggml_backend_compare_graph_backend
            for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -13,10 +13,6 @@
 #include <map>
 #include <array>

-// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
-#define STRINGIZE_IMPL(...) #__VA_ARGS__
-#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
-
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
@@ -588,28 +584,13 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0,
 static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};

 [[noreturn]]
-static __device__ void no_device_code(
-    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
-           file_name, line, function_name, arch);
-    (void) arch_list;
-#else
-    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
-           file_name, line, function_name, arch, arch_list);
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+static __device__ void bad_arch() {
+    printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
    __trap();

-    (void) no_device_code; // suppress unused function warning
+    (void) bad_arch; // suppress unused function warning
 }

-#ifdef __CUDA_ARCH__
-#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
-#else
-#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
-#endif // __CUDA_ARCH__
-
 static __device__ __forceinline__ float warp_reduce_sum(float x) {
 #pragma unroll
    for (int mask = 16; mask > 0; mask >>= 1) {
@@ -636,7 +617,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
    return a;
 #else
    (void) a;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
 }

@@ -657,7 +638,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
    return x;
 #else
    (void) x;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 }

@@ -2440,7 +2421,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
    }
 #else
    (void) vx; (void) y; (void) k;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_PASCAL
 }

@@ -2471,7 +2452,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
    // second part effectively subtracts 8 from each quant value
    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2508,7 +2489,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2543,7 +2524,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
    // second part effectively subtracts 16 from each quant value
    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2588,7 +2569,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);

 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2609,7 +2590,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp

    return d8_0*d8_1 * sumi;
 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2639,7 +2620,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2674,7 +2655,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(

    return dm2f.x*sumf_d - dm2f.y*sumf_m;
 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2711,7 +2692,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(

    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2751,7 +2732,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(

    return d3 * sumf;
 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2776,7 +2757,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(

    return d3*d8 * sumi;
 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2809,7 +2790,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
    return dm4f.x*sumf_d - dm4f.y*sumf_m;

 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2842,7 +2823,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
    return dm4f.x*sumf_d - dm4f.y*sumf_m;

 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2882,7 +2863,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
    return dm5f.x*sumf_d - dm5f.y*sumf_m;

 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2915,7 +2896,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
    return dm4f.x*sumf_d - dm4f.y*sumf_m;

 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2945,7 +2926,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(

    return d*sumf;
 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -2976,7 +2957,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
    return d6 * sumf_d;

 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }

@@ -3842,7 +3823,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
    return dall * sumf_d - dmin * sumf_m;

 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A

 #endif
@@ -4025,7 +4006,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
    return d * sumf_d;

 #else
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A

 #endif
@@ -4520,7 +4501,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q4_0_q8_1_mul_mat;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4589,7 +4570,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q4_1_q8_1_mul_mat;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4656,7 +4637,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q5_0_q8_1_mul_mat;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4723,7 +4704,7 @@ mul_mat_q5_1(
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q5_1_q8_1_mul_mat;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4790,7 +4771,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q8_0_q8_1_mul_mat;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4857,7 +4838,7 @@ mul_mat_q2_K(
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q2_K_q8_1_mul_mat;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4926,7 +4907,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q3_K_q8_1_mul_mat;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -4995,7 +4976,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q4_K_q8_1_mul_mat;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -5062,7 +5043,7 @@ mul_mat_q5_K(
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q5_K_q8_1_mul_mat;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -5131,7 +5112,7 @@ template <bool need_check> static __global__ void
        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
    (void) vec_dot_q6_K_q8_1_mul_mat;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }

@@ -5854,7 +5835,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
    }
 #else
    (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
-    NO_DEVICE_CODE;
+    bad_arch();
 #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
 }

--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -147,6 +147,9 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,
    GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,
    GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,
+    GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,
    GGML_METAL_KERNEL_TYPE_CPY_F32_F16,
    GGML_METAL_KERNEL_TYPE_CPY_F32_F32,
    GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,
@@ -277,6 +280,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
            NSURL * libURL = [NSURL fileURLWithPath:libPath];
            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
            ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
+            if (error) {
+                GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                return NULL;
+            }
        } else {
            GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);

@@ -315,13 +322,12 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
                //[options setFastMathEnabled:false];

                ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
+                if (error) {
+                    GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
+                    return NULL;
+                }
            }
        }
-
-        if (error) {
-            GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
-            return NULL;
-        }
    }

    // print MTL GPU family:
@@ -395,6 +401,9 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
            struct ggml_metal_kernel * kernel = &ctx->kernels[e]; \
            kernel->function = [ctx->library newFunctionWithName:@"kernel_"#name]; \
            kernel->pipeline = [ctx->device newComputePipelineStateWithFunction:kernel->function error:&error]; \
+            GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \
+                    (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \
+                    (int) kernel->pipeline.threadExecutionWidth); \
            if (error) { \
                GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
                return NULL; \
@@ -405,124 +414,127 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {

        // simd_sum and simd_max requires MTLGPUFamilyApple7

-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD,                       add,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW,                   add_row,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL,                       mul,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW,                   mul_row,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV,                       div,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW,                   div_row,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE,                     scale,                  true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4,                   scale_4,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH,                      tanh,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU,                      relu,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU,                      gelu,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK,                gelu_quick,             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                      silu,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX,                  soft_max,               ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,                soft_max_4,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,             diag_mask_inf,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,           diag_mask_inf_8,        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,              get_rows_f32,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16,              get_rows_f16,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0,             get_rows_q4_0,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1,             get_rows_q4_1,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,             get_rows_q5_0,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1,             get_rows_q5_1,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0,             get_rows_q8_0,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K,             get_rows_q2_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K,             get_rows_q3_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K,             get_rows_q4_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K,             get_rows_q5_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,             get_rows_q6_K,          true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,          get_rows_iq2_xxs,       true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,           get_rows_iq2_xs,        true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,              get_rows_i32,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                  rms_norm,               ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                group_norm,             ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM,                      norm,                   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,            mul_mv_f32_f32,         ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,            mul_mv_f16_f16,         ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,            mul_mv_f16_f32,         ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,       mul_mv_f16_f32_1row,    ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,         mul_mv_f16_f32_l4,      ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,           mul_mv_q4_0_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,           mul_mv_q4_1_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,           mul_mv_q5_0_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,           mul_mv_q5_1_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,           mul_mv_q8_0_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,           mul_mv_q2_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,           mul_mv_q3_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,           mul_mv_q4_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,           mul_mv_q5_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,           mul_mv_q6_K_f32,        ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,        mul_mv_iq2_xxs_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,         mul_mv_iq2_xs_f32,      ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,         mul_mv_id_f32_f32,      ctx->support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,         mul_mv_id_f16_f16,      ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,         mul_mv_id_f16_f32,      ctx->support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,    mul_mv_id_f16_f32_1row, ctx->support_simdgroup_reduction);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,      mul_mv_id_f16_f32_l4,   ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,        mul_mv_id_q4_0_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,        mul_mv_id_q4_1_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,        mul_mv_id_q5_0_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,        mul_mv_id_q5_1_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,        mul_mv_id_q8_0_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,        mul_mv_id_q2_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,        mul_mv_id_q3_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,        mul_mv_id_q4_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,        mul_mv_id_q5_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,        mul_mv_id_q6_K_f32,     ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,     mul_mv_id_iq2_xxs_f32,  ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,      mul_mv_id_iq2_xs_f32,   ctx->support_simdgroup_reduction);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,            mul_mm_f32_f32,         ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,            mul_mm_f16_f32,         ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,           mul_mm_q4_0_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,           mul_mm_q4_1_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,           mul_mm_q5_0_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,           mul_mm_q5_1_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,           mul_mm_q8_0_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,           mul_mm_q2_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,           mul_mm_q3_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,           mul_mm_q4_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,           mul_mm_q5_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,           mul_mm_q6_K_f32,        ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,        mul_mm_iq2_xxs_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,         mul_mm_iq2_xs_f32,      ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,         mul_mm_id_f32_f32,      ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,         mul_mm_id_f16_f32,      ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,        mul_mm_id_q4_0_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,        mul_mm_id_q4_1_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,        mul_mm_id_q5_0_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,        mul_mm_id_q5_1_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,        mul_mm_id_q8_0_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,        mul_mm_id_q2_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,        mul_mm_id_q3_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,        mul_mm_id_q4_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,        mul_mm_id_q5_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,        mul_mm_id_q6_K_f32,     ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,     mul_mm_id_iq2_xxs_f32,  ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,      mul_mm_id_iq2_xs_f32,   ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32,                  rope_f32,               true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16,                  rope_f16,               true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32,                 alibi_f32,              true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                im2col_f16,             true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,               upscale_f32,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                   pad_f32,                true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,       argsort_f32_i32_asc,    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,      argsort_f32_i32_desc,   true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,            leaky_relu_f32,         true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,               cpy_f32_f16,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,               cpy_f32_f32,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,              cpy_f32_q8_0,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,              cpy_f32_q4_0,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,              cpy_f32_q4_1,           true);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,              cpy_f32_q5_0,           true);
-      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,              cpy_f32_q5_1,           true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16,               cpy_f16_f16,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32,               cpy_f16_f32,            true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT,                    concat,                 true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR,                       sqr,                    true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                  sum_rows,               true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD,                       add,                     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW,                   add_row,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL,                       mul,                     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW,                   mul_row,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV,                       div,                     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW,                   div_row,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE,                     scale,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4,                   scale_4,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH,                      tanh,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU,                      relu,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU,                      gelu,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK,                gelu_quick,              true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU,                      silu,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX,                  soft_max,                ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4,                soft_max_4,              ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF,             diag_mask_inf,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8,           diag_mask_inf_8,         true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32,              get_rows_f32,            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16,              get_rows_f16,            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0,             get_rows_q4_0,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1,             get_rows_q4_1,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0,             get_rows_q5_0,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1,             get_rows_q5_1,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0,             get_rows_q8_0,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K,             get_rows_q2_K,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K,             get_rows_q3_K,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K,             get_rows_q4_K,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K,             get_rows_q5_K,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K,             get_rows_q6_K,           true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS,          get_rows_iq2_xxs,        true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS,           get_rows_iq2_xs,         true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32,              get_rows_i32,            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM,                  rms_norm,                ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM,                group_norm,              ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM,                      norm,                    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,            mul_mv_f32_f32,          ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,            mul_mv_f16_f16,          ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,            mul_mv_f16_f32,          ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,       mul_mv_f16_f32_1row,     ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,         mul_mv_f16_f32_l4,       ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32,           mul_mv_q4_0_f32,         ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32,           mul_mv_q4_1_f32,         ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32,           mul_mv_q5_0_f32,         ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32,           mul_mv_q5_1_f32,         ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32,           mul_mv_q8_0_f32,         ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32,           mul_mv_q2_K_f32,         ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32,           mul_mv_q3_K_f32,         ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32,           mul_mv_q4_K_f32,         ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32,           mul_mv_q5_K_f32,         ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32,           mul_mv_q6_K_f32,         ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32,        mul_mv_iq2_xxs_f32,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32,         mul_mv_iq2_xs_f32,       ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32,         mul_mv_id_f32_f32,       ctx->support_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16,         mul_mv_id_f16_f16,       ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32,         mul_mv_id_f16_f32,       ctx->support_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW,    mul_mv_id_f16_f32_1row,  ctx->support_simdgroup_reduction);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4,      mul_mv_id_f16_f32_l4,    ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32,        mul_mv_id_q4_0_f32,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32,        mul_mv_id_q4_1_f32,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32,        mul_mv_id_q5_0_f32,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32,        mul_mv_id_q5_1_f32,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32,        mul_mv_id_q8_0_f32,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32,        mul_mv_id_q2_K_f32,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32,        mul_mv_id_q3_K_f32,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32,        mul_mv_id_q4_K_f32,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32,        mul_mv_id_q5_K_f32,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32,        mul_mv_id_q6_K_f32,      ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32,     mul_mv_id_iq2_xxs_f32,   ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32,      mul_mv_id_iq2_xs_f32,    ctx->support_simdgroup_reduction);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32,            mul_mm_f32_f32,          ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32,            mul_mm_f16_f32,          ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32,           mul_mm_q4_0_f32,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32,           mul_mm_q4_1_f32,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32,           mul_mm_q5_0_f32,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32,           mul_mm_q5_1_f32,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32,           mul_mm_q8_0_f32,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32,           mul_mm_q2_K_f32,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32,           mul_mm_q3_K_f32,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32,           mul_mm_q4_K_f32,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32,           mul_mm_q5_K_f32,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32,           mul_mm_q6_K_f32,         ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32,        mul_mm_iq2_xxs_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32,         mul_mm_iq2_xs_f32,       ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,         mul_mm_id_f32_f32,       ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,         mul_mm_id_f16_f32,       ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,        mul_mm_id_q4_0_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,        mul_mm_id_q4_1_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,        mul_mm_id_q5_0_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,        mul_mm_id_q5_1_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,        mul_mm_id_q8_0_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,        mul_mm_id_q2_K_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,        mul_mm_id_q3_K_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,        mul_mm_id_q4_K_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,        mul_mm_id_q5_K_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,        mul_mm_id_q6_K_f32,      ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,     mul_mm_id_iq2_xxs_f32,   ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,      mul_mm_id_iq2_xs_f32,    ctx->support_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32,                  rope_f32,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16,                  rope_f16,                true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32,                 alibi_f32,               true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                im2col_f16,              true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,               upscale_f32,             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32,                   pad_f32,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC,       argsort_f32_i32_asc,     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC,      argsort_f32_i32_desc,    true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32,            leaky_relu_f32,          true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64,    flash_attn_ext_f16_h64,  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80,    flash_attn_ext_f16_h80,  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128,   flash_attn_ext_f16_h128, true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16,               cpy_f32_f16,             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32,               cpy_f32_f32,             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0,              cpy_f32_q8_0,            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0,              cpy_f32_q4_0,            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1,              cpy_f32_q4_1,            true);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0,              cpy_f32_q5_0,            true);
+      //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1,              cpy_f32_q5_1,            true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16,               cpy_f16_f16,             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32,               cpy_f16_f32,             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT,                    concat,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR,                       sqr,                     true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                  sum_rows,                true);
    }

    return ctx;
@@ -665,11 +677,11 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
        case GGML_OP_PAD:
        case GGML_OP_ARGSORT:
        case GGML_OP_LEAKY_RELU:
+        case GGML_OP_FLASH_ATTN_EXT:
            return true;
        case GGML_OP_MUL_MAT:
        case GGML_OP_MUL_MAT_ID:
-            return ctx->support_simdgroup_reduction &&
-                (op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32);
+            return ctx->support_simdgroup_reduction;
        case GGML_OP_CPY:
        case GGML_OP_DUP:
        case GGML_OP_CONT:
@@ -2162,6 +2174,97 @@ static bool ggml_metal_graph_compute(

                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
+                case GGML_OP_FLASH_ATTN_EXT:
+                    {
+                        GGML_ASSERT(ne00 % 4 == 0);
+                        GGML_ASSERT(src0->type == GGML_TYPE_F16);
+
+                        struct ggml_tensor * src2 = gf->nodes[i]->src[2];
+                        struct ggml_tensor * src3 = gf->nodes[i]->src[3];
+
+                        GGML_ASSERT(ggml_are_same_shape(src1, src2));
+                        GGML_ASSERT(src3);
+
+                        size_t offs_src2 = 0;
+                        size_t offs_src3 = 0;
+
+                        GGML_ASSERT(src2);
+                        id<MTLBuffer> id_src2 = ggml_metal_get_buffer(ctx, src2, &offs_src2);
+
+                        id<MTLBuffer> id_src3 = src3 ? ggml_metal_get_buffer(ctx, src3, &offs_src3) : nil;
+
+                        const int64_t  ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
+                        const int64_t  ne31 = src3 ? src3->ne[1] : 0;
+                        const int64_t  ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
+                        const int64_t  ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33);
+
+                        const uint64_t nb30 = src3 ? src3->nb[0] : 0; GGML_UNUSED(nb30);
+                        const uint64_t nb31 = src3 ? src3->nb[1] : 0;
+                        const uint64_t nb32 = src3 ? src3->nb[2] : 0; GGML_UNUSED(nb32);
+                        const uint64_t nb33 = src3 ? src3->nb[3] : 0; GGML_UNUSED(nb33);
+
+                        const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
+
+                        float scale;
+                        memcpy(&scale, dst->op_params, sizeof(float));
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        switch (ne00) {
+                            case 64:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break;
+                            case 80:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break;
+                            case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break;
+                            default:
+                                {
+                                    GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
+                                    GGML_METAL_LOG_ERROR("add template specialization for this size\n");
+                                    GGML_ASSERT(false && "add template specialization for this size");
+                                }
+                        }
+
+                        // TODO: extend if necessary
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_src1 offset:offs_src1        atIndex:1];
+                        [encoder setBuffer:id_src2 offset:offs_src2        atIndex:2];
+                        [encoder setBuffer:id_src3 offset:offs_src3        atIndex:3];
+                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:4];
+                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:5];
+                        [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:6];
+                        [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:7];
+                        [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:8];
+                        [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:9];
+                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:10];
+                        [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:11];
+                        [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:12];
+                        [encoder setBytes:&ne10    length:sizeof( int64_t) atIndex:13];
+                        [encoder setBytes:&ne11    length:sizeof( int64_t) atIndex:14];
+                        [encoder setBytes:&ne12    length:sizeof( int64_t) atIndex:15];
+                        [encoder setBytes:&ne13    length:sizeof( int64_t) atIndex:16];
+                        [encoder setBytes:&nb10    length:sizeof(uint64_t) atIndex:17];
+                        [encoder setBytes:&nb11    length:sizeof(uint64_t) atIndex:18];
+                        [encoder setBytes:&nb12    length:sizeof(uint64_t) atIndex:19];
+                        [encoder setBytes:&nb13    length:sizeof(uint64_t) atIndex:20];
+                        [encoder setBytes:&ne31    length:sizeof( int64_t) atIndex:21];
+                        [encoder setBytes:&nb31    length:sizeof(uint64_t) atIndex:22];
+                        [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:23];
+                        [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:24];
+                        [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:25];
+                        [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:26];
+                        [encoder setBytes:&scale   length:sizeof(   float) atIndex:27];
+
+                        const int64_t nsg   = 8; // simdgroups per threadgroup (a.k.a. warps)
+                        const int64_t nhptg = 4; // heads   per threadgroup !! sync with kernel template arguments !!
+                        const int64_t nqptg = 2; // queries per threadgroup !! sync with kernel template arguments !!
+
+                      //const size_t smem = nqptg*(nhptg*ne00 + nsg*(nhptg*ne00 + 256))*(sizeof(float)/2);
+                        const size_t smem = nqptg*(nhptg*ne00 + nsg*(32))*(sizeof(float)/2);
+
+                        GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength);
+                        [encoder setThreadgroupMemoryLength:smem atIndex:0];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, (ne02 + nhptg - 1)/(nhptg), ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)];
+                    } break;
                case GGML_OP_DUP:
                case GGML_OP_CPY:
                case GGML_OP_CONT:
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1959,6 +1959,348 @@ kernel void kernel_leaky_relu_f32(
    dst[tpig] = src0[tpig] > 0.0f ? src0[tpig] : src0[tpig] * slope;
 }

+typedef void (flash_attn_ext_f16_t)(
+        device const  char * q,
+        device const  char * k,
+        device const  char * v,
+        device const  char * mask,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne10,
+        constant   int64_t & ne11,
+        constant   int64_t & ne12,
+        constant   int64_t & ne13,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
+        constant   int64_t & ne31,
+        constant  uint64_t & nb31,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant     float & scale,
+        threadgroup   half * shared,
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]]);
+
+template<int64_t D, int64_t H, int64_t Q> // head size, heads per threadgroup, queries per threadgroup
+kernel void kernel_flash_attn_ext_f16(
+        device const  char * q,
+        device const  char * k,
+        device const  char * v,
+        device const  char * mask,
+        device       float * dst,
+        constant   int64_t & ne00,
+        constant   int64_t & ne01,
+        constant   int64_t & ne02,
+        constant   int64_t & ne03,
+        constant  uint64_t & nb00,
+        constant  uint64_t & nb01,
+        constant  uint64_t & nb02,
+        constant  uint64_t & nb03,
+        constant   int64_t & ne10,
+        constant   int64_t & ne11,
+        constant   int64_t & ne12,
+        constant   int64_t & ne13,
+        constant  uint64_t & nb10,
+        constant  uint64_t & nb11,
+        constant  uint64_t & nb12,
+        constant  uint64_t & nb13,
+        constant   int64_t & ne31,
+        constant  uint64_t & nb31,
+        constant   int64_t & ne0,
+        constant   int64_t & ne1,
+        constant   int64_t & ne2,
+        constant   int64_t & ne3,
+        constant     float & scale,
+        threadgroup   half * shared [[threadgroup(0)]],
+        uint3 tgpig[[threadgroup_position_in_grid]],
+        uint3 tpitg[[thread_position_in_threadgroup]],
+        uint3   ntg[[threads_per_threadgroup]],
+        uint  tiisg[[thread_index_in_simdgroup]],
+        uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    const uint nsg = ntg.y;         // number of simdgroups
+    const uint tph = N_SIMDWIDTH/H; // threads per head
+
+    const int64_t iq3 = tgpig[2];
+    const int64_t iq2 = tgpig[1]*H + tiisg/tph;
+    const int64_t iq1 = tgpig[0]*Q;
+
+    if (iq2 >= ne02) {
+        return;
+    }
+
+    // assume K and V are same shape
+    const int64_t ne22 = ne12;
+    const int64_t ne23 = ne13;
+
+    const uint64_t nb21 = nb11;
+    const uint64_t nb22 = nb12;
+    const uint64_t nb23 = nb13;
+
+    // broadcast
+    const int64_t rk2 = ne02/ne12;
+    const int64_t rk3 = ne03/ne13;
+
+    const int64_t rv2 = ne02/ne22;
+    const int64_t rv3 = ne03/ne23;
+
+    // k indices
+    const int64_t ik2 = iq2 / rk2;
+    const int64_t ik3 = iq3 / rk3;
+
+    // v indices
+    const int64_t iv2 = iq2 / rv2;
+    const int64_t iv3 = iq3 / rv3;
+
+    const int64_t ir = iq3*ne02*ne01 + iq2*ne01 + iq1;
+
+    device const float * mp[Q];
+    for (int64_t j = 0; j < Q; ++j) {
+        if (iq1 + j < ne01) {
+            mp[j] = (device const float *) (mask + ((ir + j)%ne31)*nb31);
+        } else {
+            mp[j] = nullptr;
+        }
+    }
+
+    const int64_t D4 = D/4;
+
+    const int64_t T  = (H*D + nsg*(32)); // shared memory size per query in half
+    const int64_t T4 = T/4;               // shared memory size per query in half4
+
+    threadgroup half4 * pq4 = (threadgroup half4 *) (shared +              0*H*D);
+    threadgroup half  * ss  = (threadgroup half  *) (shared + sgitg*(32) + 1*H*D);
+    threadgroup half4 * ss4 = (threadgroup half4 *) (shared + sgitg*(32) + 1*H*D);
+
+    const uint tiih  = tiisg%tph; // thread index in head
+    const uint hiisg = tiisg/tph; // head index in simdgroup
+
+    half4 ps4[Q][D4/tph];
+
+    // load H heads from Q to shared memory
+    for (int64_t i = 0; i < D4/tph; ++i) {
+        for (int64_t j = sgitg; j < Q; j += nsg) {
+            if (iq1 + j < ne01) {
+                pq4[j*T4 + hiisg*D4 + tph*i + tiih] = ((device const half4 *) ((device const char *) q + ((iq1 + j)*nb01 + iq2*nb02 + iq3*nb03)))[tph*i + tiih];
+            } else {
+                pq4[j*T4 + hiisg*D4 + tph*i + tiih] = 0.0h;
+            }
+        }
+
+        for (int64_t j = 0; j < Q; ++j) {
+            //ps4[j*T4 + hiisg*D4 + tph*i + tiih] = 0.0h;
+            ps4[j][i] = 0.0h;
+        }
+    }
+
+    for (int64_t j = 0; j < Q; ++j) {
+        ss[j*T + hiisg*tph + tiih] = 0.0h;
+        ss[j*T + hiisg*tph + tiih] = 0.0h;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    half S = { 0.0h };
+    half M = { -INFINITY };
+
+    for (int64_t ic = sgitg; ic < ne11; ic += nsg) {
+        half mv[Q];
+
+        bool skip = true;
+        for (int64_t j = 0; j < Q; ++j) {
+            mv[j] = mp[j][ic];
+            skip = skip && (mv[j] == -INFINITY);
+        }
+        if (skip) {
+            continue;
+        }
+
+        device const half4 * pk4 = (device const half4 *) ((device char *) k + (ic*nb11 + ik2*nb12 + ik3*nb13));
+
+        half s[Q] = { 0.0h };
+        half4 pk4v[D4/tph];
+
+        for (int64_t i = 0; i < D4/tph; ++i) {
+            pk4v[i] = pk4[tph*i + tiih];
+        }
+
+        for (int64_t j = 0; j < Q; ++j) {
+            for (int64_t i = 0; i < D4/tph; ++i) {
+                s[j] += dot(pq4[j*T4 + hiisg*D4 + tph*i + tiih], pk4v[i]);
+            }
+        }
+
+        for (int64_t j = 0; j < Q; ++j) {
+            ss[j*T + hiisg*tph + tiih] = s[j];
+        }
+
+        simdgroup_barrier(mem_flags::mem_none);
+
+        if (tiih < Q) {
+            const int64_t j = tiih;
+
+            half4 s4 = 0.0h;
+
+            for (int64_t i = 0; i < tph/4; ++i) {
+                s4 += ss4[j*T4 + hiisg*tph/4 + i];
+            }
+
+            half s = (s4.x + s4.y + s4.z + s4.w)*scale + mp[j][ic];
+
+            const half m = M;
+
+            M = max(M, s);
+
+            const half ms = m == -INFINITY ? 0.0h : exp(m - M);
+            const half vs = s == -INFINITY ? 0.0h : exp(s - M);
+
+            S = S*ms + vs;
+
+            ss[j*T + 2*hiisg + 0] = ms;
+            ss[j*T + 2*hiisg + 1] = vs;
+        }
+
+        simdgroup_barrier(mem_flags::mem_none);
+
+        device const half4 * pv4 = (device const half4 *) ((device char *) v + (ic*nb21 + iv2*nb22 + iv3*nb23));
+
+        for (int64_t i = 0; i < D4/tph; ++i) {
+            for (int64_t j = 0; j < Q; ++j) {
+
+                const half ms = ss[j*T + 2*hiisg + 0];
+                const half vs = ss[j*T + 2*hiisg + 1];
+
+                ps4[j][i] = ps4[j][i]*ms + pv4[tph*i + tiih]*vs;
+            }
+        }
+    }
+
+    if (tiih < Q) {
+        const int64_t j = tiih;
+
+        ss[j*T + 2*hiisg + 0] = S;
+        ss[j*T + 2*hiisg + 1] = M;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    // reduce the warps
+    //if (sgitg == 0) {
+    //    for (int64_t j = 0; j < Q; ++j) {
+    //        for (int64_t sg = 1; sg < nsg; ++sg) {
+
+    //            const half S0 = ss[j*T +            2*hiisg + 0];
+    //            const half S1 = ss[j*T + sg*(256) + 2*hiisg + 0];
+
+    //            const half M0 = ss[j*T +            2*hiisg + 1];
+    //            const half M1 = ss[j*T + sg*(256) + 2*hiisg + 1];
+
+    //            M = max(M0, M1);
+
+    //            const half ms0 = exp(M0 - M);
+    //            const half ms1 = exp(M1 - M);
+
+    //            S = S0*ms0 + S1*ms1;
+
+    //            if (tiih == 0) {
+    //                ss[j*T + 2*hiisg + 0] = S;
+    //                ss[j*T + 2*hiisg + 1] = M;
+    //            }
+
+    //            for (int64_t i = 0; i < D4/tph; ++i) {
+    //                ps4[j*T4 + hiisg*D4 + tph*i + tiih] = ps4[j*T4 + hiisg*D4 + tph*i + tiih]*ms0 + ps4[j*T4 + sg*(256)/4 + hiisg*D4 + tph*i + tiih]*ms1;
+    //            }
+    //        }
+
+    //        for (int64_t i = 0; i < D4/tph; ++i) {
+    //            ps4[j*T4 + hiisg*D4 + tph*i + tiih] = ps4[j*T4 + hiisg*D4 + tph*i + tiih]/S;
+    //        }
+    //    }
+    //}
+
+    for (int64_t sg = 1; sg < nsg; ++sg) {
+        if (sgitg == sg) {
+            // store heads to shared memory - reuse pq4
+            for (int64_t j = 0; j < Q; ++j) {
+                for (int64_t i = 0; i < D4/tph; ++i) {
+                    pq4[j*T4 + hiisg*D4 + tph*i + tiih] = ps4[j][i];
+                }
+            }
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        if (sgitg == 0) {
+            for (int64_t j = 0; j < Q; ++j) {
+                const half S0 = ss[j*T +           2*hiisg + 0];
+                const half S1 = ss[j*T + sg*(32) + 2*hiisg + 0];
+
+                const half M0 = ss[j*T +           2*hiisg + 1];
+                const half M1 = ss[j*T + sg*(32) + 2*hiisg + 1];
+
+                M = max(M0, M1);
+
+                const half ms0 = exp(M0 - M);
+                const half ms1 = exp(M1 - M);
+
+                S = S0*ms0 + S1*ms1;
+
+                if (tiih == 0) {
+                    ss[j*T + 2*hiisg + 0] = S;
+                    ss[j*T + 2*hiisg + 1] = M;
+                }
+
+                for (int64_t i = 0; i < D4/tph; ++i) {
+                    //ps4[j*T4 + hiisg*D4 + tph*i + tiih] = ps4[j*T4 + hiisg*D4 + tph*i + tiih]*ms0 + ps4[j*T4 + sg*(32)/4 + hiisg*D4 + tph*i + tiih]*ms1;
+                    ps4[j][i] = ps4[j][i]*ms0 + pq4[j*T4 + hiisg*D4 + tph*i + tiih]*ms1;
+                }
+            }
+        }
+
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    if (sgitg == 0) {
+        for (int64_t j = 0; j < Q; ++j) {
+            S = ss[j*T + 2*hiisg + 0];
+            for (int64_t i = 0; i < D4/tph; ++i) {
+                //ps4[j*T4 + hiisg*D4 + tph*i + tiih] = ps4[j*T4 + hiisg*D4 + tph*i + tiih]/S;
+                ps4[j][i] = ps4[j][i]/S;
+            }
+        }
+    }
+
+    simdgroup_barrier(mem_flags::mem_threadgroup);
+
+    device float4 * dst4 = (device float4 *) dst;
+
+    if (sgitg == 0) {
+        for (int64_t j = 0; j < Q && iq1 + j < ne01; ++j) {
+            for (int64_t i = 0; i < D4/tph; ++i) {
+                //dst4[(iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + tph*i + tiih] = (float4) ps4[j*T4 + hiisg*D4 + tph*i + tiih];
+                dst4[(iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + tph*i + tiih] = (float4) ps4[j][i];
+            }
+        }
+    }
+}
+
+template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<64,  4, 2>;
+template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<80,  4, 2>;
+template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128, 4, 2>;
+
 kernel void kernel_cpy_f16_f16(
        device  const half * src0,
        device        half * dst,
--- a/ggml.c
+++ b/ggml.c
@@ -817,7 +817,7 @@ do {                                                              \

 #if defined(__F16C__)
 // the  _mm256_cvt intrinsics require F16C
-#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))
+#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
 #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
 #else
 static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) {
@@ -1323,6 +1323,37 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
 #endif
 }

+inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) {
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
+
+    GGML_F16_VEC ax[GGML_F16_ARR];
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
+
+            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] += GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i])*v);
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] += GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i])*v);
+    }
+#endif
+}
+
 // xs and vs are byte strides of x and v
 inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) {

@@ -1407,6 +1438,35 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
 #endif
 }

+inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
+
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
+
+            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
+    }
+#endif
+}
+
 inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s);   }
 inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
 inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
@@ -1418,9 +1478,6 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
 inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
 inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
 inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
-// TODO: optimize performance
-inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
-inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }

 static const float GELU_COEF_A     = 0.044715f;
 static const float GELU_QUICK_COEF = -1.702f;
@@ -1653,6 +1710,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "LEAKY_RELU",

    "FLASH_ATTN",
+    "FLASH_ATTN_EXT",
    "FLASH_FF",
    "FLASH_ATTN_BACK",
    "WIN_PART",
@@ -1677,7 +1735,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CROSS_ENTROPY_LOSS_BACK",
 };

-static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
+static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");

 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@@ -1739,6 +1797,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "leaky_relu(x)",

    "flash_attn(x)",
+    "flash_attn_ext(x)",
    "flash_ff(x)",
    "flash_attn_back(x)",
    "win_part(x)",
@@ -1763,7 +1822,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "cross_entropy_loss_back(x,y)",
 };

-static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
+static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");

 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");

@@ -1779,11 +1838,9 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
    "GELU",
    "GELU_QUICK",
    "SILU",
-    "HARDSWISH",
-    "HARDSIGMOID",
 };

-static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
+static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");


 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -3950,20 +4007,6 @@ struct ggml_tensor * ggml_silu_back(
    return result;
 }

-// ggml hardswish
-struct ggml_tensor * ggml_hardswish(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
-}
-
-// ggml hardsigmoid
-struct ggml_tensor * ggml_hardsigmoid(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
-}
-
 // ggml_norm

 static struct ggml_tensor * ggml_norm_impl(
@@ -5363,31 +5406,6 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
    return result;
 }

-// ggml_conv_depthwise
-struct ggml_tensor * ggml_conv_depthwise_2d(
-    struct ggml_context * ctx,
-    struct ggml_tensor * a,
-    struct ggml_tensor * b,
-    int                  s0,
-    int                  s1,
-    int                  p0,
-    int                  p1,
-    int                  d0,
-    int                  d1) {
-    struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
-    struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
-                                        ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
-                                        s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
-
-    struct ggml_tensor * result =
-        ggml_mul_mat(ctx,
-                ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1),                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
-                ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
-
-    result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
-
-    return result;
-}
 // ggml_conv_2d

 // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
@@ -5722,6 +5740,47 @@ struct ggml_tensor * ggml_flash_attn(
    return result;
 }

+// ggml_flash_attn_ext
+
+struct ggml_tensor * ggml_flash_attn_ext(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * q,
+        struct ggml_tensor  * k,
+        struct ggml_tensor  * v,
+        struct ggml_tensor  * mask,
+        float                 scale) {
+    GGML_ASSERT(ggml_can_mul_mat(k, q));
+    // TODO: check if vT can be multiplied by (k*qT)
+    if (mask) {
+        GGML_ASSERT(ggml_is_contiguous(mask));
+        GGML_ASSERT(mask->ne[2] == 1);
+        GGML_ASSERT(mask->ne[3] == 1);
+        //GGML_ASSERT(ggml_can_repeat_rows(mask, qk));
+    }
+
+    bool is_node = false;
+
+    if (q->grad || k->grad || v->grad) {
+        is_node = true;
+    }
+
+    // permute(0, 2, 1, 3)
+    int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
+    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, ne);
+
+    float params[] = { scale };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op   = GGML_OP_FLASH_ATTN_EXT;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = q;
+    result->src[1] = k;
+    result->src[2] = v;
+    result->src[3] = mask;
+
+    return result;
+}
+
 // ggml_flash_ff

 struct ggml_tensor * ggml_flash_ff(
@@ -7808,9 +7867,6 @@ static void ggml_compute_forward_acc_f32(
    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];

    if (!inplace && (params->type == GGML_TASK_INIT)) {
-        if (params->ith != 0) {
-            return;
-        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        memcpy(
@@ -9380,87 +9436,6 @@ static void ggml_compute_forward_silu_back(
    }
 }

-
-static void ggml_compute_forward_hardswish_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_hardswish_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-static void ggml_compute_forward_hardswish(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_hardswish_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-static void ggml_compute_forward_hardsigmoid_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
-    assert(dst->nb[0]  == sizeof(float));
-    assert(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < n; i++) {
-        ggml_vec_hardsigmoid_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
-    }
-}
-
-static void ggml_compute_forward_hardsigmoid(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
-
 // ggml_compute_forward_norm

 static void ggml_compute_forward_norm_f32(
@@ -9953,30 +9928,11 @@ static void ggml_compute_forward_mul_mat(

 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
-        const int64_t ne_plane      = ne01*ne00;
-        const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
-        UNUSED(desired_wsize);
+        if (params->ith != 0) {
+            return;
+        }

        if (params->type == GGML_TASK_INIT) {
-            if (type != GGML_TYPE_F32) {
-                assert(params->wsize >= desired_wsize);
-                // parallelize by src0 rows
-                for (int64_t i13 = 0; i13 < ne13; i13++) {
-                    for (int64_t i12 = 0; i12 < ne12; i12++) {
-                        // broadcast src0 into src1 across 2nd,3rd dimension
-                        const int64_t i03 = i13/r3;
-                        const int64_t i02 = i12/r2;
-
-                        const void           *       x        = (char *)  src0->data    + i02*nb02          + i03*nb03;
-                              float          * const wdata    = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
-                              ggml_to_float_t  const to_float = type_traits[type].to_float;
-
-                        for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
-                            to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
-                        }
-                    }
-                }
-            }
            return;
        }

@@ -9984,14 +9940,9 @@ static void ggml_compute_forward_mul_mat(
            return;
        }

-        // perform sgemm, parallelization controlled by blas lib
-        if (ith != 0) {
-            return;
-        }
-
-        //const int64_t tgemm0 = ggml_perf_time_us();
        for (int64_t i13 = 0; i13 < ne13; i13++) {
            for (int64_t i12 = 0; i12 < ne12; i12++) {
+                // broadcast src0 into src1 across 2nd,3rd dimension
                const int64_t i03 = i13/r3;
                const int64_t i02 = i12/r2;

@@ -10000,7 +9951,17 @@ static void ggml_compute_forward_mul_mat(
                      float * d = (float *) ((char *)  dst->data + i12*nb2  + i13*nb3);

                if (type != GGML_TYPE_F32) {
-                    x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
+                            float * const wdata    = params->wdata;
+                    ggml_to_float_t const to_float = type_traits[type].to_float;
+
+                    size_t id = 0;
+                    for (int64_t i01 = 0; i01 < ne01; ++i01) {
+                        to_float((const char *) x + i01*nb01, wdata + id, ne00);
+                        id += ne00;
+                    }
+
+                    assert(id*sizeof(float) <= params->wsize);
+                    x = wdata;
                }

                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@@ -10010,7 +9971,6 @@ static void ggml_compute_forward_mul_mat(
                         0.0f,    d, ne01);
            }
        }
-        //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);

        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);

@@ -10019,9 +9979,6 @@ static void ggml_compute_forward_mul_mat(
 #endif

    if (params->type == GGML_TASK_INIT) {
-        if (ith != 0) {
-            return;
-        }
        if (src1->type != vec_dot_type) {
            char * wdata = params->wdata;
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10186,9 +10143,6 @@ static void ggml_compute_forward_mul_mat_id(
    #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]

   if (params->type == GGML_TASK_INIT) {
-        if (ith != 0) {
-            return;
-        }
        char * wdata = params->wdata;
        if (src1->type != vec_dot_type) {
            const size_t row_size = ggml_row_size(vec_dot_type, ne10);
@@ -10374,9 +10328,6 @@ static void ggml_compute_forward_out_prod_f32(
            return;
        }
 #endif
-        if (ith != 0) {
-            return;
-        }
        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
        return;
    }
@@ -10560,9 +10511,6 @@ static void ggml_compute_forward_out_prod_q_f32(
    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)

    if (params->type == GGML_TASK_INIT) {
-        if (ith != 0) {
-            return;
-        }
        ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
        return;
    }
@@ -10747,9 +10695,6 @@ static void ggml_compute_forward_set_f32(
    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];

    if (!inplace && (params->type == GGML_TASK_INIT)) {
-        if (params->ith != 0) {
-            return;
-        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        memcpy(
@@ -11074,9 +11019,6 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
    // ggml_compute_forward_dup_same_cont(params, opt0, dst);

    if (params->type == GGML_TASK_INIT) {
-        if (params->ith != 0) {
-            return;
-        }
        memset(dst->data, 0, ggml_nbytes(dst));
    }

@@ -11111,9 +11053,6 @@ static void ggml_compute_forward_get_rows_back_f32(
    // ggml_compute_forward_dup_same_cont(params, opt0, dst);

    if (params->type == GGML_TASK_INIT) {
-        if (params->ith != 0) {
-            return;
-        }
        memset(dst->data, 0, ggml_nbytes(dst));
    }

@@ -11251,9 +11190,6 @@ static void ggml_compute_forward_diag_mask_f32(
    GGML_ASSERT(n_past >= 0);

    if (!inplace && (params->type == GGML_TASK_INIT)) {
-        if (ith != 0) {
-            return;
-        }
        // memcpy needs to be synchronized across threads to avoid race conditions.
        // => do it in INIT phase
        GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@@ -12224,9 +12160,6 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
    GGML_ASSERT(nb10 == sizeof(float));

    if (params->type == GGML_TASK_INIT) {
-        if (ith != 0) {
-            return;
-        }
        memset(params->wdata, 0, params->wsize);

        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12321,9 +12254,6 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
    GGML_ASSERT(nb10 == sizeof(float));

    if (params->type == GGML_TASK_INIT) {
-        if (ith != 0) {
-            return;
-        }
        memset(params->wdata, 0, params->wsize);

        // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
@@ -12522,7 +12452,6 @@ static void ggml_compute_forward_im2col(
    }
 }

-
 // ggml_compute_forward_conv_transpose_2d

 static void ggml_compute_forward_conv_transpose_2d(
@@ -12548,9 +12477,6 @@ static void ggml_compute_forward_conv_transpose_2d(
    GGML_ASSERT(nb10 == sizeof(float));

    if (params->type == GGML_TASK_INIT) {
-        if (ith != 0) {
-            return;
-        }
        memset(params->wdata, 0, params->wsize);

        // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
@@ -13389,6 +13315,191 @@ static void ggml_compute_forward_flash_attn(
    }
 }

+// ggml_compute_forward_flash_attn_ext
+
+static void ggml_compute_forward_flash_attn_ext_f16(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * q,
+        const struct ggml_tensor * k,
+        const struct ggml_tensor * v,
+        const struct ggml_tensor * mask,
+        struct ggml_tensor * dst) {
+    int64_t t0 = ggml_perf_time_us();
+    UNUSED(t0);
+
+    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
+    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
+    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t D = neq0;
+    const int64_t N = neq1;
+    const int64_t P = nek1 - N;
+
+    GGML_ASSERT(ne0 == D);
+    GGML_ASSERT(ne2 == N);
+    GGML_ASSERT(P >= 0);
+
+    GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));
+    GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));
+
+    GGML_ASSERT(neq0 == D);
+    GGML_ASSERT(nek0 == D);
+    GGML_ASSERT(nev0 == D);
+
+    GGML_ASSERT(neq1 == N);
+    GGML_ASSERT(nek1 == N + P);
+    GGML_ASSERT(nev0 == D);
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    // broadcast factors
+    const int64_t rk2 = neq2/nek2;
+    const int64_t rk3 = neq3/nek3;
+
+    const int64_t rv2 = neq2/nev2;
+    const int64_t rv3 = neq3/nev3;
+
+    if (params->type == GGML_TASK_INIT) {
+        return;
+    }
+
+    if (params->type == GGML_TASK_FINALIZE) {
+        return;
+    }
+
+    // parallelize by q rows using ggml_vec_dot_f32
+
+    // total rows in q
+    const int nr = neq1*neq2*neq3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    float scale = 1.0f;
+    memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
+
+    //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale);
+
+    // loop over n_batch and n_head
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // q indices
+        const int iq3 = ir/(neq2*neq1);
+        const int iq2 = (ir - iq3*neq2*neq1)/neq1;
+        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
+
+        float S = 0.0f;
+        float M = -INFINITY;
+
+        float       * V32 = (float       *) params->wdata + ith*(2*D + CACHE_LINE_SIZE_F32);
+        ggml_fp16_t * V16 = (ggml_fp16_t *) (V32 + D);
+
+        memset(V16, 0, D*sizeof(ggml_fp16_t));
+
+        const float * mp = mask ? (float *)((char *) mask->data + (ir%mask->ne[1])*mask->nb[1]) : NULL;
+
+        // k indices
+        const int ik3 = iq3 / rk3;
+        const int ik2 = iq2 / rk2;
+
+        // v indices
+        const int iv3 = iq3 / rv3;
+        const int iv2 = iq2 / rv2;
+
+        // online softmax / attention
+        // loop over n_kv and n_head_kv
+        // ref: https://arxiv.org/pdf/2112.05682.pdf
+        for (int64_t ic = 0; ic < nek1; ++ic) {
+            const float mv = mp ? mp[ic] : 0.0f;
+            if (mv == -INFINITY) {
+                continue;
+            }
+
+            float s;
+
+            ggml_vec_dot_f16(D,
+                    &s,
+                    (ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)),
+                    (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+
+            s = s*scale + mv;
+
+            const float Mold = M;
+
+            float ms = 1.0f;
+            float vs = 1.0f;
+
+            if (s > M) {
+                M = s;
+                ms = expf(Mold - M);
+
+                // V = V*expf(Mold - M)
+                ggml_vec_scale_f16(D, V16, ms);
+            } else {
+                vs = expf(s - M);
+            }
+
+            const ggml_fp16_t * v16 = (const ggml_fp16_t *) ((char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3));
+
+            // V += v*expf(s - M)
+            ggml_vec_mad_f16(D, V16, v16, vs);
+
+            S = S*ms + vs;
+        }
+
+        // V /= S
+        for (int64_t d = 0; d < D; ++d) {
+            V32[d] = GGML_FP16_TO_FP32(V16[d])/S;
+        }
+
+        // dst indices
+        const int i1 = iq1;
+        const int i2 = iq2;
+        const int i3 = iq3;
+
+        // original
+        //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float));
+
+        // permute(0, 2, 1, 3)
+        memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, V32, nb1);
+    }
+}
+
+static void ggml_compute_forward_flash_attn_ext(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * q,
+        const struct ggml_tensor * k,
+        const struct ggml_tensor * v,
+        const struct ggml_tensor * mask,
+        struct ggml_tensor * dst) {
+    switch (q->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_flash_ff

 static void ggml_compute_forward_flash_ff_f16(
@@ -14094,14 +14205,6 @@ static void ggml_compute_forward_unary(
            {
                ggml_compute_forward_silu(params, src0, dst);
            } break;
-        case GGML_UNARY_OP_HARDSWISH:
-            {
-                ggml_compute_forward_hardswish(params, src0, dst);
-            } break;
-        case GGML_UNARY_OP_HARDSIGMOID:
-            {
-                ggml_compute_forward_hardsigmoid(params, src0, dst);
-            } break;
        default:
            {
                GGML_ASSERT(false);
@@ -14165,9 +14268,6 @@ static void ggml_compute_forward_add_rel_pos_f32(

    const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
    if (!inplace && params->type == GGML_TASK_INIT) {
-        if (params->ith != 0) {
-            return;
-        }
        memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
        return;
    }
@@ -14905,6 +15005,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                const bool masked = t != 0;
                ggml_compute_forward_flash_attn(params, tensor->src[0], tensor->src[1], tensor->src[2], masked, tensor);
            } break;
+        case GGML_OP_FLASH_ATTN_EXT:
+            {
+                ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
+            } break;
        case GGML_OP_FLASH_FF:
            {
                ggml_compute_forward_flash_ff(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
@@ -15901,6 +16005,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                GGML_ASSERT(false); // TODO: not implemented
            } break;
        case GGML_OP_FLASH_ATTN:
+        case GGML_OP_FLASH_ATTN_EXT:
            {
                struct ggml_tensor * flash_grad = NULL;
                if (src0->grad || src1->grad || tensor->src[2]->grad) {
@@ -16461,9 +16566,8 @@ struct ggml_compute_state_shared {
    const int n_threads;

    // synchronization primitives
-    atomic_int n_active;  // num active threads
-    atomic_int node_n;    // active graph node
-    atomic_int node_task; // active graph node task phase
+    atomic_int n_active; // num active threads
+    atomic_int node_n;   // active graph node

    bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
    void * abort_callback_data;
@@ -16519,8 +16623,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_ELU:
                case GGML_UNARY_OP_RELU:
-                case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
-                case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
                    {
                        n_tasks = 1;
                    } break;
@@ -16629,6 +16731,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                n_tasks = n_threads;
            } break;
        case GGML_OP_FLASH_ATTN:
+        case GGML_OP_FLASH_ATTN_EXT:
            {
                n_tasks = n_threads;
            } break;
@@ -16711,34 +16814,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
    return n_tasks;
 }

-static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
-    // wait for other threads to finish
-    const int last_node_n = * node_n;
-
-    while (true) {
-        if (do_yield) {
-            sched_yield();
-        }
-
-        * node_n = atomic_load(&state->shared->node_n);
-        if (* node_n != last_node_n) break;
-    }
-}
-
-static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
-    // wait for other threads to finish
-    const int last_task_phase = * task_phase;
-
-    while (true) {
-        if (do_yield) {
-            sched_yield();
-        }
-
-        * task_phase = atomic_load(&state->shared->node_task);
-        if (* task_phase != last_task_phase) break;
-    }
-}
-
 static thread_ret_t ggml_graph_compute_thread(void * data) {
    struct ggml_compute_state * state = (struct ggml_compute_state *) data;

@@ -16749,8 +16824,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

    set_numa_thread_affinity(state->ith, n_threads);

-    int node_n     = -1;
-    int task_phase = GGML_TASK_FINALIZE;
+    int node_n = -1;

    while (true) {
        if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
@@ -16782,6 +16856,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
            // distribute new work or execute it direct if 1T
            while (++node_n < cgraph->n_nodes) {
                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
+
                struct ggml_tensor * node = cgraph->nodes[node_n];
                const int n_tasks = ggml_get_n_tasks(node, n_threads);

@@ -16790,13 +16865,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {

                params.nth = n_tasks;

-                if (n_tasks == 1) {
-                    /* INIT */
-                    if (GGML_OP_HAS_INIT[node->op]) {
-                        params.type = GGML_TASK_INIT;
-                        ggml_compute_forward(&params, node);
-                    }
+                /* INIT */
+                if (GGML_OP_HAS_INIT[node->op]) {
+                    params.type = GGML_TASK_INIT;
+                    ggml_compute_forward(&params, node);
+                }

+                if (n_tasks == 1) {
                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
                    // they do something more efficient than spinning (?)
                    params.type = GGML_TASK_COMPUTE;
@@ -16817,24 +16892,38 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                }
            }

-            task_phase = GGML_TASK_INIT;
-            atomic_store(&state->shared->n_active,  n_threads);
-            atomic_store(&state->shared->node_n,    node_n);
-            atomic_store(&state->shared->node_task, task_phase);
+            atomic_store(&state->shared->n_active, n_threads);
+            atomic_store(&state->shared->node_n,   node_n);
        } else {
-            ggml_graph_compute_thread_sync_node(&node_n,     state, false);
-            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
+            // wait for other threads to finish
+            const int last = node_n;
+
+            const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
+
+            while (true) {
+                // TODO: this sched_yield can have significant impact on the performance - either positive or negative
+                //       depending on the workload and the operating system.
+                //       since it is not clear what is the best approach, it should potentially become user-configurable
+                //       ref: https://github.com/ggerganov/ggml/issues/291
+                // UPD:  adding the do_yield flag seems to resolve the issue universally
+                if (do_yield) {
+                    sched_yield();
+                }
+
+                node_n = atomic_load(&state->shared->node_n);
+                if (node_n != last) break;
+            };
        }

        // check if we should stop
        if (node_n >= cgraph->n_nodes) break;

-        /* INIT & COMPUTE */
+        /* COMPUTE */
        struct ggml_tensor * node = cgraph->nodes[node_n];
        const int n_tasks = ggml_get_n_tasks(node, n_threads);

        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_INIT,
+            /*.type  =*/ GGML_TASK_COMPUTE,
            /*.ith   =*/ state->ith,
            /*.nth   =*/ n_tasks,
            /*.wsize =*/ cplan->work_size,
@@ -16842,39 +16931,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
        };

        if (state->ith < n_tasks) {
-            if (GGML_OP_HAS_INIT[node->op]) {
-                ggml_compute_forward(&params, node);
-            }
-        }
-
-        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
-            task_phase = GGML_TASK_COMPUTE;
-            atomic_store(&state->shared->n_active,  n_threads);
-            atomic_store(&state->shared->node_task, task_phase);
-        }
-        else {
-            // TODO: this sched_yield can have significant impact on the performance - either positive or negative
-            //       depending on the workload and the operating system.
-            //       since it is not clear what is the best approach, it should potentially become user-configurable
-            //       ref: https://github.com/ggerganov/ggml/issues/291
-            // UPD:  adding the do_yield flag seems to resolve the issue universally
-            const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
-            ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
-        }
-
-        if (state->ith < n_tasks) {
-            params.type = GGML_TASK_COMPUTE;
            ggml_compute_forward(&params, node);
        }
-
-        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
-            task_phase = GGML_TASK_FINALIZE;
-            atomic_store(&state->shared->n_active,  n_threads);
-            atomic_store(&state->shared->node_task, task_phase);
-        }
-        else {
-            ggml_graph_compute_thread_sync_task(&task_phase, state, false);
-        }
    }

    return GGML_EXIT_SUCCESS;
@@ -16931,11 +16989,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                    if (ggml_compute_forward_mul_mat_use_blas(node)) {
                        if (node->src[0]->type != GGML_TYPE_F32) {
-                            // here we need memory for fully dequantized matrix from src0
-                            // take into account that src0 can be broadcasted into src1[2,3]
-                            cur = ggml_type_size(GGML_TYPE_F32)
-                                * node->src[0]->ne[0]*node->src[0]->ne[1]
-                                * node->src[1]->ne[2]*node->src[1]->ne[3];
+                            // here we need memory just for single 2D matrix from src0
+                            cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
                        }
                    } else
 #endif
@@ -17019,6 +17074,12 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
                    }
                } break;
+            case GGML_OP_FLASH_ATTN_EXT:
+                {
+                    const int64_t ne00 = node->src[0]->ne[0]; // D
+
+                    cur = 2*sizeof(float)*ne00*n_tasks; // 2x head size
+                } break;
            case GGML_OP_FLASH_FF:
                {
                    if (node->src[1]->type == GGML_TYPE_F32) {
@@ -17089,7 +17150,6 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
        /*.n_threads               =*/ n_threads,
        /*.n_active                =*/ n_threads,
        /*.node_n                  =*/ -1,
-        /*.node_task               =*/ GGML_TASK_FINALIZE,
        /*.abort_callback          =*/ NULL,
        /*.abort_callback_data     =*/ NULL,
    };
--- a/ggml.h
+++ b/ggml.h
@@ -452,6 +452,7 @@ extern "C" {
        GGML_OP_LEAKY_RELU,

        GGML_OP_FLASH_ATTN,
+        GGML_OP_FLASH_ATTN_EXT,
        GGML_OP_FLASH_FF,
        GGML_OP_FLASH_ATTN_BACK,
        GGML_OP_WIN_PART,
@@ -489,8 +490,6 @@ extern "C" {
        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_GELU_QUICK,
        GGML_UNARY_OP_SILU,
-        GGML_UNARY_OP_HARDSWISH,
-        GGML_UNARY_OP_HARDSIGMOID,

        GGML_UNARY_OP_COUNT,
    };
@@ -1034,16 +1033,6 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-    // hardswish(x) = x * relu6(x + 3) / 6
-    GGML_API struct ggml_tensor * ggml_hardswish(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // hardsigmoid(x) = relu6(x + 3) / 6
-    GGML_API struct ggml_tensor * ggml_hardsigmoid(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
    // normalize along rows
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
@@ -1495,17 +1484,6 @@ extern "C" {
            int                  d1,
            bool                 is_2D);

-    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                  s0,
-            int                  s1,
-            int                  p0,
-            int                  p1,
-            int                  d0,
-            int                  d1);
-
    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1642,6 +1620,19 @@ extern "C" {
            struct ggml_tensor  * v,
            bool                  masked);

+    // q:    [n_embd, n_batch, n_head,    1]
+    // k:    [n_embd, n_kv,    n_head_kv, 1]
+    // v:    [n_embd, n_kv,    n_head_kv, 1] !! not transposed !!
+    // mask: [n_kv,   n_batch, 1,         1]
+    // res:  [n_embd, n_head,  n_batch,   1] !! permuted !!
+    GGML_API struct ggml_tensor * ggml_flash_attn_ext(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * q,
+            struct ggml_tensor  * k,
+            struct ggml_tensor  * v,
+            struct ggml_tensor  * mask,
+            float                 scale);
+
    GGML_API struct ggml_tensor * ggml_flash_attn_back(
           struct ggml_context * ctx,
           struct ggml_tensor  * q,
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@@ -107,7 +107,6 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_XS       = 22, // except 1d tensors

        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
--- a/mypy.ini
+++ b/mypy.ini
@@ -4,4 +4,3 @@ allow_untyped_calls = true
 allow_untyped_defs = true
 allow_incomplete_defs = true
 disable_error_code = import-untyped
-warn_return_any = false
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1384,6 +1384,36 @@ struct test_leaky_relu : public test_case {
    }
 };

+// GGML_OP_FLASH_ATTN_EXT
+struct test_flash_attn_ext : public test_case {
+    const ggml_type typeq;
+    const int64_t hs; // head size
+    const int64_t nh; // num heads
+    const int64_t kv; // kv size
+    const int64_t nb; // batch size
+
+    std::string vars() override {
+        return VARS_TO_STR5(typeq, hs, nh, kv, nb);
+    }
+
+    double max_nmse_err() override {
+        return 5e-5;
+    }
+
+    test_flash_attn_ext(ggml_type typeq = GGML_TYPE_F16,
+            int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
+        : typeq(typeq), hs(hs), nh(nh), kv(kv), nb(nb) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * q = ggml_new_tensor_4d(ctx, typeq, hs, nb, nh, 1);
+        ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
+        ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
+        ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, kv, nb, 1, 1);
+        ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs));
+        return out;
+    }
+};
+
 // Mixtral MOE
 struct test_moe : public test_case {
    const int n_experts;
@@ -1650,6 +1680,10 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_pad());
    test_cases.emplace_back(new test_leaky_relu());

+    test_cases.emplace_back(new test_flash_attn_ext(GGML_TYPE_F16, 128, 32, 256, 8));
+    test_cases.emplace_back(new test_flash_attn_ext(GGML_TYPE_F16, 128, 32, 256, 7));
+    test_cases.emplace_back(new test_flash_attn_ext(GGML_TYPE_F16, 128, 32, 256, 1));
+
 #if !defined(__SANITIZE_THREAD__)
    // FIXME: these tests use too much memory with thread sanitizer
    test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
--- a/unicode.h
+++ b/unicode.h
@@ -2,9 +2,8 @@

 #include <cassert>
 #include <stdexcept>
-#include <string>
-#include <unordered_map>
 #include <vector>
+#include <unordered_map>

 static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
 {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
Author	SHA1	Message	Date
Georgi Gerganov	da23b56f25	wip : no ic 8 step	2024-01-24 13:25:34 +02:00
Georgi Gerganov	af3eda9c77	wip	2024-01-24 11:18:24 +02:00
Georgi Gerganov	5cbdba693d	wip	2024-01-24 10:16:05 +02:00
Georgi Gerganov	035c4f01e6	wip	2024-01-24 00:01:54 +02:00
Georgi Gerganov	06c2d0d117	wip	2024-01-23 22:42:43 +02:00
Georgi Gerganov	17720fad66	metal : parallel reduce across heads	2024-01-21 23:01:46 +02:00
Georgi Gerganov	77d08f3272	metal : parallelize across KV size	2024-01-21 22:26:45 +02:00
Georgi Gerganov	a4b6341c7b	wip : template for rows per warp	2024-01-21 19:06:30 +02:00
Georgi Gerganov	f31955f5d1	wip : 4 rows per simd group	2024-01-21 18:01:28 +02:00
Georgi Gerganov	8cde449b8b	wip : 8 rows per simd group	2024-01-21 17:37:24 +02:00
Georgi Gerganov	b97325800a	metal : specialize for head size	2024-01-21 12:01:55 +02:00
Georgi Gerganov	52ae085750	metal : reduce branches	2024-01-21 11:59:09 +02:00
Georgi Gerganov	528da7515e	metal : f16 precision	2024-01-21 11:13:24 +02:00
Georgi Gerganov	1173f49c3b	metal : initial implementation	2024-01-21 10:15:02 +02:00
Georgi Gerganov	a9681febd6	ggml : online attention (CPU)	2024-01-20 16:45:41 +02:00
Georgi Gerganov	c3cdfffa88	Merge branch 'master' into gg/flash-attn	2024-01-20 10:12:07 +02:00
Georgi Gerganov	fa7ebcca99	ggml : fix GQA support in ggml_flash_attn_ext	2024-01-19 20:06:26 +02:00
Georgi Gerganov	a1c004ef2e	ggml : add ggml_flash_attn_ext API	2024-01-18 18:55:48 +02:00