ci : enable git lfs for build.yml

Revert "tmp : dummy change to trigger ci"
This reverts commit 97e40df5d6.
2026-02-12 14:03:20 +02:00 · 2024-05-08 10:53:02 +03:00 · 2024-05-08 10:42:25 +03:00 · 2024-05-08 10:42:11 +03:00 · 2024-05-08 10:30:25 +03:00 · 2024-05-08 10:24:53 +03:00
95 changed files with 37295 additions and 49436 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+models/ggml-vocab-*.gguf filter=lfs diff=lfs merge=lfs -text
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -33,6 +33,7 @@ jobs:
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

      - name: Dependencies
@@ -91,6 +92,7 @@ jobs:
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

      - name: Dependencies
@@ -153,6 +155,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -188,6 +192,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -211,6 +217,7 @@ jobs:
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

      - name: Dependencies
@@ -285,6 +292,8 @@ jobs:
 #      - name: Clone
 #        id: checkout
 #        uses: actions/checkout@v4
+#        with:
+#          lfs: true
 #
 #      - name: Dependencies
 #        id: depends
@@ -319,6 +328,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -347,6 +358,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -369,6 +382,8 @@ jobs:

    steps:
      - uses: actions/checkout@v2
+        with:
+          lfs: true

      - name: add oneAPI to apt
        shell: bash
@@ -393,6 +408,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Build
        id: cmake_build
@@ -410,6 +427,8 @@ jobs:

    steps:
      - uses: actions/checkout@v2
+        with:
+          lfs: true

      - name: add oneAPI to apt
        shell: bash
@@ -434,6 +453,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Build
        id: cmake_build
@@ -454,6 +475,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -485,6 +508,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -514,6 +539,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -543,6 +570,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -576,6 +605,8 @@ jobs:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1
+        with:
+          lfs: true

      - name: Dependencies
        id: depends
@@ -606,6 +637,8 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Setup ${{ matrix.sys }}
        uses: msys2/setup-msys2@v2
@@ -687,6 +720,7 @@ jobs:
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

      - name: Clone Kompute submodule
@@ -833,6 +867,7 @@ jobs:
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

      - uses: Jimver/cuda-toolkit@v0.2.11
@@ -906,6 +941,7 @@ jobs:
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

      - name: Install
@@ -947,6 +983,8 @@ jobs:
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Build Xcode project
        run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
@@ -957,6 +995,8 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v4
+        with:
+          lfs: true

      - name: Set up JDK
        uses: actions/setup-java@v3
@@ -979,7 +1019,9 @@ jobs:
 #    runs-on: macos-12
 #    steps:
 #    - name: Clone
-#      uses: actions/checkout@v4
+#      uses: actions/checkout@#v4
+#        with:
+#          lfs: true
 #
 #    - name: Build
 #      uses: cross-platform-actions/action@v0.19.0
@@ -1012,6 +1054,7 @@ jobs:
        id: checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 0

      - name: Determine tag name
@@ -1077,6 +1120,8 @@ jobs:
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
+#        with:
+#          lfs: true
 #
 #      - name: Dependencies
 #        run: |
@@ -1101,6 +1146,8 @@ jobs:
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
+#        with:
+#          lfs: true
 #
 #      - name: Dependencies
 #        run: |
@@ -1125,6 +1172,8 @@ jobs:
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
+#        with:
+#          lfs: true
 #
 #      - name: Dependencies
 #        run: |
@@ -1155,6 +1204,8 @@ jobs:
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
+#        with:
+#          lfs: true
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -1194,6 +1245,8 @@ jobs:
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
+#        with:
+#          lfs: true
 #
 #      - name: Add msbuild to PATH
 #        uses: microsoft/setup-msbuild@v1
@@ -1240,6 +1293,8 @@ jobs:
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
+#        with:
+#          lfs: true
 #
 #      - name: Dependencies
 #        run: |
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@@ -13,14 +13,16 @@ jobs:
  run:
    runs-on: ubuntu-20.04
    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential gcc-8 lcov

+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
      - name: Build
        run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -405,7 +405,6 @@ if (LLAMA_CUDA)
        list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")

        add_compile_definitions(GGML_USE_CUDA)
-        add_compile_definitions(GGML_CUDA_USE_GRAPHS)
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@@ -431,7 +430,7 @@ if (LLAMA_CUDA)

        if (LLAMA_STATIC)
            if (WIN32)
-                # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
+                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
            else ()
                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
--- a/2
+++ b/2
@@ -433,7 +433,7 @@ ifdef LLAMA_CUDA
 	else
 		CUDA_PATH ?= /usr/local/cuda
 	endif
-	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
+	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
 	OBJS         += ggml-cuda.o
 	OBJS         += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@

 ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)

-[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)

 [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)

@@ -176,7 +176,6 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [Faraday](https://faraday.dev/) (proprietary)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
 - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
 - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1,6 +1,4 @@
 #include "common.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
@@ -901,10 +899,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.interactive = true;
        return true;
    }
-    if (arg == "--interactive-specials") {
-        params.interactive_specials = true;
-        return true;
-    }
    if (arg == "--embedding") {
        params.embedding = true;
        return true;
@@ -917,10 +911,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.instruct = true;
        return true;
    }
-    if (arg == "-cnv" || arg == "--conversation") {
-        params.conversation = true;
-        return true;
-    }
    if (arg == "-cml" || arg == "--chatml") {
        params.chatml = true;
        return true;
@@ -1371,12 +1361,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
            std::replace(arg.begin(), arg.end(), '_', '-');
        }
+
        if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
            throw std::invalid_argument("error: unknown argument: " + arg);
        }
-        if (invalid_param) {
-            throw std::invalid_argument("error: invalid parameter for argument: " + arg);
-        }
+    }
+
+    if (invalid_param) {
+        throw std::invalid_argument("error: invalid parameter for argument: " + arg);
    }

    if (params.prompt_cache_all &&
@@ -1424,9 +1416,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("  -h, --help            show this help message and exit\n");
    printf("  --version             show version and build info\n");
    printf("  -i, --interactive     run in interactive mode\n");
-    printf("  --interactive-specials allow special tokens in user text, in interactive mode\n");
    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
-    printf("  -cnv, --conversation  run in conversation mode (does not print special tokens and suffix/prefix)\n");
    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
    printf("  -cml, --chatml        run in chatml mode (use with ChatML-compatible models)\n");
    printf("  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
@@ -1974,18 +1964,18 @@ static bool llama_download_file(const std::string & url, const std::string & pat
            try {
                metadata_in >> metadata;
                fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
-                if (metadata.contains("url") && metadata.at("url").is_string()) {
-                    auto previous_url = metadata.at("url").get<std::string>();
+                if (metadata.contains("url") && metadata["url"].is_string()) {
+                    auto previous_url = metadata["url"].get<std::string>();
                    if (previous_url != url) {
                        fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
                        return false;
                    }
                }
-                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
-                    etag = metadata.at("etag");
+                if (metadata.contains("etag") && metadata["etag"].is_string()) {
+                    etag = metadata["etag"];
                }
-                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
-                    last_modified = metadata.at("lastModified");
+                if (metadata.contains("lastModified") && metadata["lastModified"].is_string()) {
+                    last_modified = metadata["lastModified"];
                }
            } catch (const nlohmann::json::exception & e) {
                fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
@@ -2655,7 +2645,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
    dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
    fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
    fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
-    fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
    fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
    fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
--- a/common/common.h
+++ b/common/common.h
@@ -140,8 +140,6 @@ struct gpt_params {
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
-    bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
-    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
    bool prompt_cache_all  = false; // save user input and generations to prompt cache
    bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -142,9 +142,6 @@ namespace grammar_parser {
                pos++;
                last_sym_start = out_elements.size();
                while (*pos != '"') {
-                    if (!*pos) {
-                        throw std::runtime_error("unexpected end of input");
-                    }
                    auto char_pair = parse_char(pos);
                         pos       = char_pair.second;
                    out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
@@ -159,9 +156,6 @@ namespace grammar_parser {
                }
                last_sym_start = out_elements.size();
                while (*pos != ']') {
-                    if (!*pos) {
-                        throw std::runtime_error("unexpected end of input");
-                    }
                    auto char_pair = parse_char(pos);
                         pos       = char_pair.second;
                    enum llama_gretype type = last_sym_start < out_elements.size()
@@ -170,9 +164,6 @@ namespace grammar_parser {

                    out_elements.push_back({type, char_pair.first});
                    if (pos[0] == '-' && pos[1] != ']') {
-                        if (!pos[1]) {
-                            throw std::runtime_error("unexpected end of input");
-                        }
                        auto endchar_pair = parse_char(pos + 1);
                             pos          = endchar_pair.second;
                        out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
--- a/common/json-schema-to-grammar.h
+++ b/common/json-schema-to-grammar.h
@@ -1,8 +1,4 @@
 #pragma once
-
-#include "ggml.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

 std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -49,10 +49,6 @@ chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍

 if len(sys.argv) == 2:
    token = sys.argv[1]
-    if not token.startswith("hf_"):
-        logger.info("Huggingface token seems invalid")
-        logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
-        sys.exit(1)
 else:
    logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
    sys.exit(1)
@@ -71,9 +67,7 @@ models = [
    {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
    {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
    {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
-    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
    {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
-    {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
 ]

 # make directory "models/tokenizers" if it doesn't exist
@@ -157,8 +151,6 @@ for model in models:
    # print the "pre_tokenizer" content from the tokenizer.json
    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
        cfg = json.load(f)
-        normalizer = cfg["normalizer"]
-        logger.info("normalizer: " + json.dumps(normalizer, indent=4))
        pre_tokenizer = cfg["pre_tokenizer"]
        logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))

@@ -261,7 +253,6 @@ tests = [
    "3333333",
    "33333333",
    "333333333",
-    # "Cửa Việt", # llama-bpe fails on this
    chktxt,
 ]

--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
--- a/convert.py
+++ b/convert.py
@@ -284,7 +284,6 @@ class Params:
        n_experts      = None
        n_experts_used = None
        f_rope_freq_base = None
-        n_ff = None

        # hack to determine LLaMA v1 vs v2 vs CodeLlama
        if config.get("moe"):
@@ -309,8 +308,6 @@ class Params:
            n_experts_used = config["moe"]["num_experts_per_tok"]
            f_rope_freq_base = 1e6

-        assert n_ff is not None
-
        return Params(
            n_vocab          = model["tok_embeddings.weight"].shape[0],
            n_embd           = config["dim"],
@@ -465,8 +462,7 @@ class SentencePieceVocab(Vocab):
            # not found in alternate location either
            raise FileNotFoundError('Cannot find tokenizer.model')

-        self.sentencepiece_tokenizer = SentencePieceProcessor()
-        self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
+        self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
        vocab_size = self.sentencepiece_tokenizer.vocab_size()

        new_tokens       = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
@@ -486,23 +482,23 @@ class SentencePieceVocab(Vocab):
    def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.sentencepiece_tokenizer
        for i in range(tokenizer.vocab_size()):
-            piece = tokenizer.IdToPiece(i)
+            piece = tokenizer.id_to_piece(i)
            text         = piece.encode("utf-8")
-            score: float = tokenizer.GetScore(i)
+            score: float = tokenizer.get_score(i)

            toktype = gguf.TokenType.NORMAL
-            if tokenizer.IsUnknown(i):
+            if tokenizer.is_unknown(i):
                toktype = gguf.TokenType.UNKNOWN
-            if tokenizer.IsControl(i):
+            if tokenizer.is_control(i):
                toktype = gguf.TokenType.CONTROL

            # NOTE: I think added_tokens are user defined.
            # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
            # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED

-            if tokenizer.IsUnused(i):
+            if tokenizer.is_unused(i):
                toktype = gguf.TokenType.UNUSED
-            if tokenizer.IsByte(i):
+            if tokenizer.is_byte(i):
                toktype = gguf.TokenType.BYTE

            yield text, score, toktype
@@ -910,7 +906,7 @@ class LazyUnpickler(pickle.Unpickler):
    def rebuild_from_type_v2(func, new_type, args, state):
        return func(*args)

-    CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
+    CLASSES = {
        # getattr used here as a workaround for mypy not being smart enough to determine
        # the staticmethods have a __func__ attribute.
        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@@ -1512,27 +1508,25 @@ def main(args_in: list[str] | None = None) -> None:
    if args.big_endian:
        endianess = gguf.GGUFEndian.BIG

-    params = None
-    if args.pad_vocab or not args.vocab_only:
-        params = Params.load(model_plus)
-        if params.n_ctx == -1:
-            if args.ctx is None:
-                msg = """\
-                    The model doesn't have a context size, and you didn't specify one with --ctx
-                    Please specify one with --ctx:
-                     - LLaMA v1: --ctx 2048
-                     - LLaMA v2: --ctx 4096"""
-                parser.error(textwrap.dedent(msg))
-            params.n_ctx = args.ctx
+    params = Params.load(model_plus)
+    if params.n_ctx == -1:
+        if args.ctx is None:
+            msg = """\
+                The model doesn't have a context size, and you didn't specify one with --ctx
+                Please specify one with --ctx:
+                 - LLaMA v1: --ctx 2048
+                 - LLaMA v2: --ctx 4096"""
+            parser.error(textwrap.dedent(msg))
+        params.n_ctx = args.ctx

-        if args.outtype:
-            params.ftype = {
-                "f32": GGMLFileType.AllF32,
-                "f16": GGMLFileType.MostlyF16,
-                "q8_0": GGMLFileType.MostlyQ8_0,
-            }[args.outtype]
+    if args.outtype:
+        params.ftype = {
+            "f32": GGMLFileType.AllF32,
+            "f16": GGMLFileType.MostlyF16,
+            "q8_0": GGMLFileType.MostlyQ8_0,
+        }[args.outtype]

-        logger.info(f"params = {params}")
+    logger.info(f"params = {params}")

    model_parent_path = model_plus.paths[0].parent
    vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
@@ -1545,17 +1539,6 @@ def main(args_in: list[str] | None = None) -> None:
        if not args.outfile:
            raise ValueError("need --outfile if using --vocab-only")
        outfile = args.outfile
-        if params is None:
-            params = Params(
-                n_vocab    = vocab.vocab_size,
-                n_embd     = 1,
-                n_layer    = 1,
-                n_ctx      = 1,
-                n_ff       = 1,
-                n_head     = 1,
-                n_head_kv  = 1,
-                f_norm_eps = 1e-5,
-            )
        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
                                    endianess=endianess, pad_vocab=args.pad_vocab)
        logger.info(f"Wrote {outfile}")
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@@ -2,7 +2,7 @@

 This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.

-To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
+To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:

 `$ make -j`

--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
                    size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
                    float v;
                    if (type == GGML_TYPE_F16) {
-                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
+                        v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
                    } else if (type == GGML_TYPE_F32) {
-                        v = *(float *) &data[i];
+                        v = *(float *) data + i;
                    } else if (type == GGML_TYPE_I32) {
-                        v = (float) *(int32_t *) &data[i];
+                        v = (float) *(int32_t *) data + i;
                    } else if (type == GGML_TYPE_I16) {
-                        v = (float) *(int16_t *) &data[i];
+                        v = (float) *(int16_t *) data + i;
                    } else if (type == GGML_TYPE_I8) {
-                        v = (float) *(int8_t *) &data[i];
+                        v = (float) *(int8_t *) data + i;
                    } else {
                        GGML_ASSERT(false);
                    }
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -573,13 +573,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    struct ggml_tensor * embeddings = inp;
    if (ctx->has_class_embedding) {
        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-        ggml_set_name(embeddings, "embeddings");
-        ggml_set_input(embeddings);
        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
        embeddings = ggml_acc(ctx0, embeddings, inp,
                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
    }
+    ggml_set_name(embeddings, "embeddings");
+    ggml_set_input(embeddings);


    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
@@ -1846,7 +1846,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    const int image_size    = hparams.image_size;
    const int patch_size    = hparams.patch_size;
    const int num_patches   = ((image_size / patch_size) * (image_size / patch_size));
-    const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
+    const int num_positions = num_patches + 1;

    {
        struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@@ -1874,14 +1874,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    }

    {
-        if (ctx->has_class_embedding) {
-            struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
+        struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");

-            void* zero_mem = malloc(ggml_nbytes(embeddings));
-            memset(zero_mem, 0, ggml_nbytes(embeddings));
-            ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
-            free(zero_mem);
-        }
+        void* zero_mem = malloc(ggml_nbytes(embeddings));
+        memset(zero_mem, 0, ggml_nbytes(embeddings));
+        ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
+        free(zero_mem);
    }

    {
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -189,11 +189,6 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
    LOG_TEE("\n");

    struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
-    if (!ctx_sampling) {
-        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
-        exit(1);
-    }
-
    std::string response = "";
    for (int i = 0; i < max_tgt_len; i++) {
        const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -362,9 +362,6 @@ int main(int argc, char ** argv) {
        params.interactive_first = true;
        params.antiprompt.emplace_back("<|im_start|>user\n");
    }
-    else if (params.conversation) {
-        params.interactive_first = true;
-    }

    // enable interactive mode if interactive start is specified
    if (params.interactive_first) {
@@ -523,10 +520,6 @@ int main(int argc, char ** argv) {
    }

    struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
-    if (!ctx_sampling) {
-        fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__);
-        exit(1);
-    }

    while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
        // predict
@@ -740,7 +733,7 @@ int main(int argc, char ** argv) {
        // display text
        if (input_echo && display) {
            for (auto id : embd) {
-                const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
+                const std::string token_str = llama_token_to_piece(ctx, id);
                printf("%s", token_str.c_str());

                if (embd.size() > 1) {
@@ -823,7 +816,7 @@ int main(int argc, char ** argv) {
            if (n_past > 0 && is_interacting) {
                LOG("waiting for user input\n");

-                if (params.conversation || params.instruct || params.chatml) {
+                if (params.instruct || params.chatml) {
                    printf("\n> ");
                }

@@ -833,7 +826,7 @@ int main(int argc, char ** argv) {
                }

                std::string buffer;
-                if (!params.input_prefix.empty() && !params.conversation) {
+                if (!params.input_prefix.empty()) {
                    LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                    printf("%s", params.input_prefix.c_str());
                }
@@ -857,7 +850,7 @@ int main(int argc, char ** argv) {
                // Entering a empty line lets the user pass control back
                if (buffer.length() > 1) {
                    // append input suffix if any
-                    if (!params.input_suffix.empty() && !params.conversation) {
+                    if (!params.input_suffix.empty()) {
                        LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                        printf("%s", params.input_suffix.c_str());
                    }
@@ -883,7 +876,7 @@ int main(int argc, char ** argv) {
                    }

                    const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, params.interactive_specials);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
                    const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);

                    LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -331,7 +331,7 @@ Notice that each `probs` is an array of length `n_probs`.

    `content`: Set the text to tokenize.

-    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+    Note that a special `BOS` token is never inserted.

 - **POST** `/detokenize`: Convert tokens to text.

--- a/examples/server/public/favicon.ico
+++ b/examples/server/public/favicon.ico
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -12,8 +12,6 @@
 // increase max payload length to allow use of larger context size
 #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
 #include "httplib.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

 // auto generated files (update with ./deps.sh)
@@ -861,7 +859,7 @@ struct server_context {
        slot.sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);

        // process "json_schema" and "grammar"
-        if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
+        if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) {
            send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
            return false;
        } else if (data.contains("json_schema") && !data.contains("grammar")) {
@@ -1514,7 +1512,7 @@ struct server_context {
        // add subtasks
        for (int i = 0; i < prompt_count; i++) {
            json subtask_data = multiprompt_task.data;
-            subtask_data["prompt"] = subtask_data.at("prompt")[i];
+            subtask_data["prompt"] = subtask_data["prompt"][i];

            // subtasks inherit everything else (infill mode, embedding mode, etc.)
            request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding);
@@ -1534,7 +1532,7 @@ struct server_context {
                    }

                    if (task.data.contains("system_prompt")) {
-                        system_prompt_set(task.data.at("system_prompt"));
+                        system_prompt_set(task.data["system_prompt"]);

                        for (server_slot & slot : slots) {
                            slot.n_past    = 0;
@@ -1646,7 +1644,7 @@ struct server_context {
                } break;
            case SERVER_TASK_TYPE_SLOT_SAVE:
                {
-                    int id_slot = task.data.at("id_slot");
+                    int id_slot = task.data["id_slot"];
                    server_slot * slot = get_slot(id_slot);
                    if (slot == nullptr) {
                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -1656,8 +1654,8 @@ struct server_context {
                    const size_t token_count = slot->cache_tokens.size();
                    const int64_t t_start = ggml_time_us();

-                    std::string filename = task.data.at("filename");
-                    std::string filepath = task.data.at("filepath");
+                    std::string filename = task.data["filename"];
+                    std::string filepath = task.data["filepath"];

                    const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);

@@ -1681,7 +1679,7 @@ struct server_context {
                } break;
            case SERVER_TASK_TYPE_SLOT_RESTORE:
                {
-                    int id_slot = task.data.at("id_slot");
+                    int id_slot = task.data["id_slot"];
                    server_slot * slot = get_slot(id_slot);
                    if (slot == nullptr) {
                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -1690,8 +1688,8 @@ struct server_context {

                    const int64_t t_start = ggml_time_us();

-                    std::string filename = task.data.at("filename");
-                    std::string filepath = task.data.at("filepath");
+                    std::string filename = task.data["filename"];
+                    std::string filepath = task.data["filepath"];

                    slot->cache_tokens.resize(slot->n_ctx);
                    size_t token_count = 0;
@@ -1723,7 +1721,7 @@ struct server_context {
                } break;
            case SERVER_TASK_TYPE_SLOT_ERASE:
                {
-                    int id_slot = task.data.at("id_slot");
+                    int id_slot = task.data["id_slot"];
                    server_slot * slot = get_slot(id_slot);
                    if (slot == nullptr) {
                        send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@@ -3138,8 +3136,8 @@ int main(int argc, char ** argv) {
                    server_task_result result = ctx_server.queue_results.recv(task.id);
                    ctx_server.queue_results.remove_waiting_task_id(task.id);

-                    const int n_idle_slots       = result.data.at("idle");
-                    const int n_processing_slots = result.data.at("processing");
+                    const int n_idle_slots       = result.data["idle"];
+                    const int n_processing_slots = result.data["processing"];

                    json health = {
                        {"status",           "ok"},
@@ -3149,7 +3147,7 @@ int main(int argc, char ** argv) {

                    res.status = 200; // HTTP OK
                    if (sparams.slots_endpoint && req.has_param("include_slots")) {
-                        health["slots"] = result.data.at("slots");
+                        health["slots"] = result.data["slots"];
                    }

                    if (n_idle_slots == 0) {
@@ -3193,7 +3191,7 @@ int main(int argc, char ** argv) {
        server_task_result result = ctx_server.queue_results.recv(task.id);
        ctx_server.queue_results.remove_waiting_task_id(task.id);

-        res.set_content(result.data.at("slots").dump(), "application/json");
+        res.set_content(result.data["slots"].dump(), "application/json");
        res.status = 200; // HTTP OK
    };

@@ -3220,32 +3218,32 @@ int main(int argc, char ** argv) {

        json data = result.data;

-        const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed");
-        const uint64_t t_prompt_processing       = data.at("t_prompt_processing");
+        const uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
+        const uint64_t t_prompt_processing       = data["t_prompt_processing"];

-        const uint64_t n_tokens_predicted  = data.at("n_tokens_predicted");
-        const uint64_t t_tokens_generation = data.at("t_tokens_generation");
+        const uint64_t n_tokens_predicted  = data["n_tokens_predicted"];
+        const uint64_t t_tokens_generation = data["t_tokens_generation"];

-        const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");
+        const int32_t kv_cache_used_cells = data["kv_cache_used_cells"];

        // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
        json all_metrics_def = json {
            {"counter", {{
                    {"name",  "prompt_tokens_total"},
                    {"help",  "Number of prompt tokens processed."},
-                    {"value",  (uint64_t) data.at("n_prompt_tokens_processed_total")}
+                    {"value",  (uint64_t) data["n_prompt_tokens_processed_total"]}
            }, {
                    {"name",  "prompt_seconds_total"},
                    {"help",  "Prompt process time"},
-                    {"value",  (uint64_t) data.at("t_prompt_processing_total") / 1.e3}
+                    {"value",  (uint64_t) data["t_prompt_processing_total"] / 1.e3}
            }, {
                    {"name",  "tokens_predicted_total"},
                    {"help",  "Number of generation tokens processed."},
-                    {"value",  (uint64_t) data.at("n_tokens_predicted_total")}
+                    {"value",  (uint64_t) data["n_tokens_predicted_total"]}
            }, {
                    {"name",  "tokens_predicted_seconds_total"},
                    {"help",  "Predict process time"},
-                    {"value",  (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
+                    {"value",  (uint64_t) data["t_tokens_generation_total"] / 1.e3}
            }}},
            {"gauge", {{
                    {"name",  "prompt_tokens_seconds"},
@@ -3262,15 +3260,15 @@ int main(int argc, char ** argv) {
            },{
                    {"name",  "kv_cache_tokens"},
                    {"help",  "KV-cache tokens."},
-                    {"value",  (uint64_t) data.at("kv_cache_tokens_count")}
+                    {"value",  (uint64_t) data["kv_cache_tokens_count"]}
            },{
                    {"name",  "requests_processing"},
                    {"help",  "Number of request processing."},
-                    {"value",  (uint64_t) data.at("processing")}
+                    {"value",  (uint64_t) data["processing"]}
            },{
                    {"name",  "requests_deferred"},
                    {"help",  "Number of request deferred."},
-                    {"value",  (uint64_t) data.at("deferred")}
+                    {"value",  (uint64_t) data["deferred"]}
            }}}
        };

@@ -3281,8 +3279,8 @@ int main(int argc, char ** argv) {
            const auto & metrics_def = el.value();

            for (const auto & metric_def : metrics_def) {
-                const std::string name = metric_def.at("name");
-                const std::string help = metric_def.at("help");
+                const std::string name = metric_def["name"];
+                const std::string help = metric_def["help"];

                auto value = json_value(metric_def, "value", 0.);
                prometheus << "# HELP llamacpp:" << name << " " << help  << "\n"
@@ -3291,7 +3289,7 @@ int main(int argc, char ** argv) {
            }
        }

-        const int64_t t_start = data.at("t_start");
+        const int64_t t_start = data["t_start"];
        res.set_header("Process-Start-Time-Unix", std::to_string(t_start));

        res.set_content(prometheus.str(), "text/plain; version=0.0.4");
@@ -3300,7 +3298,7 @@ int main(int argc, char ** argv) {

    const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
        json request_data = json::parse(req.body);
-        std::string filename = request_data.at("filename");
+        std::string filename = request_data["filename"];
        if (!validate_file_name(filename)) {
            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
            return;
@@ -3330,7 +3328,7 @@ int main(int argc, char ** argv) {

    const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
        json request_data = json::parse(req.body);
-        std::string filename = request_data.at("filename");
+        std::string filename = request_data["filename"];
        if (!validate_file_name(filename)) {
            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
            return;
@@ -3649,8 +3647,7 @@ int main(int argc, char ** argv) {

        std::vector<llama_token> tokens;
        if (body.count("content") != 0) {
-            const bool add_special = json_value(body, "add_special", false);
-            tokens = ctx_server.tokenize(body.at("content"), add_special);
+            tokens = ctx_server.tokenize(body["content"], false);
        }
        const json data = format_tokenizer_response(tokens);
        return res.set_content(data.dump(), "application/json; charset=utf-8");
@@ -3662,7 +3659,7 @@ int main(int argc, char ** argv) {

        std::string content;
        if (body.count("tokens") != 0) {
-            const std::vector<llama_token> tokens = body.at("tokens");
+            const std::vector<llama_token> tokens = body["tokens"];
            content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
        }

@@ -3685,10 +3682,10 @@ int main(int argc, char ** argv) {
        json prompt;
        if (body.count("input") != 0) {
            is_openai = true;
-            prompt = body.at("input");
+            prompt = body["input"];
        } else if (body.count("content") != 0) {
            // with "content", we only support single prompt
-            prompt = std::vector<std::string>{body.at("content")};
+            prompt = std::vector<std::string>{body["content"]};
        } else {
            res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
            return;
@@ -3707,7 +3704,7 @@ int main(int argc, char ** argv) {
            if (!result.error) {
                if (result.data.count("results")) {
                    // result for multi-task
-                    responses = result.data.at("results");
+                    responses = result.data["results"];
                } else {
                    // result for single task
                    responses = std::vector<json>{result.data};
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -7,7 +7,6 @@ Feature: llama.cpp server
    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
    And   a model file test-model.gguf
    And   a model alias tinyllama-2
-    And   BOS token is 1
    And   42 as server seed
      # KV Cache corresponds to the total amount of tokens
      # that can be stored across all independent sequences: #4130
@@ -92,18 +91,7 @@ Feature: llama.cpp server
    """
    What is the capital of France ?
    """
-    Then tokens can be detokenized
-    And  tokens do not begin with BOS
-
-  Scenario: Tokenize w/ BOS
-    Given adding special tokens
-    When  tokenizing:
-    """
-    What is the capital of Germany?
-    """
-    Then  tokens begin with BOS
-    Given first token is removed
-    Then  tokens can be detokenized
+    Then tokens can be detokenize

  Scenario: Models available
    Given available models
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -376,11 +376,6 @@ def step_seed(context, seed):
        context.seed.append(seed)


-@step('BOS token is {bos:d}')
-def step_bos_token(context, bos):
-    context.bos = bos
-
-
@step('a prefix prompt')
 def step_prompt_prefix(context):
    context.prompt_prefix = context_text(context)
@@ -661,29 +656,21 @@ async def all_embeddings_are_generated(context):
        assert_embeddings(context.tasks_result.pop().pop())


-@step('adding special tokens')
-def step_tokenize_set_add_special(context):
-    context.tokenize_add_special = True
-
-
@step('tokenizing')
@async_run_until_complete
 async def step_tokenize(context):
    context.tokenized_text = context_text(context)
    async with aiohttp.ClientSession() as session:
-        tokenize_args = {
-            "content": context.tokenized_text,
-        }
-        if getattr(context, 'tokenize_add_special', None) is not None:
-            tokenize_args['add_special'] = context.tokenize_add_special
        async with session.post(f'{context.base_url}/tokenize',
-                                json=tokenize_args) as response:
+                                json={
+                                    "content": context.tokenized_text,
+                                }) as response:
            assert response.status == 200
            tokenize_json = await response.json()
            context.tokens = tokenize_json['tokens']


-@step('tokens can be detokenized')
+@step('tokens can be detokenize')
@async_run_until_complete
 async def step_detokenize(context):
    assert len(context.tokens) > 0
@@ -698,21 +685,6 @@ async def step_detokenize(context):
            assert context.tokenized_text == detokenize_json['content'].strip()


-@step('tokens begin with BOS')
-def step_strings_for_tokenization(context):
-    assert context.tokens[0] == context.bos
-
-
-@step('tokens do not begin with BOS')
-def step_strings_for_tokenization(context):
-    assert context.tokens[0] != context.bos
-
-
-@step('first token is removed')
-def step_strings_for_tokenization(context):
-    context.tokens = context.tokens[1:]
-
-
@step('an OPTIONS request is sent from {origin}')
@async_run_until_complete
 async def step_options_request(context, origin):
@@ -939,7 +911,7 @@ async def oai_chat_completions(user_prompt,
                    while event_received:
                        event_received = False
                        async for line_in_bytes in response.content:
-                            line = line_in_bytes.decode('utf-8')
+                            line = line_in_bytes.decode('utf8')
                            line = line.rstrip('\n').rstrip('\r')
                            if line == '':
                                continue
--- a/examples/server/themes/README.md
+++ b/examples/server/themes/README.md
@@ -1,5 +0,0 @@
-# LLaMA.cpp Server Wild Theme
-
-Simple themes directory of sample "public" directories. To try any of these add --path to your run like `server --path=wild`.
-
-![image](wild/wild.png)
--- a/examples/server/themes/buttons-top/README.md
+++ b/examples/server/themes/buttons-top/README.md
@@ -1,7 +0,0 @@
-# LLaMA.cpp Server Buttons Top Theme
-
-Simple tweaks to the UI. Chat buttons at the top of the page instead of bottom so you can hit Stop instead of chasing it down the page.
-
-To use simply run server with `--path=themes/buttons_top`
-
-![image](buttons_top.png)
--- a/examples/server/themes/buttons-top/buttons_top.png
+++ b/examples/server/themes/buttons-top/buttons_top.png
--- a/examples/server/themes/buttons-top/favicon.ico
+++ b/examples/server/themes/buttons-top/favicon.ico
--- a/examples/server/themes/buttons-top/index.html
+++ b/examples/server/themes/buttons-top/index.html
--- a/examples/server/themes/wild/README.md
+++ b/examples/server/themes/wild/README.md
@@ -1,5 +0,0 @@
-# LLaMA.cpp Server Wild Theme
-
-Simple tweaks to the UI. To use simply run server with `--path=themes/wild`
-
-![image](wild.png)
--- a/examples/server/themes/wild/favicon.ico
+++ b/examples/server/themes/wild/favicon.ico
--- a/examples/server/themes/wild/index.html
+++ b/examples/server/themes/wild/index.html
--- a/examples/server/themes/wild/llama_cpp.png
+++ b/examples/server/themes/wild/llama_cpp.png
--- a/examples/server/themes/wild/llamapattern.png
+++ b/examples/server/themes/wild/llamapattern.png
--- a/examples/server/themes/wild/wild.png
+++ b/examples/server/themes/wild/wild.png
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -3,8 +3,6 @@
 #include "llama.h"
 #include "common.h"

-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
 #include "json.hpp"

 #include <string>
@@ -51,18 +49,18 @@ extern bool server_log_json;
 #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
 #define LOG_INFO(   MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)

-static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
+static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra);

 template <typename T>
-static T json_value(const json & body, const std::string & key, const T & default_value) {
+static T json_value(const json &body, const std::string &key, const T &default_value) {
    // Fallback null to default value
-    if (body.contains(key) && !body.at(key).is_null()) {
+    if (body.contains(key) && !body.at(key).is_null()){
        try {
-            return body.at(key);
-        } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
-            std::stringstream ss;
-            ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
-            LOG_WARNING(ss.str().c_str(), body);
+            return body.value(key, default_value);
+        }
+        catch (nlohmann::json_abi_v3_11_3::detail::type_error const&){
+            std::string message = "Wrong type supplied for parameter '" + key + "'. Expected '" + typeid(default_value).name() + "', using default value.";
+            server_log("WARN", __func__, __LINE__, message.c_str(), body);
            return default_value;
        }
    } else {
@@ -70,16 +68,16 @@ static T json_value(const json & body, const std::string & key, const T & defaul
    }
 }

-static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
+static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
    std::stringstream ss_tid;
    ss_tid << std::this_thread::get_id();
-    json log = json{
+    json log = nlohmann::ordered_json{
        {"tid",       ss_tid.str()},
        {"timestamp", time(nullptr)},
    };

    if (server_log_json) {
-        log.merge_patch({
+        log.merge_patch( {
            {"level",    level},
            {"function", function},
            {"line",     line},
@@ -100,7 +98,7 @@ static inline void server_log(const char * level, const char * function, int lin
        }
        std::stringstream ss;
        ss << buf << " |";
-        for (const auto & el : log.items())
+        for (const auto& el : log.items())
        {
            const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
            ss << " " << el.key() << "=" << value;
@@ -375,11 +373,11 @@ static json oaicompat_completion_params_parse(
    llama_params["top_p"]             = json_value(body,   "top_p",             1.0);

    // Apply chat template to the list of messages
-    llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
+    llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);

    // Handle "stop" field
-    if (body.contains("stop") && body.at("stop").is_string()) {
-        llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
+    if (body.contains("stop") && body["stop"].is_string()) {
+        llama_params["stop"] = json::array({body["stop"].get<std::string>()});
    } else {
        llama_params["stop"] = json_value(body, "stop", json::array());
    }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1647,7 +1647,7 @@ static void ggml_cuda_op_mul_mat(
    }
 }

-static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
    GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@@ -1670,7 +1670,7 @@ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const gg
    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
 }

-static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
    GGML_ASSERT(!ggml_is_transposed(src0));
    GGML_ASSERT(!ggml_is_transposed(src1));
    GGML_ASSERT(!ggml_is_permuted(src0));
@@ -2410,304 +2410,32 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
    GGML_UNUSED(backend);
 }

-static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
-    graph_node_properties->node_address = node->data;
-    graph_node_properties->node_op = node->op;
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        graph_node_properties->ne[i] = node->ne[i];
-        graph_node_properties->nb[i] = node->nb[i];
-    }
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
-    }
-}
-
-static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
-    if (node->data != graph_node_properties->node_address &&
-          node->op != GGML_OP_CPY &&
-          node->op != GGML_OP_VIEW) {
-        return false;
-    }
-
-    if (node->op != graph_node_properties->node_op) {
-        return false;
-    }
-
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if (node->ne[i] != graph_node_properties->ne[i]) {
-            return false;
-        }
-        if (node->nb[i] != graph_node_properties->nb[i]) {
-            return false;
-        }
-    }
-
-    for (int i = 0; i < GGML_MAX_SRC; i++) {
-        if (node->src[i] &&
-            node->src[i]->data != graph_node_properties->src_address[i] &&
-            node->op != GGML_OP_CPY &&
-            node->op != GGML_OP_VIEW
-        ) {
-            return false;
-        }
-    }
-    return true;
-}
-
 GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
    ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;

    ggml_cuda_set_device(cuda_ctx->device);

-#ifdef USE_CUDA_GRAPH
-    static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];

-    // Objects required for CUDA Graph
-    if (cuda_ctx->cuda_graph == nullptr) {
-        cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
-    }
-
-    bool use_cuda_graph = true;
-    bool cuda_graph_update_required = false;
-    // pointer to CUDA cpy kernel, which is required to identify
-    // kernel parameters which need updated in the graph for each token
-    void * ggml_cuda_cpy_fn_ptr = nullptr;
-
-    if (cuda_ctx->cuda_graph->graph == nullptr) {
-        if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
-            cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
-#ifndef NDEBUG
-            fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
-#endif
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            continue;
        }
-    }
-
-    // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
-    // or previous graph capture failure.
-    // Also disable for multi-gpu for now. TO DO investigate
-    if (disable_cuda_graphs_due_to_env
-        || cuda_ctx->cuda_graph->disable_due_to_gpu_arch
-        || cuda_ctx->cuda_graph->disable_due_to_too_many_updates
-        || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
-        use_cuda_graph = false;
-    }
-
-    if (use_cuda_graph) {
-        if (cuda_ctx->cuda_graph->instance == nullptr) {
-            cuda_graph_update_required = true;
-        }
-
-        // Check if the graph size has changed
-        if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
-            cuda_graph_update_required = true;
-            cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
-        }
-
-        // Loop over nodes in GGML graph to determine if CUDA graph update is required
-        // and store properties to allow this comparison for the next token
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            bool has_matching_properties = true;
-            if (!cuda_graph_update_required) {
-                has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
-            }
-            if (!has_matching_properties) {
-                cuda_graph_update_required = true;
-            }
-            set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
-        }
-
-        // Loop over nodes in GGML graph to obtain info needed for CUDA graph
-        cuda_ctx->cuda_graph->updated_kernel_arg.clear();
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            ggml_tensor * node = cgraph->nodes[i];
-
-            if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
-                use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
-#ifndef NDEBUG
-                fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
-#endif
-            }
-
-            if (node->op == GGML_OP_MUL_MAT_ID) {
-                use_cuda_graph = false; // This node type is not supported by CUDA graph capture
-#ifndef NDEBUG
-                fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
-#endif
-            }
-
-            if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
-                // disable CUDA graphs for batch size > 1 for now.
-                // Changes in batch size or context size can cause changes to the grid size of some kernels.
-                use_cuda_graph = false;
-#ifndef NDEBUG
-                fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
-#endif
-            }
-
-            if (node->op == GGML_OP_CPY) {
-                // store the copy op parameter which changes with each token.
-                cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
-                if (ggml_cuda_cpy_fn_ptr == nullptr) {
-                    // store a pointer to the copy op CUDA kernel to identify it later
-                    ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
-                }
-            }
-
-            if (!use_cuda_graph) {
-                break;
-            }
-        }
-
-        // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
-        if (cuda_graph_update_required) {
-            cuda_ctx->cuda_graph->number_consecutive_updates++;
-        } else {
-            cuda_ctx->cuda_graph->number_consecutive_updates = 0;
-        }
-
-        if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
-            cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
-#ifndef NDEBUG
-            fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
-#endif
-        }
-    }
-
-    if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
-        CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
-    }
-
-#else
-    bool use_cuda_graph = false;
-    bool cuda_graph_update_required = false;
-#endif // USE_CUDA_GRAPH
-
-    bool graph_evaluated_or_captured = false;
-
-    while (!graph_evaluated_or_captured) {
-        // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
-        // With the use of CUDA graphs, the execution will be performed by the graph launch.
-        if (!use_cuda_graph || cuda_graph_update_required) {
-            for (int i = 0; i < cgraph->n_nodes; i++) {
-                ggml_tensor * node = cgraph->nodes[i];
-
-                if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
-                    continue;
-                }

 #ifndef NDEBUG
-                assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
-                for (int j = 0; j < GGML_MAX_SRC; j++) {
-                    if (node->src[j] != nullptr) {
-                        assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
-                    }
-                }
+        assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j] != nullptr) {
+                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
+            }
+        }
 #endif

-                bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
-                if (!ok) {
-                    fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
-                }
-                GGML_ASSERT(ok);
-            }
+        bool ok = ggml_cuda_compute_forward(*cuda_ctx, node);
+        if (!ok) {
+            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
        }
-
-#ifdef USE_CUDA_GRAPH
-        if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
-            if (cuda_ctx->cuda_graph->graph != nullptr) {
-                CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
-                cuda_ctx->cuda_graph->graph = nullptr;
-            }
-            CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
-
-#if 0
-            if (disable_cuda_graphs_due_to_failed_capture) {
-                use_cuda_graph = false;
-                cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
-#ifndef NDEBUG
-                fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
-#endif
-            } else {
-                graph_evaluated_or_captured = true; // CUDA graph has been captured
-            }
-#endif
-            graph_evaluated_or_captured = true; // CUDA graph has been captured
-        } else {
-            graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
-        }
-    }
-
-    if (use_cuda_graph) {
-        if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
-            CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
-        }
-
-        // Perform update to graph (if required for this token), and change copy parameter (required for every token)
-
-        if (cuda_graph_update_required) {
-            // Extract nodes from graph
-            if (cuda_ctx->cuda_graph->num_nodes == 0) {
-                // First call with null argument gets number of nodes in graph
-                CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
-            }
-            // Subsequent call with non-null argument gets nodes
-            cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
-            cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
-            if (cuda_ctx->cuda_graph->num_nodes > 0) {
-                CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
-
-                // Loop over nodes, and extract kernel parameters from each node
-                for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
-                    cudaGraphNodeType node_type;
-                    CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
-                    if (node_type == cudaGraphNodeTypeKernel) {
-                        cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
-                        if (stat == cudaErrorInvalidDeviceFunction) {
-                            // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
-                            // We don't need to update blas nodes, so clear error and move on.
-                            cudaGetLastError();
-                        } else {
-                            GGML_ASSERT(stat == cudaSuccess);
-                        }
-                    }
-                }
-            }
-        }
-
-        // One of the arguments to the copy kernel is updated for each token, hence we need to
-        // replace that argument with the updated value in the CUDA graph
-        if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
-            int k = 0;
-            for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
-                if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
-                    char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
-                    cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
-                    CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
-                }
-            }
-        }
-
-        // Update graph executable
-        cudaGraphExecUpdateResultInfo result_info;
-        cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
-        if (stat == cudaErrorGraphExecUpdateFailure) {
-#ifndef NDEBUG
-            fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
-#endif
-            // The pre-existing graph exec cannot be updated due to violated constraints
-            // so instead clear error and re-instantiate
-            cudaGetLastError();
-            CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
-            cuda_ctx->cuda_graph->instance = nullptr;
-            CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
-        } else {
-            GGML_ASSERT(stat == cudaSuccess);
-        }
-        // Launch graph
-        CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
-#else
-        graph_evaluated_or_captured = true;
-#endif // USE_CUDA_GRAPH
+        GGML_ASSERT(ok);
    }

    return GGML_STATUS_SUCCESS;
--- a/ggml-cuda/clamp.cu
+++ b/ggml-cuda/clamp.cu
@@ -31,4 +31,5 @@ void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));

    clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
+    CUDA_CHECK(cudaGetLastError());
 }
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@@ -19,7 +19,6 @@
 #include <cassert>
 #include <cfloat>
 #include <string>
-#include <vector>

 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
@@ -234,6 +233,122 @@ typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
 #endif //GGML_CUDA_F16

+[[noreturn]]
+static __device__ void no_device_code(
+    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
+           file_name, line, function_name, arch);
+    GGML_UNUSED(arch_list);
+#else
+    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
+           file_name, line, function_name, arch, arch_list);
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    __trap();
+
+    GGML_UNUSED(no_device_code); // suppress unused function warning
+}
+
+#ifdef __CUDA_ARCH__
+#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
+#else
+#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
+#endif // __CUDA_ARCH__
+
+static __device__ __forceinline__ float warp_reduce_sum(float x) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
+    }
+    return x;
+}
+
+static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
+        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
+    }
+    return a;
+}
+
+static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+#pragma unroll
+   for (int mask = 16; mask > 0; mask >>= 1) {
+       a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
+   }
+   return a;
+#else
+   GGML_UNUSED(a);
+   NO_DEVICE_CODE;
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+}
+
+static __device__ __forceinline__ float warp_reduce_max(float x) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+    }
+    return x;
+}
+
+static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+
+#if CUDART_VERSION >= CUDART_HMAX
+    return __hmax(a, b);
+#else
+    return __half2float(a) > __half2float(b) ? a : b;
+#endif // CUDART_VERSION >= CUDART_HMAX
+
+#else
+    GGML_UNUSED(a);
+    GGML_UNUSED(b);
+    NO_DEVICE_CODE;
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
+}
+static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+
+#if CUDART_VERSION >= CUDART_HMAX
+    return __hmax2(a, b);
+#else
+    half2 ret;
+    reinterpret_cast<half&>(ret.x) =  __low2float(a) >  __low2float(b) ?  __low2half(a) :  __low2half(b);
+    reinterpret_cast<half&>(ret.y) = __high2float(a) > __high2float(b) ? __high2half(a) : __high2half(b);
+    return ret;
+#endif // CUDART_VERSION >= CUDART_HMAX
+
+#else
+    GGML_UNUSED(a);
+    GGML_UNUSED(b);
+    NO_DEVICE_CODE;
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
+}
+
+static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+#pragma unroll
+   for (int mask = 16; mask > 0; mask >>= 1) {
+       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
+   }
+   return x;
+#else
+   GGML_UNUSED(x);
+   NO_DEVICE_CODE;
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
+}
+
+#if CUDART_VERSION < CUDART_HMASK
+static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
+    const uint32_t mask_low  = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
+    const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
+    return mask_low | mask_high;
+}
+#endif // CUDART_VERSION < 12000
+
 #if defined(GGML_USE_HIPBLAS)
 #define __CUDA_ARCH__ 1300

@@ -317,143 +432,11 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 }
 #endif // defined(GGML_USE_HIPBLAS)

-#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
+#define FP16_AVAILABLE     defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
+    defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL

 #define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA

-static bool fp16_mma_available(const int cc) {
-    return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
-}
-
-[[noreturn]]
-static __device__ void no_device_code(
-    const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
-           file_name, line, function_name, arch);
-    GGML_UNUSED(arch_list);
-#else
-    printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
-           file_name, line, function_name, arch, arch_list);
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    __trap();
-
-    GGML_UNUSED(no_device_code); // suppress unused function warning
-}
-
-#ifdef __CUDA_ARCH__
-#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
-#else
-#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
-#endif // __CUDA_ARCH__
-
-static __device__ __forceinline__ float warp_reduce_sum(float x) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x += __shfl_xor_sync(0xffffffff, x, mask, 32);
-    }
-    return x;
-}
-
-static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
-        a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
-    }
-    return a;
-}
-
-static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
-#if FP16_AVAILABLE
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
-        reinterpret_cast<half&>(a.x) +=  __low2half(a_other);
-        reinterpret_cast<half&>(a.y) += __high2half(a_other);
-    }
-    return a;
-#else
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
-    }
-    return a;
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-
-#else
-    NO_DEVICE_CODE;
-    return a;
-#endif // FP16_AVAILABLE
-}
-
-static __device__ __forceinline__ float warp_reduce_max(float x) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
-    }
-    return x;
-}
-
-static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
-#if FP16_AVAILABLE
-
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
-    return __float2half(fmaxf(__half2float(a), __half2float(b)));
-#else
-    return __hmax(a, b);
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
-
-#else
-   NO_DEVICE_CODE;
-   GGML_UNUSED(b);
-   return a;
-#endif // FP16_AVAILABLE
-}
-
-static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-
-#if CUDART_VERSION >= CUDART_HMAX
-    return __hmax2(a, b);
-#else
-    half2 ret;
-    reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a),  __low2float(b)));
-    reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
-    return ret;
-#endif // CUDART_VERSION >= CUDART_HMAX
-
-#else
-    GGML_UNUSED(a);
-    GGML_UNUSED(b);
-    NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-}
-
-static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-#pragma unroll
-   for (int mask = 16; mask > 0; mask >>= 1) {
-       x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
-   }
-   return x;
-#else
-   GGML_UNUSED(x);
-   NO_DEVICE_CODE;
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
-}
-
-#if CUDART_VERSION < CUDART_HMASK
-static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
-    const uint32_t mask_low  = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
-    const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
-    return mask_low | mask_high;
-}
-#endif // CUDART_VERSION < 12000
-
 // TODO: move to ggml-common.h
 static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

@@ -543,43 +526,6 @@ struct ggml_tensor_extra_gpu {
    cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
 };

-
-#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
-#define USE_CUDA_GRAPH
-#endif
-
-struct ggml_graph_node_properties {
-    void * node_address;
-    ggml_op node_op;
-    int64_t ne[GGML_MAX_DIMS];
-    size_t nb[GGML_MAX_DIMS];
-    void * src_address[GGML_MAX_SRC];
-};
-
-struct ggml_cuda_graph {
-#ifdef USE_CUDA_GRAPH
-    ~ggml_cuda_graph() {
-        if (instance != nullptr) {
-            CUDA_CHECK(cudaGraphExecDestroy(instance));
-        }
-        if (graph != nullptr) {
-            CUDA_CHECK(cudaGraphDestroy(graph));
-        }
-    }
-    cudaGraph_t graph = nullptr;
-    cudaGraphExec_t instance = nullptr;
-    size_t num_nodes = 0;
-    std::vector<cudaGraphNode_t> nodes;
-    std::vector<cudaKernelNodeParams> params;
-    bool disable_due_to_gpu_arch = false;
-    bool disable_due_to_too_many_updates = false;
-    bool disable_due_to_failed_graph_capture = false;
-    int number_consecutive_updates = 0;
-    std::vector<ggml_graph_node_properties> ggml_graph_properties;
-    std::vector<char **> updated_kernel_arg;
-#endif
-};
-
 struct ggml_backend_cuda_context {
    int device;
    std::string name;
@@ -588,8 +534,6 @@ struct ggml_backend_cuda_context {
    cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
    cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};

-    std::unique_ptr<ggml_cuda_graph> cuda_graph;
-
    explicit ggml_backend_cuda_context(int device) :
        device(device),
        name(GGML_CUDA_NAME + std::to_string(device)) {
--- a/ggml-cuda/convert.cu
+++ b/ggml-cuda/convert.cu
@@ -727,6 +727,7 @@ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict_
 }

 to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
+    int id;
    switch (type) {
        case GGML_TYPE_Q4_0:
            return dequantize_row_q4_0_cuda;
@@ -737,7 +738,8 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
        case GGML_TYPE_Q5_1:
            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
        case GGML_TYPE_Q8_0:
-            if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) {
+            CUDA_CHECK(cudaGetDevice(&id));
+            if (ggml_cuda_info().devices[id].cc >= CC_PASCAL) {
                return dequantize_block_q8_0_f16_cuda;
            }
            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
--- a/ggml-cuda/cpy.cu
+++ b/ggml-cuda/cpy.cu
@@ -459,32 +459,3 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    ggml_cuda_cpy(ctx, src0, dst);
 }
-
-void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
-    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-            return (void*) cpy_f32_f16<cpy_1_f32_f32>;
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-            return (void*) cpy_f32_f16<cpy_1_f32_f16>;
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
-            return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
-            return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
-            return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
-            return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
-            return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
-            return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
-    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-            return (void*) cpy_f32_f16<cpy_1_f32_f16>;
-    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
-            return (void*) cpy_f32_f16<cpy_1_f16_f32>;
-    } else {
-        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
-                ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
-    }
-}
-
--- a/ggml-cuda/cpy.cuh
+++ b/ggml-cuda/cpy.cuh
@@ -5,5 +5,3 @@
 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);

 void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
--- a/ggml-cuda/fattn.cu
+++ b/ggml-cuda/fattn.cu
@@ -11,10 +11,8 @@
 #define HALF_MAX_HALF         __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
 #define SOFTMAX_FTZ_THRESHOLD -20.0f                   // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.

-template<int D, int ncols, int parallel_blocks> // D == head size
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
-__launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+template<int D, int parallel_blocks> // D == head size
+__launch_bounds__(((D + WARP_SIZE - 1) / WARP_SIZE)*WARP_SIZE, 1)
 static __global__ void flash_attn_vec_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
@@ -46,77 +44,55 @@ static __global__ void flash_attn_vec_ext_f16(
 #if FP16_AVAILABLE
    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+    const int ic = blockIdx.x / parallel_blocks; // Index of the Q/QKV column to work on.
+    const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.

    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
+    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic);
    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
    const half   * V_h   = (const half   *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
-    const half   * maskh = (const half   *)  mask + ne11*ic0;
+    const half   * maskh = (const half   *)  mask + ne11*ic;

    const int stride_KV  = nb11 / sizeof(half);
    const int stride_KV2 = nb11 / sizeof(half2);

-    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
-    constexpr int nwarps = D / WARP_SIZE;
+    constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
    const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
-    __builtin_assume(tid < D);
+    __builtin_assume(tid < nwarps*WARP_SIZE);

-    __shared__ half KQ[ncols*D];
-#pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-        KQ[j*D + tid] = -HALF_MAX_HALF;
-    }
+    __shared__ half KQ[nwarps*WARP_SIZE];
+    KQ[tid] = -INFINITY;
    half2 * KQ2 = (half2 *) KQ;

-    half kqmax[ncols];
-#pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-        kqmax[j] = -HALF_MAX_HALF;
-    }
-    half kqsum[ncols] = {0.0f};
+    half kqmax = -HALF_MAX_HALF;
+    half kqsum = 0.0f;

-    __shared__ half kqmax_shared[ncols][WARP_SIZE];
-    __shared__ half kqsum_shared[ncols][WARP_SIZE];
-#pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-        if (threadIdx.y == 0) {
-            kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
-            kqsum_shared[j][threadIdx.x] = 0.0f;
-        }
+    __shared__ half kqmax_shared[WARP_SIZE];
+    __shared__ half kqsum_shared[WARP_SIZE];
+    if (threadIdx.y == 0) {
+        kqmax_shared[threadIdx.x] = -HALF_MAX_HALF;
+        kqsum_shared[threadIdx.x] = 0.0f;
    }
    __syncthreads();

    // Convert Q to half2 and store in registers:
-    half2 Q_h2[ncols][D/(2*WARP_SIZE)];
+    half2 Q_h2[(D/2 + WARP_SIZE - 1) / WARP_SIZE];
 #pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-#pragma unroll
-        for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
-            const int i = i0 + threadIdx.x;
-
-            const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
-            Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
+    for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
+        const int i = i0 + threadIdx.x;
+        if (i0 + WARP_SIZE > D/2 && i >= D/2) {
+            break;
        }
+
+        Q_h2[i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(Q_f2[i].x, Q_f2[i].y);
    }

-    half2 VKQ[ncols] = {{0.0f, 0.0f}};
+    half2 VKQ = make_half2(0.0f, 0.0f); // Each thread calculates a single VKQ value.

-    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
+    const int k_start  = parallel_blocks == 1 ? 0 : ip*D;
    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:
-
-        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
-        // see https://github.com/ggerganov/llama.cpp/pull/7061 .
-        // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
-        half kqmax_new = kqmax[0];
-        half kqmax_new_arr[ncols];
-#pragma unroll
-        for (int j = 0; j < ncols; ++j) {
-            kqmax_new_arr[j] = kqmax[j];
-        }
-
+        half kqmax_new = kqmax;
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
            const int i_KQ = i_KQ_0 + threadIdx.y;
@@ -125,112 +101,89 @@ static __global__ void flash_attn_vec_ext_f16(
                break;
            }

-            half2 sum2[ncols] = {{0.0f, 0.0f}};
+            half2 sum2 = make_half2(0.0f, 0.0f);
 #pragma unroll
            for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
                const int k_KQ = k_KQ_0 + threadIdx.x;
+                if (k_KQ_0 + WARP_SIZE > D/2 && k_KQ >= D/2) {
+                    break;
+                }

                const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
-#pragma unroll
-                for (int j = 0; j < ncols; ++j) {
-                    sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
-                }
+                sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE];
            }

-#pragma unroll
-            for (int j = 0; j < ncols; ++j) {
-                sum2[j] = warp_reduce_sum(sum2[j]);
-                half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
-                sum += mask ? maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
-
-                if (ncols == 1) {
-                    kqmax_new        = ggml_cuda_hmax(kqmax_new,        sum);
-                } else {
-                    kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
-                }
-
-                if (threadIdx.x == 0) {
-                    KQ[j*D + i_KQ] = sum;
-                }
-            }
-        }
-
-#pragma unroll
-        for (int j = 0; j < ncols; ++j) {
-            half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
-
-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
+            sum2 = warp_reduce_sum(sum2);
+            half sum = __low2half(sum2) + __high2half(sum2);
+            sum += mask ? maskh[k_VKQ_0 + i_KQ] : __float2half(0.0f);
+            kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
            if (threadIdx.x == 0) {
-                kqmax_shared[j][threadIdx.y] = kqmax_new_j;
+                KQ[i_KQ] = sum;
            }
        }

-        __syncthreads();
-
-#pragma unroll
-        for (int j = 0; j < ncols; ++j) {
-            half kqmax_new_j = kqmax_shared[j][threadIdx.x];
-            kqmax_new_j = warp_reduce_max(kqmax_new_j);
-
-            const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
-            kqmax[j] = kqmax_new_j;
-
-            const half val = hexp(KQ[j*D + tid] - kqmax[j]);
-            kqsum[j] = kqsum[j]*KQ_max_scale + val;
-            KQ[j*D + tid] = val;
-
-            VKQ[j] *= __half2half2(KQ_max_scale);
-        }
-
-        __syncthreads();
-
-#pragma unroll
-        for (int k0 = 0; k0 < D; k0 += 2) {
-            if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
-                break;
-            }
-
-            half2 V_k;
-            reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
-            reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
-#pragma unroll
-            for (int j = 0; j < ncols; ++j) {
-                VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
-            }
-        }
-
-        __syncthreads();
-    }
-
-#pragma unroll
-    for (int j = 0; j < ncols; ++j) {
-        kqsum[j] = warp_reduce_sum(kqsum[j]);
+        kqmax_new = warp_reduce_max(kqmax_new);
        if (threadIdx.x == 0) {
-            kqsum_shared[j][threadIdx.y] = kqsum[j];
+            kqmax_shared[threadIdx.y] = kqmax_new;
        }
+        __syncthreads();
+        kqmax_new = kqmax_shared[threadIdx.x];
+        kqmax_new = warp_reduce_max(kqmax_new);
+
+        const half KQ_max_scale = hexp(kqmax - kqmax_new);
+        kqmax = kqmax_new;
+
+        const half val = hexp(KQ[tid] - kqmax);
+        kqsum = kqsum*KQ_max_scale + val;
+        KQ[tid] = val;
+
+        VKQ *= __half2half2(KQ_max_scale);
+
+        __syncthreads();
+
+        if (tid < D) {
+#pragma unroll
+            for (int k0 = 0; k0 < D; k0 += 2) {
+                if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
+                    break;
+                }
+
+                half2 V_k;
+                reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
+                reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
+                VKQ += V_k*KQ2[k0/2];
+            }
+        }
+
+        __syncthreads();
    }

+    if (tid >= D) {
+        kqsum = 0.0f;
+    }
+
+    kqsum = warp_reduce_sum(kqsum);
+    if (threadIdx.x == 0) {
+        kqsum_shared[threadIdx.y] = kqsum;
+    }
    __syncthreads();
+    kqsum = kqsum_shared[threadIdx.x];
+    kqsum = warp_reduce_sum(kqsum);

-#pragma unroll
-    for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
-        kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
-        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
-
-        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
-        if (parallel_blocks == 1) {
-            dst_val /= kqsum[j_VKQ];
-        }
-        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
-        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
+    if (tid >= D) {
+        return;
    }

-    if (parallel_blocks != 1 && tid != 0) {
-#pragma unroll
-        for (int j = 0; j < ncols; ++j) {
-            dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
-        }
+    half dst_val = (__low2half(VKQ) + __high2half(VKQ));
+    if (parallel_blocks == 1) {
+        dst_val /= kqsum;
    }
+    dst[D*gridDim.y*blockIdx.x + D*blockIdx.y + tid] = dst_val;
+
+    if (parallel_blocks == 1 || tid != 0) {
+        return;
+    }
+    dst_meta[ic*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax, kqsum);
 #else
   NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
@@ -238,9 +191,7 @@ static __global__ void flash_attn_vec_ext_f16(

 // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
 template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_ext_f16(
        const char * __restrict__ Q,
        const char * __restrict__ K,
@@ -622,9 +573,7 @@ static __global__ void flash_attn_ext_f16(
 }

 template<int D, int parallel_blocks> // D == head size
-#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
-#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_combine_results(
        const float  * __restrict__ VKQ_parts,
        const float2 * __restrict__ VKQ_meta,
@@ -693,7 +642,7 @@ static_assert(get_VKQ_stride( 80, 1, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 2, 16) ==  16, "Test failed.");
 static_assert(get_VKQ_stride( 80, 4, 16) ==  16, "Test failed.");

-template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
+template <int D, int parallel_blocks> void launch_fattn_vec_f16(
        const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
        ggml_cuda_pool & pool, cudaStream_t main_stream
 ) {
@@ -707,13 +656,13 @@ template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_

    constexpr int  nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
    const     dim3 block_dim(WARP_SIZE, nwarps, 1);
-    const     dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
+    const     dim3 blocks_num(parallel_blocks*Q->ne[1], Q->ne[2], Q->ne[3]);
    const     int  shmem = 0;

    float scale;
    memcpy(&scale, KQV->op_params, sizeof(float));

-    flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
+    flash_attn_vec_ext_f16<D, parallel_blocks>
        <<<blocks_num, block_dim, shmem, main_stream>>> (
                (const char *) Q->data,
                (const char *) K->data,
@@ -834,99 +783,10 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst

    ggml_cuda_set_device(ctx.device);

-    const int cc  = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;

    const int32_t precision = KQV->op_params[1];

-    if (!fp16_mma_available(cc)) {
-        GGML_ASSERT(precision == GGML_PREC_DEFAULT);
-        GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
-
-        if (Q->ne[1] == 1) {
-            constexpr int cols_per_block = 1;
-            constexpr int parallel_blocks = 4;
-            switch (Q->ne[0]) {
-                case 64:
-                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                case 128:
-                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                default:
-                    GGML_ASSERT(false);
-                    break;
-            }
-            return;
-        }
-
-        if (Q->ne[1] == 2) {
-            constexpr int cols_per_block = 2;
-            constexpr int parallel_blocks = 4;
-            switch (Q->ne[0]) {
-                case 64:
-                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                case 128:
-                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                default:
-                    GGML_ASSERT(false);
-                    break;
-            }
-            return;
-        }
-
-        if (Q->ne[1] <= 4) {
-            constexpr int cols_per_block = 4;
-            constexpr int parallel_blocks = 4;
-            switch (Q->ne[0]) {
-                case 64:
-                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                case 128:
-                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                default:
-                    GGML_ASSERT(false);
-                    break;
-            }
-            return;
-        }
-
-        if (Q->ne[1] <= 8) {
-            constexpr int cols_per_block = 8;
-            constexpr int parallel_blocks = 4;
-            switch (Q->ne[0]) {
-                case 64:
-                    launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                case 128:
-                    launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                    break;
-                default:
-                    GGML_ASSERT(false);
-                    break;
-            }
-            return;
-        }
-
-        constexpr int cols_per_block = 8;
-        constexpr int parallel_blocks = 1;
-        switch (Q->ne[0]) {
-            case 64:
-                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                break;
-            case 128:
-                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
-                break;
-            default:
-                GGML_ASSERT(false);
-                break;
-        }
-        return;
-    }
-
    if (precision != GGML_PREC_DEFAULT) {
        if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
            constexpr int cols_per_block = 16;
@@ -985,17 +845,16 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    }

    if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
-        constexpr int cols_per_block = 1;
        constexpr int parallel_blocks = 4;
        switch (Q->ne[0]) {
            case 64:
-                launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                launch_fattn_vec_f16< 64, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            case 128:
-                launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                launch_fattn_vec_f16<128, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            case 256:
-                launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
+                launch_fattn_vec_f16<256, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
                break;
            default:
                GGML_ASSERT(false);
--- a/ggml-cuda/mmq.cu
+++ b/ggml-cuda/mmq.cu
@@ -1735,7 +1735,8 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

-    int id = ggml_cuda_get_device();
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = ggml_cuda_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
@@ -1779,7 +1780,8 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

-    int id = ggml_cuda_get_device();
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = ggml_cuda_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
@@ -1823,7 +1825,8 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

-    int id = ggml_cuda_get_device();
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = ggml_cuda_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
@@ -1867,7 +1870,8 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

-    int id = ggml_cuda_get_device();
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = ggml_cuda_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
@@ -1911,7 +1915,8 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

-    int id = ggml_cuda_get_device();
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = ggml_cuda_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
@@ -1955,7 +1960,8 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

-    int id = ggml_cuda_get_device();
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = ggml_cuda_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
@@ -2001,7 +2007,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(

 #if QK_K == 256

-    int id = ggml_cuda_get_device();
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = ggml_cuda_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
@@ -2046,7 +2053,8 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

-    int id = ggml_cuda_get_device();
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = ggml_cuda_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
@@ -2090,7 +2098,8 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

-    int id = ggml_cuda_get_device();
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = ggml_cuda_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
@@ -2134,7 +2143,8 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
    const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {

-    int id = ggml_cuda_get_device();
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));
    const int compute_capability = ggml_cuda_info().devices[id].cc;

    int mmq_x, mmq_y, nwarps;
--- a/ggml-cuda/mmvq.cu
+++ b/ggml-cuda/mmvq.cu
@@ -89,7 +89,8 @@ static void mul_mat_vec_q_cuda(
    GGML_ASSERT(ncols_x % qk == 0);
    GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);

-    int id = ggml_cuda_get_device();
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));

    int64_t nwarps = 1;
    int64_t rows_per_cuda_block = 1;
@@ -327,7 +328,8 @@ void ggml_cuda_op_mul_mat_vec_q(

    const int64_t ne0 = dst->ne[0];

-    int id = ggml_cuda_get_device();
+    int id;
+    CUDA_CHECK(cudaGetDevice(&id));

    // the main device has a larger memory buffer to hold the results from all GPUs
    // nrows_dst == nrows of the matrix that the kernel writes into
--- a/ggml-cuda/scale.cu
+++ b/ggml-cuda/scale.cu
@@ -28,4 +28,5 @@ void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    memcpy(&scale, dst->op_params, sizeof(float));

    scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
+    CUDA_CHECK(cudaGetLastError());
 }
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -265,20 +265,11 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){

 static void * ggml_metal_host_malloc(size_t n) {
    void * data = NULL;
-
-#if TARGET_OS_OSX
-    kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
-    if (err != KERN_SUCCESS) {
-        GGML_METAL_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
-        return NULL;
-    }
-#else
    const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
    if (result != 0) {
        GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
        return NULL;
    }
-#endif

    return data;
 }
@@ -2849,11 +2840,7 @@ GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_
    ggml_backend_metal_free_device();

    if (ctx->owned) {
-#if TARGET_OS_OSX
-        vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size);
-#else
        free(ctx->all_data);
-#endif
    }

    free(ctx);
@@ -2957,16 +2944,14 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
    ctx->owned = true;
    ctx->n_buffers = 1;

-    if (ctx->all_data != NULL) {
-        ctx->buffers[0].data = ctx->all_data;
-        ctx->buffers[0].size = size;
-        ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
-                        length:size_aligned
-                        options:MTLResourceStorageModeShared
-                        deallocator:nil];
-    }
+    ctx->buffers[0].data = ctx->all_data;
+    ctx->buffers[0].size = size;
+    ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
+                    length:size_aligned
+                    options:MTLResourceStorageModeShared
+                    deallocator:nil];

-    if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
+    if (ctx->buffers[0].metal == nil) {
        GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
        free(ctx);
        ggml_backend_metal_free_device();
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -2119,7 +2119,6 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
    if (alignment == (cl_uint)-1) {
        ggml_cl_init();
        clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
-        alignment /= 8; // bits to bytes
    }
    return alignment;

--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -8330,26 +8330,24 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
    const int blocks_per_row = ncols / qk;
    const int blocks_per_warp = vdr * WARP_SIZE / qi;

-    const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
-
-    // partial sum for each thread
+// partial sum for each thread
    float tmp = 0.0f;

    const block_q_t  * x = (const block_q_t  *) vx;
    const block_q8_1 * y = (const block_q8_1 *) vy;

-    for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
+    for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
         i += blocks_per_warp) {
-      const int ibx = row * blocks_per_row + i; // x block index
+        const int ibx = row*blocks_per_row + i; // x block index

-      const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx

-      const int iqs =
-          vdr *
-          (item_ct1.get_local_id(2) -
-           i * qi_vdr); // x block quant index when casting the quants to int
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int

-      tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
+        tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
    }

    // sum up partial sums and write back result
--- a/ggml-vulkan-shaders.hpp
+++ b/ggml-vulkan-shaders.hpp
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -860,7 +860,7 @@ class GGUFValueType(IntEnum):
 # Note: Does not support GGML_QKK_64
 QK_K = 256
 # Items here are (block size, type size)
-GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
+GGML_QUANT_SIZES = {
    GGMLQuantizationType.F32:     (1, 4),
    GGMLQuantizationType.F16:     (1, 2),
    GGMLQuantizationType.Q4_0:    (32, 2 + 16),
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@@ -65,7 +65,7 @@ class ReaderTensor(NamedTuple):

 class GGUFReader:
    # I - same as host, S - swapped
-    byte_order: Literal['I'] | Literal['S'] = 'I'
+    byte_order: Literal['I' | 'S'] = 'I'
    alignment: int = GGUF_DEFAULT_ALIGNMENT

    # Note: Internal helper, API may change.
@@ -83,7 +83,7 @@ class GGUFReader:
        GGUFValueType.BOOL:    np.bool_,
    }

-    def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
+    def __init__(self, path: os.PathLike[str] | str, mode: Literal['r' | 'r+' | 'c'] = 'r'):
        self.data = np.memmap(path, mode = mode)
        offs = 0
        if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
@@ -128,7 +128,7 @@ class GGUFReader:
        return self.tensors[idx]

    def _get(
-        self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I'] | Literal['S'] | Literal['<'] = None,
+        self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I' | 'S' | '<'] = None,
    ) -> npt.NDArray[Any]:
        count = int(count)
        itemsize = int(np.empty([], dtype = dtype).itemsize)
@@ -250,7 +250,7 @@ class GGUFReader:
                raise ValueError(f'Found duplicated tensor with name {tensor_name}')
            tensor_names.add(tensor_name)
            ggml_type = GGMLQuantizationType(raw_dtype[0])
-            n_elems = int(np.prod(dims))
+            n_elems = np.prod(dims)
            block_size, type_size = GGML_QUANT_SIZES[ggml_type]
            n_bytes = n_elems * type_size // block_size
            data_offs = int(start_offs + offset_tensor[0])
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -7,7 +7,7 @@ import struct
 import tempfile
 from enum import Enum, auto
 from io import BufferedWriter
-from typing import IO, Any, Callable, Sequence, Mapping
+from typing import IO, Any, Sequence, Mapping
 from string import ascii_letters, digits

 import numpy as np
@@ -28,47 +28,6 @@ from .constants import (
 logger = logging.getLogger(__name__)


-class LazyTensor:
-    data: Callable[[], np.ndarray[Any, Any]]
-    # to avoid too deep recursion
-    functions: list[Callable[[np.ndarray[Any, Any]], np.ndarray[Any, Any]]]
-    dtype: np.dtype[Any]
-    shape: tuple[int, ...]
-
-    def __init__(self, data: Callable[[], np.ndarray[Any, Any]], *, dtype: type, shape: tuple[int, ...]):
-        self.data = data
-        self.functions = []
-        self.dtype = np.dtype(dtype)
-        self.shape = shape
-
-    def astype(self, dtype: type, **kwargs) -> LazyTensor:
-        self.functions.append(lambda n: n.astype(dtype, **kwargs))
-        self.dtype = np.dtype(dtype)
-        return self
-
-    @property
-    def nbytes(self) -> int:
-        size = 1
-        for n in self.shape:
-            size *= n
-        return size * self.dtype.itemsize
-
-    def tofile(self, *args, **kwargs) -> None:
-        data = self.data()
-        for f in self.functions:
-            data = f(data)
-        assert data.shape == self.shape
-        assert data.dtype == self.dtype
-        assert data.nbytes == self.nbytes
-        self.functions = []
-        self.data = lambda: data
-        data.tofile(*args, **kwargs)
-
-    def byteswap(self, *args, **kwargs) -> LazyTensor:
-        self.functions.append(lambda n: n.byteswap(*args, **kwargs))
-        return self
-
-
 class WriterState(Enum):
    EMPTY   = auto()
    HEADER  = auto()
@@ -79,7 +38,7 @@ class WriterState(Enum):
 class GGUFWriter:
    fout: BufferedWriter
    temp_file: tempfile.SpooledTemporaryFile[bytes] | None
-    tensors: list[np.ndarray[Any, Any] | LazyTensor]
+    tensors: list[np.ndarray[Any, Any]]
    _simple_value_packing = {
        GGUFValueType.UINT8:   "B",
        GGUFValueType.INT8:    "b",
@@ -217,7 +176,7 @@ class GGUFWriter:
        if pack_fmt is not None:
            self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
        elif vtype == GGUFValueType.STRING:
-            encoded_val = val.encode("utf-8") if isinstance(val, str) else val
+            encoded_val = val.encode("utf8") if isinstance(val, str) else val
            self.kv_data += self._pack("Q", len(encoded_val))
            self.kv_data += encoded_val
        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
@@ -246,7 +205,7 @@ class GGUFWriter:
            raise ValueError(f'Duplicated tensor name {name}')
        self.ti_names.add(name)

-        encoded_name = name.encode("utf-8")
+        encoded_name = name.encode("utf8")
        self.ti_data += self._pack("Q", len(encoded_name))
        self.ti_data += encoded_name
        n_dims = len(tensor_shape)
@@ -278,7 +237,7 @@ class GGUFWriter:
        self.ti_data_count += 1

    def add_tensor(
-        self, name: str, tensor: np.ndarray[Any, Any] | LazyTensor, raw_shape: Sequence[int] | None = None,
+        self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
        raw_dtype: GGMLQuantizationType | None = None,
    ) -> None:
        if self.endianess == GGUFEndian.BIG:
@@ -303,7 +262,7 @@ class GGUFWriter:
        if pad != 0:
            fp.write(bytes([0] * pad))

-    def write_tensor_data(self, tensor: np.ndarray[Any, Any] | LazyTensor) -> None:
+    def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
        if self.state is not WriterState.TI_DATA:
            raise ValueError(f'Expected output file to contain tensor info, got {self.state}')

@@ -313,33 +272,15 @@ class GGUFWriter:
        tensor.tofile(self.fout)
        self.write_padding(self.fout, tensor.nbytes)

-    def write_tensors_to_file(self, *, progress: bool = False) -> None:
+    def write_tensors_to_file(self) -> None:
        self.write_ti_data_to_file()

        self.write_padding(self.fout, self.fout.tell())

        if self.temp_file is None:
-            self.tensors.reverse()  # to pop from the "beginning" in constant time
-
-            if progress:
-                from tqdm import tqdm
-
-                total_bytes = sum(t.nbytes for t in self.tensors)
-
-                bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
-
-                while True:
-                    try:
-                        tensor = self.tensors.pop()
-                    except IndexError:
-                        break
-                    tensor.tofile(self.fout)
-                    bar.update(tensor.nbytes)
-                    self.write_padding(self.fout, tensor.nbytes)
-                return
            while True:
                try:
-                    tensor = self.tensors.pop()
+                    tensor = self.tensors.pop(0)
                except IndexError:
                    break
                tensor.tofile(self.fout)
@@ -538,7 +479,7 @@ class GGUFWriter:
        self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)

    def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
-        if not isinstance(value, str):
+        if isinstance(value, list):
            template_default = None
            template_names = set()

--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -4,7 +4,7 @@ import logging
 import json
 import os
 from pathlib import Path
-from typing import Any, Callable, Sequence, Mapping, Iterable
+from typing import Any, Callable

 from .gguf_writer import GGUFWriter

@@ -15,11 +15,11 @@ class SpecialVocab:
    merges: list[str]
    add_special_token: dict[str, bool]
    special_token_ids: dict[str, int]
-    chat_template: str | Sequence[Mapping[str, str]] | None
+    chat_template: str | None

    def __init__(
        self, path: str | os.PathLike[str], load_merges: bool = False,
-        special_token_types: Iterable[str] | None = None,
+        special_token_types: tuple[str, ...] | None = None,
        n_vocab: int | None = None,
    ):
        self.special_token_ids = {}
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -21,7 +21,6 @@ classifiers = [
 [tool.poetry.dependencies]
 python = ">=3.8"
 numpy = ">=1.17"
-tqdm = ">=4.27"

 [tool.poetry.dev-dependencies]
 pytest = "^5.2"
--- a/gguf-py/scripts/gguf-dump.py
+++ b/gguf-py/scripts/gguf-dump.py
@@ -47,7 +47,7 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
        if len(field.types) == 1:
            curr_type = field.types[0]
            if curr_type == GGUFValueType.STRING:
-                log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60]))
+                log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60]))
            elif field.types[0] in reader.gguf_scalar_to_np:
                log_message += ' = {0}'.format(field.parts[-1][0])
        print(log_message)  # noqa: NP100
--- a/gguf-py/scripts/gguf-new-metadata.py
+++ b/gguf-py/scripts/gguf-new-metadata.py
@@ -7,8 +7,7 @@ import json
 from pathlib import Path

 import numpy as np
-from tqdm import tqdm
-from typing import Any, Sequence, NamedTuple
+from typing import Any, Mapping, Sequence

 # Necessary to load the local gguf package
 if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
@@ -19,12 +18,6 @@ import gguf
 logger = logging.getLogger("gguf-new-metadata")


-class MetadataDetails(NamedTuple):
-    type: gguf.GGUFValueType
-    value: Any
-    description: str = ''
-
-
 def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
    if np.uint32(1) == np.uint32(1).newbyteorder("<"):
        # Host is little endian
@@ -41,7 +34,7 @@ def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
        return host_endian


-def decode_field(field: gguf.ReaderField | None) -> Any:
+def decode_field(field: gguf.ReaderField) -> Any:
    if field and field.types:
        main_type = field.types[0]

@@ -49,11 +42,11 @@ def decode_field(field: gguf.ReaderField | None) -> Any:
            sub_type = field.types[-1]

            if sub_type == gguf.GGUFValueType.STRING:
-                return [str(bytes(field.parts[idx]), encoding='utf-8') for idx in field.data]
+                return [str(bytes(field.parts[idx]), encoding='utf8') for idx in field.data]
            else:
                return [pv for idx in field.data for pv in field.parts[idx].tolist()]
        if main_type == gguf.GGUFValueType.STRING:
-            return str(bytes(field.parts[-1]), encoding='utf-8')
+            return str(bytes(field.parts[-1]), encoding='utf8')
        else:
            return field.parts[-1][0]

@@ -66,16 +59,7 @@ def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
    return decode_field(field)


-def find_token(token_list: Sequence[int], token: str) -> Sequence[int]:
-    token_ids = [index for index, value in enumerate(token_list) if value == token]
-
-    if len(token_ids) == 0:
-        raise LookupError(f'Unable to find "{token}" in token list!')
-
-    return token_ids
-
-
-def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, MetadataDetails], remove_metadata: Sequence[str]) -> None:
+def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: Mapping[str, str], remove_metadata: Sequence[str]) -> None:
    for field in reader.fields.values():
        # Suppress virtual fields and fields written by GGUFWriter
        if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
@@ -91,64 +75,54 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
            logger.debug(f'Removing {field.name}')
            continue

-        old_val = MetadataDetails(field.types[0], decode_field(field))
+        old_val = decode_field(field)
        val = new_metadata.get(field.name, old_val)

        if field.name in new_metadata:
-            logger.debug(f'Modifying {field.name}: "{old_val.value}" -> "{val.value}" {val.description}')
+            logger.debug(f'Modifying {field.name}: "{old_val}" -> "{val}"')
            del new_metadata[field.name]
-        elif val.value is not None:
+        elif val is not None:
            logger.debug(f'Copying {field.name}')

-        if val.value is not None:
+        if val is not None:
            writer.add_key(field.name)
-            writer.add_val(val.value, val.type)
+            writer.add_val(val, field.types[0])

    if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
        logger.debug('Adding chat template(s)')
-        writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE].value)
+        writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE])
        del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]

+    # TODO: Support other types than string?
    for key, val in new_metadata.items():
-        logger.debug(f'Adding {key}: "{val.value}" {val.description}')
+        logger.debug(f'Adding {key}: {val}')
        writer.add_key(key)
-        writer.add_val(val.value, val.type)
-
-    total_bytes = 0
+        writer.add_val(val, gguf.GGUFValueType.STRING)

    for tensor in reader.tensors:
-        total_bytes += tensor.n_bytes
        # Dimensions are written in reverse order, so flip them first
-        shape = np.flipud(tensor.shape).tolist()
+        shape = np.flipud(tensor.shape)
        writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)

-    bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
-
    writer.write_header_to_file()
    writer.write_kv_data_to_file()
    writer.write_ti_data_to_file()

    for tensor in reader.tensors:
        writer.write_tensor_data(tensor.data)
-        bar.update(tensor.n_bytes)

    writer.close()


 def main() -> None:
-    tokenizer_metadata = (getattr(gguf.Keys.Tokenizer, n) for n in gguf.Keys.Tokenizer.__dict__.keys() if not n.startswith('_'))
-    token_names = dict((n.split('.')[-1][:-len('_token_id')], n) for n in tokenizer_metadata if n.endswith('_token_id'))
-
    parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata")
    parser.add_argument("input",                                       type=Path, help="GGUF format model input filename")
    parser.add_argument("output",                                      type=Path, help="GGUF format model output filename")
-    parser.add_argument("--general-name",                              type=str,  help="The models general.name", metavar='"name"')
-    parser.add_argument("--general-description",                       type=str,  help="The models general.description", metavar='"Description ..."')
-    parser.add_argument("--chat-template",                             type=str,  help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
-    parser.add_argument("--chat-template-config",                      type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
-    parser.add_argument("--remove-metadata",      action="append",     type=str,  help="Remove metadata (by key name) from output model", metavar='general.url')
-    parser.add_argument("--special-token",        action="append",     type=str,  help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
-    parser.add_argument("--special-token-by-id",  action="append",     type=str,  help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0'))
+    parser.add_argument("--general-name",                              type=str,  help="The models general.name")
+    parser.add_argument("--general-description",                       type=str,  help="The models general.description")
+    parser.add_argument("--chat-template",                             type=str,  help="Chat template string (or JSON string containing templates)")
+    parser.add_argument("--chat-template-config",                      type=Path, help="Config file (tokenizer_config.json) containing chat template(s)")
+    parser.add_argument("--remove-metadata",      action="append",     type=str,  help="Remove metadata (by key name) from output model")
    parser.add_argument("--force",                action="store_true",            help="Bypass warnings without confirmation")
    parser.add_argument("--verbose",              action="store_true",            help="Increase output verbosity")
    args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"])
@@ -159,20 +133,20 @@ def main() -> None:
    remove_metadata = args.remove_metadata or []

    if args.general_name:
-        new_metadata[gguf.Keys.General.NAME] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_name)
+        new_metadata[gguf.Keys.General.NAME] = args.general_name

    if args.general_description:
-        new_metadata[gguf.Keys.General.DESCRIPTION] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_description)
+        new_metadata[gguf.Keys.General.DESCRIPTION] = args.general_description

    if args.chat_template:
-        new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)
+        new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template

    if args.chat_template_config:
        with open(args.chat_template_config, 'r') as fp:
            config = json.load(fp)
            template = config.get('chat_template')
            if template:
-                new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
+                new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = template

    if remove_metadata:
        logger.warning('*** Warning *** Warning *** Warning **')
@@ -192,32 +166,6 @@ def main() -> None:
    arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
    endianess = get_byteorder(reader)

-    token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or []
-
-    for name, token in args.special_token or []:
-        if name not in token_names:
-            logger.warning(f'Unknown special token "{name}", ignoring...')
-        else:
-            ids = find_token(token_list, token)
-            new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, ids[0], f'= {token}')
-
-            if len(ids) > 1:
-                logger.warning(f'Multiple "{token}" tokens found, choosing ID {ids[0]}, use --special-token-by-id if you want another:')
-                logger.warning(', '.join(str(i) for i in ids))
-
-    for name, id_string in args.special_token_by_id or []:
-        if name not in token_names:
-            logger.warning(f'Unknown special token "{name}", ignoring...')
-        elif not id_string.isdecimal():
-            raise LookupError(f'Token ID "{id_string}" is not a valid ID!')
-        else:
-            id_int = int(id_string)
-
-            if id_int >= 0 and id_int < len(token_list):
-                new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, id_int, f'= {token_list[id_int]}')
-            else:
-                raise LookupError(f'Token ID {id_int} is not within token list!')
-
    if os.path.isfile(args.output) and not args.force:
        logger.warning('*** Warning *** Warning *** Warning **')
        logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!')
--- a/llama.cpp
+++ b/llama.cpp
@@ -3860,7 +3860,7 @@ static void llm_load_hparams(
                    switch (hparams.n_layer) {
                        case 22: model.type = e_model::MODEL_1B; break;
                        case 26: model.type = e_model::MODEL_3B; break;
-                        case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break;
+                        case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
                        case 40: model.type = e_model::MODEL_13B; break;
                        case 48: model.type = e_model::MODEL_34B; break;
                        case 60: model.type = e_model::MODEL_30B; break;
@@ -4391,15 +4391,9 @@ static void llm_load_vocab(
            } else if (
                tokenizer_pre == "command-r") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
-            } else if (
-                tokenizer_pre == "qwen2") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
            } else if (
                tokenizer_pre == "olmo") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
-            } else if (
-                tokenizer_pre == "dbrx") {
-                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }
@@ -12206,7 +12200,6 @@ struct llm_tokenizer_bpe {
            case LLAMA_VOCAB_TYPE_BPE:
                switch (vocab.type_pre) {
                    case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
-                    case LLAMA_VOCAB_PRE_TYPE_DBRX:
                        word_collection = unicode_regex_split(text, {
                            // original regex from tokenizer.json
                            //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12266,13 +12259,6 @@ struct llm_tokenizer_bpe {
                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                        });
                        break;
-                    case LLAMA_VOCAB_PRE_TYPE_QWEN2:
-                        word_collection = unicode_regex_split(text, {
-                            // original regex from tokenizer.json
-                            // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
-                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
-                        });
-                        break;
                    default:
                        // default regex for BPE tokenization pre-processing
                        word_collection = unicode_regex_split(text, {
@@ -12488,7 +12474,7 @@ struct llm_tokenizer_wpm {
                continue;
            }
            code = unicode_tolower(code);
-            if (type == CODEPOINT_TYPE_SEPARATOR) {
+            if (type == CODEPOINT_TYPE_WHITESPACE) {
                code = ' ';
            }
            std::string s = unicode_cpt_to_utf8(code);
@@ -15519,6 +15505,13 @@ struct llama_context * llama_new_context_with_model(
        cparams.flash_attn = false;
    }

+#ifdef GGML_USE_HIPBLAS
+    if (cparams.flash_attn) {
+        LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
+        cparams.flash_attn = false;
+    }
+#endif
+
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }
@@ -17872,7 +17865,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
        /*.t_eval_ms   =*/ 1e-3 * ctx->t_eval_us,

        /*.n_sample =*/ std::max(1, ctx->n_sample),
-        /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
+        /*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
        /*.n_eval   =*/ std::max(1, ctx->n_eval),
    };

--- a/llama.h
+++ b/llama.h
@@ -81,9 +81,7 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
        LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
-        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 10,
-        LLAMA_VOCAB_PRE_TYPE_OLMO           = 11,
-        LLAMA_VOCAB_PRE_TYPE_DBRX           = 12,
+        LLAMA_VOCAB_PRE_TYPE_OLMO           = 10,
    };

    // note: these values should be synchronized with ggml_rope
--- a/models/ggml-vocab-aquila.gguf
+++ b/models/ggml-vocab-aquila.gguf
--- a/models/ggml-vocab-baichuan.gguf
+++ b/models/ggml-vocab-baichuan.gguf
--- a/models/ggml-vocab-bert-bge.gguf
+++ b/models/ggml-vocab-bert-bge.gguf
--- a/models/ggml-vocab-command-r.gguf
+++ b/models/ggml-vocab-command-r.gguf
--- a/models/ggml-vocab-deepseek-coder.gguf
+++ b/models/ggml-vocab-deepseek-coder.gguf
--- a/models/ggml-vocab-deepseek-llm.gguf
+++ b/models/ggml-vocab-deepseek-llm.gguf
--- a/models/ggml-vocab-falcon.gguf
+++ b/models/ggml-vocab-falcon.gguf
--- a/models/ggml-vocab-gpt-2.gguf
+++ b/models/ggml-vocab-gpt-2.gguf
--- a/models/ggml-vocab-gpt-neox.gguf
+++ b/models/ggml-vocab-gpt-neox.gguf
--- a/models/ggml-vocab-gpt2.gguf
+++ b/models/ggml-vocab-gpt2.gguf
--- a/models/ggml-vocab-llama-bpe.gguf
+++ b/models/ggml-vocab-llama-bpe.gguf
--- a/models/ggml-vocab-llama-spm.gguf
+++ b/models/ggml-vocab-llama-spm.gguf
--- a/models/ggml-vocab-mpt.gguf
+++ b/models/ggml-vocab-mpt.gguf
--- a/models/ggml-vocab-phi-3.gguf
+++ b/models/ggml-vocab-phi-3.gguf
--- a/models/ggml-vocab-qwen2.gguf
+++ b/models/ggml-vocab-qwen2.gguf
--- a/models/ggml-vocab-qwen2.gguf.inp
+++ b/models/ggml-vocab-qwen2.gguf.inp
@@ -1,106 +0,0 @@
-ied 4 ½ months
-__ggml_vocab_test__
-Führer
-__ggml_vocab_test__
-
-__ggml_vocab_test__
- 
-__ggml_vocab_test__
-  
-__ggml_vocab_test__
-   
-__ggml_vocab_test__
-	
-__ggml_vocab_test__
-
-
-__ggml_vocab_test__
-
-
-
-__ggml_vocab_test__
-
-
-
-
-__ggml_vocab_test__
-	
-
-__ggml_vocab_test__
-Hello world
-__ggml_vocab_test__
- Hello world
-__ggml_vocab_test__
-Hello World
-__ggml_vocab_test__
- Hello World
-__ggml_vocab_test__
- Hello World!
-__ggml_vocab_test__
-Hello, world!
-__ggml_vocab_test__
- Hello, world!
-__ggml_vocab_test__
- this is 🦙.cpp
-__ggml_vocab_test__
-w048 7tuijk dsdfhu
-__ggml_vocab_test__
-нещо на Български
-__ggml_vocab_test__
-កាន់តែពិសេសអាចខលចេញ
-__ggml_vocab_test__
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
-__ggml_vocab_test__
-Hello
-__ggml_vocab_test__
- Hello
-__ggml_vocab_test__
-  Hello
-__ggml_vocab_test__
-   Hello
-__ggml_vocab_test__
-    Hello
-__ggml_vocab_test__
-    Hello
-    Hello
-__ggml_vocab_test__
- (
-__ggml_vocab_test__
-
- =
-__ggml_vocab_test__
-' era
-__ggml_vocab_test__
-Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
-__ggml_vocab_test__
-3
-__ggml_vocab_test__
-33
-__ggml_vocab_test__
-333
-__ggml_vocab_test__
-3333
-__ggml_vocab_test__
-33333
-__ggml_vocab_test__
-333333
-__ggml_vocab_test__
-3333333
-__ggml_vocab_test__
-33333333
-__ggml_vocab_test__
-333333333
-__ggml_vocab_test__
-
- 
-
- 
-
-
- 	 		 	
-  
-   
-    
-     
-🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
-__ggml_vocab_test__
--- a/models/ggml-vocab-qwen2.gguf.out
+++ b/models/ggml-vocab-qwen2.gguf.out
@@ -1,43 +0,0 @@
- 1122 220 19 220 26062 3951
- 37 50753 261
-
- 220
- 256
- 262
- 197
- 198
- 271
- 1406
- 1572
- 9707 1879
- 21927 1879
- 9707 4337
- 21927 4337
- 21927 4337 0
- 9707 11 1879 0
- 21927 11 1879 0
- 419 374 11162 99 247 13 10821
- 86 15 19 23 220 22 83 1963 41808 11472 2940 16739
- 78762 14144 1456 13073 63471 33594 3038 133178 79012
- 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 147805 148301 147270 44258 223 146848
- 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 320 3243 42365 429 702 1181 1828 3950 8
- 9707
- 21927
- 220 21927
- 256 21927
- 262 21927
- 262 21927 198 262 21927
- 320
- 198 284
- 6 11385
- 9707 11 379 64848 0 2585 525 498 26525 223 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216
- 18
- 18 18
- 18 18 18
- 18 18 18 18
- 18 18 18 18 18
- 18 18 18 18 18 18
- 18 18 18 18 18 18 18
- 18 18 18 18 18 18 18 18
- 18 18 18 18 18 18 18 18 18
- 198 4710 14731 65497 7847 1572 2303 78672 10947 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 11162 99 247 149955 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 496 18 220 18 1112 18 220 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 144534 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216 55460 53237 18658 14144 1456 13073 63471 33594 3038 133178 79012 3355 4605 4605 13874 13874 73594 3014 3014 28149 17085 2928 26610 7646 358 3003 1012 364 83 813 566 594 1052 11 364 787 498 2704 30 364 44 537 2704 358 3278 1281 432 11 364 35 498 1075 1045 15243 30 1205 6 42612 264 63866 43
--- a/models/ggml-vocab-refact.gguf
+++ b/models/ggml-vocab-refact.gguf
--- a/models/ggml-vocab-stablelm.gguf
+++ b/models/ggml-vocab-stablelm.gguf
--- a/models/ggml-vocab-starcoder.gguf
+++ b/models/ggml-vocab-starcoder.gguf
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -1,3 +0,0 @@
-{
-  "extraPaths": ["gguf-py"],
-}
--- a/requirements/requirements-convert-hf-to-gguf-update.txt
+++ b/requirements/requirements-convert-hf-to-gguf-update.txt
@@ -1,2 +1,3 @@
 -r ./requirements-convert.txt
 torch~=2.1.1
+einops~=0.7.0
--- a/requirements/requirements-convert-hf-to-gguf.txt
+++ b/requirements/requirements-convert-hf-to-gguf.txt
@@ -1,2 +1,3 @@
 -r ./requirements-convert.txt
 torch~=2.1.1
+einops~=0.7.0
--- a/requirements/requirements-convert.txt
+++ b/requirements/requirements-convert.txt
@@ -1,5 +1,5 @@
 numpy~=1.24.4
-sentencepiece~=0.2.0
+sentencepiece~=0.1.98
 transformers>=4.40.1,<5.0.0
 gguf>=0.1.0
 protobuf>=4.21.0,<5.0.0
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -93,14 +93,11 @@ help_s = (
    "specified values are averaged WITHOUT weighing by the --repetitions parameter of llama-bench."
 )
 parser.add_argument("-s", "--show", help=help_s)
-parser.add_argument("--verbose", action="store_true", help="increase output verbosity")

 known_args, unknown_args = parser.parse_known_args()

-logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
-
 if unknown_args:
-    logger.error(f"Received unknown args: {unknown_args}.\n")
+    logger.error(f"Received unknown args: {unknown_args}.")
    parser.print_help()
    sys.exit(1)

@@ -113,7 +110,7 @@ if input_file is None:
        input_file = sqlite_files[0]

 if input_file is None:
-    logger.error("Cannot find a suitable input file, please provide one.\n")
+    logger.error("Cannot find a suitable input file, please provide one.")
    parser.print_help()
    sys.exit(1)

@@ -205,12 +202,12 @@ elif repo is not None:
    hexsha8_baseline = find_parent_in_data(repo.heads.master.commit)

    if hexsha8_baseline is None:
-        logger.error("No baseline was provided and did not find data for any master branch commits.\n")
+        logger.error("No baseline was provided and did not find data for any master branch commits.")
        parser.print_help()
        sys.exit(1)
 else:
    logger.error("No baseline was provided and the current working directory "
-                 "is not part of a git repository from which a baseline could be inferred.\n")
+                 "is not part of a git repository from which a baseline could be inferred.")
    parser.print_help()
    sys.exit(1)

@@ -241,7 +238,7 @@ elif repo is not None:
            break

    if hexsha8_compare is None:
-        logger.error("No compare target was provided and did not find data for any non-master commits.\n")
+        logger.error("No compare target was provided and did not find data for any non-master commits.")
        parser.print_help()
        sys.exit(1)
 else:
@@ -364,7 +361,7 @@ if "gpu_info" in show:
 headers  = [PRETTY_NAMES[p] for p in show]
 headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]

-print(tabulate( # noqa: NP100
+logger.info(tabulate(
    table,
    headers=headers,
    floatfmt=".2f",
--- a/scripts/gen-unicode-data.py
+++ b/scripts/gen-unicode-data.py
@@ -1,14 +1,31 @@
 import regex


+def cpt_to_utf8_str(cpt):
+    if cpt <= 0xFF:
+        return bytes([cpt, 0, 0, 0])
+    elif cpt <= 0xFFFF:
+        return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
+    elif cpt <= 0xFFFFFF:
+        return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
+    else:
+        return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
+
+
+def is_match(codepoint, regex_expr):
+    try:
+        res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
+        return res is not None
+    except Exception:
+        return False
+
+
 def get_matches(regex_expr):
-    regex_expr_compiled = regex.compile(regex_expr)
    unicode_ranges = []
    current_range = None

    for codepoint in range(0x110000):
-        char = chr(codepoint)
-        if regex_expr_compiled.match(char):
+        if is_match(codepoint, regex_expr):
            if current_range is None:
                current_range = [codepoint, codepoint]
            else:
@@ -23,42 +40,27 @@ def get_matches(regex_expr):
    return unicode_ranges


-def print_cat(mode, cat, ranges):
-    if mode == "range":
-        print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
-    if mode == "map":
-        print("const std::map<uint32_t, uint32_t> unicode_map_{} = {{".format(cat)) # noqa: NP100
-    for i, values in enumerate(ranges):
-        end = ",\n" if (i % 4 == 3 or i + 1 == len(ranges)) else ", "
-        values = ["0x%08X" % value for value in values]
-        print("{" + ", ".join(values) + "}", end=end) # noqa: NP100
+def print_cat(cat, ranges):
+    print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
+    cnt = 0
+    for start, end in ranges:
+        if cnt % 4 != 0:
+            print(" ", end="") # noqa: NP100
+        print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100
+        if cnt % 4 == 3:
+            print("") # noqa: NP100
+        cnt += 1
+
+    if cnt % 4 != 0:
+        print("") # noqa: NP100
    print("};") # noqa: NP100
    print("") # noqa: NP100


-print_cat("range", "number",      get_matches(r'\p{N}'))
-print_cat("range", "letter",      get_matches(r'\p{L}'))
-print_cat("range", "separator",   get_matches(r'\p{Z}'))
-print_cat("range", "accent_mark", get_matches(r'\p{M}'))
-print_cat("range", "punctuation", get_matches(r'\p{P}'))
-print_cat("range", "symbol",      get_matches(r'\p{S}'))
-print_cat("range", "control",     get_matches(r'\p{C}'))
-
-print_cat("range", "whitespace",  get_matches(r'\s'))
-
-
-map_lowercase = []
-map_uppercase = []
-for codepoint in range(0x110000):
-    char = chr(codepoint)
-    lower = ord(char.lower()[0])
-    upper = ord(char.upper()[0])
-    if codepoint != lower:
-        map_lowercase.append((codepoint, lower))
-    if codepoint != upper:
-        map_uppercase.append((codepoint, upper))
-print_cat("map", "lowercase", map_lowercase)
-print_cat("map", "uppercase", map_uppercase)
-
-
-# TODO: generate unicode_map_nfd
+print_cat("number",      get_matches(r'\p{N}'))
+print_cat("letter",      get_matches(r'\p{L}'))
+print_cat("whitespace",  get_matches(r'\p{Z}'))
+print_cat("accent_mark", get_matches(r'\p{M}'))
+print_cat("punctuation", get_matches(r'\p{P}'))
+print_cat("symbol",      get_matches(r'\p{S}'))
+print_cat("control",     get_matches(r'\p{C}'))
--- a/sgemm.cpp
+++ b/sgemm.cpp
@@ -1,3 +1,6 @@
+// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
+// vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi
+//
 // Copyright 2024 Mozilla Foundation
 //
 // Permission is hereby granted, free of charge, to any person obtaining
@@ -582,15 +585,15 @@ class tinyBLAS_Q0_ARM {
 };
 #endif // __ARM_FEATURE_DOTPROD

-#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
+#if defined(__AVX2__) || defined(__AVX512F__)
 template <typename TA, typename TB, typename TC>
-class tinyBLAS_Q0_AVX {
+class tinyBLAS_Q0_AVX2 {
  public:
-    tinyBLAS_Q0_AVX(int64_t k,
-                    const TA *A, int64_t lda,
-                    const TB *B, int64_t ldb,
-                    TC *C, int64_t ldc,
-                    int ith, int nth)
+    tinyBLAS_Q0_AVX2(int64_t k,
+                     const TA *A, int64_t lda,
+                     const TB *B, int64_t ldb,
+                     TC *C, int64_t ldc,
+                     int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }

@@ -725,34 +728,14 @@ class tinyBLAS_Q0_AVX {
            __m256 Cv[RN][RM] = {};
            for (int64_t l = 0; l < k; ++l)
                for (int64_t j = 0; j < RN; ++j)
-                    for (int64_t i = 0; i < RM; ++i) {
-#if defined(__AVX2__)
-                        __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
-                                                              load(A + lda * (ii + i) + l)),
-                                             _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
-                                                              load(A + lda * (ii + i) + l)));
-#else
-                        __m128i ali0 = load0(A + lda * (ii + i) + l);
-                        __m128i ali1 = load1(A + lda * (ii + i) + l);
-                        __m128i blj0 = load0(B + ldb * (jj + j) + l);
-                        __m128i blj1 = load1(B + ldb * (jj + j) + l);
-
-                        __m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
-                        __m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
-                        __m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
-                        __m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
-
-                        // updot
-                        const __m128i oneFill = _mm_set1_epi16(1);
-                        __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
-                        __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
-                        __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
-#endif
+                    for (int64_t i = 0; i < RM; ++i)
                        Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
                                                       unhalf(B[ldb * (jj + j) + l].d)),
-                                                       udTmp,
-                                                       Cv[j][i]);
-                    }
+                                        updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
+                                                               load(A + lda * (ii + i) + l)),
+                                              _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
+                                                               load(A + lda * (ii + i) + l))),
+                                        Cv[j][i]);
            for (int64_t j = 0; j < RN; ++j)
                for (int64_t i = 0; i < RM; ++i)
                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
@@ -763,28 +746,10 @@ class tinyBLAS_Q0_AVX {
        return _mm256_loadu_si256((const __m256i *)b->qs);
    }

-    inline __m128i load0(const block_q8_0 *b) {
-        return _mm_loadu_si128((const __m128i *)b->qs);
-    }
-
-    inline __m128i load1(const block_q8_0 *b) {
-        return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
-    }
-
    inline __m256i load(const block_q4_0 *b) {
        return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
    }

-    inline __m128i load0(const block_q4_0 *b) {
-        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
-        return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
-    }
-
-    inline __m128i load1(const block_q4_0 *b) {
-        const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
-        return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
-    }
-
    inline __m256 updot(__m256i u, __m256i s) {
        __m256i res;
 #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
@@ -812,7 +777,7 @@ class tinyBLAS_Q0_AVX {
    const int ith;
    const int nth;
 };
-#endif // __AVX__
+#endif // __AVX2__

 } // namespace

@@ -963,8 +928,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
    case GGML_TYPE_Q8_0: {
        if (Btype != GGML_TYPE_Q8_0)
           return false;
-#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
-        tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
+#if defined(__AVX2__) || defined(__AVX512F__)
+        tinyBLAS_Q0_AVX2<block_q8_0, block_q8_0, float> tb{
            k, (const block_q8_0 *)A, lda,
            (const block_q8_0 *)B, ldb,
            (float *)C, ldc,
@@ -987,8 +952,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
    case GGML_TYPE_Q4_0: {
        if (Btype != GGML_TYPE_Q8_0)
            return false;
-#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
-        tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
+#if defined(__AVX2__) || defined(__AVX512F__)
+        tinyBLAS_Q0_AVX2<block_q4_0, block_q8_0, float> tb{
            k, (const block_q4_0 *)A, lda,
            (const block_q8_0 *)B, ldb,
            (float *)C, ldc,
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -84,7 +84,6 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)

 # build test-tokenizer-1-bpe target once and add many tests
 add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2175,11 +2175,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_timestep_embedding());
    test_cases.emplace_back(new test_leaky_relu());

-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    for (int hs : { 64, 128, }) { // other head sizes not implemented
-#else
    for (int hs : { 64, 80, 128, 256, }) {
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
        for (int nh : { 32, }) {
            for (int kv : { 512, 1024, }) {
                for (int nb : { 1, 2, 4, 8, }) {
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -2,7 +2,6 @@
 #undef NDEBUG
 #endif

-#include <cassert>
 #include <fstream>
 #include <sstream>
 #include <regex>
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -1,295 +0,0 @@
-# Test libllama tokenizer == AutoTokenizer.
-# Brute force random tokens/text generation.
-#
-# Sample usage:
-#
-#   python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
-#
-
-import time
-import logging
-import argparse
-import subprocess
-import random
-
-from typing import Iterator
-
-import cffi
-from transformers import AutoTokenizer, PreTrainedTokenizerBase
-
-logger = logging.getLogger("test-tokenizer-random-bpe")
-
-
-class LibLlama:
-
-    DEFAULT_PATH_LLAMA_H = "./llama.h"
-    DEFAULT_PATH_LIBLLAMA = "./build/libllama.so"  # CMakeLists.txt: BUILD_SHARED_LIBS ON
-
-    def __init__(self, path_llama_h: str = None, path_libllama: str = None):
-        path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
-        path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
-        (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_libllama)
-        self.lib.llama_backend_init()
-
-    def _load_libllama_cffi(self, path_llama_h: str, path_libllama: str):
-        cmd = ["gcc", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)=", path_llama_h]
-        res = subprocess.run(cmd, stdout=subprocess.PIPE)
-        assert (res.returncode == 0)
-        source = res.stdout.decode()
-        ffi = cffi.FFI()
-        if True:  # workarounds for pycparser
-            source = "typedef struct { } __builtin_va_list;" + "\n" + source
-            source = source.replace("sizeof (int)",    str(ffi.sizeof("int")))
-            source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
-            source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
-            source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
-        ffi.cdef(source, override=True)
-        lib = ffi.dlopen(path_libllama)
-        return (ffi, lib)
-
-    def model_default_params(self, **kwargs):
-        mparams = self.lib.llama_model_default_params()
-        for k, v in kwargs.items():
-            setattr(mparams, k, v)
-        return mparams
-
-    def context_default_params(self, **kwargs):
-        cparams = self.lib.llama_context_default_params()
-        for k, v in kwargs.items():
-            setattr(cparams, k, v)
-        return cparams
-
-
-class LibLlamaModel:
-
-    def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
-        self.lib = libllama.lib
-        self.ffi = libllama.ffi
-        if isinstance(mparams, dict):
-            mparams = libllama.model_default_params(**mparams)
-        self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
-        if not self.model:
-            raise RuntimeError("error: failed to load model '%s'" % path_model)
-        if isinstance(cparams, dict):
-            cparams = libllama.context_default_params(**cparams)
-        self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
-        if not self.ctx:
-            raise RuntimeError("error: failed to create context for model '%s'" % path_model)
-        n_tokens_max = self.lib.llama_n_ctx(self.ctx)
-        self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
-
-    def free(self):
-        if self.ctx:
-            self.lib.llama_free(self.ctx)
-        if self.model:
-            self.lib.llama_free_model(self.model)
-        self.ctx = None
-        self.model = None
-        self.lib = None
-
-    def tokenize(self, text: str, n_tokens_max: int = 0, add_special: bool = False, parse_special: bool = False) -> list[int]:
-        n_tokens_max = n_tokens_max if n_tokens_max > 0 else len(self.token_ids)
-        text = text.encode("utf-8")
-        num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, n_tokens_max, add_special, parse_special)
-        if num < 0:
-            return []
-        return list(self.token_ids[0:num])
-
-
-def generator_custom_text() -> Iterator[str]:
-    """General tests"""
-    yield from [
-        "",
-        " ",
-        "  ",
-        "   ",
-        "\t",
-        "\n",
-        "\n\n",
-        "\n\n\n",
-        "\t\n",
-        "Hello world",
-        " Hello world",
-        "Hello World",
-        " Hello World",
-        " Hello World!",
-        "Hello, world!",
-        " Hello, world!",
-        " this is 🦙.cpp",
-        "w048 7tuijk dsdfhu",
-        "нещо на Български",
-        "កាន់តែពិសេសអាចខលចេញ",
-        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
-        "Hello",
-        " Hello",
-        "  Hello",
-        "   Hello",
-        "    Hello",
-        "    Hello\n    Hello",
-        " (",
-        "\n =",
-        "' era",
-        "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
-        "3",
-        "33",
-        "333",
-        "3333",
-        "33333",
-        "333333",
-        "3333333",
-        "33333333",
-        "333333333",
-    ]
-
-
-def generator_custom_text_edge_cases() -> Iterator[str]:
-    """Edge cases found while debugging"""
-    yield from [
-        '\x1f-a',   # unicode_ranges_control, {0x00001C, 0x00001F}
-        '¼-a',      # unicode_ranges_digit, 0x00BC
-        '½-a',      # unicode_ranges_digit, 0x00BD
-        '¾-a',      # unicode_ranges_digit, 0x00BE
-        'a 〇b',    # unicode_ranges_digit, 0x3007
-        'Ⅵ-a',     # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
-        '\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
-        '<s>a'      # TODO: Phi-3 fail
-    ]
-
-
-def generator_random_chars(iterations = 100) -> Iterator[str]:
-    """Brute force random text with simple characters"""
-
-    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
-    CHARS = list(set("""
-        ABCDEFGHIJKLMNOPQRSTUVWXYZ
-        abcdefghijklmnopqrstuvwxyz
-        ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
-        áéíóúàèìòùâêîôûäëïöü
-        .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
-    """))
-
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        text = []
-        num_words = rand.randint(300, 400)
-        for i in range(num_words):
-            k = rand.randint(1, 7)
-            word = rand.choices(CHARS, k=k)
-            space = rand.choice(WHITESPACES)
-            text.append("".join(word) + space)
-        yield "".join(text)
-
-
-def generator_random_vocab_chars(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
-    """Brute force random text with vocab characters"""
-
-    vocab_ids = list(tokenizer.vocab.values())
-    vocab_text = tokenizer.decode(vocab_ids, skip_special_tokens=True)
-    vocab_chars = list(set(vocab_text))
-    del vocab_ids, vocab_text
-
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        text = rand.choices(vocab_chars, k=1024)
-        yield "".join(text)
-
-
-def generator_random_vocab_tokens(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
-    """Brute force random text from vocab tokens"""
-
-    space_id = tokenizer.encode(" ", add_special_tokens=False)[0]
-    vocab_ids = list(tokenizer.vocab.values())
-    vocab_ids = list(sorted(vocab_ids + vocab_ids))
-    for i in range(1, len(vocab_ids), 2):
-        vocab_ids[i] = space_id
-    vocab_tokens = tokenizer.decode(vocab_ids, skip_special_tokens=True)
-    vocab_tokens = vocab_tokens.split(" ")
-    del vocab_ids
-
-    yield from vocab_tokens
-
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        text = []
-        num_words = rand.randint(300, 400)
-        for i in range(num_words):
-            k = rand.randint(1, 3)
-            tokens = rand.choices(vocab_tokens, k=k)
-            tokens = [t.strip(" \n\r\t") for t in tokens]
-            sep = rand.choice("     \n\r\t")
-            text.append("".join(tokens) + sep)
-        yield "".join(text)
-
-
-def generator_random_bytes(iterations = 100) -> Iterator[str]:
-    """Brute force random bytes"""
-
-    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
-
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        text = []
-        num_words = rand.randint(300, 400)
-        for i in range(num_words):
-            k = rand.randint(1, 8)
-            word = [chr(r) for r in rand.randbytes(k) if r]
-            word.append(rand.choice(WHITESPACES))
-            text.append("".join(word))
-        yield "".join(text)
-
-
-def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, generator: Iterator[str]):
-
-    def find_first_mismatch(ids1: list[int], ids2: list[int]):
-        for i, (a,b) in enumerate(zip(ids1, ids2)):
-            if a != b:
-                return i
-        if len(ids1) == len(ids2):
-            return -1
-        return min(len(ids1), len(ids2))
-
-    t0 = time.perf_counter()
-    logger.info("%s: %s" % (generator.__name__, "ini"))
-    for text in generator:
-        ids1 = model.tokenize(text, add_special=False, parse_special=False)
-        ids2 = tokenizer.encode(text, add_special_tokens=False)
-        if ids1 != ids2:
-            i = find_first_mismatch(ids1, ids2)
-            ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1]
-            ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1]
-            text2 = tokenizer.decode(ids2, skip_special_tokens=True)
-            assert (text2 in text)
-            logger.info(" Text:     " + repr(text2))
-            logger.info(" TokenIDs: " + str(ids1))
-            logger.info(" Expected: " + str(ids2))
-            raise Exception()
-    t1 = time.perf_counter()
-    logger.info("%s: end, time: %.3f secs" % (generator.__name__, t1 - t0))
-
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("vocab_file", help="path to vocab 'gguf' file")
-    parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
-    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
-    args = parser.parse_args()
-
-    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
-
-    model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048))
-
-    tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
-
-    test_compare_tokenizer(model, tokenizer, generator_custom_text())
-    test_compare_tokenizer(model, tokenizer, generator_custom_text_edge_cases())
-    test_compare_tokenizer(model, tokenizer, generator_random_chars(10_000))
-    test_compare_tokenizer(model, tokenizer, generator_random_vocab_chars(tokenizer, 10_000))
-    test_compare_tokenizer(model, tokenizer, generator_random_vocab_tokens(tokenizer, 10_000))
-    # test_compare_tokenizer(model, tokenizer, generator_random_bytes(10_000)) # FAIL
-
-    model.free()
--- a/unicode-data.cpp
+++ b/unicode-data.cpp
--- a/unicode-data.h
+++ b/unicode-data.h
@@ -7,7 +7,6 @@

 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
-extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
 extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
--- a/unicode.cpp
+++ b/unicode.cpp
@@ -9,7 +9,6 @@
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <utility>
 #include <vector>
 #include <locale>
@@ -112,27 +111,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
 static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
    std::unordered_map<uint32_t, int> cpt_types;
    for (auto p : unicode_ranges_number) {
-        for (auto i = p.first; i <= p.second; ++i) {
+        for (auto i = p.first; i <= p.second; ++ i) {
            cpt_types[i] = CODEPOINT_TYPE_NUMBER;
        }
    }
    for (auto p : unicode_ranges_letter) {
-        for (auto i = p.first; i <= p.second; ++i) {
+        for (auto i = p.first; i <= p.second; ++ i) {
            cpt_types[i] = CODEPOINT_TYPE_LETTER;
        }
    }
-    for (auto p : unicode_ranges_separator) {
-        for (auto i = p.first; i <= p.second; ++i) {
-            cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
+    for (auto p : unicode_ranges_whitespace) {
+        for (auto i = p.first; i <= p.second; ++ i) {
+            cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
        }
    }
    for (auto p : unicode_ranges_accent_mark) {
-        for (auto i = p.first; i <= p.second; ++i) {
+        for (auto i = p.first; i <= p.second; ++ i) {
            cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
        }
    }
    for (auto p : unicode_ranges_punctuation) {
-        for (auto i = p.first; i <= p.second; ++i) {
+        for (auto i = p.first; i <= p.second; ++ i) {
            cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
        }
    }
@@ -142,7 +141,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
        }
    }
    for (auto p : unicode_ranges_control) {
-        for (auto i = p.first; i <= p.second; ++i) {
+        for (auto i = p.first; i <= p.second; ++ i) {
            cpt_types[i] = CODEPOINT_TYPE_CONTROL;
        }
    }
@@ -225,256 +224,138 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size

-    const auto cpts = unicode_cpts_from_utf8(text);
-
    size_t start = 0;
-    for (auto offset : offsets) {
-        const size_t offset_ini = start;
-        const size_t offset_end = start + offset;
-        assert(offset_end <= cpts.size());
-        start = offset_end;
-
-        auto _get_cpt = [&] (const size_t pos) -> char32_t {
-            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
-        };
-
-        auto _get_cpt_type = [&] (const size_t pos) -> int {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
-        };
-
-        size_t _prev_end = offset_ini;
-        auto _add_token = [&] (const size_t end) -> size_t {
-            assert(_prev_end <= end && end <= offset_end);
-            size_t len = end - _prev_end;
-            if (len > 0) {
-                bpe_offsets.push_back(len);
-            }
-            _prev_end = end;
-            //if (len > 0) {
-            //    std::string s = "";
-            //    for(size_t p = end-len; p < end; p++)
-            //        s += unicode_cpt_to_utf8(cpts[p]);
-            //    printf(">>> '%s'\n", s.c_str());
-            //}
-            return len;
-        };
-
-        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
-            const char32_t cpt = _get_cpt(pos);
-            const int cpt_type = _get_cpt_type(pos);
-
-            // regex: 's|'t|'re|'ve|'m|'ll|'d
-            if (cpt == '\'' && pos+1 < offset_end) {
-                char32_t cpt_next = _get_cpt(pos+1);
-                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
-                    pos += _add_token(pos+2);
-                    continue;
-                }
-                if (pos+2 < offset_end) {
-                    char32_t cpt_next_next = _get_cpt(pos+2);
-                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
-                        (cpt_next == 'v' && cpt_next_next == 'e') ||
-                        (cpt_next == 'l' && cpt_next_next == 'l')) {
-                        pos += _add_token(pos+3);
-                        continue;
-                    }
-                }
-            }
-
-            char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
-            int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
-            // regex: <space>?\p{L}+
-            if (cpt2_type == CODEPOINT_TYPE_LETTER) {
-                pos += (cpt == ' ');
-                while (cpt2_type == CODEPOINT_TYPE_LETTER) {
-                    cpt2_type = _get_cpt_type(++pos);
-                }
-                _add_token(pos);
-                continue;
-            }
-            // regex: <space>?\p{N}+
-            if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
-                pos += (cpt == ' ');
-                while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
-                    cpt2_type = _get_cpt_type(++pos);
-                }
-                _add_token(pos);
-                continue;
-            }
-            // regex: <space>?[^\s\p{L}\p{N}]+
-            if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
-                pos += (cpt == ' ');
-                while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
-                    cpt2_type = _get_cpt_type(++pos);
-                    cpt2 = _get_cpt(pos);
-                }
-                _add_token(pos);
-                continue;
-            }
-
-            size_t num_whitespaces = 0;
-            while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
-                num_whitespaces++;
-            }
-
-            // regex: \s+(?!\S)
-            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
-                pos += num_whitespaces - 1;
-                _add_token(pos);
-                continue;
-            }
-
-            // regex: \s+
-            if (num_whitespaces > 0) {
-                pos += num_whitespaces;
-                _add_token(pos);
-                continue;
-            }
-
-            // no matches
-            _add_token(++pos);
-        }
-    }
-
-    return bpe_offsets;
-}
-
-// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
-static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
-    std::vector<size_t> bpe_offsets; // store the offset of each word
-    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size

    const auto cpts = unicode_cpts_from_utf8(text);

-    size_t start = 0;
    for (auto offset : offsets) {
-        const size_t offset_ini = start;
-        const size_t offset_end = start + offset;
-        assert(offset_end <= cpts.size());
-        start = offset_end;
+        std::string token;

-        auto _get_cpt = [&] (const size_t pos) -> char32_t {
-            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
-        };
+        bool collecting_numeric = false;
+        bool collecting_letter = false;
+        bool collecting_special = false;
+        bool collecting_whitespace_lookahead = false;
+        bool collecting = false;

-        auto _get_cpt_type = [&] (const size_t pos) -> int {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
-        };
+        std::vector<std::string> text_utf;
+        text_utf.reserve(offset);

-        size_t _prev_end = offset_ini;
-        auto _add_token = [&] (const size_t end) -> size_t {
-            assert(_prev_end <= end && end <= offset_end);
-            size_t len = end - _prev_end;
-            if (len > 0) {
-                bpe_offsets.push_back(len);
-            }
-            _prev_end = end;
-            //if (len > 0) {
-            //    std::string s = "";
-            //    for(size_t p = end-len; p < end; p++)
-            //        s += unicode_cpt_to_utf8(cpts[p]);
-            //    printf(">>> '%s'\n", s.c_str());
-            //}
-            return len;
-        };
-
-        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
-            const char32_t cpt = _get_cpt(pos);
-            const int cpt_type = _get_cpt_type(pos);
-
-            // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
-            if (cpt == '\'' && pos+1 < offset_end) {
-                char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
-                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
-                    pos += _add_token(pos+2);
-                    continue;
-                }
-                if (pos+2 < offset_end) {
-                    char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
-                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
-                        (cpt_next == 'v' && cpt_next_next == 'e') ||
-                        (cpt_next == 'l' && cpt_next_next == 'l')) {
-                        pos += _add_token(pos+3);
-                        continue;
-                    }
-                }
-            }
-
-            // regex: [^\r\n\p{L}\p{N}]?\p{L}+  //####FIXME: the first \p{L} is correct?
-            if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
-                if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) {  // one or more letters
-                    pos++;
-                    while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
-                        pos++;
-                    }
-                    _add_token(pos);
-                    continue;
-                }
-            }
-
-            // regex: \p{N}{1,3}
-            if (cpt_type == CODEPOINT_TYPE_NUMBER) {
-                size_t ini = pos;
-                while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
-                    if (++pos - ini >= 3 ) {
-                        _add_token(pos);
-                        ini = pos;
-                    }
-                }
-                _add_token(pos);
-                continue;
-            }
-
-            // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
-            char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
-            int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
-            if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
-                pos += (cpt == ' ');
-                while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
-                    cpt2_type = _get_cpt_type(++pos);
-                    cpt2 = _get_cpt(pos);
-                }
-                while (cpt2 == '\r' || cpt2 == '\n') {
-                    cpt2 = _get_cpt(++pos);
-                }
-                _add_token(pos);
-                continue;
-            }
-
-            size_t num_whitespaces = 0;
-            size_t last_end_r_or_n = 0;
-            while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
-                char32_t cpt2 = _get_cpt(pos+num_whitespaces);
-                if (cpt2 == '\r' || cpt2 == '\n') {
-                    last_end_r_or_n = pos + num_whitespaces + 1;
-                }
-                num_whitespaces++;
-            }
-
-            // regex: \s*[\r\n]+
-            if (last_end_r_or_n > 0) {
-                pos = last_end_r_or_n;
-                _add_token(pos);
-                continue;
-            }
-
-            // regex: \s+(?!\S)
-            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
-                pos += num_whitespaces - 1;
-                _add_token(pos);
-                continue;
-            }
-
-            // regex: \s+
-            if (num_whitespaces > 0) {
-                pos += num_whitespaces;
-                _add_token(pos);
-                continue;
-            }
-
-            // no matches
-            _add_token(++pos);
+        for (size_t i = start; i < start + offset; ++i) {
+            text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
        }
+
+        for (int i = 0; i < (int)text_utf.size(); i++) {
+            const std::string & utf_char = text_utf[i];
+            bool split_condition = false;
+            int bytes_remain = text_utf.size() - i;
+
+            // forward backward lookups
+            const std::string & utf_char_next      = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
+            const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
+
+            // handling contractions
+            if (!split_condition && bytes_remain >= 2) {
+                // 's|'t|'m|'d
+                if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
+                    split_condition = true;
+                }
+                if (split_condition) {
+                    if (token.size()) {
+                        bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
+                    }
+                    token = utf_char + utf_char_next;
+                    bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
+                    token = "";
+                    i++;
+                    continue;
+                }
+            }
+            if (!split_condition && bytes_remain >= 3) {
+                // 're|'ve|'ll
+                if (utf_char == "\'" && (
+                    (utf_char_next == "r" && utf_char_next_next == "e") ||
+                    (utf_char_next == "v" && utf_char_next_next == "e") ||
+                    (utf_char_next == "l" && utf_char_next_next == "l"))
+                    ) {
+                    split_condition = true;
+                }
+                if (split_condition) {
+                    // current token + next token can be defined
+                    if (token.size()) {
+                        bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
+                    }
+                    token =  utf_char;
+                    token += utf_char_next;
+                    token += utf_char_next_next;
+
+                    bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
+                    token = "";
+                    i += 2;
+                    continue;
+                }
+            }
+
+            if (!split_condition && !collecting) {
+                if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
+                    collecting_letter = true;
+                    collecting = true;
+                }
+                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
+                    collecting_numeric = true;
+                    collecting = true;
+                }
+                else if (
+                    ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
+                    (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_NUMBER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
+                    ) {
+                    collecting_special = true;
+                    collecting = true;
+                }
+                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
+                    collecting_whitespace_lookahead = true;
+                    collecting = true;
+                }
+                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
+                    split_condition = true;
+                }
+            }
+            else if (!split_condition && collecting) {
+                if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
+                    split_condition = true;
+                }
+                else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) {
+                    split_condition = true;
+                }
+                else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
+                    split_condition = true;
+                }
+                else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
+                    split_condition = true;
+                }
+            }
+
+            if (utf_char_next == "") {
+                split_condition = true; // final
+                token += utf_char;
+            }
+
+            if (split_condition) {
+                if (token.size()) {
+                    bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
+                }
+                token = utf_char;
+                collecting = false;
+                collecting_letter = false;
+                collecting_numeric = false;
+                collecting_special = false;
+                collecting_whitespace_lookahead = false;
+            }
+            else {
+                token += utf_char;
+            }
+        }
+
+        start += offset;
    }

    return bpe_offsets;
@@ -543,14 +424,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
 static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets;

-    if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
-        bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
-    } else if (
-            regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
-            regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
-
-        bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
-    }
+    (void)(text);
+    (void)(regex_expr);
+    (void)(offsets);
+    // TODO: this implementation is actually wrong, uncomment and run:
+    //       make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
+    //if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
+    //    bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
+    //}

    return bpe_offsets;
 }
@@ -625,19 +506,6 @@ int unicode_cpt_type(const std::string & utf8) {
    return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
 }

-bool unicode_cpt_is_whitespace(uint32_t cp) {
-    static const std::unordered_set<uint32_t> is_whitespace = [] {
-        std::unordered_set<uint32_t> is_whitespace;
-        for (auto p : unicode_ranges_whitespace) {
-            for (auto i = p.first; i <= p.second; ++i) {
-                is_whitespace.insert(i);
-            }
-        }
-        return is_whitespace;
-    }();
-    return (bool)is_whitespace.count(cp);
-}
-
 std::string unicode_byte_to_utf8(uint8_t byte) {
    static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
    return map.at(byte);
--- a/unicode.h
+++ b/unicode.h
@@ -7,7 +7,7 @@
 #define CODEPOINT_TYPE_UNIDENTIFIED 0
 #define CODEPOINT_TYPE_NUMBER       1
 #define CODEPOINT_TYPE_LETTER       2
-#define CODEPOINT_TYPE_SEPARATOR    3
+#define CODEPOINT_TYPE_WHITESPACE   3
 #define CODEPOINT_TYPE_ACCENT_MARK  4
 #define CODEPOINT_TYPE_PUNCTUATION  5
 #define CODEPOINT_TYPE_SYMBOL       6
@@ -21,8 +21,6 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
 int unicode_cpt_type(uint32_t cp);
 int unicode_cpt_type(const std::string & utf8);

-bool unicode_cpt_is_whitespace(uint32_t cp);
-
 std::string unicode_byte_to_utf8(uint8_t byte);
 uint8_t unicode_utf8_to_byte(const std::string & utf8);
Author	SHA1	Message	Date
Georgi Gerganov	0fc560fe96	ci : enable git lfs for build.yml	2024-05-08 10:53:02 +03:00
Georgi Gerganov	db5c2ad30e	Revert "tmp : dummy change to trigger ci" This reverts commit `97e40df5d6`.	2024-05-08 10:42:25 +03:00
Georgi Gerganov	97e40df5d6	tmp : dummy change to trigger ci	2024-05-08 10:42:11 +03:00
Georgi Gerganov	837f426f19	ci : try lfs true	2024-05-08 10:30:25 +03:00
Georgi Gerganov	9d13776f34	ci : deps before checkout	2024-05-08 10:24:53 +03:00
Georgi Gerganov	2c7ff2c7ae	ci : add git-lfs ggml-ci	2024-05-08 10:18:47 +03:00
Georgi Gerganov	0dc0e9aa42	models : convert vocab files to LFS ggml-ci	2024-05-08 09:54:38 +03:00
				`@@ -0,0 +1 @@`
				`models/ggml-vocab-*.gguf filter=lfs diff=lfs merge=lfs -text`