metal : add poc for normalized Q4_0 and Q4_1

Merge branch 'master' into norm-quants
convert.py : use dir name to name the llama
2026-04-23 16:37:33 +03:00 · 2023-08-30 18:47:16 +03:00 · 2023-08-30 17:50:58 +03:00 · 2023-08-30 13:29:40 +03:00 · 2023-08-30 12:53:24 +03:00 · 2023-08-30 12:47:40 +03:00
25 changed files with 1131 additions and 901 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -41,6 +41,12 @@ jobs:
        run: |
          CC=gcc-8 make

+      - name: Test
+        id: make_test
+        run: |
+          CC=gcc-8 make tests
+          make test
+
  ubuntu-latest-cmake:
    runs-on: ubuntu-latest

@@ -157,6 +163,12 @@ jobs:
        run: |
          make

+      - name: Test
+        id: make_test
+        run: |
+          make tests
+          make test
+
  macOS-latest-cmake:
    runs-on: macos-latest

--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@@ -0,0 +1,43 @@
+# This workflow will upload a Python Package using Twine when a GGUF release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+# See `gguf-py/README.md` for how to make a release.
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+  workflow_dispatch:
+  push:
+    # Pattern matched against refs/tags
+    tags:
+      - 'gguf-v*'           # Push events to every version tag
+
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.9.x'
+    - name: Install dependencies
+      run: |
+        cd gguf-py
+        python -m pip install poetry
+        poetry install
+
+    - name: Build package
+      run: poetry build
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.PYPI_API_TOKEN }}
--- a/.gitignore
+++ b/.gitignore
@@ -42,6 +42,9 @@ models-mnt
 /gguf-llama-simple
 /libllama.so
 /llama-bench
+/baby-llama
+/beam-search
+/save-load-state
 build-info.h
 arm_neon.h
 compile_commands.json
--- a/21
+++ b/21
@@ -1,11 +1,28 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam_search tests/test-c.o
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search tests/test-c.o

 # Binaries only useful for tests
 TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1

 default: $(BUILD_TARGETS)

+test:
+	@echo "Running tests..."
+	@for test_target in $(TEST_TARGETS); do \
+		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
+		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
+			continue; \
+		elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
+			continue; \
+		else \
+			./$$test_target; \
+		fi; \
+	done
+	@echo "All tests have been run."
+
+all: $(BUILD_TARGETS) $(TEST_TARGETS)
+
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
@@ -429,7 +446,7 @@ llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o co
 baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-beam_search: examples/beam_search/beam_search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))'
--- a/README.md
+++ b/README.md
@@ -107,7 +107,7 @@ as the main playground for developing new features for the [ggml](https://github

 - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
 - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
+- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp), [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
 - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
 - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
 - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -8,6 +8,7 @@ import struct
 import json
 import numpy as np
 import torch
+import argparse

 from typing import Any, List
 from pathlib import Path
@@ -32,11 +33,10 @@ def bytes_to_unicode():
            bs.append(b)
            cs.append(2**8+n)
            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
+    return dict(zip(bs, (chr(n) for n in cs)))


-def count_model_parts(dir_model: str) -> int:
+def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
@@ -47,17 +47,22 @@ def count_model_parts(dir_model: str) -> int:
    return num_parts


-if len(sys.argv) < 3:
-    print(f"Usage: python {sys.argv[0]} dir-model ftype\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file")
+    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
+    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.bin)")
+    parser.add_argument("ftype",     type=int, choices=[0, 1],   help="output format - use 0 for float32, 1 for float16", default = 1)
+    return parser.parse_args()
+
+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)

-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-last_dir = os.path.basename(os.path.normpath(dir_model))
-
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
@@ -65,25 +70,21 @@ last_dir = os.path.basename(os.path.normpath(dir_model))
 # map from ftype to string
 ftype_str = ["f32", "f16"]

-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'

-        sys.exit(1)
+print("gguf: loading model "+dir_model.name)

-fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
-
-print("gguf: loading model "+last_dir)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)

 if hparams["architectures"][0] != "RWForCausalLM":
    print("Model architecture not supported: " + hparams["architectures"][0])

-    sys.exit()
+    sys.exit(1)

 # get number of model parts
 num_parts = count_model_parts(dir_model)
@@ -113,77 +114,58 @@ gguf_writer.add_file_type(ftype)

 print("gguf: get tokenizer metadata")

-tokens: List[str] = []
+tokens: List[bytearray] = []
 scores: List[float] = []
 toktypes: List[int] = []
-merges: List[str] = []

+tokenizer_json_file = dir_model / 'tokenizer.json'
+if not tokenizer_json_file.is_file():
+    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
+    sys.exit(1)

-if Path(dir_model + "/tokenizer.json").is_file():
-    # gpt2 tokenizer
-    gguf_writer.add_tokenizer_model("gpt2")
+# gpt2 tokenizer
+gguf_writer.add_tokenizer_model("gpt2")

-    print("gguf: get gpt2 tokenizer merges")
+with open(tokenizer_json_file, "r", encoding="utf-8") as f:
+    tokenizer_json = json.load(f)

-    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
-        tokenizer_json = json.load(f)
-    merges = tokenizer_json["model"]["merges"]
+print("gguf: get gpt2 tokenizer vocab")

-    gguf_writer.add_token_merges(merges)
+vocab_size = len(tokenizer_json["model"]["vocab"])

-    print("gguf: get gpt2 tokenizer vocab")
+# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+tokenizer = AutoTokenizer.from_pretrained(dir_model)

-    vocab_size = len(tokenizer_json["model"]["vocab"])
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v: k for k, v in byte_encoder.items()}

-    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+for i in range(vocab_size):
+    if i in reverse_vocab:
+        try:
+            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+        except KeyError:
+            text = bytearray()
+            for c in reverse_vocab[i]:
+                if ord(c) < 256:  # single byte character
+                    text.append(byte_decoder[ord(c)])
+                else:  # multibyte special token character
+                    text.extend(c.encode('utf-8'))
+    else:
+        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+        pad_token = f"[PAD{i}]".encode("utf8")
+        text = bytearray(pad_token)

-    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-    byte_encoder = bytes_to_unicode()
-    byte_decoder = {v: k for k, v in byte_encoder.items()}
+    tokens.append(text)
+    scores.append(0.0)                      # dymmy
+    toktypes.append(gguf.TokenType.NORMAL)  # dummy

-    for i in range(vocab_size):
-        if i in reverse_vocab:
-            try:
-                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
-            except KeyError:
-                text = bytearray()
-                for c in reverse_vocab[i]:
-                    if ord(c) < 256:  # single byte character
-                        text.append(byte_decoder[ord(c)])
-                    else:  # multibyte special token character
-                        text.extend(c.encode('utf-8'))
-        else:
-            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
-            pad_token = f"[PAD{i}]".encode("utf8")
-            text = bytearray(pad_token)
-
-        tokens.append(text)
-        scores.append(0.0)                      # dymmy
-        toktypes.append(gguf.TokenType.NORMAL)  # dummy
-
-    gguf_writer.add_token_list(tokens)
-    gguf_writer.add_token_scores(scores)
-    gguf_writer.add_token_types(toktypes)
-
-print("gguf: get special token ids")
-# Look for special tokens in config.json
-
-if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
-    gguf_writer.add_bos_token_id(hparams["bos_token_id"])
-
-if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
-    gguf_writer.add_eos_token_id(hparams["eos_token_id"])
-
-if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
-    gguf_writer.add_unk_token_id(hparams["unk_token_id"])
-
-if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
-    gguf_writer.add_sep_token_id(hparams["sep_token_id"])
-
-if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
-    gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
+gguf_writer.add_token_types(toktypes)

+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
+special_vocab.add_to_gguf(gguf_writer)

 # TENSORS

@@ -199,15 +181,17 @@ head_dim = hparams["hidden_size"] // n_head
 print("gguf: get tensor metadata")

 if num_parts == 0:
-    part_names = ("pytorch_model.bin",)
+    part_names = iter(("pytorch_model.bin",))
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )

 for part_name in part_names:
+    if args.vocab_only:
+        break
    print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
+    model_part = torch.load(dir_model / part_name, map_location="cpu")

    for name in model_part.keys():
        data = model_part[name]
@@ -238,11 +222,8 @@ for part_name in part_names:
        data = data.squeeze().numpy()

        # map tensor names
-        if name.endswith(".weight") and name[:-7] in tensor_map:
-            name = tensor_map[name[:-7]] + ".weight"
-        elif name.endswith(".bias") and name[:-5] in tensor_map:
-            name = tensor_map[name[:-5]] + ".bias"
-        else:
+        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+        if new_name is None:
            print("Can not map tensor '" + name + "'")
            sys.exit()

@@ -261,19 +242,20 @@ for part_name in part_names:
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)

-        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))

-        gguf_writer.add_tensor(name, data)
+        gguf_writer.add_tensor(new_name, data)


 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
-print("gguf: write tensors")
-gguf_writer.write_tensors_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()

 gguf_writer.close()

-print("gguf: model successfully exported to '" + fname_out + "'")
+print(f"gguf: model successfully exported to '{fname_out}'")
 print("")
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@@ -8,6 +8,7 @@ import struct
 import json
 import numpy as np
 import torch
+import argparse

 from typing import Any, List
 from pathlib import Path
@@ -34,11 +35,10 @@ def bytes_to_unicode():
            bs.append(b)
            cs.append(2**8+n)
            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
+    return dict(zip(bs, (chr(n) for n in cs)))


-def count_model_parts(dir_model: str) -> int:
+def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("pytorch_model-"):
@@ -49,17 +49,22 @@ def count_model_parts(dir_model: str) -> int:
    return num_parts


-if len(sys.argv) < 3:
-    print(f"Usage: python {sys.argv[0]} dir-model ftype\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file")
+    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
+    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.bin)")
+    parser.add_argument("ftype",     type=int, choices=[0, 1],   help="output format - use 0 for float32, 1 for float16", default = 1)
+    return parser.parse_args()
+
+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)

-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-last_dir = os.path.basename(os.path.normpath(dir_model))
-
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
@@ -67,19 +72,15 @@ last_dir = os.path.basename(os.path.normpath(dir_model))
 # map from ftype to string
 ftype_str = ["f32", "f16"]

-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'

-        sys.exit(1)
+print("gguf: loading model "+dir_model.name)

-fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
-
-print("gguf: loading model "+last_dir)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)

 if hparams["architectures"][0] != "GPTNeoXForCausalLM":
@@ -97,7 +98,7 @@ print("gguf: get model metadata")

 block_count = hparams["num_hidden_layers"]

-gguf_writer.add_name(last_dir)
+gguf_writer.add_name(dir_model.name)
 gguf_writer.add_context_length(hparams["max_position_embeddings"])
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
@@ -111,86 +112,52 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"])

 print("gguf: get tokenizer metadata")

-tokens: List[str] = []
-merges: List[str] = []
+tokens: List[bytearray] = []

+tokenizer_json_file = dir_model / 'tokenizer.json'
+if not tokenizer_json_file.is_file():
+    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
+    sys.exit(1)

-if Path(dir_model + "/tokenizer.json").is_file():
-    # gpt2 tokenizer
-    gguf_writer.add_tokenizer_model("gpt2")
+# gpt2 tokenizer
+gguf_writer.add_tokenizer_model("gpt2")

-    print("gguf: get gpt2 tokenizer merges")
+with open(tokenizer_json_file, "r", encoding="utf-8") as f:
+    tokenizer_json = json.load(f)

-    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
-        tokenizer_json = json.load(f)
-    merges = tokenizer_json["model"]["merges"]
+print("gguf: get gpt2 tokenizer vocab")

-    gguf_writer.add_token_merges(merges)
+vocab_size = len(tokenizer_json["model"]["vocab"])

-    print("gguf: get gpt2 tokenizer vocab")
+# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
+tokenizer = AutoTokenizer.from_pretrained(dir_model)

-    vocab_size = len(tokenizer_json["model"]["vocab"])
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v: k for k, v in byte_encoder.items()}

-    # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
-    tokenizer = AutoTokenizer.from_pretrained(dir_model)
+for i in range(vocab_size):
+    if i in reverse_vocab:
+        try:
+            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+        except KeyError:
+            text = bytearray()
+            for c in reverse_vocab[i]:
+                if ord(c) < 256:  # single byte character
+                    text.append(byte_decoder[ord(c)])
+                else:  # multibyte special token character
+                    text.extend(c.encode('utf-8'))
+    else:
+        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+        pad_token = f"[PAD{i}]".encode("utf8")
+        text = bytearray(pad_token)

-    reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
-    byte_encoder = bytes_to_unicode()
-    byte_decoder = {v: k for k, v in byte_encoder.items()}
+    tokens.append(text)

-    for i in range(vocab_size):
-        if i in reverse_vocab:
-            try:
-                text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
-            except KeyError:
-                text = bytearray()
-                for c in reverse_vocab[i]:
-                    if ord(c) < 256:  # single byte character
-                        text.append(byte_decoder[ord(c)])
-                    else:  # multibyte special token character
-                        text.extend(c.encode('utf-8'))
-        else:
-            print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
-            pad_token = f"[PAD{i}]".encode("utf8")
-            text = bytearray(pad_token)
-
-        tokens.append(text)
-
-    gguf_writer.add_token_list(tokens)
-
-    if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file():
-        print("gguf: get special token ids")
-
-        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
-            tokenizer_config = json.load(f)
-
-        # find special token ids
-
-        if "bos_token" in tokenizer_config:
-            for key in tokenizer_json["added_tokens"]:
-                if key["content"] == tokenizer_config["bos_token"]:
-                    gguf_writer.add_bos_token_id(key["id"])
-
-        if "eos_token" in tokenizer_config:
-            for key in tokenizer_json["added_tokens"]:
-                if key["content"] == tokenizer_config["eos_token"]:
-                    gguf_writer.add_eos_token_id(key["id"])
-
-        if "unk_token" in tokenizer_config:
-            for key in tokenizer_json["added_tokens"]:
-                if key["content"] == tokenizer_config["unk_token"]:
-                    gguf_writer.add_unk_token_id(key["id"])
-
-        if "sep_token" in tokenizer_config:
-            for key in tokenizer_json["added_tokens"]:
-                if key["content"] == tokenizer_config["sep_token"]:
-                    gguf_writer.add_sep_token_id(key["id"])
-
-        if "pad_token" in tokenizer_config:
-            for key in tokenizer_json["added_tokens"]:
-                if key["content"] == tokenizer_config["pad_token"]:
-                    gguf_writer.add_pad_token_id(key["id"])
+gguf_writer.add_token_list(tokens)

+special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
+special_vocab.add_to_gguf(gguf_writer)

 # TENSORS

@@ -200,13 +167,15 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 print("gguf: get tensor metadata")

 if num_parts == 0:
-    part_names = ("pytorch_model.bin",)
+    part_names = iter(("pytorch_model.bin",))
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )

 for part_name in part_names:
+    if args.vocab_only:
+        break
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")

@@ -226,11 +195,8 @@ for part_name in part_names:
        data = data.squeeze().numpy()

        # map tensor names
-        if name.endswith(".weight") and name[:-7] in tensor_map:
-            name = tensor_map[name[:-7]] + ".weight"
-        elif name.endswith(".bias") and name[:-5] in tensor_map:
-            name = tensor_map[name[:-5]] + ".bias"
-        else:
+        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+        if new_name is None:
            print("Can not map tensor '" + name + "'")
            sys.exit()

@@ -249,19 +215,20 @@ for part_name in part_names:
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)

-        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))

-        gguf_writer.add_tensor(name, data)
+        gguf_writer.add_tensor(new_name, data)


 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
-print("gguf: write tensors")
-gguf_writer.write_tensors_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()

 gguf_writer.close()

-print("gguf: model successfully exported to '" + fname_out + "'")
+print(f"gguf: model successfully exported to '{fname_out}'")
 print("")
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@@ -10,8 +10,9 @@ import struct
 import json
 import numpy as np
 import torch
+import argparse

-from typing import Any, List
+from typing import Any, List, TypeAlias
 from pathlib import Path
 from sentencepiece import SentencePieceProcessor

@@ -20,7 +21,7 @@ from sentencepiece import SentencePieceProcessor
 NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]'


-def count_model_parts(dir_model: str) -> int:
+def count_model_parts(dir_model: Path) -> int:
    num_parts = 0
    for filename in os.listdir(dir_model):
        if filename.startswith("consolidated."):
@@ -31,19 +32,22 @@ def count_model_parts(dir_model: str) -> int:
    return num_parts


-if len(sys.argv) < 3:
-    print(f"Usage: python {sys.argv[0]} dir-model ftype\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a PyTorch 7B LLaMA model to a GGML compatible file")
+    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
+    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.bin)")
+    parser.add_argument("ftype",     type=int, choices=[0, 1],   help="output format - use 0 for float32, 1 for float16", default = 1)
+    return parser.parse_args()

+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)

-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-last_dir = os.path.basename(os.path.normpath(dir_model))
-
-
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
@@ -51,19 +55,15 @@ last_dir = os.path.basename(os.path.normpath(dir_model))
 # map from ftype to string
 ftype_str = ["f32", "f16"]

-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'

-        sys.exit(1)
+print("gguf: loading model "+dir_model.name)

-fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
-
-print("gguf: loading model "+last_dir)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)

 if hparams["architectures"][0] != "LlamaForCausalLM":
@@ -107,7 +107,7 @@ else:
    sys.exit()


-gguf_writer.add_name(last_dir)
+gguf_writer.add_name(dir_model.name)
 gguf_writer.add_source_hf_repo(hf_repo)
 gguf_writer.add_tensor_data_layout("Meta AI original pth")
 gguf_writer.add_context_length(ctx_length)
@@ -133,109 +133,60 @@ tokens: List[bytes] = []
 scores: List[float] = []
 toktypes: List[int] = []

-if Path(dir_model + "/tokenizer.model").is_file():
-    # vocab type sentencepiece
-    print("gguf: get sentencepiece tokenizer vocab and scores")
+tokenizer_model_file = dir_model / 'tokenizer.model'
+if not tokenizer_model_file.is_file():
+    print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
+    sys.exit(1)

-    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+# vocab type sentencepiece
+print("gguf: get sentencepiece tokenizer vocab and scores")

-    for i in range(tokenizer.vocab_size()):
-        text: bytes
-        score: float
+tokenizer = SentencePieceProcessor(str(tokenizer_model_file))

-        piece = tokenizer.id_to_piece(i)
-        text = piece.encode("utf-8")
-        score = tokenizer.get_score(i)
+for i in range(tokenizer.vocab_size()):
+    text: bytes
+    score: float

-        toktype = 1  # defualt to normal token type
-        if tokenizer.is_unknown(i):
-            toktype = 2
-        if tokenizer.is_control(i):
-            toktype = 3
+    piece = tokenizer.id_to_piece(i)
+    text = piece.encode("utf-8")
+    score = tokenizer.get_score(i)

-        # toktype = 4 is user-defined = tokens from added_tokens.json
+    toktype = 1  # defualt to normal token type
+    if tokenizer.is_unknown(i):
+        toktype = 2
+    if tokenizer.is_control(i):
+        toktype = 3

-        if tokenizer.is_unused(i):
-            toktype = 5
-        if tokenizer.is_byte(i):
-            toktype = 6
+    # toktype = 4 is user-defined = tokens from added_tokens.json

-        tokens.append(text)
-        scores.append(score)
-        toktypes.append(toktype)
+    if tokenizer.is_unused(i):
+        toktype = 5
+    if tokenizer.is_byte(i):
+        toktype = 6

-    if Path(dir_model + "/added_tokens.json").is_file():
-        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
-            addtokens_json = json.load(f)
+    tokens.append(text)
+    scores.append(score)
+    toktypes.append(toktype)

-            print("gguf: get added tokens")
+added_tokens_file = dir_model / 'added_tokens.json'
+if added_tokens_file.is_file():
+    with open(added_tokens_file, "r", encoding="utf-8") as f:
+        addtokens_json = json.load(f)

-            for key in addtokens_json:
-                tokens.append( key.encode("utf-8") )
-                scores.append(-1000.0)
-                toktypes.append(4) # user-defined token type
+        print("gguf: get added tokens")

-    gguf_writer.add_tokenizer_model("llama")
-    gguf_writer.add_token_list(tokens)
-    gguf_writer.add_token_scores(scores)
-    gguf_writer.add_token_types(toktypes)
+        for key in addtokens_json:
+            tokens.append( key.encode("utf-8") )
+            scores.append(-1000.0)
+            toktypes.append(4) # user-defined token type

+gguf_writer.add_tokenizer_model("llama")
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
+gguf_writer.add_token_types(toktypes)

-print("gguf: get special token ids")
-
-if Path(dir_model + "/tokenizer.json").is_file():
-    # Look for special tokens in tokenizer.json if it exists
-
-    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
-        tokenizer = json.load(f)
-
-    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
-
-        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
-            tokenizer_config = json.load(f)
-
-        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
-            for key in tokenizer["added_tokens"]:
-                if key["content"] == tokenizer_config["bos_token"]["content"]:
-                    gguf_writer.add_bos_token_id(key["id"])
-
-        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
-            for key in tokenizer["added_tokens"]:
-                if key["content"] == tokenizer_config["eos_token"]["content"]:
-                    gguf_writer.add_eos_token_id(key["id"])
-
-        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
-            for key in tokenizer["added_tokens"]:
-                if key["content"] == tokenizer_config["unk_token"]["content"]:
-                    gguf_writer.add_unk_token_id(key["id"])
-
-        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
-            for key in tokenizer["added_tokens"]:
-                if key["content"] == tokenizer_config["sep_token"]["content"]:
-                    gguf_writer.add_sep_token_id(key["id"])
-
-        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
-            for key in tokenizer["added_tokens"]:
-                if key["content"] == tokenizer_config["pad_token"]["content"]:
-                    gguf_writer.add_pad_token_id(key["id"])
-else:
-    # If no tokenizer.json: Look for special tokens in config.json
-
-    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
-        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
-
-    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
-        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
-
-    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
-        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
-
-    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
-        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
-
-    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
-        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
-
+special_vocab = gguf.SpecialVocab(dir_model)
+special_vocab.add_to_gguf(gguf_writer)

 # TENSORS

@@ -247,6 +198,8 @@ print("gguf: get tensor metadata")
 part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts))

 for part_name in part_names:
+    if args.vocab_only:
+        break
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")

@@ -266,11 +219,8 @@ for part_name in part_names:
        data = data.squeeze().numpy()

        # map tensor names
-        if name.endswith(".weight") and name[:-7] in tensor_map:
-            name = tensor_map[name[:-7]] + ".weight"
-        elif name.endswith(".bias") and name[:-5] in tensor_map:
-            name = tensor_map[name[:-5]] + ".bias"
-        else:
+        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+        if new_name is None:
            print("Can not map tensor '" + name + "'")
            sys.exit()

@@ -289,20 +239,20 @@ for part_name in part_names:
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)

-        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))

-        gguf_writer.add_tensor(name, data)
+        gguf_writer.add_tensor(new_name, data)


 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
-print("gguf: write tensors")
-gguf_writer.write_tensors_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()

 gguf_writer.close()

-
-print("gguf: model successfully exported to '" + fname_out + "'")
+print(f"gguf: model successfully exported to '{fname_out}'")
 print("")
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@@ -75,7 +75,7 @@ class Tensor:
        self.dims = ()
        self.dtype = None
        self.start_offset = 0
-        self.len_bytes = 0
+        self.len_bytes = np.int64(0)

    def load(self, data, offset):
        orig_offset = offset
@@ -134,13 +134,14 @@ class GGMLV3Model:
        return offset

 class GGMLToGGUF:
-    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None):
+    def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
        hp = ggml_model.hyperparameters
        self.model = ggml_model
        self.data = data
        self.cfg = cfg
        self.params_override = params_override
        self.vocab_override = vocab_override
+        self.special_vocab = special_vocab
        if params_override is not None:
            n_kv_head = params_override.n_head_kv
        else:
@@ -162,6 +163,8 @@ class GGMLToGGUF:
        gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
        self.add_params(gguf_writer)
        self.add_vocab(gguf_writer)
+        if self.special_vocab is not None:
+            self.special_vocab.add_to_gguf(gguf_writer)
        self.add_tensors(gguf_writer)
        print("    gguf: write header")
        gguf_writer.write_header_to_file()
@@ -259,20 +262,13 @@ class GGMLToGGUF:
        gguf_writer.add_eos_token_id(2)

    def add_tensors(self, gguf_writer):
-        nm = self.name_map
+        tensor_map = self.name_map
        data = self.data
        print(f'* Adding {len(self.model.tensors)} tensor(s)')
        for tensor in self.model.tensors:
            name = str(tensor.name, 'UTF-8')
-            if name.endswith('.weight'):
-                name = name[:-7]
-                suffix = '.weight'
-            elif name.endswith('.bias'):
-                name = name[:-5]
-                suffix = '.bias'
-            mapped_name = nm.get(name)
+            mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
            assert mapped_name is not None, f'Bad name {name}'
-            mapped_name += suffix
            tempdims = list(tensor.dims[:])
            if len(tempdims) > 1:
                temp = tempdims[1]
@@ -302,8 +298,10 @@ def handle_metadata(cfg, hp):
    else:
        raise ValueError('Unable to load metadata')
    vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
+    # FIXME: Respect cfg.vocab_dir?
+    svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
    convert.check_vocab_size(params, vocab)
-    return (params, vocab)
+    return (params, vocab, svocab)

 def handle_args():
    parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
@@ -330,14 +328,16 @@ def main():
    print(f'* GGML model hyperparameters: {model.hyperparameters}')
    vocab_override = None
    params_override = None
+    special_vocab = None
    if cfg.model_metadata_dir is not None:
-        (params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters)
+        (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
        print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
        print(f'* Overriding params: {params_override}')
        print(f'* Overriding vocab: {vocab_override}')
+        print(f'* Special vocab: {special_vocab}')
    else:
        print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
-    converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override)
+    converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab)
    converter.save()
    print(f'* Successful completion. Output saved to: {cfg.output}')

--- a/convert-llama-hf-to-gguf.py
+++ b/convert-llama-hf-to-gguf.py
@@ -8,8 +8,9 @@ import struct
 import json
 import numpy as np
 import torch
+import argparse

-from typing import Any, List, Optional
+from typing import Any, List, Optional, TypeAlias
 from pathlib import Path
 from sentencepiece import SentencePieceProcessor

@@ -43,40 +44,38 @@ def count_model_parts(dir_model: str) -> int:
    return num_parts


-if len(sys.argv) < 3:
-    print(f"Usage: python {sys.argv[0]} dir-model ftype\n")
-    print("  ftype == 0 -> float32")
-    print("  ftype == 1 -> float16")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
+    parser.add_argument("--vocab-only",  action="store_true",    help="extract only the vocab")
+    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
+    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.bin)")
+    parser.add_argument("ftype",     type=int, choices=[0, 1],   help="output format - use 0 for float32, 1 for float16", default = 1)
+    return parser.parse_args()

+args = parse_args()
+
+dir_model = args.model
+ftype = args.ftype
+if not dir_model.is_dir():
+    print(f'Error: {args.model} is not a directory', file = sys.stderr)
    sys.exit(1)

-
-# output in the same directory as the model
-dir_model = sys.argv[1]
-last_dir = os.path.basename(os.path.normpath(dir_model))
-
-
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16

-
 # map from ftype to string
 ftype_str = ["f32", "f16"]

-ftype = 1
-if len(sys.argv) > 2:
-    ftype = int(sys.argv[2])
-    if ftype < 0 or ftype > 1:
-        print("Invalid ftype: " + str(ftype))
+if args.outfile is not None:
+    fname_out = args.outfile
+else:
+    # output in the same directory as the model by default
+    fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'

-        sys.exit(1)
+print("gguf: loading model "+dir_model.name)

-fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf"
-
-print("gguf: loading model "+last_dir)
-
-with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
+with open(dir_model / "config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)

 if hparams["architectures"][0] != "LlamaForCausalLM":
@@ -115,7 +114,7 @@ else:
    sys.exit()


-gguf_writer.add_name(last_dir)
+gguf_writer.add_name(dir_model.name)
 gguf_writer.add_source_hf_repo(hf_repo)
 gguf_writer.add_tensor_data_layout("Meta AI original pth")
 gguf_writer.add_context_length(ctx_length)
@@ -141,110 +140,61 @@ tokens: List[bytes] = []
 scores: List[float] = []
 toktypes: List[int] = []

-if Path(dir_model + "/tokenizer.model").is_file():
-    # vocab type sentencepiece
-    print("gguf: get sentencepiece tokenizer vocab, scores and token types")
+tokenizer_model_file = dir_model / 'tokenizer.model'
+if not tokenizer_model_file.is_file():
+    print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
+    sys.exit(1)

-    tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model")
+# vocab type sentencepiece
+print("gguf: get sentencepiece tokenizer vocab, scores and token types")

-    for i in range(tokenizer.vocab_size()):
-        text: bytes
-        score: float
+tokenizer = SentencePieceProcessor(str(tokenizer_model_file))

-        piece = tokenizer.id_to_piece(i)
-        text = piece.encode("utf-8")
-        score = tokenizer.get_score(i)
+for i in range(tokenizer.vocab_size()):
+    text: bytes
+    score: float

-        toktype = 1  # defualt to normal token type
-        if tokenizer.is_unknown(i):
-            toktype = 2
-        if tokenizer.is_control(i):
-            toktype = 3
+    piece = tokenizer.id_to_piece(i)
+    text = piece.encode("utf-8")
+    score = tokenizer.get_score(i)

-        # toktype = 4 is user-defined = tokens from added_tokens.json
+    toktype = 1  # defualt to normal token type
+    if tokenizer.is_unknown(i):
+        toktype = 2
+    if tokenizer.is_control(i):
+        toktype = 3

-        if tokenizer.is_unused(i):
-            toktype = 5
-        if tokenizer.is_byte(i):
-            toktype = 6
+    # toktype = 4 is user-defined = tokens from added_tokens.json

-        tokens.append(text)
-        scores.append(score)
-        toktypes.append(toktype)
+    if tokenizer.is_unused(i):
+        toktype = 5
+    if tokenizer.is_byte(i):
+        toktype = 6

-    if Path(dir_model + "/added_tokens.json").is_file():
-        with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
-            addtokens_json = json.load(f)
+    tokens.append(text)
+    scores.append(score)
+    toktypes.append(toktype)

-            print("gguf: get added tokens")
+added_tokens_file = dir_model / 'added_tokens.json'
+if added_tokens_file.is_file():
+    with open(added_tokens_file, "r", encoding="utf-8") as f:
+        addtokens_json = json.load(f)

-            for key in addtokens_json:
-                tokens.append( key.encode("utf-8") )
-                scores.append(-1000.0)
-                toktypes.append(4) # user-defined token type
+        print("gguf: get added tokens")
+
+        for key in addtokens_json:
+            tokens.append( key.encode("utf-8") )
+            scores.append(-1000.0)
+            toktypes.append(4) # user-defined token type


-    gguf_writer.add_tokenizer_model("llama")
-    gguf_writer.add_token_list(tokens)
-    gguf_writer.add_token_scores(scores)
-    gguf_writer.add_token_types(toktypes)
-
-
-print("gguf: get special token ids")
-
-if Path(dir_model + "/tokenizer.json").is_file():
-    # Look for special tokens in tokenizer.json if it exists
-
-    with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
-        tokenizer = json.load(f)
-
-    if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file():
-
-        with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f:
-            tokenizer_config = json.load(f)
-
-        if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None:
-            for key in tokenizer["added_tokens"]:
-                if key["content"] == tokenizer_config["bos_token"]["content"]:
-                    gguf_writer.add_bos_token_id(key["id"])
-
-        if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None:
-            for key in tokenizer["added_tokens"]:
-                if key["content"] == tokenizer_config["eos_token"]["content"]:
-                    gguf_writer.add_eos_token_id(key["id"])
-
-        if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None:
-            for key in tokenizer["added_tokens"]:
-                if key["content"] == tokenizer_config["unk_token"]["content"]:
-                    gguf_writer.add_unk_token_id(key["id"])
-
-        if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None:
-            for key in tokenizer["added_tokens"]:
-                if key["content"] == tokenizer_config["sep_token"]["content"]:
-                    gguf_writer.add_sep_token_id(key["id"])
-
-        if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None:
-            for key in tokenizer["added_tokens"]:
-                if key["content"] == tokenizer_config["pad_token"]["content"]:
-                    gguf_writer.add_pad_token_id(key["id"])
-else:
-    # If no tokenizer.json: Look for special tokens in config.json
-
-    if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
-        gguf_writer.add_bos_token_id(hparams["bos_token_id"])
-
-    if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
-        gguf_writer.add_eos_token_id(hparams["eos_token_id"])
-
-    if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
-        gguf_writer.add_unk_token_id(hparams["unk_token_id"])
-
-    if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
-        gguf_writer.add_sep_token_id(hparams["sep_token_id"])
-
-    if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
-        gguf_writer.add_pad_token_id(hparams["pad_token_id"])
+gguf_writer.add_tokenizer_model("llama")
+gguf_writer.add_token_list(tokens)
+gguf_writer.add_token_scores(scores)
+gguf_writer.add_token_types(toktypes)

+special_vocab = gguf.SpecialVocab(dir_model)
+special_vocab.add_to_gguf(gguf_writer)

 # TENSORS

@@ -254,13 +204,15 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 print("gguf: get tensor metadata")

 if num_parts == 0:
-    part_names = ("pytorch_model.bin",)
+    part_names = iter(("pytorch_model.bin",))
 else:
    part_names = (
        f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
    )

 for part_name in part_names:
+    if args.vocab_only:
+        break
    print("gguf: loading model part '" + part_name + "'")
    model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")

@@ -286,11 +238,8 @@ for part_name in part_names:
            data = reverse_hf_permute(data, head_count, head_count_kv)

        # map tensor names
-        if name.endswith(".weight") and name[:-7] in tensor_map:
-            name = tensor_map[name[:-7]] + ".weight"
-        elif name.endswith(".bias") and name[:-5] in tensor_map:
-            name = tensor_map[name[:-5]] + ".bias"
-        else:
+        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+        if new_name is None:
            print("Can not map tensor '" + name + "'")
            sys.exit()

@@ -309,20 +258,20 @@ for part_name in part_names:
        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
            data = data.astype(np.float16)

-        print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))

-        gguf_writer.add_tensor(name, data)
+        gguf_writer.add_tensor(new_name, data)


 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
-print("gguf: write tensors")
-gguf_writer.write_tensors_to_file()
+if not args.vocab_only:
+    print("gguf: write tensors")
+    gguf_writer.write_tensors_to_file()

 gguf_writer.close()

-
-print("gguf: model successfully exported to '" + fname_out + "'")
+print(f"gguf: model successfully exported to '{fname_out}'")
 print("")
--- a/convert-lora-to-ggml.py
+++ b/convert-lora-to-ggml.py
@@ -4,7 +4,7 @@ import os
 import re
 import struct
 import sys
-from typing import Any, Dict, Sequence, TextIO
+from typing import Any, Dict, Sequence, BinaryIO

 import numpy as np
 import torch
@@ -46,7 +46,7 @@ def translate_tensor_name(t: str) -> str:
        sys.exit(1)


-def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
+def write_file_header(fout: BinaryIO, params: Dict[str, Any]) -> None:
    fout.write(b"ggla"[::-1])  # magic (ggml lora)
    fout.write(struct.pack("i", 1))  # file version
    fout.write(struct.pack("i", params["r"]))
@@ -60,7 +60,7 @@ def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:


 def write_tensor_header(
-    self, name: str, shape: Sequence[int], data_type: np.dtype
+    self, name: str, shape: Sequence[int], data_type: np.dtype[Any]
 ) -> None:
    sname = name.encode("utf-8")
    fout.write(
--- a/convert.py
+++ b/convert.py
@@ -25,7 +25,7 @@ import numpy as np
 from abc import ABCMeta, abstractmethod
 from dataclasses import dataclass
 from pathlib import Path
-from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, TypeVar, Union)
+from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, Type, TypeVar, Union)
 from sentencepiece import SentencePieceProcessor  # type: ignore

 if TYPE_CHECKING:
@@ -299,8 +299,10 @@ class Params:
            params = Params.loadHFTransformerJson(model_plus.model, hf_config_path)
        elif orig_config_path.exists():
            params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path)
-        else:
+        elif model_plus.format != 'none':
            params = Params.guessed(model_plus.model)
+        else:
+            raise ValueError('Cannot guess params when model format is none')

        params.path_model = model_plus.paths[0].parent

@@ -353,7 +355,7 @@ class BpeVocab:
        yield from self.added_tokens()

    def __repr__(self) -> str:
-        return f"BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
+        return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"


 class SentencePieceVocab:
@@ -416,7 +418,6 @@ class SentencePieceVocab:

 Vocab = Union[BpeVocab, SentencePieceVocab]

-
 #
 # data loading
 # TODO: reuse (probably move to gguf.py?)
@@ -439,14 +440,14 @@ class Tensor(metaclass=ABCMeta):
    @abstractmethod
    def permute(self, n_head: int, n_head_kv: int) -> 'Tensor': ...
    @abstractmethod
-    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ...
+    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> 'UnquantizedTensor': ...
    @abstractmethod
    def part(self, n_part: int) -> 'UnquantizedTensor': ...
    @abstractmethod
    def to_ggml(self) -> 'GGMLCompatibleTensor': ...


-def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
+def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray:
    assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
    fp32_arr = bf16_arr.astype(np.uint32) << 16
    return fp32_arr.view(np.float32)
@@ -467,9 +468,9 @@ class UnquantizedTensor(Tensor):
    def to_ggml(self) -> 'UnquantizedTensor':
        return self

-    def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor':
+    def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> 'UnquantizedTensor':
        r = self.ndarray.shape[0] // 3
-        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head))
+        return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv))

    def part(self, n_part: int) -> 'UnquantizedTensor':
        r = self.ndarray.shape[0] // 3
@@ -531,7 +532,7 @@ LazyModel = Dict[str, LazyTensor]
 class ModelPlus:
    model: LazyModel
    paths: List[Path]  # Where this was read from.
-    format: Literal['ggml', 'torch', 'safetensors']
+    format: Literal['ggml', 'torch', 'safetensors', 'none']
    vocab: Optional[Vocab]  # For GGML models (which have vocab built in), the vocab.


@@ -597,12 +598,12 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
        return lazy_tensor.load().permute(n_head, n_head_kv)
    return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)

-def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor:
+def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
    def load() -> Tensor:
-        return lazy_tensor.load().permute_part(n_part, n_head)
+        return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
    s = lazy_tensor.shape.copy()
    s[0] = s[0] // 3
-    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description)
+    return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)

 def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
    def load() -> Tensor:
@@ -657,7 +658,7 @@ class LazyUnpickler(pickle.Unpickler):
        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
        return LazyStorage(load=load, kind=pid[1], description=description)

-    # @staticmethod
+    @staticmethod
    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
                               # pyright: ignore[reportSelfClsParameterName]
                               requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
@@ -669,13 +670,15 @@ class LazyUnpickler(pickle.Unpickler):
        description = f'pickled storage_offset={storage_offset} in {storage.description}'
        return LazyTensor(load, list(size), storage.kind.data_type, description)

-    # @staticmethod
+    @staticmethod
    def rebuild_from_type_v2(func, new_type, args, state):
        return func(*args)

-    CLASSES: Dict[Any, Any] = {
-        ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2,
-        ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
+    CLASSES: Dict[Tuple[str, str], Any] = {
+        # getattr used here as a workaround for mypy not being smart enough to detrmine
+        # the staticmethods have a __func__ attribute.
+        ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
+        ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
@@ -751,7 +754,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
 In = TypeVar('In')
 Out = TypeVar('Out')

-def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, factory: Callable = ThreadPoolExecutor) -> Iterable[Out]:
+def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, use_processpool_executor: bool = False) -> Iterable[Out]:
    '''Parallel map, but with backpressure.  If the caller doesn't call `next`
    fast enough, this will stop calling `func` at some point rather than
    letting results pile up in memory.  Specifically, there is a max of one
@@ -760,7 +763,12 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
        yield from map(func, iterable)
        # Not reached.
    iterable = iter(iterable)
-    with factory(max_workers = max_workers) as executor:
+    executor_class: Union[Type[ThreadPoolExecutor], Type[ProcessPoolExecutor]]
+    if use_processpool_executor:
+        executor_class = ProcessPoolExecutor
+    else:
+        executor_class = ThreadPoolExecutor
+    with executor_class(max_workers = max_workers) as executor:
        futures: List[concurrent.futures.Future[Out]] = []
        done = False
        for _ in range(concurrency):
@@ -803,10 +811,12 @@ class OutputFile:

    def add_meta_arch(self, params: Params) -> None:
        name = "LLaMA"
+
+        # TODO: better logic to determine model name
        if (params.n_ctx == 4096):
            name = "LLaMA v2"
-            if params.path_model:
-                name = str(params.path_model.parent).split('/')[-1]
+        elif params.path_model:
+            name = str(params.path_model.parent).split('/')[-1]

        self.gguf.add_name                (name)
        self.gguf.add_context_length      (params.n_ctx)
@@ -831,18 +841,25 @@ class OutputFile:
        tokens = []
        scores = []
        toktypes = []
-        # NOTE: `all_tokens` returns the the base vocabulary and added tokens
-        # TODO: add special tokens?
+        # NOTE: `all_tokens` returns the base vocabulary and added tokens
        for text, score, toktype in vocab.all_tokens():
            tokens.append(text)
            scores.append(score)
            toktypes.append(toktype)

-        self.gguf.add_tokenizer_model("llama")
+        if isinstance(vocab, SentencePieceVocab):
+            self.gguf.add_tokenizer_model("llama")
+        elif isinstance(vocab, BpeVocab):
+            self.gguf.add_tokenizer_model("gpt2")
+        else:
+            raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab')
        self.gguf.add_token_list(tokens)
        self.gguf.add_token_scores(scores)
        self.gguf.add_token_types(toktypes)

+    def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None:
+        svocab.add_to_gguf(self.gguf)
+
    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
        n_elements = int(np.prod(tensor.shape))
        raw_dtype = getattr(tensor.data_type, 'ggml_type', None)
@@ -861,7 +878,7 @@ class OutputFile:
        self.gguf.close()

    @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab) -> None:
+    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
        check_vocab_size(params, vocab)

        of = OutputFile(fname_out)
@@ -869,6 +886,8 @@ class OutputFile:
        # meta data
        of.add_meta_arch(params)
        of.add_meta_vocab(vocab)
+        of.add_meta_special_vocab(svocab)
+
        of.write_meta()

        of.close()
@@ -887,7 +906,7 @@ class OutputFile:
        return dt.quantize(arr)

    @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
+    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
        check_vocab_size(params, vocab)

        of = OutputFile(fname_out)
@@ -895,6 +914,7 @@ class OutputFile:
        # meta data
        of.add_meta_arch(params)
        of.add_meta_vocab(vocab)
+        of.add_meta_special_vocab(svocab)

        # tensor info
        for name, lazy_tensor in model.items():
@@ -906,7 +926,7 @@ class OutputFile:
        # tensor data
        ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
        if ftype == GGMLFileType.MostlyQ8_0:
-            ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, factory = ProcessPoolExecutor)
+            ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, use_processpool_executor = True)
        else:
            ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)

@@ -939,7 +959,8 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
            for (name, tensor) in model.items()}

 def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
-    tmap = gguf.get_tensor_name_map(ARCH, params.n_layer)
+    tmap = gguf.TensorNameMap(ARCH, params.n_layer)
+    should_skip: Set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))

    tmp = model

@@ -952,8 +973,8 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
           #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] =              model[f"model.layers.{i}.self_attn.v_proj.weight"]
        elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
            print(f"Unpacking and permuting layer {i}")
-            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head)
-            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head)
+            tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
+            tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
            tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy        (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
            del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
        else:
@@ -961,23 +982,16 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:

    out: LazyModel = {}
    for name, lazy_tensor in model.items():
-        name_new = name
-
-        if name in tmap:
-            name_new = tmap[name]
-        elif name.endswith(".weight") and name[:-7] in tmap:
-            name_new = tmap[name[:-7]] + ".weight"
-        elif name.endswith(".bias") and name[:-5] in tmap:
-            name_new = tmap[name[:-5]] + ".bias"
-        else:
+        tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
+        if name_new is None:
            raise Exception(f"Unexpected tensor name: {name}")

-        if gguf.should_skip_tensor_TMP(ARCH, params.n_layer, name_new):
+        if tensor_type in should_skip:
            print(f"skipping tensor {name_new}")
            continue
-        else:
-            print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
-            out[name_new] = lazy_tensor
+
+        print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
+        out[name_new] = lazy_tensor

    return out

@@ -1117,8 +1131,16 @@ def main(args_in: Optional[List[str]] = None) -> None:
    if args.dump_single:
        model_plus = lazy_load_file(args.model)
        do_dump_model(model_plus)
+        return

-    model_plus = load_some_model(args.model)
+    if not args.vocab_only:
+        model_plus = load_some_model(args.model)
+    else:
+        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
+
+    if args.dump:
+        do_dump_model(model_plus)
+        return

    params = Params.load(model_plus)
    if params.n_ctx == -1:
@@ -1140,33 +1162,34 @@ def main(args_in: Optional[List[str]] = None) -> None:

    vocab: Vocab
    if args.vocab_only:
-        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
        assert args.outfile, "need --outfile if using --vocab-only"
+        # FIXME: Try to respect vocab_dir somehow?
+        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
+        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
        outfile = args.outfile
-        OutputFile.write_vocab_only(outfile, params, vocab)
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
        print(f"Wrote {outfile}")
+        return
+
+    if model_plus.vocab is not None and args.vocab_dir is None:
+        vocab = model_plus.vocab
    else:
-        if args.dump:
-            do_dump_model(model_plus)
-            return
+        vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
+        vocab = load_vocab(vocab_dir, args.vocabtype)
+    # FIXME: Try to respect vocab_dir somehow?
+    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')

-        if model_plus.vocab is not None and args.vocab_dir is None:
-            vocab = model_plus.vocab
-        else:
-            vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
-            vocab = load_vocab(vocab_dir, args.vocabtype)
+    model   = model_plus.model
+    model   = convert_model_names(model, params)
+    ftype   = pick_output_type(model, args.outtype)
+    model   = convert_to_output_type(model, ftype)
+    outfile = args.outfile or default_outfile(model_plus.paths, ftype)

-        model   = model_plus.model
-        model   = convert_model_names(model, params)
-        ftype   = pick_output_type(model, args.outtype)
-        model   = convert_to_output_type(model, ftype)
-        outfile = args.outfile or default_outfile(model_plus.paths, ftype)
+    params.ftype = ftype
+    print(f"Writing {outfile}, format {ftype}")

-        params.ftype = ftype
-        print(f"Writing {outfile}, format {ftype}")
-
-        OutputFile.write_all(outfile, ftype, params, model, vocab, concurrency = args.concurrency)
-        print(f"Wrote {outfile}")
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
+    print(f"Wrote {outfile}")


 if __name__ == '__main__':
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -25,7 +25,7 @@ else()
    add_subdirectory(simple)
    add_subdirectory(embd-input)
    add_subdirectory(llama-bench)
-    add_subdirectory(beam_search)
+    add_subdirectory(beam-search)
    if (LLAMA_METAL)
        add_subdirectory(metal)
    endif()
--- a/examples/beam-search/CMakeLists.txt
+++ b/examples/beam-search/CMakeLists.txt
@@ -1,5 +1,5 @@
-set(TARGET beam_search)
-add_executable(${TARGET} beam_search.cpp)
+set(TARGET beam-search)
+add_executable(${TARGET} beam-search.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/beam-search/beam-search.cpp
+++ b/examples/beam-search/beam-search.cpp
--- a/examples/llm.vim
+++ b/examples/llm.vim
@@ -8,7 +8,7 @@ function! Llm()
  let buffer_content = join(getline(1, '$'), "\n")

  " Create the JSON payload
-  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false}
+  let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":256,"stop": ["\n\n\n"],"stream": v:false}
  let json_payload.prompt = buffer_content

  " Define the curl command
@@ -25,3 +25,4 @@ function! Llm()
 endfunction

 command! Llm call Llm()
+noremap <F2> :Llm<CR>
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -204,23 +204,31 @@ typedef void (*ggml_cuda_op_t)(
 // QR = QK / number of values before dequantization
 // QI = number of 32 bit integers before dequantization

+#define Q4_0DM   (1.0f/8.0f)
+#define Q4_0D(x) (((x)*Q4_0DM) / 127.0f)
+
 #define QK4_0 32
 #define QR4_0 2
 #define QI4_0 (QK4_0 / (4 * QR4_0))
 typedef struct {
-    half    d;              // delta
+    int8_t  d;              // delta
    uint8_t qs[QK4_0 / 2];  // nibbles / quants
 } block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+static_assert(sizeof(block_q4_0) == sizeof(int8_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define Q4_1DM   (2.0f/15.0f)
+#define Q4_1MM   (2.0f      )
+#define Q4_1D(x) (        (((x) &  0xFF)*Q4_1DM) / 255.0f)
+#define Q4_1M(x) (-1.0f + (((x) >>    8)*Q4_1MM) / 255.0f)

 #define QK4_1 32
 #define QR4_1 2
 #define QI4_1 (QK4_1 / (4 * QR4_1))
 typedef struct {
-    half2   dm;             // dm.x = delta, dm.y = min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+    uint16_t dm;             // 8-bit delta + 8-bit min (can be adjusted easily)
+    uint8_t  qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
-static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+static_assert(sizeof(block_q4_1) == sizeof(uint16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");

 #define QK5_0 32
 #define QR5_0 2
@@ -232,15 +240,20 @@ typedef struct {
 } block_q5_0;
 static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");

+#define Q5_1DM   (2.0f/31.0f)
+#define Q5_1MM   (2.0f      )
+#define Q5_1D(x) (        (((x) &  0x0F)*Q5_1DM) / 15.0f)
+#define Q5_1M(x) (-1.0f + (((x) >>    4)*Q5_1MM) / 15.0f)
+
 #define QK5_1 32
 #define QR5_1 2
 #define QI5_1 (QK5_1 / (4 * QR5_1))
 typedef struct {
-    half2 dm;               // dm.x = delta, dm.y = min
+    uint8_t dm;             // 4-bit delta + 4-bit min
    uint8_t qh[4];          // 5-th bit of quants
    uint8_t qs[QK5_1 / 2];  // nibbles / quants
 } block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+static_assert(sizeof(block_q5_1) == sizeof(uint8_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");

 #define QK8_0 32
 #define QR8_0 1
@@ -506,7 +519,7 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
 static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q4_0 * x = (const block_q4_0 *) vx;

-    const dfloat d = x[ib].d;
+    const dfloat d = Q4_0D(x[ib].d);

    const int vui = x[ib].qs[iqs];

@@ -525,8 +538,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
 static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q4_1 * x = (const block_q4_1 *) vx;

-    const dfloat d = __low2half(x[ib].dm);
-    const dfloat m = __high2half(x[ib].dm);
+    const dfloat d = Q4_1D(x[ib].dm);
+    const dfloat m = Q4_1M(x[ib].dm);

    const int vui = x[ib].qs[iqs];

@@ -568,8 +581,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
 static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
    const block_q5_1 * x = (const block_q5_1 *) vx;

-    const dfloat d = __low2half(x[ib].dm);
-    const dfloat m = __high2half(x[ib].dm);
+    const dfloat d = Q5_1D(x[ib].dm);
+    const dfloat m = Q5_1M(x[ib].dm);

    uint32_t qh;
    memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -2041,7 +2054,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
    }

-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, Q4_0D(bq4_0->d), bq8_1->ds);
 }

 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2135,7 +2148,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
    }

-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
+    const float d = Q4_1D(bq4_1->dm);
+    const float m = Q4_1M(bq4_1->dm);
+
+    const float2 dm = {d, m};
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, dm, bq8_1->ds);
 }

 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2341,7 +2359,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
    }

-    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
+    const float d = Q5_1D(bq4_1->dm);
+    const float m = Q5_1M(bq4_1->dm);
+
+    const float2 dm = {d, m};
+
+    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, dm, bq8_1->ds);
 }

 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -697,6 +697,9 @@ void ggml_metal_graph_compute(
                        } break;
                    case GGML_OP_MUL:
                        {
+                            GGML_ASSERT(ne00 % 4 == 0);
+                            const int64_t nb = ne00/4;
+
                            if (ggml_nelements(src1) == ne10) {
                                // src1 is a row
                                [encoder setComputePipelineState:ctx->pipeline_mul_row];
@@ -706,9 +709,9 @@ void ggml_metal_graph_compute(
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&nb     length:sizeof(nb) atIndex:3];

-                            const int64_t n = ggml_nelements(dst);
+                            const int64_t n = ggml_nelements(dst)/4;

                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                        } break;
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -4,17 +4,22 @@ using namespace metal;

 #define MAX(x, y) ((x) > (y) ? (x) : (y))

+#define Q4_0DM   (1.0f/8.0f)
+#define Q4_0D(x) (((x)*Q4_0DM) / 127.0f)
 #define QK4_0 32
 #define QR4_0 2
 typedef struct {
-    half    d;             // delta
+    int8_t  d;             // delta
    uint8_t qs[QK4_0 / 2]; // nibbles / quants
 } block_q4_0;

+#define Q4_1DM   (2.0f/15.0f)
+#define Q4_1MM   (2.0f      )
+#define Q4_1D(x) (        (((x) &  0xFF)*Q4_1DM) / 255.0f)
+#define Q4_1M(x) (-1.0f + (((x) >>    8)*Q4_1MM) / 255.0f)
 #define QK4_1 32
 typedef struct {
-    half d;          // delta
-    half m;          // min
+    uint16_t dm;
    uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;

@@ -44,9 +49,9 @@ kernel void kernel_add_row(
 }

 kernel void kernel_mul(
-        device const float * src0,
-        device const float * src1,
-        device       float * dst,
+        device const float4 * src0,
+        device const float4 * src1,
+        device       float4 * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] * src1[tpig];
 }
@@ -54,12 +59,12 @@ kernel void kernel_mul(
 // assumption: src1 is a row
 // broadcast src1 into src0
 kernel void kernel_mul_row(
-        device const float * src0,
-        device const float * src1,
-        device       float * dst,
-        constant   int64_t & ne00,
+        device const float4 * src0,
+        device const float4 * src1,
+        device       float4 * dst,
+        constant    int64_t & nb,
        uint tpig[[thread_position_in_grid]]) {
-    dst[tpig] = src0[tpig] * src1[tpig % ne00];
+    dst[tpig] = src0[tpig] * src1[tpig % nb];
 }

 kernel void kernel_scale(
@@ -314,14 +319,18 @@ kernel void kernel_rms_norm(
 // we assume that the yl's have been multiplied with the appropriate scale factor
 // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
 inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thread float * yl, int il) {
-    float d = qb_curr->d;
+    float d = Q4_0D(qb_curr->d);
    float2 acc = 0.f;
-    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
+    device const uint8_t * qs = ((device const uint8_t *)qb_curr->qs + il);
+    uint16_t qs16;
    for (int i = 0; i < 8; i+=2) {
-        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
-                + yl[i + 1] * (qs[i / 2] & 0x0F00);
-        acc[1] += yl[i + 8] * (qs[i / 2] & 0x00F0)
-                + yl[i + 9] * (qs[i / 2] & 0xF000);
+        qs16 = qs[i+1];
+        qs16 <<= 8;
+        qs16 |= qs[i];
+        acc[0] += yl[i + 0] * (qs16 & 0x000F)
+                + yl[i + 1] * (qs16 & 0x0F00);
+        acc[1] += yl[i + 8] * (qs16 & 0x00F0)
+                + yl[i + 9] * (qs16 & 0xF000);
    }
    return d * (sumy * -8.f + acc[0] + acc[1]);
 }
@@ -331,9 +340,9 @@ inline float block_q_n_dot_y(device const block_q4_0 * qb_curr, float sumy, thre
 // we assume that the yl's have been multiplied with the appropriate scale factor
 // that corresponds to the missing bit shifts (1, 1/16, 1/256, 1/4096)
 inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thread float * yl, int il) {
-    float d = qb_curr->d;
-    float m = qb_curr->m;
-    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 2 + il/2);
+    float d = Q4_1D(qb_curr->dm);
+    float m = Q4_1M(qb_curr->dm);
+    device const uint16_t * qs = ((device const uint16_t *)qb_curr + 1 + il/2);
    float2 acc = 0.f;
    for (int i = 0; i < 8; i+=2) {
        acc[0] += yl[i + 0] * (qs[i / 2] & 0x000F)
@@ -1686,23 +1695,27 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)

 template <typename type4x4>
 void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 1);
-    const half d = il ? (xb->d / 16.h) : xb->d;
+    device const uint8_t * qs = ((device const uint8_t *)xb->qs);
+    const half d = il ? (Q4_0D(xb->d) / 16.h) : Q4_0D(xb->d);
    const half m = il ? ( -8.h * 16.h) : -8.h;
    const ushort mask0 = il ? 0x00F0 : 0x000F;
    const ushort mask1 = il ? 0xF000 : 0x0F00;

+    uint16_t qs16;
    for (int i=0;i<8;i++) {
-        reg[i/2][2*(i%2)]   = (((qs[i] & mask0)     ) + m) * d;
-        reg[i/2][2*(i%2)+1] = (((qs[i] & mask1) >> 8) + m) * d;
+        qs16 = qs[2*i+1];
+        qs16 <<= 8;
+        qs16 |= qs[2*i];
+        reg[i/2][2*(i%2)]   = (((qs16 & mask0)     ) + m) * d;
+        reg[i/2][2*(i%2)+1] = (((qs16 & mask1) >> 8) + m) * d;
    }
 }

 template <typename type4x4>
 void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
-    device const uint16_t * qs = ((device const uint16_t *)xb + 2);
-    const half d = il ? (xb->d / 16.h) : xb->d;
-    const half m = xb->m;
+    device const uint16_t * qs = ((device const uint16_t *)xb + 1);
+    const half d = il ? (Q4_1D(xb->dm) / 16.h) : Q4_1D(xb->dm);
+    const half m = Q4_1M(xb->dm);
    const ushort mask0 = il ? 0x00F0 : 0x000F;
    const ushort mask1 = il ? 0xF000 : 0x0F00;

--- a/ggml.c
+++ b/ggml.c
@@ -887,20 +887,28 @@ inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
 #endif
 #endif

+// we know the values are in the [-1 .. 1] range, so abs(d) cannot be more than 1/8 when using 4 bits
+#define Q4_0DM   (1.0f/8.0f)
+#define Q4_0D(x) (((x)*Q4_0DM) / 127.0f)
+
 #define QK4_0 32
 typedef struct {
-    ggml_fp16_t d;          // delta
+    int8_t  d;              // delta
    uint8_t qs[QK4_0 / 2];  // nibbles / quants
 } block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+static_assert(sizeof(block_q4_0) == sizeof(int8_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define Q4_1DM   (2.0f/15.0f)
+#define Q4_1MM   (2.0f      )
+#define Q4_1D(x) (        (((x) &  0xFF)*Q4_1DM) / 255.0f)
+#define Q4_1M(x) (-1.0f + (((x) >>    8)*Q4_1MM) / 255.0f)

 #define QK4_1 32
 typedef struct {
-    ggml_fp16_t d;          // delta
-    ggml_fp16_t m;          // min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+    uint16_t dm;             // 8-bit delta + 8-bit min (can be adjusted easily)
+    uint8_t  qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
-static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
+static_assert(sizeof(block_q4_1) == sizeof(uint16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");

 #define QK5_0 32
 typedef struct {
@@ -910,14 +918,21 @@ typedef struct {
 } block_q5_0;
 static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");

+// we know the values are in the [-1 .. 1] range, so:
+//  - d is unsigned 4-bit that represents maximum value of 2.0/31 when using 5 bits
+//  - m is unsigned 4-bit that represents offset from -1.0 which cannot be more than 2.0
+#define Q5_1DM   (2.0f/31.0f)
+#define Q5_1MM   (2.0f      )
+#define Q5_1D(x) (        (((x) &  0x0F)*Q5_1DM) / 15.0f)
+#define Q5_1M(x) (-1.0f + (((x) >>    4)*Q5_1MM) / 15.0f)
+
 #define QK5_1 32
 typedef struct {
-    ggml_fp16_t d;         // delta
-    ggml_fp16_t m;         // min
+    uint8_t dm;            // 4-bit delta + 4-bit min (can be adjusted easily)
    uint8_t qh[4];         // 5-th bit of quants
    uint8_t qs[QK5_1 / 2]; // nibbles / quants
 } block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+static_assert(sizeof(block_q5_1) == sizeof(uint8_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");

 #define QK8_0 32
 typedef struct {
@@ -954,10 +969,13 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
            }
        }

-        const float d  = max / -8;
-        const float id = d ? 1.0f/d : 0.0f;
+        float d = max / -8;

-        y[i].d = GGML_FP32_TO_FP16(d);
+        y[i].d = (int8_t)(ceilf((127.0f * d) / Q4_0DM));
+
+        d = Q4_0D(y[i].d);
+
+        const float id = d ? 1.0f/d : 0.0f;

        for (int j = 0; j < qk/2; ++j) {
            const float x0 = x[i*qk + 0    + j]*id;
@@ -994,11 +1012,17 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
            if (v > max) max = v;
        }

-        const float d  = (max - min) / ((1 << 4) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
+        y[i].dm = (uint16_t)(floorf((255.0f * (min + 1.0f)) / Q4_1MM)) << 8;

-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
+        min = Q4_1M(y[i].dm);
+
+        float d = (max - min) / ((1 << 4) - 1);
+
+        y[i].dm |= (uint16_t)(ceilf((255.0f * d) / Q4_1DM));
+
+        d = Q4_1D(y[i].dm);
+
+        const float id = d ? 1.0f/d : 0.0f;

        for (int j = 0; j < qk/2; ++j) {
            const float x0 = (x[i*qk + 0    + j] - min)*id;
@@ -1083,11 +1107,17 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
            if (v > max) max = v;
        }

-        const float d  = (max - min) / ((1 << 5) - 1);
-        const float id = d ? 1.0f/d : 0.0f;
+        y[i].dm = (uint8_t)(floorf((15.0f * (min + 1.0f)) / Q5_1MM)) << 4;

-        y[i].d = GGML_FP32_TO_FP16(d);
-        y[i].m = GGML_FP32_TO_FP16(min);
+        min = Q5_1M(y[i].dm);
+
+        float d = (max - min) / ((1 << 5) - 1);
+
+        y[i].dm |= (uint8_t)(ceilf((15.0f * d) / Q5_1DM));
+
+        d = Q5_1D(y[i].dm);
+
+        const float id = d ? 1.0f/d : 0.0f;

        uint32_t qh = 0;

@@ -1525,7 +1555,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
    const int nb = k / qk;

    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
+        const float d = Q4_0D(x[i].d);

        for (int j = 0; j < qk/2; ++j) {
            const int x0 = (x[i].qs[j] & 0x0F) - 8;
@@ -1545,8 +1575,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
    const int nb = k / qk;

    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
+        const float d = Q4_1D(x[i].dm);
+        const float m = Q4_1M(x[i].dm);

        for (int j = 0; j < qk/2; ++j) {
            const int x0 = (x[i].qs[j] & 0x0F);
@@ -1592,8 +1622,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict
    const int nb = k / qk;

    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-        const float m = GGML_FP16_TO_FP32(x[i].m);
+        const float d = Q5_1D(x[i].dm);
+        const float m = Q5_1M(x[i].dm);

        uint32_t qh;
        memcpy(&qh, x[i].qh, sizeof(qh));
@@ -2476,8 +2506,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);

-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), Q4_0D(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), Q4_0D(x1->d)*GGML_FP16_TO_FP32(y1->d));
 #else
        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
@@ -2494,8 +2524,8 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));

-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), Q4_0D(x0->d)*GGML_FP16_TO_FP32(y0->d));
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), Q4_0D(x1->d)*GGML_FP16_TO_FP32(y1->d));
 #endif
    }

@@ -2507,7 +2537,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
    // Main loop
    for (int i = 0; i < nb; ++i) {
        /* Compute combined scale for the block */
-        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
+        const __m256 d = _mm256_set1_ps( Q4_0D(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );

        __m256i bx = bytes_from_nibbles_32(x[i].qs);

@@ -2531,7 +2561,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
    // Main loop
    for (int i = 0; i < nb; ++i) {
        // Compute combined scale for the block
-        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
+        const __m256 d = _mm256_set1_ps( Q4_0D(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );

        const __m128i lowMask = _mm_set1_epi8(0xF);
        const __m128i off = _mm_set1_epi8(8);
@@ -2573,7 +2603,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
        _mm_prefetch(&y[0] + sizeof(block_q8_0), _MM_HINT_T0);

        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );
+        const __m128 d_0_1 = _mm_set1_ps( Q4_0D(x[0].d) * GGML_FP16_TO_FP32(y[0].d) );

        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[0].qs);

@@ -2591,7 +2621,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
        _mm_prefetch(&y[1] + sizeof(block_q8_0), _MM_HINT_T0);

        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );
+        const __m128 d_2_3 = _mm_set1_ps( Q4_0D(x[1].d) * GGML_FP16_TO_FP32(y[1].d) );

        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[1].qs);

@@ -2625,7 +2655,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
        _mm_prefetch(&y[i] + sizeof(block_q8_0), _MM_HINT_T0);

        // Compute combined scale for the block 0 and 1
-        const __m128 d_0_1 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );
+        const __m128 d_0_1 = _mm_set1_ps( Q4_0D(x[i].d) * GGML_FP16_TO_FP32(y[i].d) );

        const __m128i tmp_0_1 = _mm_loadu_si128((const __m128i *)x[i].qs);

@@ -2643,7 +2673,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
        _mm_prefetch(&y[i] + 2 * sizeof(block_q8_0), _MM_HINT_T0);

        // Compute combined scale for the block 2 and 3
-        const __m128 d_2_3 = _mm_set1_ps( GGML_FP16_TO_FP32(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );
+        const __m128 d_2_3 = _mm_set1_ps( Q4_0D(x[i + 1].d) * GGML_FP16_TO_FP32(y[i + 1].d) );

        const __m128i tmp_2_3 = _mm_loadu_si128((const __m128i *)x[i + 1].qs);

@@ -2691,7 +2721,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
        }

-        sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
+        sumf += sumi*Q4_0D(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
    }

    *s = sumf;
@@ -2721,7 +2751,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
        const block_q8_1 * restrict y0 = &y[i + 0];
        const block_q8_1 * restrict y1 = &y[i + 1];

-        summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
+        summs += Q4_1M(x0->dm) * y0->s + Q4_1M(x1->dm) * y1->s;

        const uint8x16_t m4b = vdupq_n_u8(0x0F);

@@ -2745,8 +2775,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);

-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), Q4_1D(x0->dm)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), Q4_1D(x1->dm)*y1->d);
 #else
        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
@@ -2763,8 +2793,8 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));

-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), Q4_1D(x0->dm)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), Q4_1D(x1->dm)*y1->d);
 #endif
    }

@@ -2777,10 +2807,10 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *

    // Main loop
    for (int i = 0; i < nb; ++i) {
-        const float d0 = GGML_FP16_TO_FP32(x[i].d);
+        const float d0 = Q4_1D(x[i].dm);
        const float d1 = y[i].d;

-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
+        summs += Q4_1M(x[i].dm) * y[i].s;

        const __m256 d0v = _mm256_set1_ps( d0 );
        const __m256 d1v = _mm256_set1_ps( d1 );
@@ -2817,7 +2847,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
            sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
        }

-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
+        sumf += (Q4_1D(x[i].dm)*y[i].d)*sumi + Q4_1M(x[i].dm)*y[i].s;
    }

    *s = sumf;
@@ -3096,8 +3126,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *

        const uint8x16_t m4b = vdupq_n_u8(0x0F);

-        summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
-        summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
+        summs0 += Q5_1M(x0->dm) * y0->s;
+        summs1 += Q5_1M(x1->dm) * y1->s;

        // extract the 5th bit via lookup table ((b) << 4)
        memcpy(&qh0, x0->qh, sizeof(qh0));
@@ -3142,10 +3172,10 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
 #if defined(__ARM_FEATURE_DOTPROD)
        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
+                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), Q5_1D(x0->dm)*y0->d);
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
+                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), Q5_1D(x1->dm)*y1->d);
 #else
        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
@@ -3162,8 +3192,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));

-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
+        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), Q5_1D(x0->dm)*y0->d);
+        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), Q5_1D(x1->dm)*y1->d);
 #endif
    }

@@ -3181,7 +3211,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
        const block_q5_1 * restrict x0 = &x[i];
        const block_q8_1 * restrict y0 = &y[i];

-        summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
+        summs += Q5_1M(x0->dm) * y0->s;

        const v128_t m4b = wasm_i8x16_splat(0x0F);

@@ -3228,7 +3258,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
-                    wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
+                    wasm_f32x4_splat(Q5_1D(x0->dm) * y0->d)));
    }

    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
@@ -3241,9 +3271,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *

    // Main loop
    for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
+        const __m256 dx = _mm256_set1_ps(Q5_1D(x[i].dm));

-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
+        summs += Q5_1M(x[i].dm) * y[i].s;

        __m256i bx = bytes_from_nibbles_32(x[i].qs);
        __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -3268,9 +3298,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *

    // Main loop
    for (int i = 0; i < nb; i++) {
-        const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
+        const __m256 dx = _mm256_set1_ps(Q5_1D(x[i].dm));

-        summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
+        summs += Q5_1M(x[i].dm) * y[i].s;

        __m256i bx = bytes_from_nibbles_32(x[i].qs);
        const __m256i bxhi = bytes_from_bits_32(x[i].qh);
@@ -3313,7 +3343,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
            sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
        }

-        sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
+        sumf += (Q5_1D(x[i].dm)*y[i].d)*sumi + Q5_1M(x[i].dm)*y[i].s;
    }

    *s = sumf;
@@ -5491,7 +5521,7 @@ struct ggml_tensor * ggml_sum_rows(
    }

    int64_t ne[4] = {1,1,1,1};
-    for (int i=1; i<a->n_dims; ++i) {
+    for (int i = 1; i < a->n_dims; ++i) {
        ne[i] = a->ne[i];
    }

@@ -9316,6 +9346,13 @@ static void ggml_compute_forward_mul_f32(

    const int64_t nr = ggml_nrows(src0);

+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
    GGML_TENSOR_BINARY_OP_LOCALS;

    GGML_ASSERT( nb0 == sizeof(float));
@@ -9323,7 +9360,7 @@ static void ggml_compute_forward_mul_f32(
    GGML_ASSERT(ne00 == ne10);

    if (nb10 == sizeof(float)) {
-        for (int64_t ir = ith; ir < nr; ir += nth) {
+        for (int64_t ir = ir0; ir < ir1; ++ir) {
            // src0 and dst are same shape => same indices
            const int64_t i03 = ir/(ne02*ne01);
            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
@@ -9337,19 +9374,11 @@ static void ggml_compute_forward_mul_f32(
            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);

-#ifdef GGML_USE_ACCELERATE
-            UNUSED(ggml_vec_mul_f32);
-
-            vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr,  1, ne00);
-#else
            ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
-#endif
-                // }
-            // }
        }
    } else {
        // src1 is not contiguous
-        for (int64_t ir = ith; ir < nr; ir += nth) {
+        for (int64_t ir = ir0; ir < ir1; ++ir) {
            // src0 and dst are same shape => same indices
            // src1 is broadcastable across src0 and dst in i1, i2, i3
            const int64_t i03 = ir/(ne02*ne01);
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@@ -27,8 +27,25 @@ In this case, upgrade Pip to the latest:
 pip install --upgrade pip
 ```

-## Publishing
-To publish the package, you need to have `twine` and `build` installed:
+## Automatic publishing with CI
+
+There's a GitHub workflow to make a release automatically upon creation of tags in a specified format.
+
+1. Bump the version in `pyproject.toml`.
+2. Create a tag named `gguf-vx.x.x` where `x.x.x` is the semantic version number.
+
+```sh
+git tag -a gguf-v1.0.0 -m "Version 1.0 release"
+```
+
+3. Push the tags.
+
+```sh
+git push origin --tags
+```
+
+## Manual publishing
+If you want to publish the package manually for any reason, you need to have `twine` and `build` installed:

 ```sh
 pip install build twine
@@ -36,7 +53,7 @@ pip install build twine

 Then, folow these steps to release a new version:

-1. Update the version in `pyproject.toml`.
+1. Bump the version in `pyproject.toml`.
 2. Build the package:

 ```sh
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -4,9 +4,13 @@ import sys
 import struct
 import tempfile
 import numpy as np
+import json
+import os
+from pathlib import Path

 from enum import IntEnum, auto
-from typing import Any, IO, List, Optional
+from io import BufferedWriter
+from typing import Any, BinaryIO, Callable, IO, Dict, List, Optional, Sequence, Tuple, Union

 #
 # constants
@@ -71,35 +75,35 @@ KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"


 class MODEL_ARCH(IntEnum):
-    LLAMA   = auto()
-    FALCON  = auto()
-    GPT2    = auto()
-    GPTJ    = auto()
-    GPTNEOX = auto()
-    MPT     = auto()
+    LLAMA  : int = auto()
+    FALCON : int = auto()
+    GPT2   : int = auto()
+    GPTJ   : int = auto()
+    GPTNEOX: int = auto()
+    MPT    : int = auto()


 class MODEL_TENSOR(IntEnum):
-    TOKEN_EMBD    = auto()
-    POS_EMBD      = auto()
-    OUTPUT        = auto()
-    OUTPUT_NORM   = auto()
-    ROPE_FREQS    = auto()
-    ATTN_Q        = auto()
-    ATTN_K        = auto()
-    ATTN_V        = auto()
-    ATTN_QKV      = auto()
-    ATTN_OUT      = auto()
-    ATTN_NORM     = auto()
-    ATTN_NORM_2   = auto()
-    ATTN_ROT_EMBD = auto()
-    FFN_GATE      = auto()
-    FFN_DOWN      = auto()
-    FFN_UP        = auto()
-    FFN_NORM      = auto()
+    TOKEN_EMBD   : int = auto()
+    POS_EMBD     : int = auto()
+    OUTPUT       : int = auto()
+    OUTPUT_NORM  : int = auto()
+    ROPE_FREQS   : int = auto()
+    ATTN_Q       : int = auto()
+    ATTN_K       : int = auto()
+    ATTN_V       : int = auto()
+    ATTN_QKV     : int = auto()
+    ATTN_OUT     : int = auto()
+    ATTN_NORM    : int = auto()
+    ATTN_NORM_2  : int = auto()
+    ATTN_ROT_EMBD: int = auto()
+    FFN_GATE     : int = auto()
+    FFN_DOWN     : int = auto()
+    FFN_UP       : int = auto()
+    FFN_NORM     : int = auto()


-MODEL_ARCH_NAMES = {
+MODEL_ARCH_NAMES: Dict[MODEL_ARCH, str] = {
    MODEL_ARCH.LLAMA:   "llama",
    MODEL_ARCH.FALCON:  "falcon",
    MODEL_ARCH.GPT2:    "gpt2",
@@ -108,7 +112,7 @@ MODEL_ARCH_NAMES = {
    MODEL_ARCH.MPT:     "mpt",
 }

-MODEL_TENSOR_NAMES = {
+MODEL_TENSOR_NAMES: Dict[MODEL_ARCH, Dict[MODEL_TENSOR, str]] = {
    MODEL_ARCH.LLAMA: {
        MODEL_TENSOR.TOKEN_EMBD:    "token_embd",
        MODEL_TENSOR.OUTPUT_NORM:   "output_norm",
@@ -154,7 +158,7 @@ MODEL_TENSOR_NAMES = {
 }

 # tensors that will not be serialized
-MODEL_TENSOR_SKIP = {
+MODEL_TENSOR_SKIP: Dict[MODEL_ARCH, List[MODEL_TENSOR]] = {
    MODEL_ARCH.LLAMA: [
        MODEL_TENSOR.ROPE_FREQS,
        MODEL_TENSOR.ATTN_ROT_EMBD,
@@ -162,167 +166,198 @@ MODEL_TENSOR_SKIP = {
 }


-# TODO: the following helper functions should be removed
-#       instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR)
-#       however, my Python is very bad, and I couldn't figure out how to do this, hence these functions
-# REMOVE
-def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool:
-    for skip in MODEL_TENSOR_SKIP.get(arch, []):
-        for i in range(n_blocks):
-            if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i):
-                return True
+class TensorNameMap:
+    mappings_cfg: Dict[MODEL_TENSOR, Tuple[str, ...]] = {
+        # Token embeddings
+        MODEL_TENSOR.TOKEN_EMBD: (
+            "gpt_neox.embed_in",           # gptneox
+            "transformer.wte",             # gpt2 mpt
+            "transformer.word_embeddings", # falcon
+            "model.embed_tokens",          # llama-hf
+            "tok_embeddings",              # llama-pth
+        ),

-    return False
+        # Position embeddings
+        MODEL_TENSOR.POS_EMBD: (
+            "transformer.wpe", # gpt2
+        ),

+        # Output
+        MODEL_TENSOR.OUTPUT: (
+            "embed_out", # gptneox
+            "lm_head",   # gpt2 mpt falcon llama-hf
+            "output",    # llama-pth
+        ),

-def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict:
-    tensor_map = {}
+        # Output norm
+        MODEL_TENSOR.OUTPUT_NORM: (
+            "gpt_neox.final_layer_norm", # gptneox
+            "transformer.ln_f",          # gpt2 falcon
+            "model.norm",                # llama-hf
+            "norm",                      # llama-pth
+        ),

-    # Token embeddings
-    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None)
+        # Rope frequencies
+        MODEL_TENSOR.ROPE_FREQS: (
+            "rope.freqs", # llama-pth
+        ),
+    }

-    tensor_map["gpt_neox.embed_in"]           = mapped_to  # gptneox
-    tensor_map["transformer.wte"]             = mapped_to  # gpt2 mpt
-    tensor_map["transformer.word_embeddings"] = mapped_to  # falcon
-    tensor_map["model.embed_tokens"]          = mapped_to  # llama-hf
-    tensor_map["tok_embeddings"]              = mapped_to  # llama-pth
-
-    # Position embeddings
-    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None)
-
-    tensor_map["transformer.wpe"] = mapped_to  # gpt2
-
-    # Output
-    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None)
-
-    tensor_map["embed_out"] = mapped_to  # gptneox
-    tensor_map["lm_head"]   = mapped_to  # gpt2 mpt falcon llama-hf
-    tensor_map["output"]    = mapped_to  # llama-pth
-
-    # Output norm
-    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None)
-
-    tensor_map["gpt_neox.final_layer_norm"] = mapped_to  # gptneox
-    tensor_map["transformer.ln_f"]          = mapped_to  # gpt2 falcon
-    tensor_map["transformer.norm_f"]        = mapped_to  # mpt
-    tensor_map["model.norm"]                = mapped_to  # llama-hf
-    tensor_map["norm"]                      = mapped_to  # llama-pth
-
-    # Rope frequencies
-    mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None)
-
-    tensor_map["rope.freqs"] = mapped_to  # llama-pth
-
-    # Attention and feed-forward blocks
-    for i in range(0, n_blocks):
+    block_mappings_cfg: Dict[MODEL_TENSOR, Tuple[str, ...]] = {
        # Attention norm
-        # TODO: is there are simpler way to write these 2 lines in Python?
-        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None)
-        mapped_to = mapped_to.format(bid=i) if mapped_to else None
-
-        tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to  # gptneox
-        tensor_map["transformer.h."+str(i)+".ln_1"]              = mapped_to  # gpt2
-        tensor_map["transformer.blocks."+str(i)+".norm_1"]       = mapped_to  # mpt
-        tensor_map["transformer.h."+str(i)+".input_layernorm"]   = mapped_to  # falcon7b
-        tensor_map["transformer.h."+str(i)+".ln_mlp"]            = mapped_to  # falcon40b
-        tensor_map["model.layers."+str(i)+".input_layernorm"]    = mapped_to  # llama-hf
-        tensor_map["layers."+str(i)+".attention_norm"]           = mapped_to  # llama-pth
+        MODEL_TENSOR.ATTN_NORM: (
+            "gpt_neox.layers.{bid}.input_layernorm", # gptneox
+            "transformer.h.{bid}.ln_1",              # gpt2
+            "transformer.blocks.{bid}.norm_1",       # mpt
+            "transformer.h.{bid}.input_layernorm",   # falcon7b
+            "transformer.h.{bid}.ln_mlp",            # falcon40b
+            "model.layers.{bid}.input_layernorm",    # llama-hf
+            "layers.{bid}.attention_norm",           # llama-pth
+        ),

        # Attention norm 2
-        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None)
-        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
-
-        tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to  # falcon40b
+        MODEL_TENSOR.ATTN_NORM_2: (
+            "transformer.h.{bid}.ln_attn", # falcon40b
+        ),

        # Attention query-key-value
-        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None)
-        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
-
-        tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"]    = mapped_to  # gptneox
-        tensor_map["transformer.h."+str(i)+".attn.c_attn"]                    = mapped_to  # gpt2
-        tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"]                 = mapped_to  # mpt
-        tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to  # falcon
+        MODEL_TENSOR.ATTN_QKV: (
+            "gpt_neox.layers.{bid}.attention.query_key_value",    # gptneox
+            "transformer.h.{bid}.attn.c_attn",                    # gpt2
+            "transformer.blocks.{bid}.attn.Wqkv",                 # mpt
+            "transformer.h.{bid}.self_attention.query_key_value", # falcon
+        ),

        # Attention query
-        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None)
-        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
-
-        tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to  # llama-hf
-        tensor_map["layers."+str(i)+".attention.wq"]           = mapped_to  # llama-pth
+        MODEL_TENSOR.ATTN_Q: (
+            "model.layers.{bid}.self_attn.q_proj", # llama-hf
+            "layers.{bid}.attention.wq",           # llama-pth
+        ),

        # Attention key
-        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None)
-        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
-
-        tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to  # llama-hf
-        tensor_map["layers."+str(i)+".attention.wk"]           = mapped_to  # llama-pth
+        MODEL_TENSOR.ATTN_K: (
+            "model.layers.{bid}.self_attn.k_proj", # llama-hf
+            "layers.{bid}.attention.wk",           # llama-pth
+        ),

        # Attention value
-        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None)
-        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
-
-        tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to  # llama-hf
-        tensor_map["layers."+str(i)+".attention.wv"]           = mapped_to  # llama-pth
+        MODEL_TENSOR.ATTN_V: (
+            "model.layers.{bid}.self_attn.v_proj", # llama-hf
+            "layers.{bid}.attention.wv",           # llama-pth
+        ),

        # Attention output
-        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None)
-        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
-
-        tensor_map["gpt_neox.layers."+str(i)+".attention.dense"]    = mapped_to  # gptneox
-        tensor_map["transformer.h."+str(i)+".attn.c_proj"]          = mapped_to  # gpt2
-        tensor_map["transformer.blocks."+str(i)+".attn.out_proj"]   = mapped_to  # mpt
-        tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to  # falcon
-        tensor_map["model.layers."+str(i)+".self_attn.o_proj"]      = mapped_to  # llama-hf
-        tensor_map["layers."+str(i)+".attention.wo"]                = mapped_to  # llama-pth
+        MODEL_TENSOR.ATTN_OUT: (
+            "gpt_neox.layers.{bid}.attention.dense",    # gptneox
+            "transformer.h.{bid}.attn.c_proj",          # gpt2
+            "transformer.blocks.{bid}.attn.out_proj",   # mpt
+            "transformer.h.{bid}.self_attention.dense", # falcon
+            "model.layers.{bid}.self_attn.o_proj",      # llama-hf
+            "layers.{bid}.attention.wo",                # llama-pth
+        ),

        # Rotary embeddings
-        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None)
-        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
-
-        tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"]  = mapped_to  # llama-hf
-        tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to  # llama-pth
+        MODEL_TENSOR.ATTN_ROT_EMBD: (
+            "model.layers.{bid}.self_attn.rotary_emb.inv_freq",  # llama-hf
+            "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
+        ),

        # Feed-forward norm
-        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None)
-        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
-
-        tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to  # gptneox
-        tensor_map["transformer.h."+str(i)+".ln_2"]                       = mapped_to  # gpt2
-        tensor_map["transformer.blocks."+str(i)+".norm_2"]                = mapped_to  # mpt
-        tensor_map["model.layers."+str(i)+".post_attention_layernorm"]    = mapped_to  # llama-hf
-        tensor_map["layers."+str(i)+".ffn_norm"]                          = mapped_to  # llama-pth
+        MODEL_TENSOR.FFN_NORM: (
+            "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
+            "transformer.h.{bid}.ln_2",                       # gpt2
+            "transformer.blocks.{bid}.norm_2",                # mpt
+            "model.layers.{bid}.post_attention_layernorm",    # llama-hf
+            "layers.{bid}.ffn_norm",                          # llama-pth
+        ),

        # Feed-forward up
-        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None)
-        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
-
-        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to  # gptneox
-        tensor_map["transformer.h."+str(i)+".mlp.c_fc"]            = mapped_to  # gpt2
-        tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"]    = mapped_to  # mpt
-        tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"]   = mapped_to  # falcon
-        tensor_map["model.layers."+str(i)+".mlp.up_proj"]          = mapped_to  # llama-hf
-        tensor_map["layers."+str(i)+".feed_forward.w3"]            = mapped_to  # llama-pth
+        MODEL_TENSOR.FFN_UP: (
+            "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
+            "transformer.h.{bid}.mlp.c_fc",            # gpt2
+            "transformer.blocks.{bid}.ffn.up_proj",    # mpt
+            "transformer.h.{bid}.mlp.dense_h_to_4h",   # falcon
+            "model.layers.{bid}.mlp.up_proj",          # llama-hf
+            "layers.{bid}.feed_forward.w3",            # llama-pth
+        ),

        # Feed-forward gate
-        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None)
-        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
-
-        tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to  # llama-hf
-        tensor_map["layers."+str(i)+".feed_forward.w1"]     = mapped_to  # llama-pth
+        MODEL_TENSOR.FFN_GATE: (
+            "model.layers.{bid}.mlp.gate_proj", # llama-hf
+            "layers.{bid}.feed_forward.w1",     # llama-pth
+        ),

        # Feed-forward down
-        mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None)
-        mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None
+        MODEL_TENSOR.FFN_DOWN: (
+            "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
+            "transformer.h.{bid}.mlp.c_proj",          # gpt2
+            "transformer.blocks.{bid}.ffn.down_proj",  # mpt
+            "transformer.h.{bid}.mlp.dense_4h_to_h",   # falcon
+            "model.layers.{bid}.mlp.down_proj",        # llama-hf
+            "layers.{bid}.feed_forward.w2",            # llama-pth
+        ),
+    }

-        tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to  # gptneox
-        tensor_map["transformer.h."+str(i)+".mlp.c_proj"]          = mapped_to  # gpt2
-        tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"]  = mapped_to  # mpt
-        tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"]   = mapped_to  # falcon
-        tensor_map["model.layers."+str(i)+".mlp.down_proj"]        = mapped_to  # llama-hf
-        tensor_map["layers."+str(i)+".feed_forward.w2"]            = mapped_to  # llama-pth
+    mapping: Dict[str, Tuple[MODEL_TENSOR, str]]

-    return tensor_map
+    tensor_names: Dict[MODEL_TENSOR, str]

+    def __init__(self, arch: MODEL_ARCH, n_blocks: int):
+        mapping = self.mapping = {}
+        tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch]
+        for tensor, keys in self.mappings_cfg.items():
+            tensor_name = tensor_names.get(tensor)
+            if tensor_name is None:
+                continue
+            for key in keys:
+                mapping[key] = (tensor, tensor_name)
+        for bid in range(n_blocks):
+            for tensor, keys in self.block_mappings_cfg.items():
+                tensor_name = tensor_names.get(tensor)
+                if tensor_name is None:
+                    continue
+                tensor_name = tensor_name.format(bid = bid)
+                for key in keys:
+                    key = key.format(bid = bid)
+                    mapping[key] = (tensor, tensor_name)
+
+    def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> Optional[Tuple[MODEL_TENSOR, str]]:
+        result = self.mapping.get(key)
+        if result is not None:
+            return result
+        for suffix in try_suffixes:
+            if key.endswith(suffix):
+                result = self.mapping.get(key[:-len(suffix)])
+                if result is not None:
+                    return (result[0], result[1] + suffix)
+        return None
+
+    def get_name(self, key: str, try_suffixes: Sequence[str]) -> Optional[str]:
+        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
+        if result is None:
+            return None
+        return result[1]
+
+    def get_type(self, key: str, try_suffixes: Sequence[str]) -> Optional[MODEL_TENSOR]:
+        result = self.get_type_and_name(key, try_suffixes = try_suffixes)
+        if result is None:
+            return None
+        return result[0]
+
+    def __getitem__(self, key: str) -> str:
+        try:
+            return self.mapping[key][1]
+        except KeyError:
+            raise KeyError(key)
+
+    def __contains__(self, key: str) -> bool:
+        return key in self.mapping
+
+    def __repr__(self) -> str:
+        return repr(self.mapping)
+
+def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap:
+    return TensorNameMap(arch, n_blocks)

 class TokenType(IntEnum):
    NORMAL       = 1
@@ -388,15 +423,21 @@ class GGUFValueType(IntEnum):


 class GGUFWriter:
-    def __init__(self, path: str, arch: str, use_temp_file = True):
+    fout: BufferedWriter
+    arch: str
+    offset_tensor = 0
+    data_alignment = GGUF_DEFAULT_ALIGNMENT
+    kv_data = b""
+    kv_data_count = 0
+    ti_data = b""
+    ti_data_count = 0
+    use_temp_file: bool
+    temp_file: Optional[tempfile.SpooledTemporaryFile[bytes]] = None
+    tensors: List[Tuple[np.ndarray[Any, Any], int]]
+
+    def __init__(self, path: Union[os.PathLike[str], str], arch: str, use_temp_file = True):
        self.fout = open(path, "wb")
        self.arch = arch
-        self.offset_tensor = 0
-        self.data_alignment = GGUF_DEFAULT_ALIGNMENT
-        self.kv_data = b""
-        self.kv_data_count = 0
-        self.ti_data = b""
-        self.ti_data_count = 0
        self.add_architecture()
        self.use_temp_file = use_temp_file
        self.tensors = []
@@ -470,14 +511,27 @@ class GGUFWriter:
        self.add_key(key)
        self.add_val(val, GGUFValueType.STRING)

-    def add_array(self, key: str, val: list):
-        if not isinstance(val, list):
-            raise ValueError("Value must be a list for array type")
+    def add_array(self, key: str, val: Sequence[Any]):
+        if not isinstance(val, Sequence):
+            raise ValueError("Value must be a sequence for array type")

        self.add_key(key)
        self.add_val(val, GGUFValueType.ARRAY)

-    def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True):
+    _simple_value_packing = {
+        GGUFValueType.UINT8:   "<B",
+        GGUFValueType.INT8:    "<b",
+        GGUFValueType.UINT16:  "<H",
+        GGUFValueType.INT16:   "<h",
+        GGUFValueType.UINT32:  "<I",
+        GGUFValueType.INT32:   "<i",
+        GGUFValueType.FLOAT32: "<f",
+        GGUFValueType.UINT64:  "<Q",
+        GGUFValueType.INT64:   "<q",
+        GGUFValueType.FLOAT64: "<d",
+        GGUFValueType.BOOL:    "?" ,
+    }
+    def add_val(self, val: Any, vtype: Optional[GGUFValueType] = None, add_vtype: bool = True):
        if vtype is None:
            vtype = GGUFValueType.get_type(val)

@@ -485,47 +539,29 @@ class GGUFWriter:
            self.kv_data += struct.pack("<I", vtype)
            self.kv_data_count += 1

-        if vtype == GGUFValueType.UINT8:
-            self.kv_data += struct.pack("<B", val)
-        elif vtype == GGUFValueType.INT8:
-            self.kv_data += struct.pack("<b", val)
-        elif vtype == GGUFValueType.UINT16:
-            self.kv_data += struct.pack("<H", val)
-        elif vtype == GGUFValueType.INT16:
-            self.kv_data += struct.pack("<h", val)
-        elif vtype == GGUFValueType.UINT32:
-            self.kv_data += struct.pack("<I", val)
-        elif vtype == GGUFValueType.INT32:
-            self.kv_data += struct.pack("<i", val)
-        elif vtype == GGUFValueType.FLOAT32:
-            self.kv_data += struct.pack("<f", val)
-        elif vtype == GGUFValueType.UINT64:
-            self.kv_data += struct.pack("<Q", val)
-        elif vtype == GGUFValueType.INT64:
-            self.kv_data += struct.pack("<q", val)
-        elif vtype == GGUFValueType.FLOAT64:
-            self.kv_data += struct.pack("<d", val)
-        elif vtype == GGUFValueType.BOOL:
-            self.kv_data += struct.pack("?", val)
+        pack_fmt = self._simple_value_packing.get(vtype)
+        if pack_fmt is not None:
+            self.kv_data += struct.pack(pack_fmt, val)
        elif vtype == GGUFValueType.STRING:
            encoded_val = val.encode("utf8") if isinstance(val, str) else val
            self.kv_data += struct.pack("<Q", len(encoded_val))
            self.kv_data += encoded_val
-        elif vtype == GGUFValueType.ARRAY:
-            ltype = set([GGUFValueType.get_type(item) for item in val])
-            assert len(ltype) == 1, "All items in a GGUF array should be of the same type"
-            self.kv_data += struct.pack("<I", list(ltype)[0])
+        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
+            ltype = GGUFValueType.get_type(val[0])
+            if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
+                raise ValueError("All items in a GGUF array should be of the same type")
+            self.kv_data += struct.pack("<I", ltype)
            self.kv_data += struct.pack("<Q", len(val))
            for item in val:
                self.add_val(item, add_vtype=False)
        else:
-            raise ValueError("Invalid GGUF metadata value type")
+            raise ValueError("Invalid GGUF metadata value type or value")

    @staticmethod
    def ggml_pad(x: int, n: int) -> int:
        return ((x + n - 1) // n) * n

-    def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
+    def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: Union[np.dtype[np.float16], np.dtype[np.float32]], tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None):
        assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"

        encoded_name = name.encode("utf8")
@@ -544,16 +580,18 @@ class GGUFWriter:
        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
        self.ti_data_count += 1

-    def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
-        if self.use_temp_file and not hasattr(self, "temp_file"):
-            self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
-            self.temp_file.seek(0)
+    def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Optional[Sequence[int]] = None, raw_dtype: Optional[GGMLQuantizationType] = None):
+        if self.use_temp_file and self.temp_file is None:
+            fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
+            fp.seek(0)
+            self.temp_file = fp

-        self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
+        shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
+        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)

        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes

-        if not self.use_temp_file:
+        if  self.temp_file is None:
            self.tensors.append((tensor, pad))
            return

@@ -562,25 +600,22 @@ class GGUFWriter:
        if pad != 0:
            self.temp_file.write(bytes([0] * pad))

-    def write_tensor_data(self, tensor: np.ndarray):
-        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
+    def write_padding(self, fp: BinaryIO, n: int, align: Optional[int] = None):
+        pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n
        if pad != 0:
-            self.fout.write(bytes([0] * pad))
+            fp.write(bytes([0] * pad))

+    def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
+        self.write_padding(self.fout, self.fout.tell())
        tensor.tofile(self.fout)
-
-        pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes
-        if pad != 0:
-            self.fout.write(bytes([0] * pad))
+        self.write_padding(self.fout, tensor.nbytes)

    def write_tensors_to_file(self):
        self.write_ti_data_to_file()

-        pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
-        if pad != 0:
-            self.fout.write(bytes([0] * pad))
+        self.write_padding(self.fout, self.fout.tell())

-        if not self.use_temp_file:
+        if self.temp_file is None:
            for (currtensor, currpad) in self.tensors:
                currtensor.tofile(self.fout)
                if currpad != 0:
@@ -654,10 +689,6 @@ class GGUFWriter:
        self.add_bool(
            KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)

-    def add_tensor_data_layout(self, layout: str):
-        self.add_string(
-            KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
-
    def add_head_count(self, count: int):
        self.add_uint32(
            KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count)
@@ -695,16 +726,16 @@ class GGUFWriter:
    def add_tokenizer_model(self, model: str):
        self.add_string(KEY_TOKENIZER_MODEL, model)

-    def add_token_list(self, tokens: List):
+    def add_token_list(self, tokens: Union[Sequence[str], Sequence[bytes], Sequence[bytearray]]):
        self.add_array(KEY_TOKENIZER_LIST, tokens)

-    def add_token_merges(self, merges: List):
+    def add_token_merges(self, merges: Union[Sequence[str], Sequence[bytes], Sequence[bytearray]]):
        self.add_array(KEY_TOKENIZER_MERGES, merges)

-    def add_token_types(self, types: List[int]):
+    def add_token_types(self, types: Union[Sequence[TokenType], Sequence[int]]):
        self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types)

-    def add_token_scores(self, scores: List[float]):
+    def add_token_scores(self, scores: Sequence[float]):
        self.add_array(KEY_TOKENIZER_SCORES, scores)

    def add_bos_token_id(self, id: int):
@@ -723,6 +754,84 @@ class GGUFWriter:
        self.add_uint32(KEY_TOKENIZER_PAD_ID, id)


+class SpecialVocab:
+    load_merges: bool = False
+    merges: List[str] = []
+    special_token_types: Tuple[str, ...] = tuple(('bos', 'eos', 'unk', 'sep', 'pad'))
+    special_token_ids: Dict[str, int] = {}
+
+    def __init__(self, path: Path, load_merges: bool = False, special_token_types: Optional[Tuple[str, ...]] = None):
+        self.special_token_ids = {}
+        self.load_merges = load_merges
+        if special_token_types is not None:
+            self.special_token_types = special_token_types
+        self.load(path)
+
+    def load(self, path: Path):
+        if not self.try_load_from_tokenizer_json(path):
+            self.try_load_from_config_json(path)
+
+    def try_load_from_tokenizer_json(self, path: Path) -> bool:
+        tokenizer_file = path / 'tokenizer.json'
+        if not tokenizer_file.is_file():
+            return False
+        with open(tokenizer_file, 'r', encoding = 'utf-8') as f:
+            tokenizer = json.load(f)
+        if self.load_merges:
+            merges = tokenizer.get('model', {}).get('merges')
+            if isinstance(merges, list) and len(merges) > 0 and isinstance(merges[0], str):
+                self.merges = merges
+        tokenizer_config_file = path / 'tokenizer_config.json'
+        added_tokens = tokenizer.get('added_tokens')
+        if added_tokens is None or not tokenizer_config_file.is_file():
+            return True
+        with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f:
+            tokenizer_config = json.load(f)
+        for typ in self.special_token_types:
+            entry = tokenizer_config.get(f'{typ}_token')
+            if isinstance(entry, str):
+                tc_content = entry
+            elif isinstance(entry, dict):
+                entry_content = entry.get('content')
+                if not isinstance(entry_content, str):
+                    continue
+                tc_content = entry_content
+            else:
+                continue
+            for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
+                if isinstance(maybe_token_id, int):
+                    self.special_token_ids[typ] = maybe_token_id
+                break
+        return True
+
+    def try_load_from_config_json(self, path: Path) -> bool:
+        config_file = path / 'config.json'
+        if not config_file.is_file():
+            return False
+        with open(config_file, 'r', encoding = 'utf-8') as f:
+            config = json.load(f)
+        for typ in self.special_token_types:
+            maybe_token_id = config.get(f'{typ}_token_id')
+            if isinstance(maybe_token_id, int):
+                self.special_token_ids[typ] = maybe_token_id
+        return True
+
+    def add_to_gguf(self, gw: GGUFWriter):
+        if len(self.merges) > 0:
+            print(f'gguf: Adding {len(self.merges)} merge(s).')
+            gw.add_token_merges(self.merges)
+        for typ, tokid in self.special_token_ids.items():
+            handler: Optional[Callable[[int], None]] = getattr(gw, f'add_{typ}_token_id', None)
+            if handler is None:
+                print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
+                continue
+            print(f'gguf: Setting special token type {typ} to {tokid}')
+            handler(tokid)
+
+    def __repr__(self):
+        return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids if self.special_token_ids else "unset"}>'
+
+
 # Example usage:
 if __name__ == "__main__":
    # Example usage with a file
--- a/gguf-py/gguf/py.typed
+++ b/gguf-py/gguf/py.typed
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -5,6 +5,7 @@ description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [
    {include = "gguf"},
+    {include = "gguf/py.typed"},
 ]
 readme = "README.md"
 homepage = "https://ggml.ai"
--- a/llama.cpp
+++ b/llama.cpp
@@ -901,6 +901,11 @@ struct llama_layer {
    struct ggml_tensor * wo;
    struct ggml_tensor * wqkv;

+    struct ggml_tensor * wq_a;
+    struct ggml_tensor * wk_a;
+    struct ggml_tensor * wv_a;
+    struct ggml_tensor * wo_a;
+
    // normalization
    struct ggml_tensor * ffn_norm;

@@ -908,6 +913,10 @@ struct llama_layer {
    struct ggml_tensor * w1; // ffn_gate
    struct ggml_tensor * w2; // ffn_down
    struct ggml_tensor * w3; // ffn_up
+
+    struct ggml_tensor * w1_a;
+    struct ggml_tensor * w2_a;
+    struct ggml_tensor * w3_a;
 };

 struct llama_kv_cache {
@@ -1927,17 +1936,29 @@ static void llm_load_tensors(
                        layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa}, backend_split);
                        layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},     backend_split);

+                        layer.wq_a = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q,   "weight.a", i), {n_embd},     backend);
+                        layer.wk_a = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K,   "weight.a", i), {n_embd_gqa}, backend);
+                        layer.wv_a = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V,   "weight.a", i), {n_embd_gqa}, backend);
+                        layer.wo_a = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight.a", i), {n_embd},     backend);
+
                        layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);

                        layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff}, backend_split);
                        layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
                        layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);

+                        layer.w1_a = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight.a", i), {  n_ff}, backend);
+                        layer.w2_a = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight.a", i), {n_embd}, backend);
+                        layer.w3_a = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight.a", i), {  n_ff}, backend);
+
                        if (backend == GGML_BACKEND_GPU) {
                            vram_weights +=
-                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)       +
-                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
-                                ggml_nbytes(layer.w1)        + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+                                ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)   + ggml_nbytes(layer.wk)       +
+                                ggml_nbytes(layer.wv)        + ggml_nbytes(layer.wo)   + ggml_nbytes(layer.ffn_norm) +
+                                ggml_nbytes(layer.w1)        + ggml_nbytes(layer.w2)   + ggml_nbytes(layer.w3)       +
+                                ggml_nbytes(layer.wq_a)      + ggml_nbytes(layer.wk_a) + ggml_nbytes(layer.wv_a)     +
+                                ggml_nbytes(layer.wo_a)      + ggml_nbytes(layer.w1_a) + ggml_nbytes(layer.w2_a)     +
+                                ggml_nbytes(layer.w3_a);
                        }
                    }
                } break;
@@ -2159,6 +2180,34 @@ static bool llama_model_load(
    return true;
 }

+// computes: Z = (X @ Y) * a
+// a is vector with size equal to rows of X. each element is the scaling factor used to normalize X's rows
+// the ggml_mul() is broadcasted row-wise to restore the normalization
+struct ggml_tensor * ggml_mul_mat_ex(
+        struct ggml_context * ctx0,
+        struct ggml_tensor * t,
+        struct ggml_tensor * a,
+        //struct ggml_tensor * b,
+        struct ggml_tensor * cur,
+        offload_func_t offload_func) {
+    cur = ggml_mul_mat(ctx0, t, cur);
+    offload_func(cur);
+
+    cur = ggml_mul(ctx0, cur, a);
+    offload_func(cur);
+
+    return cur;
+
+    //struct ggml_tensor * tmp = ggml_mul_mat(ctx0, t, cur);
+    //tmp = ggml_mul(ctx0, tmp, a);
+    //cur = ggml_add(ctx0, tmp,
+    //        ggml_mul(ctx0,
+    //            ggml_repeat(ctx0, ggml_sum_rows(ctx0, cur), tmp),
+    //            b)
+    //        );
+    //return cur;
+}
+
 static struct ggml_cgraph * llm_build_llama(
         llama_context & lctx,
     const llama_token * tokens,
@@ -2292,12 +2341,10 @@ static struct ggml_cgraph * llm_build_llama(
        // self-attention
        {
            // compute Q and K and RoPE them
-            struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-            offload_func_kq(tmpk);
+            struct ggml_tensor * tmpk = ggml_mul_mat_ex(ctx0, model.layers[il].wk, model.layers[il].wk_a, cur, offload_func_kq);
            ggml_set_name(tmpk, "tmpk");

-            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-            offload_func_kq(tmpq);
+            struct ggml_tensor * tmpq = ggml_mul_mat_ex(ctx0, model.layers[il].wq, model.layers[il].wq_a, cur, offload_func_kq);
            ggml_set_name(tmpq, "tmpq");

            struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
@@ -2312,8 +2359,7 @@ static struct ggml_cgraph * llm_build_llama(
            {
                // compute the transposed [N, n_embd] V matrix

-                struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                offload_func_v(tmpv);
+                struct ggml_tensor * tmpv = ggml_mul_mat_ex(ctx0, model.layers[il].wv, model.layers[il].wv_a, cur, offload_func_v);
                ggml_set_name(tmpv, "tmpv");

                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
@@ -2404,10 +2450,7 @@ static struct ggml_cgraph * llm_build_llama(
            ggml_set_name(cur, "KQV_merged_contiguous");

            // projection (no bias)
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].wo,
-                    cur);
-            offload_func(cur);
+            cur = ggml_mul_mat_ex(ctx0, model.layers[il].wo, model.layers[il].wo_a, cur, offload_func);
            ggml_set_name(cur, "result_wo");
        }

@@ -2429,16 +2472,10 @@ static struct ggml_cgraph * llm_build_llama(
                ggml_set_name(cur, "ffn_norm");
            }

-            struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
-                    model.layers[il].w3,
-                    cur);
-            offload_func(tmp);
+            struct ggml_tensor * tmp = ggml_mul_mat_ex(ctx0, model.layers[il].w3, model.layers[il].w3_a, cur, offload_func);
            ggml_set_name(tmp, "result_w3");

-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w1,
-                    cur);
-            offload_func(cur);
+            cur = ggml_mul_mat_ex(ctx0, model.layers[il].w1, model.layers[il].w1_a, cur, offload_func);
            ggml_set_name(cur, "result_w1");

            // SILU activation
@@ -2450,10 +2487,7 @@ static struct ggml_cgraph * llm_build_llama(
            offload_func(cur);
            ggml_set_name(cur, "silu_x_result_w3");

-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].w2,
-                    cur);
-            offload_func(cur);
+            cur = ggml_mul_mat_ex(ctx0, model.layers[il].w2, model.layers[il].w2_a, cur, offload_func);
            ggml_set_name(cur, "result_w2");
        }

@@ -4731,6 +4765,26 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    // populate the original tensors so we get an initial meta data
    for (int i = 0; i < ml->n_tensors; ++i) {
        struct ggml_tensor * meta = ml->get_tensor_meta(i);
+
+        // write the tensor info for the extra row normalization factors
+        {
+            struct ggml_tensor meta_a = *meta;
+
+            const auto tn = LLM_TN(ml->get_arch());
+
+            std::string name = ggml_get_name(&meta_a);
+
+            if (meta->n_dims == 2 && name != tn(LLM_TENSOR_OUTPUT, "weight") && name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
+                meta_a.ne[0] = meta_a.ne[1];
+                meta_a.n_dims = 1;
+                meta_a.type = GGML_TYPE_F32;
+                ggml_set_name(&meta_a, (name + ".a").c_str());
+                gguf_add_tensor(ctx_out, &meta_a);
+
+                LLAMA_LOG_INFO("%s: added tensor %s\n", __func__, ggml_get_name(&meta_a));
+            }
+        }
+
        gguf_add_tensor(ctx_out, meta);
    }

@@ -4781,7 +4835,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
            // TODO: avoid hardcoded tensor names - use the TN_* constants
            const auto tn = LLM_TN(ml->get_arch());

-            if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
+            if (name == tn(LLM_TENSOR_OUTPUT, "weight") || name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
                int nx = tensor->ne[0];
                if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
                    new_type = GGML_TYPE_Q8_0;
@@ -4889,8 +4943,42 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                f32_data = (float *) f32_conv_buf.data();
            }

+            // TODO: this is temporary since we only implemented Q4_0, Q4_1 and Q5_1 as PoC
+            if (new_type == GGML_TYPE_Q4_0 || new_type == GGML_TYPE_Q4_1 || new_type == GGML_TYPE_Q5_1) {
+                //printf("\n dims: %d x %d\n", tensor.ne.at(0), tensor.ne.at(1));
+
+                const uint32_t nr = tensor->ne[1];
+
+                std::vector<float> va(nr);
+
+                // normalize to -1..1 per rows
+                for (uint32_t r = 0; r < nr; ++r) {
+                    const uint32_t n = tensor->ne[0];
+                    float * p = f32_data + r * n;
+
+                    float amax = 0.0f;
+                    for (size_t i = 0; i < n; ++i) {
+                        amax = std::max(amax, std::abs(p[i]));
+                    }
+
+                    for (size_t i = 0; i < n; ++i) {
+                        p[i] = p[i] / amax;
+                    }
+
+                    va[r] = amax;
+                }
+
+                new_data = (uint8_t *) va.data();
+                new_size = nr * sizeof(float);
+
+                gguf_set_tensor_data(ctx_out, (name + ".a").c_str(), new_data, new_size);
+
+                // write tensor data + padding
+                fout.write((const char *) new_data, new_size);
+                zeros(fout, GGML_PAD(new_size, align) - new_size);
+            }
+
            LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
-            fflush(stdout);

            work.resize(nelements * 4); // upper bound on size
            new_data = work.data();
Author	SHA1	Message	Date
Georgi Gerganov	b4e70822f6	metal : add poc for normalized Q4_0 and Q4_1	2023-08-30 18:47:16 +03:00
Georgi Gerganov	9ffe54ed10	Merge branch 'master' into norm-quants	2023-08-30 17:50:58 +03:00
Georgi Gerganov	b532a69b2f	convert.py : use dir name to name the llama	2023-08-30 13:29:40 +03:00
Georgi Gerganov	c90d135eb4	examples : fix underscore in beam-search + .gitignore (close #2900 )	2023-08-30 12:53:24 +03:00
M. Yusuf Sarıgöz	0d1c706181	gguf : add workflow for Pypi publishing (#2896 ) * gguf : add workflow for Pypi publishing * gguf : add workflow for Pypi publishing * fix trailing whitespace	2023-08-30 12:47:40 +03:00
alonfaraj	9509294420	make : add test and update CI (#2897 ) * build ci: run make test * makefile: - add all - add test * enable tests/test-tokenizer-0-llama * fix path to model * remove gcc-8 from macos build test * Update Makefile * Update Makefile	2023-08-30 12:42:51 +03:00
Gilad S	35092fb547	docs : add `node-llama-cpp` to `README.md` (#2885 )	2023-08-30 11:40:12 +03:00
Kerfuffle	dc07dc492e	convert : various script cleanups/fixes + merges and special token handling (#2842 ) * convert: Fix permute calls and method/func definitions * Cleanups for gguf-py * Minor types cleanups. * Initial implementation of handling merges and special tokens * convert: Handle special tokens and merges in vocab only mode convert: Vocab only mode no longer requires loading model tensors * gguf: Refactor tensor name mapping * convert: Fix type hint for special_token_types in SpecialVocab * Use common special vocab handling in various conversion scripts * First pass at implementing suggested changes * Second pass * gguf: SpecialVocab: Fix issue with special token content not in a dict gguf: SpecialVocab: Allow skipping handling of merges * convert-falcon-hf-to-gguf: Support --vocab-only option, bail out if no tokenizer.json * convert-gptneox-hf-to-gguf and convert: Only handle merges for BPE tokenizer * gguf: SpecialVocab: Actually set load_merges in object * Uniform args parsing and vocab only mode for convert examples * convert.py: Set gpt2 as tokenizer model when using BPE * Squish last type warning in gguf.py - yay!	2023-08-30 11:25:50 +03:00
chaihahaha	ad9ddcff6e	llm.vim : stop generation at multiple linebreaks, bind to <F2> (#2879 )	2023-08-30 09:50:55 +03:00
Iwan Kawrakow	dead8f4b5b	Fix misaligned memory access in Q4_1 kernel	2023-07-28 17:27:01 +03:00
Iwan Kawrakow	72af25998c	Fix misaligned memory access in Q4_1 kernel	2023-07-28 17:13:22 +03:00
Georgi Gerganov	e5d23f2e7e	ggml : fix ARM build + speed-up ggml_mul	2023-07-28 16:31:59 +03:00
Georgi Gerganov	a4d1eb72c6	ggml : add q4_1 normalized quants	2023-07-28 15:20:34 +03:00
Georgi Gerganov	675425563c	ggml : poc for normalizing weights for better quantization	2023-07-28 10:44:38 +03:00