ggml : avoid using ggml_fp16_to_fp32() and ggml_fp32_to_fp16() in ggml.c

Speedup the AVX-512 implementation of ggml_vec_dot_q4_0() (#933 )
Fix: do not close file on mmap (#1017 )
2026-02-26 14:23:22 +02:00 · 2023-04-17 16:16:23 +03:00 · 2023-04-17 15:10:57 +02:00 · 2023-04-16 21:27:38 +02:00 · 2023-04-16 13:59:27 +03:00 · 2023-04-16 10:13:00 +00:00
26 changed files with 2409 additions and 1801 deletions
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -5,9 +5,10 @@ FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip

+COPY requirements.txt requirements.txt
+
 RUN pip install --upgrade pip setuptools wheel \
-    && pip install numpy requests sentencepiece tqdm \
-    && pip install torch --index-url https://download.pytorch.org/whl/cpu
+    && pip install -r requirements.txt

 WORKDIR /app

--- a/.gitignore
+++ b/.gitignore
@@ -23,6 +23,7 @@ models/*
 /result
 /perplexity
 /embedding
+/benchmark-q4_0-matmult
 /Pipfile

 arm_neon.h
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,8 @@ option(LLAMA_SANITIZE_UNDEFINED     "llama: enable undefined sanitizer"
 option(LLAMA_AVX                    "llama: enable AVX"                                     ON)
 option(LLAMA_AVX2                   "llama: enable AVX2"                                    ON)
 option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
+option(LLAMA_AVX512_VBMI            "llama: enable AVX512-VBMI"                             OFF)
+option(LLAMA_AVX512_VNNI            "llama: enable AVX512-VNNI"                             OFF)
 option(LLAMA_FMA                    "llama: enable FMA"                                     ON)
 # in MSVC F16C is implied with AVX2/AVX512
 if (NOT MSVC)
@@ -120,6 +122,21 @@ if (LLAMA_OPENBLAS)
        add_compile_definitions(GGML_USE_OPENBLAS)
        add_link_options(${BLAS_LIBRARIES})
        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
+
+        # find header file
+        set(OPENBLAS_INCLUDE_SEARCH_PATHS
+            /usr/include
+            /usr/include/openblas
+            /usr/include/openblas-base
+            /usr/local/include
+            /usr/local/include/openblas
+            /usr/local/include/openblas-base
+            /opt/OpenBLAS/include
+            $ENV{OpenBLAS_HOME}
+            $ENV{OpenBLAS_HOME}/include
+            )
+        find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+        add_compile_options(-I${OPENBLAS_INC})
    else()
        message(WARNING "OpenBLAS not found")
    endif()
@@ -205,6 +222,16 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
    if (MSVC)
        if (LLAMA_AVX512)
            add_compile_options(/arch:AVX512)
+            # MSVC has no compile-time flags enabling specific
+            # AVX512 extensions, neither it defines the
+            # macros corresponding to the extensions.
+            # Do it manually.
+            if (LLAMA_AVX512_VBMI)
+                add_compile_definitions(__AVX512VBMI__)
+            endif()
+            if (LLAMA_AVX512_VNNI)
+                add_compile_definitions(__AVX512VNNI__)
+            endif()
        elseif (LLAMA_AVX2)
            add_compile_options(/arch:AVX2)
        elseif (LLAMA_AVX)
@@ -225,9 +252,13 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
        endif()
        if (LLAMA_AVX512)
            add_compile_options(-mavx512f)
-            # add_compile_options(-mavx512cd)
-            # add_compile_options(-mavx512dq)
-            # add_compile_options(-mavx512bw)
+            add_compile_options(-mavx512bw)
+        endif()
+        if (LLAMA_AVX512_VBMI)
+            add_compile_options(-mavx512vbmi)
+        endif()
+        if (LLAMA_AVX512_VNNI)
+            add_compile_options(-mavx512vnni)
        endif()
    endif()
 else()
@@ -253,7 +284,6 @@ endif()
 add_library(llama
            llama.cpp
            llama.h
-            llama_internal.h
            llama_util.h)

 target_include_directories(llama PUBLIC .)
--- a/26
+++ b/26
@@ -133,51 +133,51 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )

-default: main quantize perplexity embedding
+default: main quantize quantize-stats perplexity embedding

 #
 # Build library
 #

 ggml.o: ggml.c ggml.h
-	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
+	$(CC)  $(CFLAGS)   -c $< -o $@

-llama.o: llama.cpp llama.h llama_util.h llama_internal.h
-	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
+llama.o: llama.cpp ggml.h llama.h llama_util.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@

 common.o: examples/common.cpp examples/common.h
-	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
+	$(CXX) $(CXXFLAGS) -c $< -o $@

 clean:
 	rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult

 main: examples/main/main.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo

 quantize: examples/quantize/quantize.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

 quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

 perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

 embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

 libllama.so: llama.o ggml.o
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 #
 # Tests
 #

-benchmark: ggml.o
-	$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)
+benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o
+	$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
 	./benchmark-q4_0-matmult

 .PHONY: tests
--- a/README.md
+++ b/README.md
@@ -192,10 +192,10 @@ ls ./models
 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model

 # install Python dependencies
-python3 -m pip install torch numpy sentencepiece
+python3 -m pip install -r requirements.txt

 # convert the 7B model to ggml FP16 format
-python3 convert-pth-to-ggml.py models/7B/ 1
+python3 convert.py models/7B/

 # quantize the model to 4-bits (using method 2 = q4_0)
 ./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
--- a/convert-ggml-to-pth.py
+++ b/convert-ggml-to-pth.py
@@ -1,299 +0,0 @@
-# Author: github.com/ductai199x
-import argparse
-import os
-import struct
-
-import numpy as np
-import torch
-from numba import njit
-from tqdm.auto import tqdm
-
-
-def read_header(fin):
-    values = struct.unpack("i" * 9, fin.read(4 * 9))
-    _, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
-    return {
-        "vocab_size": vocab_size,
-        "dim": dim,
-        "multiple_of": multiple_of,
-        "n_heads": n_heads,
-        "n_layers": n_layers,
-    }, ftype
-
-
-def read_tokens(fin, vocab_size):
-    tokens = []
-    for _ in range(vocab_size):
-        text_len = struct.unpack("i", fin.read(4))[0]
-        text_bytes = fin.read(text_len)
-        try:
-            text = text_bytes.decode()
-        except UnicodeDecodeError:
-            text = text_bytes.decode(errors="replace")
-        score = struct.unpack("f", fin.read(4))[0]
-        tokens.append((text, score))
-    return tokens
-
-
-@njit
-def dequantize_weights_numba(fin_data, n_rows, n_cols):
-    qk = 32
-    nb = n_cols // qk
-    bs = 4 + (qk // 2)
-
-    weights = np.zeros((n_rows, n_cols), dtype=np.float32)
-    data_pos = 0
-
-    for row in range(n_rows):
-        for block in range(nb):
-            d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
-            data_pos += 4
-            packed_values = fin_data[data_pos : data_pos + (qk // 2)]
-            data_pos += qk // 2
-
-            for i in range(qk // 2):
-                packed_value = packed_values[i]
-                v0 = np.float32((packed_value & 0b00001111) - 8) * d
-                v1 = np.float32((packed_value >> 4) - 8) * d
-
-                weights[row, block * qk + 2 * i] = v0
-                weights[row, block * qk + 2 * i + 1] = v1
-
-    return weights
-
-
-def dequantize_weights(fin, n_rows, n_cols):
-    qk = 32
-    nb = n_cols // qk
-    data_size = n_rows * n_cols // 2 + n_rows * nb * 4
-    fin_data = fin.read(data_size)
-    return dequantize_weights_numba(fin_data, n_rows, n_cols)
-
-
-def read_variables(fin):
-    model = {}
-    pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
-    while True:
-        start_pos = fin.tell()
-        try:
-            n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
-        except struct.error:
-            break
-
-        shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
-        shape = shape[::-1]
-        name = fin.read(name_length).decode()
-
-        # ensure tensor data is aligned
-        tensor_data_offset = fin.tell()
-        tensor_data_offset = (tensor_data_offset + 31) & -32
-        fin.seek(tensor_data_offset)
-
-        if ftype_cur == 2:
-            # 4-bit quantized weights
-            dtype = np.uint8
-            data = dequantize_weights(fin, shape[0], shape[1])
-            data = data.reshape(shape)
-        elif ftype_cur == 0:
-            dtype = np.float32
-            data_size = np.prod(shape)
-            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
-        elif ftype_cur == 1:
-            dtype = np.float16
-            data_size = np.prod(shape)
-            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
-
-        model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
-
-        pbar.update(fin.tell() - start_pos)
-
-    return model
-
-
-def convert_to_hf_format(model, hparams):
-    # This works for llama 7B, need to test with other models
-    n_layers = hparams["n_layers"]
-    n_heads = hparams["n_heads"]
-    dim = hparams["dim"]
-    dims_per_head = dim // n_heads
-    base = 10000.0
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-
-    # permute for sliced rotary
-    def permute(w):
-        return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
-
-    state_dict = {}
-    for layer_i in range(n_layers):
-        state_dict.update(
-            {
-                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
-                    model[f"layers.{layer_i}.attention.wq.weight"]
-                ),
-                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
-                    model[f"layers.{layer_i}.attention.wk.weight"]
-                ),
-                f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
-                    f"layers.{layer_i}.attention.wv.weight"
-                ],
-                f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
-                    f"layers.{layer_i}.attention.wo.weight"
-                ],
-                f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
-                    f"layers.{layer_i}.feed_forward.w1.weight"
-                ],
-                f"model.layers.{layer_i}.mlp.down_proj.weight": model[
-                    f"layers.{layer_i}.feed_forward.w2.weight"
-                ],
-                f"model.layers.{layer_i}.mlp.up_proj.weight": model[
-                    f"layers.{layer_i}.feed_forward.w3.weight"
-                ],
-                f"model.layers.{layer_i}.input_layernorm.weight": model[
-                    f"layers.{layer_i}.attention_norm.weight"
-                ],
-                f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
-                    f"layers.{layer_i}.ffn_norm.weight"
-                ],
-            }
-        )
-        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
-    state_dict.update(
-        {
-            "model.embed_tokens.weight": model["tok_embeddings.weight"],
-            "model.norm.weight": model["norm.weight"],
-            "lm_head.weight": model["output.weight"],
-        }
-    )
-
-    return state_dict
-
-
-def chat(model, hparams, llama_dir):
-    from transformers import (GenerationConfig, LlamaForCausalLM,
-                              LlamaTokenizer, StoppingCriteria,
-                              StoppingCriteriaList)
-    from transformers.models.llama.configuration_llama import LlamaConfig
-
-    class StoppingCriteriaSub(StoppingCriteria):
-        def __init__(self):
-            super().__init__()
-
-        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
-            print(tokenizer.decode(input_ids[0]), end="", flush=True)
-            if input_ids[0][-1] == 13:
-                return True
-
-            return False
-
-    config = LlamaConfig(
-        vocab_size=hparams["vocab_size"],
-        dim=hparams["dim"],
-        num_hidden_layers=hparams["n_layers"],
-        num_attention_heads=hparams["n_heads"],
-    )
-
-    llama = LlamaForCausalLM(config=config)
-    llama.load_state_dict(state_dict=model, strict=True)
-    tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
-
-    device = torch.device("cpu")
-    llama = llama.to(device)
-
-    ctx = """You are AI.
-This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
-User: Hello, AI.
-AI: Hello! How can I assist you today?
-"""
-    print(ctx.rstrip("\n"))
-    while True:
-        print("-" * 60)
-        prompt = input("User: ")
-        if ctx != "":
-            ctx = f"{ctx}User: {prompt}\n"
-        else:
-            ctx = f"{prompt}\nAI:"
-
-        ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
-
-        print("-" * 60)
-        if len(ctx.strip()) > 0:
-            input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
-            generation_config = GenerationConfig(
-                temperature=0.8,
-                top_p=0.95,
-                top_k=50,
-                repetition_penalty=1.1764,
-            )
-            with torch.no_grad():
-                generation_output = llama.generate(
-                    input_ids=input_ids,
-                    generation_config=generation_config,
-                    return_dict_in_generate=True,
-                    output_scores=True,
-                    max_length=2048,
-                    do_sample=True,
-                    stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
-                )
-            s = generation_output.sequences[0]
-            decoded = tokenizer.decode(s)
-            ctx = f"{decoded}\n"
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
-    )
-    parser.add_argument(
-        "--prefix",
-        "-p",
-        type=str,
-        required=True,
-        help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
-    )
-    parser.add_argument(
-        "--hf",
-        action="store_true",
-        help="Whether to save the model in the Hugging Face format. (default: False)",
-    )
-    parser.add_argument(
-        "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
-    )
-    args = parser.parse_args()
-
-    llama_dir = os.path.abspath(f"{args.input_dir}/../")
-
-    ggml_files = sorted(
-        [f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
-    )
-
-    fin = open(ggml_files[0], "rb")
-    hparams, ftype = read_header(fin)
-    tokens = read_tokens(fin, hparams["vocab_size"])
-    model = read_variables(fin)
-
-    for f in tqdm(ggml_files[1:]):
-        fin = open(f, "rb")
-        read_header(fin)
-        read_tokens(fin, hparams["vocab_size"])
-        model.update(read_variables(fin))
-
-    if args.hf:
-        model = convert_to_hf_format(model, hparams)
-
-    pth_ckpt = {
-        "state_dict": model,
-        "hparams": hparams,
-        "tokens": tokens,
-    }
-
-    torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
-
-    if args.chat:
-        if not args.hf:
-            model = convert_to_hf_format(model, hparams)
-        chat(model, hparams, llama_dir)
-
-
-if __name__ == "__main__":
-    main()
--- a/convert-gpt4all-to-ggml.py
+++ b/convert-gpt4all-to-ggml.py
@@ -1,107 +0,0 @@
-#!/usr/bin/env python3
-
-#
-# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
-#
-
-# Original by https://github.com/eiz
-# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
-import argparse
-import glob
-import os
-import struct
-import sys
-from sentencepiece import SentencePieceProcessor
-
-HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
-    parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
-    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
-    return parser.parse_args()
-
-def read_header(f_in):
-    struct_fmt = "i" * (3 + len(HPARAMS))
-    struct_size = struct.calcsize(struct_fmt)
-    buf = f_in.read(struct_size)
-    return struct.unpack(struct_fmt, buf)
-
-def write_header(f_out, header):
-    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
-
-    if magic != 0x67676d6c:
-        raise Exception('Invalid file magic. Must be an old style ggml file.')
-
-    values = [
-        0x67676d66, # magic: ggml in hex
-        1,          # file version
-        vocab_size,
-        dim,
-        multiple_of,
-        n_heads,
-        n_layers,
-        rot,
-        ftype
-    ]
-    f_out.write(struct.pack("i" * len(values), *values))
-
-def write_tokens(fout, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode()
-        elif tokenizer.is_control(i):
-            text = b""
-        elif tokenizer.is_byte(i):
-            piece = tokenizer.id_to_piece(i)
-            if len(piece) != 6:
-                print(f"Invalid token: {piece}")
-                sys.exit(1)
-            byte_value = int(piece[3:-1], 16)
-            text = struct.pack("B", byte_value)
-        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
-        fout.write(struct.pack("f", tokenizer.get_score(i)))
-
-    # TODO: GPT4All - add extra <pad> token
-    text = "<pad>".encode()
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    fout.write(struct.pack("f", 0.0))
-
-def read_tokens(f_in, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        len_b = f_in.read(4)
-        (length,) = struct.unpack("i", len_b)
-        f_in.read(length)
-
-def copy_all_data(f_out, f_in):
-    while True:
-        buf = f_in.read(1024 * 1024)
-        if not buf:
-            break
-        f_out.write(buf)
-
-def convert_one_file(path_in, tokenizer):
-    path_tmp = f"{path_in}.tmp"
-    path_orig= f"{path_in}.orig"
-    print(f"converting {path_in}")
-    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
-        write_header(f_out, read_header(f_in))
-        read_tokens(f_in, tokenizer)
-        write_tokens(f_out, tokenizer)
-        copy_all_data(f_out, f_in)
-    os.rename(path_in, path_orig)
-    os.rename(path_tmp, path_in)
-
-def main():
-    args = parse_args()
-
-    tokenizer = SentencePieceProcessor(args.tokenizer_model)
-
-    convert_one_file(args.gpt4all_model, tokenizer)
-
-if __name__ == "__main__":
-    main()
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@@ -1,172 +0,0 @@
-# Convert a GPTQ quantized LLaMA model to a ggml compatible file
-# Based on: https://github.com/qwopqwop200/GPTQ-for-LLaMa
-#
-import os
-import re
-import sys
-import json
-import struct
-import numpy as np
-import torch
-from sentencepiece import SentencePieceProcessor
-
-if len(sys.argv) != 4:
-    print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
-    sys.exit(1)
-
-fname_model = sys.argv[1]
-fname_tokenizer = sys.argv[2]
-dir_out = sys.argv[3]
-
-model = torch.load(fname_model, map_location="cpu")
-
-n_vocab, n_embd = model['model.embed_tokens.weight'].shape
-n_layer = 1 + max(int(m.group(1)) for name in model
-                  if (m := re.match(r'model\.layers\.([0-9]+)', name)))
-
-# hardcoded:
-n_mult = 256
-n_head = {32: 32, 40: 40, 60: 52, 80: 64}[n_layer]
-
-tokenizer = SentencePieceProcessor(fname_tokenizer)
-
-assert tokenizer.vocab_size() == n_vocab
-
-fname_out = sys.argv[3]
-
-fout = open(fname_out, "wb")
-
-fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
-fout.write(struct.pack("i", 1)) # file version
-fout.write(struct.pack("i", n_vocab))
-fout.write(struct.pack("i", n_embd))
-fout.write(struct.pack("i", n_mult))
-fout.write(struct.pack("i", n_head))
-fout.write(struct.pack("i", n_layer))
-fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
-fout.write(struct.pack("i", 4))
-
-
-# This loop unchanged from convert-pth-to-ggml.py:
-for i in range(tokenizer.vocab_size()):
-    if tokenizer.is_unknown(i):
-        text = " \u2047 ".encode()
-    elif tokenizer.is_control(i):
-        text = b""
-    elif tokenizer.is_byte(i):
-        piece = tokenizer.id_to_piece(i)
-        if len(piece) != 6:
-            print(f"Invalid token: {piece}")
-            sys.exit(1)
-        byte_value = int(piece[3:-1], 16)
-        text = struct.pack("B", byte_value)
-    else:
-        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
-    fout.write(struct.pack("i", len(text)))
-    fout.write(text)
-    fout.write(struct.pack("f", tokenizer.get_score(i)))
-
-def write_header(shape, dst_name, ftype_cur):
-    sname = dst_name.encode()
-    fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
-    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
-    fout.write(sname)
-
-    # ensure tensor data is aligned
-    tensor_data_offset = fout.tell()
-    tensor_data_offset = (tensor_data_offset + 31) & -32
-    fout.seek(tensor_data_offset)
-
-def convert_non_q4(src_name, dst_name):
-    v = model[src_name]
-    shape = v.shape
-    print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
-    if len(shape) == 1:
-        print("  Converting to float32")
-        v = v.to(torch.float32)
-
-    ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
-
-    # header
-    write_header(shape, dst_name, ftype_cur)
-
-    # data
-    v.numpy().tofile(fout)
-
-def convert_q4(src_name, dst_name, permute=False):
-    zeros = model[f"{src_name}.zeros"].numpy()
-    scales = model[f"{src_name}.scales"].numpy()
-    bias = model[f"{src_name}.bias"].numpy()
-    qweight = model[f"{src_name}.qweight"].numpy().T # transpose
-
-    # Q4_1 does not support bias; good thing the bias is always all zeros.
-    assert not np.any(bias)
-
-    # Each int32 item is actually 8 int4 items packed together, and it's transposed.
-    shape = (qweight.shape[0], qweight.shape[1] * 8)
-
-    print(f"Processing Q4 variable: {src_name} with shape: {shape}")
-
-    # The output format has the int4 weights in groups of 32 rather than 8.
-    # It looks like this:
-    # For each row:
-    #   For each group of 32 columns:
-    #     - addend (float32, 4 bytes)
-    #     - scale (float32, 4 bytes)
-    #     - weights (int4 * 32, 16 bytes)
-    # Note that in the input, the scales and addends are shared between all
-    # the columns in a row, so we end up wasting quite a bit of memory with
-    # repeated scales and addends.
-
-    addends = -zeros # flip sign
-
-    # Since the output format is mixed between integers and floats, we have
-    # to hackily view the floats as int32s just so numpy will let us
-    # concatenate them.
-    addends_view = addends.view(dtype=np.int32)
-    scales_view = scales.view(dtype=np.int32)
-
-    # Split into groups of 4 columns (i.e. 32 columns of quantized data):
-    grouped = qweight.reshape([qweight.shape[0], qweight.shape[1] // 4, 4])
-
-    # Repeat addends and scales:
-    addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
-    scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)
-
-    blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
-
-    if permute:
-        # Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
-        # This can be done after the above conversion because it doesn't affect column order/layout.
-        blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
-                    .swapaxes(1, 2)
-                    .reshape(blob.shape))
-
-    # header
-    write_header(shape, dst_name, 3) # ftype = Q4_1
-
-    # data
-    blob.tofile(fout)
-
-convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
-convert_non_q4("model.norm.weight", "norm.weight")
-convert_non_q4("lm_head.weight", "output.weight")
-
-for i in range(n_layer):
-    convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
-    convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
-    convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
-    convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")
-
-    convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
-    convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
-    convert_q4(f"model.layers.{i}.mlp.up_proj",   f"layers.{i}.feed_forward.w3.weight")
-
-    convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
-    convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
-
-
-fout.close()
-
-print(f"Done. Output file: {fname_out}")
-print()
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -1,274 +1,11 @@
-# Convert a LLaMA model checkpoint to a ggjt compatible file
-#
-# Load the model using Torch
-# Iterate over all variables and write them to a binary file.
-#
-# For each variable, write the following:
-#   - Number of dimensions (int)
-#   - Name length (int)
-#   - Dimensions (int[n_dims])
-#   - Name (char[name_length])
-#   - Data (float[n_dims])
-#
-# At the start of the ggml file we write the model parameters
-# and vocabulary.
-#
+# Compatibility stub

 import argparse
-import os
-import sys
-import json
-import struct
-import numpy as np
-import torch

-from sentencepiece import SentencePieceProcessor
+import convert

-QK = 32
-
-GGML_TYPE_Q4_0  = 0
-GGML_TYPE_Q4_1  = 1
-GGML_TYPE_I8    = 2
-GGML_TYPE_I16   = 3
-GGML_TYPE_I32   = 4
-GGML_TYPE_F16   = 5
-GGML_TYPE_F32   = 6
-
-WTYPES = {
-    0: GGML_TYPE_F32,
-    1: GGML_TYPE_F16,
-    2: GGML_TYPE_Q4_0,
-    3: GGML_TYPE_Q4_1,
-}
-
-GGML_BLCK_SIZE = {
-    GGML_TYPE_Q4_0:  QK,
-    GGML_TYPE_Q4_1:  QK,
-    GGML_TYPE_I8:    1,
-    GGML_TYPE_I16:   1,
-    GGML_TYPE_I32:   1,
-    GGML_TYPE_F16:   1,
-    GGML_TYPE_F32:   1,
-}
-
-GGML_TYPE_SIZE = {
-    GGML_TYPE_Q4_0: 4   + QK//2,
-    GGML_TYPE_Q4_1: 4*2 + QK//2,
-    GGML_TYPE_I8:   1,
-    GGML_TYPE_I16:  2,
-    GGML_TYPE_I32:  4,
-    GGML_TYPE_F16:  2,
-    GGML_TYPE_F32:  4,
-}
-
-def ggml_nelements(shape):
-    r = 1
-    for i in shape:
-        r *= i
-    return r
-
-def ggml_nbytes(shape, ftype):
-    x = ggml_nelements(shape)
-    t = WTYPES[ftype]
-    x *= GGML_TYPE_SIZE[t]
-    x //= GGML_BLCK_SIZE[t]
-    return x
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
-    parser.add_argument('dir_model',  help='directory containing the model checkpoint')
-    parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
-    parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
-    return parser.parse_args()
-
-def get_n_parts(dim):
-    mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
-    n_parts = mappings.get(dim)
-    if n_parts is None:
-        print(f"Invalid dim: {dim}")
-        sys.exit(1)
-
-    print(f"n_parts = {n_parts}\n")
-    return n_parts
-
-def load_hparams_and_tokenizer(dir_model):
-    # `dir_model` is something like `models/7B` or `models/7B/`.
-    # "tokenizer.model" is expected under model's parent dir.
-    # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
-    # Let's use the model's parent dir directly.
-    model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
-    fname_hparams = f"{dir_model}/params.json"
-    fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
-    with open(fname_hparams, "r") as f:
-        hparams = json.load(f)
-        print(hparams)
-    tokenizer = SentencePieceProcessor(fname_tokenizer)
-    hparams.update({"vocab_size": tokenizer.vocab_size()})
-    return hparams, tokenizer
-
-def write_header(fout, hparams, ftype):
-    keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
-    values = [
-        0x67676a74,  # magic: ggjt in hex
-        1, # file version
-        *[hparams[key] for key in keys],
-        hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
-        ftype
-    ]
-    fout.write(struct.pack("i" * len(values), *values))
-
-def write_tokens(fout, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode()
-        elif tokenizer.is_control(i):
-            text = b""
-        elif tokenizer.is_byte(i):
-            piece = tokenizer.id_to_piece(i)
-            if len(piece) != 6:
-                print(f"Invalid token: {piece}")
-                sys.exit(1)
-            byte_value = int(piece[3:-1], 16)
-            text = struct.pack("B", byte_value)
-        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
-        fout.write(struct.pack("f", tokenizer.get_score(i)))
-
-def process_and_write_variables(fout, model, ftype, part_id, n_parts):
-    for name, datao in model.items():
-        if name.endswith("freqs"):
-            continue
-
-        # remove dimensions with a single element
-        data = datao.numpy().squeeze()
-        partshape = data.shape
-        n_dims = len(data.shape)
-        assert n_dims in (1, 2)
-
-        print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
-
-        # coerce single-dimensional tensors from float16 to float32
-        ftype_cur = 1
-        if ftype == 0 or n_dims == 1:
-            print("  Converting to float32")
-            data = data.astype(np.float32)
-            ftype_cur = 0
-        blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
-        type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
-
-        # determine dimension along which multipart tensor is sharded
-        #
-        # split_dim 0 regex:
-        #   - output.*
-        #   - layers.*.attention.wq.weight
-        #   - layers.*.attention.wk.weight
-        #   - layers.*.attention.wv.weight
-        #   - layers.*.feed_forward.w1.weight
-        #   - layers.*.feed_forward.w3.weight
-        #
-        # split_dim 1 regex:
-        #   - tok_embeddings.*
-        #   - layers.*.attention.wo.weight
-        #   - layers.*.feed_forward.w2.weight
-        #
-        if n_dims > 1:
-            split_dim = 1
-            if "tok_embeddings" in name:
-                split_dim = 1
-            elif "layers" in name:
-                if "attention.wo.weight" in name:
-                    split_dim = 1
-                elif "feed_forward.w2.weight" in name:
-                    split_dim = 1
-                else:
-                    split_dim = 0
-            elif "output" in name:
-                split_dim = 0
-
-        # output tensor header
-        fullshape = list(partshape)
-        if n_dims > 1:
-            fullshape[split_dim] *= n_parts
-        sname = name.encode()
-        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
-        for dim in reversed(fullshape):
-            fout.write(struct.pack("i", dim))
-        fout.write(sname)
-
-        # ensure tensor data is aligned
-        tensor_data_offset = fout.tell()
-        while tensor_data_offset % QK != 0:
-            fout.write(struct.pack("B", 0))
-            tensor_data_offset += 1
-
-        # output unified mappable tensor data
-        if n_dims == 1 or n_parts == 1:
-            # copy tensor which we thankfully received in one piece
-            if part_id == 0:
-                data.tofile(fout)
-        elif split_dim == 0:
-            # reassemble multifile tensor containing some of the rows
-            rows_per_chunk = partshape[0]
-            current_row = part_id * rows_per_chunk
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset = current_row * bytes_per_row
-            fout.seek(tensor_data_offset + offset)
-            data.tofile(fout)
-        elif split_dim == 1:
-            # reassemble multifile tensor containing some of the cols
-            cols_per_chunk = partshape[1]
-            current_col = part_id * cols_per_chunk
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset_current_col = current_col // blck_size * type_size
-            for row in range(partshape[0]):
-                offset_row = row * bytes_per_row
-                offset = offset_row + offset_current_col
-                fout.seek(tensor_data_offset + offset)
-                data[row].tofile(fout)
-
-        # advance file position to next tensor
-        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
-
-def main():
-    args = parse_args()
-    dir_model = args.dir_model
-    ftype = args.ftype
-    ftype_str = ["f32", "f16"]
-    hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
-
-    print(args)
-
-    # if only writing vocab to file
-    if args.vocab_only:
-        fname_model = f"{dir_model}/consolidated.00.pth"
-        fname_out = f"{dir_model}/ggml-vocab.bin"
-        print(f"Extracting only the vocab from '{fname_model}'\n")
-        with open(fname_out, "wb") as fout:
-            write_header(fout, hparams, ftype)
-            write_tokens(fout, tokenizer)
-        print(f"Done. Output file: {fname_out}\n")
-        return
-
-    n_parts = get_n_parts(hparams["dim"])
-    fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
-
-    # we output a single file for ggml
-    with open(fname_out, "wb") as fout:
-        write_header(fout, hparams, ftype)
-        write_tokens(fout, tokenizer)
-        offset_of_tensors = fout.tell()
-        # the tensors we load could be split across multiple files
-        for part_id in range(n_parts):
-            fout.seek(offset_of_tensors)
-            print(f"Processing part {part_id+1} of {n_parts}\n")
-            fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
-            model = torch.load(fname_model, map_location="cpu")
-            process_and_write_variables(fout, model, ftype, part_id, n_parts)
-            del model
-
-    print(f"Done. Output file: {fname_out}\n")
-
-if __name__ == "__main__":
-    main()
+parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
+parser.add_argument('dir_model',  help='directory containing the model checkpoint')
+parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
+args = parser.parse_args()
+convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
--- a/convert-unversioned-ggml-to-ggml.py
+++ b/convert-unversioned-ggml-to-ggml.py
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-# Original by https://github.com/eiz
-# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
-import argparse
-import glob
-import os
-import struct
-import sys
-from sentencepiece import SentencePieceProcessor
-
-HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
-    parser.add_argument('dir_model', help='directory containing ggml .bin files')
-    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
-    return parser.parse_args()
-
-def read_header(f_in):
-    struct_fmt = "i" * (3 + len(HPARAMS))
-    struct_size = struct.calcsize(struct_fmt)
-    buf = f_in.read(struct_size)
-    return struct.unpack(struct_fmt, buf)
-
-def write_header(f_out, header):
-    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
-
-    if magic != 0x67676d6c:
-        raise Exception('Invalid file magic. Must be an old style ggml file.')
-
-    values = [
-        0x67676d66,  # magic: ggml in hex
-        1, # file version
-        vocab_size,
-        dim,
-        multiple_of,
-        n_heads,
-        n_layers,
-        rot,
-        ftype
-    ]
-    f_out.write(struct.pack("i" * len(values), *values))
-
-def write_tokens(fout, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode()
-        elif tokenizer.is_control(i):
-            text = b""
-        elif tokenizer.is_byte(i):
-            piece = tokenizer.id_to_piece(i)
-            if len(piece) != 6:
-                print(f"Invalid token: {piece}")
-                sys.exit(1)
-            byte_value = int(piece[3:-1], 16)
-            text = struct.pack("B", byte_value)
-        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
-        fout.write(struct.pack("i", len(text)))
-        fout.write(text)
-        fout.write(struct.pack("f", tokenizer.get_score(i)))
-
-def read_tokens(f_in, tokenizer):
-    for i in range(tokenizer.vocab_size()):
-        len_b = f_in.read(4)
-        (length,) = struct.unpack("i", len_b)
-        f_in.read(length)
-
-def copy_all_data(f_out, f_in):
-    while True:
-        buf = f_in.read(1024 * 1024)
-        if not buf:
-            break
-        f_out.write(buf)
-
-def convert_one_file(path_in, tokenizer):
-    path_tmp = f"{path_in}.tmp"
-    path_orig= f"{path_in}.orig"
-    print(f"converting {path_in}")
-    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
-        write_header(f_out, read_header(f_in))
-        read_tokens(f_in, tokenizer)
-        write_tokens(f_out, tokenizer)
-        copy_all_data(f_out, f_in)
-    os.rename(path_in, path_orig)
-    os.rename(path_tmp, path_in)
-
-def main():
-    args = parse_args()
-    files = []
-    files.extend(glob.glob(f"{args.dir_model}/*.bin"))
-    files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
-
-    tokenizer = SentencePieceProcessor(args.tokenizer_model)
-
-    for file in files:
-        convert_one_file(file, tokenizer)
-
-if __name__ == "__main__":
-    main()
--- a/convert.py
+++ b/convert.py
--- a/examples/benchmark/benchmark-q4_0-matmult.c
+++ b/examples/benchmark/benchmark-q4_0-matmult.c
@@ -24,7 +24,7 @@

 float tensor_sum_elements(struct ggml_tensor * tensor) {
    float sum = 0;
-    if (tensor->type==6) {
+    if (tensor->type==GGML_TYPE_F32) {
        for (int j = 0; j < tensor->ne[1]; j++) {
            for (int k = 0; k < tensor->ne[0]; k++) {
                sum +=  ((float *) tensor->data)[j*tensor->ne[0]+k];
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -7,12 +7,6 @@
 #include <iterator>
 #include <algorithm>

-#if defined(_MSC_VER) || defined(__MINGW32__)
-#include <malloc.h> // using malloc.h with MSC/MINGW
-#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
-#include <alloca.h>
-#endif
-
 #if defined (_WIN32)
 #include <fcntl.h>
 #include <io.h>
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -1,6 +1,8 @@
 #include "common.h"
 #include "llama.h"

+#include <ctime>
+
 int main(int argc, char ** argv) {
    gpt_params params;
    params.model = "models/llama-7B/ggml-model.bin";
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -11,6 +11,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstring>
+#include <ctime>
 #include <fstream>
 #include <iostream>
 #include <string>
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -2,6 +2,7 @@
 #include "llama.h"

 #include <cmath>
+#include <ctime>

 std::vector<float> softmax(const std::vector<float>& logits) {
    std::vector<float> probs(logits.size());
@@ -27,20 +28,27 @@ void perplexity(llama_context * ctx, const gpt_params & params) {

    int count = 0;
    int seq_count = tokens.size() / params.n_ctx;
+    int n_vocab = llama_n_vocab(ctx);

    double nll = 0.0;
-
-    fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
+    fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch);

    for (int i = 0; i < seq_count; ++i) {
        int start = i * params.n_ctx;
-        int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512
-                                            //       it is better to always be power of 2 for better performance
-        std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
+        int end = start + params.n_ctx;
+
+        std::vector<float> logits;
+        int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch;
        auto start_t = std::chrono::high_resolution_clock::now();
-        if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
-            fprintf(stderr, "%s : failed to eval\n", __func__);
-            return;
+        for (int j = 0; j < num_batches; ++j) {
+            int batch_start = start + j * params.n_batch;
+            int batch_size = std::min(end - batch_start, params.n_batch);
+            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) {
+                fprintf(stderr, "%s : failed to eval\n", __func__);
+                return;
+            }
+            auto batch_logits = llama_get_logits(ctx);
+            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
        }
        auto end_t = std::chrono::high_resolution_clock::now();
        if (i == 0) {
@@ -59,15 +67,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
        // Example, we have a context window of 512, we will compute perplexity for each of the
        // last 256 tokens.  Then, we split the input up into context window size chunks to
        // process the entire prompt.
-
-        auto logits = llama_get_logits(ctx);
-        for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
+        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
            // Calculate probability of next token, given the previous ones.
-            int n_vocab = llama_n_vocab(ctx);
            std::vector<float> tok_logits(
-                logits + j * n_vocab,
-                logits + (j + 1) * n_vocab);
-            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
+                logits.begin() + j * n_vocab,
+                logits.begin() + (j + 1) * n_vocab);
+            float prob = softmax(tok_logits)[tokens[start + j + 1]];
            nll += -std::log(prob);
            ++count;
        }
@@ -82,11 +87,13 @@ int main(int argc, char ** argv) {
    gpt_params params;
    params.model = "models/llama-7B/ggml-model.bin";

+    params.n_batch = 512;
    if (gpt_params_parse(argc, argv, params) == false) {
        return 1;
    }

    params.perplexity = true;
+    params.n_batch = std::min(params.n_batch, params.n_ctx);

    if (params.n_ctx > 2048) {
        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -1,6 +1,7 @@
 #include "ggml.h"
+
+#define LLAMA_API_INTERNAL
 #include "llama.h"
-#include "llama_internal.h"

 #include <algorithm>
 #include <cassert>
@@ -15,9 +16,6 @@
 #include <unordered_map>
 #include <vector>

-static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32"  };
-static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
-
 struct quantize_stats_params {
    std::string model = "models/7B/ggml-model-f16.bin";
    bool verbose = false;
@@ -223,7 +221,7 @@ int main(int argc, char ** argv) {
                break;
            }
            int j;
-            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
+            for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) i)) != 0; j++) {
                // find match
            }
            if (j < GGML_TYPE_COUNT) {
@@ -278,7 +276,7 @@ int main(int argc, char ** argv) {
            continue;
        }
        if (params.verbose) {
-            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
+            printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
        }
        if (kv_tensor.second->type == GGML_TYPE_F16) {
            is_f16 = true;
@@ -303,13 +301,14 @@ int main(int argc, char ** argv) {

    // loop throught quantization types
    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
+        const ggml_type type = (ggml_type) i;
        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
            continue;
        }
        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
            if (params.verbose) {
-                printf("testing %s ...\n",  type_strs[i]);
+                printf("testing %s ...\n",  ggml_type_name(type));
            }

            error_stats global_stats {};
@@ -321,7 +320,7 @@ int main(int argc, char ** argv) {
                if (params.verbose) {
                    printf("  %s ...\n",  kv_tensor.first.c_str());
                }
-                std::string layer_name { type_strs[i] };
+                std::string layer_name { ggml_type_name(type) };
                layer_name += "::" + kv_tensor.first;
                test_roundtrip_on_layer(
                        layer_name,
@@ -336,7 +335,7 @@ int main(int argc, char ** argv) {
                );
            }

-            print_error_stats(type_strs[i], global_stats, params.print_histogram);
+            print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
        }
    }

--- a/flake.nix
+++ b/flake.nix
@@ -10,7 +10,6 @@
          inherit system;
        };
        llama-python = pkgs.python310.withPackages (ps: with ps; [
-          torch
          numpy
          sentencepiece
        ]);
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@@ -177,11 +177,12 @@ extern "C" {
 #include <stddef.h>
 #include <stdbool.h>

-#define GGML_MAX_DIMS     4
-#define GGML_MAX_NODES    4096
-#define GGML_MAX_PARAMS   16
-#define GGML_MAX_CONTEXTS 64
-#define GGML_MAX_OPT      4
+#define GGML_MAX_DIMS          4
+#define GGML_MAX_NODES         4096
+#define GGML_MAX_PARAMS        16
+#define GGML_MAX_CONTEXTS      64
+#define GGML_MAX_OPT           4
+#define GGML_DEFAULT_N_THREADS 4

 #ifdef __ARM_NEON
 // we use the built-in 16-bit float type
@@ -203,6 +204,7 @@ enum ggml_type {
    GGML_TYPE_F16  = 1,
    GGML_TYPE_Q4_0 = 2,
    GGML_TYPE_Q4_1 = 3,
+    GGML_TYPE_Q8_0 = 4,
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
@@ -252,6 +254,9 @@ enum ggml_op {
    GGML_OP_FLASH_ATTN,
    GGML_OP_FLASH_FF,

+    GGML_OP_MAP_UNARY,
+    GGML_OP_MAP_BINARY,
+
    GGML_OP_COUNT,
 };

@@ -350,6 +355,8 @@ int    ggml_blck_size (enum ggml_type type);
 size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
 float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float

+const char * ggml_type_name(enum ggml_type type);
+
 size_t ggml_element_size(const struct ggml_tensor * tensor);

 struct ggml_context * ggml_init(struct ggml_init_params params);
@@ -651,6 +658,21 @@ struct ggml_tensor * ggml_flash_ff(
        struct ggml_tensor  * c0,
        struct ggml_tensor  * c1);

+// Mapping operations
+typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
+typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
+
+struct ggml_tensor * ggml_map_unary_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t fun);
+
+struct ggml_tensor * ggml_map_binary_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t fun);
+
 //
 // automatic differentiation
 //
@@ -786,6 +808,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
 int ggml_cpu_has_avx(void);
 int ggml_cpu_has_avx2(void);
 int ggml_cpu_has_avx512(void);
+int ggml_cpu_has_avx512_vbmi(void);
+int ggml_cpu_has_avx512_vnni(void);
 int ggml_cpu_has_fma(void);
 int ggml_cpu_has_neon(void);
 int ggml_cpu_has_arm_fma(void);
@@ -815,6 +839,7 @@ typedef struct {
    dequantize_row_q_t dequantize_row_q;
    quantize_row_q_t   quantize_row_q;
    quantize_row_q_t   quantize_row_q_reference;
+    quantize_row_q_t   quantize_row_q_dot;
    vec_dot_q_t        vec_dot_q;
 } quantize_fns_t;

--- a/llama.cpp
+++ b/llama.cpp
@@ -5,11 +5,11 @@

 #include "llama_util.h"
 #include "llama.h"
-#include "llama_internal.h"

 #include "ggml.h"

 #include <array>
+#include <ctime>
 #include <cinttypes>
 #include <fstream>
 #include <random>
@@ -262,22 +262,12 @@ static size_t checked_div(size_t a, size_t b) {
 }

 static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
-    std::string ret = "[" + std::to_string(ne.at(0));
+    char buf[256];
+    snprintf(buf, sizeof(buf), "%5u", ne.at(0));
    for (size_t i = 1; i < ne.size(); i++) {
-        ret += " x " + std::to_string(ne.at(i));
-    }
-    ret += "]";
-    return ret;
-}
-
-static const char * llama_format_type(enum ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_F32: return "f32";
-        case GGML_TYPE_F16: return "f16";
-        case GGML_TYPE_Q4_0: return "q4_0";
-        case GGML_TYPE_Q4_1: return "q4_1";
-        default: LLAMA_ASSERT(false);
+        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
    }
+    return buf;
 }

 static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
@@ -952,8 +942,8 @@ static void llama_model_load_internal(
        ml->ggml_ctx = ctx;

        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
-        model.norm   = ml->get_tensor("norm.weight", {n_embd});
-        model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
+        model.norm           = ml->get_tensor("norm.weight",           {n_embd});
+        model.output         = ml->get_tensor("output.weight",         {n_embd, n_vocab});

        model.layers.resize(n_layer);
        for (uint32_t i = 0; i < n_layer; ++i) {
@@ -1580,10 +1570,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        tensor.data = read_data.addr;
        model_loader->load_data_for(tensor);

-        printf("[%zu/%zu] %36s - %s, type = %6s, ",
+        printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
               ++idx, model_loader->tensors_map.tensors.size(),
               tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
-               llama_format_type(tensor.type));
+               ggml_type_name(tensor.type));

        // This used to be a regex, but <regex> has an extreme cost to compile times.
        bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
@@ -1616,7 +1606,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
                    f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
                }
            } else {
-                throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type));
+                throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
            }

            printf("quantizing .. ");
@@ -1925,18 +1915,20 @@ const char * llama_print_system_info(void) {
    static std::string s;

    s  = "";
-    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
-    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
-    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
-    s += "FMA = "       + std::to_string(ggml_cpu_has_fma())       + " | ";
-    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
-    s += "ARM_FMA = "   + std::to_string(ggml_cpu_has_arm_fma())   + " | ";
-    s += "F16C = "      + std::to_string(ggml_cpu_has_f16c())      + " | ";
-    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
-    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
-    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
-    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
-    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
+    s += "AVX = "         + std::to_string(ggml_cpu_has_avx())         + " | ";
+    s += "AVX2 = "        + std::to_string(ggml_cpu_has_avx2())        + " | ";
+    s += "AVX512 = "      + std::to_string(ggml_cpu_has_avx512())      + " | ";
+    s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
+    s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
+    s += "FMA = "         + std::to_string(ggml_cpu_has_fma())         + " | ";
+    s += "NEON = "        + std::to_string(ggml_cpu_has_neon())        + " | ";
+    s += "ARM_FMA = "     + std::to_string(ggml_cpu_has_arm_fma())     + " | ";
+    s += "F16C = "        + std::to_string(ggml_cpu_has_f16c())        + " | ";
+    s += "FP16_VA = "     + std::to_string(ggml_cpu_has_fp16_va())     + " | ";
+    s += "WASM_SIMD = "   + std::to_string(ggml_cpu_has_wasm_simd())   + " | ";
+    s += "BLAS = "        + std::to_string(ggml_cpu_has_blas())        + " | ";
+    s += "SSE3 = "        + std::to_string(ggml_cpu_has_sse3())        + " | ";
+    s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";

    return s.c_str();
 }
--- a/llama.h
+++ b/llama.h
@@ -179,4 +179,15 @@ extern "C" {
 }
 #endif

+// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
+#ifdef LLAMA_API_INTERNAL
+
+#include <vector>
+#include <string>
+struct ggml_tensor;
+
+std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
+
+#endif
+
 #endif // LLAMA_H
--- a/llama_internal.h
+++ b/llama_internal.h
@@ -1,12 +0,0 @@
-// Internal header to be included by llama.cpp and tests/benchmarks only.
-
-#ifndef LLAMA_INTERNAL_H
-#define LLAMA_INTERNAL_H
-
-#include <vector>
-#include <string>
-struct ggml_tensor;
-
-std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
-
-#endif // LLAMA_INTERNAL_H
--- a/llama_util.h
+++ b/llama_util.h
@@ -43,8 +43,12 @@
    } while (0)

 #ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
 __attribute__((format(printf, 1, 2)))
 #endif
+#endif
 static std::string format(const char * fmt, ...) {
    va_list ap, ap2;
    va_start(ap, fmt);
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
    va_end(ap2);
    va_end(ap);
    return std::string(buf.data(), size);
-};
+}

 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
@@ -172,7 +176,6 @@ struct llama_mmap {
        flags |= MAP_POPULATE;
 #endif
        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        close(fd);
        if (addr == MAP_FAILED) {
            throw format("mmap failed: %s", strerror(errno));
        }
--- a/migrate-ggml-2023-03-30-pr613.py
+++ b/migrate-ggml-2023-03-30-pr613.py
@@ -1,311 +0,0 @@
-# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
-#
-# We caused a breaking change to the file format on 2023-03-30 in:
-#     https://github.com/ggerganov/llama.cpp/pull/613
-#
-# (1) If you still have the Meta LLaMA .pth files, then close this
-#     file now; you can just run `convert-pth-to-ggml.py` again to
-#     migrate to the new format. The tool is easier to use too. It
-#     isn't necessary anymore to manage split output files because
-#     the new format always combines things into a single file.
-#
-# (2) If you deleted the Meta LLaMA .pth files due to save on disk
-#     space, then this tool is intended to help you.  Please check
-#     out the instructions below.
-#
-# USAGE
-#
-#     python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
-#
-# PREREQUISITES
-#
-#     pip install numpy
-#     cd llama.cpp
-#     make -j4
-#
-# EXAMPLE (7B MODEL)
-#
-#     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
-#     python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
-#
-#     # check that it works
-#     ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
-#
-#     # you can delete the old files
-#     rm -f models/7B/ggml-model-f16.bin
-#     mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
-#
-# EXAMPLE (13B MODEL)
-#
-#     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
-#     python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
-#
-#     # check that it works
-#     ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
-#
-#     # you can delete the old files
-#     rm -f models/13B/ggml-model-f16.bin*
-#     mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
-#
-
-import argparse
-import os
-import sys
-import json
-import struct
-import numpy as np
-
-QK = 32
-
-GGML_TYPE_Q4_0  = 0
-GGML_TYPE_Q4_1  = 1
-GGML_TYPE_I8    = 2
-GGML_TYPE_I16   = 3
-GGML_TYPE_I32   = 4
-GGML_TYPE_F16   = 5
-GGML_TYPE_F32   = 6
-
-WTYPE_NAMES = {
-    0: "F32",
-    1: "F16",
-    2: "Q4_0",
-    3: "Q4_1",
-}
-
-WTYPES = {
-    0: GGML_TYPE_F32,
-    1: GGML_TYPE_F16,
-    2: GGML_TYPE_Q4_0,
-    3: GGML_TYPE_Q4_1,
-}
-
-GGML_BLCK_SIZE = {
-    GGML_TYPE_Q4_0:  QK,
-    GGML_TYPE_Q4_1:  QK,
-    GGML_TYPE_I8:    1,
-    GGML_TYPE_I16:   1,
-    GGML_TYPE_I32:   1,
-    GGML_TYPE_F16:   1,
-    GGML_TYPE_F32:   1,
-}
-
-GGML_TYPE_SIZE = {
-    GGML_TYPE_Q4_0: 4   + QK//2,
-    GGML_TYPE_Q4_1: 4*2 + QK//2,
-    GGML_TYPE_I8:   1,
-    GGML_TYPE_I16:  2,
-    GGML_TYPE_I32:  4,
-    GGML_TYPE_F16:  2,
-    GGML_TYPE_F32:  4,
-}
-
-HPARAMS = [
-    'magic',    # int32
-    'version',  # int32
-    'n_vocab',  # int32
-    'n_embd',   # int32
-    'n_mult',   # int32
-    'n_head',   # int32
-    'n_layer',  # int32
-    'n_rot',    # int32
-    'f16',      # int32
-]
-
-def read_hparams(fin):
-    struct_fmt = "i" * len(HPARAMS)
-    struct_size = struct.calcsize(struct_fmt)
-    buf = fin.read(struct_size)
-    ints = struct.unpack(struct_fmt, buf)
-    hparams = dict(zip(HPARAMS, ints))
-    return hparams
-
-def write_hparams(fout, hparams):
-    struct_fmt = "i" * len(HPARAMS)
-    struct_size = struct.calcsize(struct_fmt)
-    ints = [hparams[h] for h in HPARAMS]
-    fout.write(struct.pack(struct_fmt, *ints))
-
-def read_tokens(fin, hparams):
-    tokens = []
-    for i in range(hparams['n_vocab']):
-        len_b = fin.read(4)
-        (length,) = struct.unpack("i", len_b)
-        word = fin.read(length)
-        score_b = fin.read(4)
-        (score,) = struct.unpack("f", score_b)
-        tokens.append((word, score))
-    return tokens
-
-def write_tokens(fout, tokens):
-    for word, score in tokens:
-        fout.write(struct.pack("i", len(word)))
-        fout.write(word)
-        fout.write(struct.pack("f", score))
-
-def ggml_nelements(shape):
-    r = 1
-    for i in shape:
-        r *= i
-    return r
-
-def ggml_nbytes(shape, ftype):
-    x = ggml_nelements(shape)
-    t = WTYPES[ftype]
-    x *= GGML_TYPE_SIZE[t]
-    x //= GGML_BLCK_SIZE[t]
-    return x
-
-def copy_tensors(fin, fout, part_id, n_parts):
-    while True:
-
-        b = fin.read(4)
-        if not b: break
-        (n_dims,) = struct.unpack("i", b)
-        b = fin.read(4)
-        (length,) = struct.unpack("i", b)
-        b = fin.read(4)
-        (ftype,) = struct.unpack("i", b)
-
-        assert n_dims in (1, 2)
-
-        partshape = list(range(n_dims))
-        for i in range(n_dims):
-            b = fin.read(4)
-            partshape[i] = struct.unpack("i", b)[0]
-        partshape = list(reversed(partshape))
-
-        name = fin.read(length)
-        data = fin.read(ggml_nbytes(partshape, ftype))
-
-        blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
-        type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
-
-        print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
-
-        # determine dimension along which multipart tensor is sharded
-        #
-        # split_dim 0 regex:
-        #   - output.*
-        #   - layers.*.attention.wq.weight
-        #   - layers.*.attention.wk.weight
-        #   - layers.*.attention.wv.weight
-        #   - layers.*.feed_forward.w1.weight
-        #   - layers.*.feed_forward.w3.weight
-        #
-        # split_dim 1 regex:
-        #   - tok_embeddings.*
-        #   - layers.*.attention.wo.weight
-        #   - layers.*.feed_forward.w2.weight
-        #
-        if n_dims > 1:
-            split_dim = 1
-            if b"tok_embeddings" in name:
-                split_dim = 1
-            elif b"layers" in name:
-                if b"attention.wo.weight" in name:
-                    split_dim = 1
-                elif b"feed_forward.w2.weight" in name:
-                    split_dim = 1
-                else:
-                    split_dim = 0
-            elif b"output" in name:
-                split_dim = 0
-
-        # output tensor header
-        fullshape = list(partshape)
-        if n_dims > 1:
-            fullshape[split_dim] *= n_parts
-        fout.write(struct.pack("iii", n_dims, len(name), ftype))
-        for dim in reversed(fullshape):
-            fout.write(struct.pack("i", dim))
-        fout.write(name)
-
-        # ensure tensor data is aligned
-        tensor_data_offset = fout.tell()
-        while tensor_data_offset % QK != 0:
-            fout.write(struct.pack("B", 0))
-            tensor_data_offset += 1
-
-        # output unified mappable tensor data
-        if n_dims == 1 or n_parts == 1:
-            # copy tensor which we thankfully received in one piece
-            if part_id == 0:
-                fout.write(data)
-        elif split_dim == 0:
-            # reassemble multifile tensor containing some of the rows
-            rows_per_chunk = partshape[0]
-            current_row = part_id * rows_per_chunk
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset = current_row * bytes_per_row
-            fout.seek(tensor_data_offset + offset)
-            fout.write(data)
-        elif split_dim == 1:
-            # reassemble multifile tensor containing some of the cols
-            cols_per_chunk = partshape[1]
-            current_col = part_id * cols_per_chunk
-            bpr = partshape[1] // blck_size * type_size
-            bytes_per_row = fullshape[1] // blck_size * type_size
-            offset_current_col = current_col // blck_size * type_size
-            for row in range(partshape[0]):
-                offset_row = row * bytes_per_row
-                offset = offset_row + offset_current_col
-                fout.seek(tensor_data_offset + offset)
-                fout.write(data[row * bpr:row * bpr + bpr])
-
-        # advance file position to next tensor
-        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
-    parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
-    parser.add_argument('fout_path', help='your new ggjt file name')
-    return parser.parse_args()
-
-def main():
-    args = parse_args()
-    assert args.fin_path
-    assert args.fout_path
-    assert args.fin_path != args.fout_path
-
-    with open(args.fin_path, "rb") as fin:
-        hparams = read_hparams(fin)
-        tokens = read_tokens(fin, hparams)
-
-    if hparams['magic'] == 0x67676a74:  # ggjt
-        print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
-        sys.exit(1)
-
-    if hparams['magic'] != 0x67676d66:  # ggmf
-        print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
-        sys.exit(1)
-
-    hparams['magic'] = 0x67676a74  # ggjt
-
-    # count number of multipart files by convention
-    n_parts = 1
-    while True:
-        if os.path.exists(f"{args.fin_path}.{n_parts}"):
-            n_parts += 1
-        else:
-            break
-
-    # we output a single file for ggml
-    with open(args.fout_path, "wb") as fout:
-        write_hparams(fout, hparams)
-        write_tokens(fout, tokens)
-        offset_of_tensors = fout.tell()
-        # the tensors we load could be split across multiple files
-        for part_id in range(n_parts):
-            fout.seek(offset_of_tensors)
-            print(f"Processing part {part_id+1} of {n_parts}\n")
-            fin_path = args.fin_path
-            if part_id > 0:
-                fin_path += f".{part_id}"
-            with open(fin_path, "rb") as fin:
-                read_tokens(fin, read_hparams(fin))
-                copy_tensors(fin, fout, part_id, n_parts)
-
-    print(f"Done. Output file: {args.fout_path}\n")
-
-if __name__ == "__main__":
-    main()
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+numpy==1.24
+sentencepiece==0.1.98
Author	SHA1	Message	Date
Georgi Gerganov	69b740289f	ggml : avoid using ggml_fp16_to_fp32() and ggml_fp32_to_fp16() in ggml.c	2023-04-17 16:16:23 +03:00
Ivan Komarov	f266259ad9	Speedup the AVX-512 implementation of ggml_vec_dot_q4_0() (#933 )	2023-04-17 15:10:57 +02:00
slaren	47f61aaa5f	Fix: do not close file on mmap (#1017 )	2023-04-16 21:27:38 +02:00
Georgi Gerganov	3173a62eb9	stdout : vertical align outputs for better readibility	2023-04-16 13:59:27 +03:00
Pavol Rusnak	489537e6cf	examples: add missing <ctime> include for time() (#1011 )	2023-04-16 10:13:00 +00:00
nanahi	2d3481c721	Fix msys2 build error and warnings (#1009 )	2023-04-16 11:13:42 +02:00
comex	74f5899df4	convert.py: Fix loading safetensors and ggml format on Windows (#991 ) Calling `mmap.mmap` on Windows apparently resets the file offset of the raw file object (and makes the BufferedReader return a negative file offset). For safetensors, avoid using the file offset after calling mmap. For GGML format, explicitly save and restore the offset. Fixes #966.	2023-04-15 23:53:21 +02:00
Stephan Walter	2f7c8e014e	Fix potential int8 overflow in non-SIMD vec_dot (#986 )	2023-04-15 18:28:56 +00:00
Stephan Walter	0ad964631f	Refactor ggml.c for future tensor types (#1001 )	2023-04-15 16:25:38 +00:00
Georgi Gerganov	e95b6554b4	ggml : add Q8_0 quantization for intermediate results (#951 ) * ggml : add Q8_0 quantization for intermediate results * quantize-stats : fix test + add it to Makefile default * Q8: use int8_t, AVX/AVX2 optimizations * ggml : fix quantize_row_q8_0() ARM_NEON rounding * minor : updates after rebase to latest master * quantize-stats : delete obsolete strings * ggml : fix q4_1 dot func --------- Co-authored-by: Stephan Walter <stephan@walter.name>	2023-04-15 17:53:22 +03:00
Georgi Gerganov	aa485cee33	ggml : use posix_memalign on non-Windows env	2023-04-15 14:25:45 +03:00
Ivan Komarov	c12b14b77f	benchmark : fix result validation in benchmark-q4_0-matmult (#987 )	2023-04-15 08:51:54 +03:00
katsu560	106faaf297	cmake : add finding the OpenBLAS header file (#992 )	2023-04-15 08:51:11 +03:00
Pavol Rusnak	c85e03d12e	Revert "main : alternative instruct mode (Vicuna support, etc.) (#863 )" (#982 ) This reverts commit `f4d277ae17`.	2023-04-14 22:58:43 +03:00
Pavol Rusnak	489093548c	py : bump sentencepiece to 0.1.98 to support Python 3.11 (#976 )	2023-04-14 19:46:49 +00:00
Stephan Walter	93265e988a	make : fix dependencies, use auto variables (#983 )	2023-04-14 22:39:48 +03:00
Pavol Rusnak	c56b715269	Expose type name from ggml (#970 ) Avoid duplication of type names in utils Co-authored-by: Håkon H. Hitland <haakon@likedan.net>	2023-04-14 20:05:37 +02:00
Tomáš Pazdiora	f4d277ae17	main : alternative instruct mode (Vicuna support, etc.) (#863 ) * Add support for configs, add configurable prefixes / suffixes, deprecate instruct mode, add stop prompt * Add multiline mode, update text input. * bugfix * update implementation * typos * Change --multiline implementation to be toggled by EOF. * bugfix * default multiline mode * add more configs * update formating * update formatting * apply suggestions	2023-04-14 18:19:17 +03:00
Kerfuffle	c9a59b70a5	ggml : add unary and binary map operations (#874 ) * GGML map ops proof of concept. * Various cleanups. Add handling for task setting. Add handling for ggml_compute_backward. Rename functions to ggml_map_unary_f32 and ggml_map_binary_f32 Fix compiler warnings related to casting function pointers and `void ` Reorder functions and definitions based on the GGML op number. Use typedefs for map op function pointer types. Fix position of map ops cases in ggml_compute_forward	2023-04-14 17:43:55 +03:00
Pavol Rusnak	a32f7acc9f	py : cleanup dependencies (#962 ) after #545 we do not need torch, tqdm and requests in the dependencies	2023-04-14 15:37:11 +02:00
Pavol Rusnak	43ffdefb74	py : fix flake8 and isort nitpicks (#960 )	2023-04-14 14:23:21 +02:00
Georgi Gerganov	1623a6e9b4	ggml : minor	2023-04-14 13:31:29 +03:00
Georgi Gerganov	c14e0d2f23	ggml : always allocate buffers with size multiple of GGML_MEM_ALIGN	2023-04-14 13:31:15 +03:00
comex	723dac55fa	py : new conversion script (#545 ) Current status: Working, except for the latest GPTQ-for-LLaMa format that includes `g_idx`. This turns out to require changes to GGML, so for now it only works if you use the `--outtype` option to dequantize it back to f16 (which is pointless except for debugging). I also included some cleanup for the C++ code. This script is meant to replace all the existing conversion scripts (including the ones that convert from older GGML formats), while also adding support for some new formats. Specifically, I've tested with: - [x] `LLaMA` (original) - [x] `llama-65b-4bit` - [x] `alpaca-native` - [x] `alpaca-native-4bit` - [x] LLaMA converted to 'transformers' format using `convert_llama_weights_to_hf.py` - [x] `alpaca-native` quantized with `--true-sequential --act-order --groupsize 128` (dequantized only) - [x] same as above plus `--save_safetensors` - [x] GPT4All - [x] stock unversioned ggml - [x] ggmh There's enough overlap in the logic needed to handle these different cases that it seemed best to move to a single script. I haven't tried this with Alpaca-LoRA because I don't know where to find it. Useful features: - Uses multiple threads for a speedup in some cases (though the Python GIL limits the gain, and sometimes it's disk-bound anyway). - Combines split models into a single file (both the intra-tensor split of the original and the inter-tensor split of 'transformers' format files). Single files are more convenient to work with and more friendly to future changes to use memory mapping on the C++ side. To accomplish this without increasing memory requirements, it has some custom loading code which avoids loading whole input files into memory at once. - Because of the custom loading code, it no longer depends in PyTorch, which might make installing dependencies slightly easier or faster... although it still depends on NumPy and sentencepiece, so I don't know if there's any meaningful difference. In any case, I also added a requirements.txt file to lock the dependency versions in case of any future breaking changes. - Type annotations checked with mypy. - Some attempts to be extra user-friendly: - The script tries to be forgiving with arguments, e.g. you can specify either the model file itself or the directory containing it. - The script doesn't depend on config.json / params.json, just in case the user downloaded files individually and doesn't have those handy. But you still need tokenizer.model and, for Alpaca, added_tokens.json. - The script tries to give a helpful error message if added_tokens.json is missing.	2023-04-14 10:03:03 +03:00
Georgi Gerganov	0f07cacb05	ggml : fix q4_1 dot product types	2023-04-14 09:45:42 +03:00
Howard Su	c5d70f5c9e	ggml : optimize rope function to avoid call powf in the tight loop (#807 )	2023-04-14 09:24:52 +03:00
Gary Linscott	be87b6ed20	perplexity : add support for batch size to `--perplexity` (#407 ) * Add support to batch size for perplexity * Revert "Fix memory allocation issues and seg faults" This reverts commit `4870e455b3`. * update from merge * Remove perplexity from main * updates * Update batch size for efficiency	2023-04-14 00:50:42 +03:00
CRD716	0e07e6a839	common : remove unnecessary includes (#947 )	2023-04-13 18:39:25 +03:00
Georgi Gerganov	a3a2a0eda8	ggml : add GGML_DEFAULT_N_THREADS	2023-04-13 18:36:48 +03:00
Georgi Gerganov	d990e3fffc	ggml : speed-up ggml_vec_dot_q4_1() ARM_NEON + 32-bit ARM support (#900 ) * ggml : speed-up q4_1 ARM_NEON by ~5% * ggml : implement vaddvq when missing * ggml : implement vminvq and vmaxvq when missing * ggml : implement vzip when missing * ggml : fix comment * ggml : try to use correct ifdef	2023-04-13 18:32:36 +03:00
Georgi Gerganov	9190e8eac8	llama : merge llama_internal.h into llama.h Hide it behind an #ifdef	2023-04-13 18:04:45 +03:00
Georgi Gerganov	c85980acd0	gitignore : benchmark	2023-04-13 18:01:33 +03:00
Stephan Walter	6232f2d7fd	ggml : optimize non-SIMD Q4_0 vector dot product (#703 )	2023-04-13 17:59:50 +03:00
Pavol Rusnak	6c248707f5	ggml : introduce GGML_ALIGNED_MALLOC/GGML_ALIGNED_FREE macros (#884 ) which allows us to use aligned_alloc or _aligned_malloc functions	2023-04-13 17:08:32 +03:00