Compare commits

..

24 Commits

Author SHA1 Message Date
Slaren
0d054e292e Show error message when -f fails 2023-04-01 16:08:40 +02:00
Stephan Walter
3525899277 Enable -std= for cmake builds, fix warnings (#598) 2023-03-31 19:19:16 +00:00
slaren
1d08882afa Optimize AVX2 ggml_vec_dot_q4_0 (#642) 2023-03-31 15:55:52 +00:00
perserk
02c5b27e91 Add AVX acceleration (#617)
* ggml : add AVX quantize_row_q4_0()

* ggml : add AVX ggml_vec_dot_q4_0()

* ggml : refactor AVX part of ggml_vec_dot_q4_0()

https://github.com/ggerganov/llama.cpp/pull/617#issuecomment-1489985645
2023-03-31 13:55:44 +02:00
Pavol Rusnak
cbef542879 py : cleanup the code
- use f-strings where possible
- drop first param of encode/decode functions since "utf-8" is the default
2023-03-31 10:32:01 +02:00
Pavol Rusnak
9733104be5 drop quantize.py (now that models are using a single file) 2023-03-31 01:07:32 +02:00
Georgi Gerganov
3df890aef4 readme : update supported models 2023-03-30 22:31:54 +03:00
Justine Tunney
ee0c40dd6d Introduce GGML migration tool for new file format
If you deleted your old Meta LLaMA .pth files, then the
migrate-ggml-2023-03-30-pr613.py script will allow you to convert your
old ggml files into the new mmap()'able format.

See #613
2023-03-30 12:28:25 -07:00
Justine Tunney
6f23ba5ee2 Ensure --mlock works properly with mmap() support 2023-03-30 12:28:25 -07:00
Justine Tunney
78ca9838ee Make loading weights 10-100x faster
This is a breaking change that's going to give you three benefits:

1. Your inference commands should load 100x faster
2. You may be able to safely load models 2x larger
3. You can run many concurrent inference processes

This was accomplished by changing the file format so we can mmap()
weights directly into memory without having to read() or copy them
thereby ensuring the kernel can make its file cache pages directly
accessible to our inference processes; and secondly, that the file
cache pages are much less likely to get evicted (which would force
loads to hit disk) because they're no longer competing with memory
pages that were needlessly created by gigabytes of standard i/o.

The new file format supports single-file models like LLaMA 7b, and
it also supports multi-file models like LLaMA 13B. Our Python tool
now merges the foo.1, foo.2, etc. files back into a single file so
that the C++ code which maps it doesn't need to reshape data every
time. That's made llama.cpp so much simpler. Much of its load code
has now been deleted.

Furthermore, this change ensures that tensors are aligned properly
on a 32-byte boundary. That opens the door to seeing if we can get
additional performance gains on some microprocessors, by using ops
that require memory alignment.

Lastly note that both POSIX and the Windows platform are supported

Fixes #91
2023-03-30 12:28:25 -07:00
Slaren
a017390358 Initial windows support (untested) 2023-03-30 12:28:25 -07:00
Slaren
ac184d5147 Always initialize mm_addr and mm_length in llama_model 2023-03-30 12:28:25 -07:00
Slaren
276e5b7811 Unmap the file in llama_free 2023-03-30 12:28:25 -07:00
Slaren
d68c5dc435 Make mmap_file static 2023-03-30 12:28:25 -07:00
Slaren
64bde3ffd4 Fix ggml_init_params in quantize 2023-03-30 12:28:25 -07:00
Slaren
c03ae8dca1 Add mmap support for model files 2023-03-30 12:28:25 -07:00
Stephan Walter
3bcc129ba8 cmake : properly invoke CTest (#629) 2023-03-30 20:56:59 +03:00
Casey Primozic
a4755cf288 Remove unused variable (#607)
* It seems some new warning were added recently that exposed this.  I wrote the code that included this unused variable originally and it is indeed not needed.
2023-03-30 17:53:35 +00:00
david raistrick
1f0414feec make : fix darwin f16c flags check (#615)
...there was no check.  ported upstream from https://github.com/zanussbaum/gpt4all.cpp/pull/2 (I dont see any clean path for upstream patches)
2023-03-30 20:34:45 +03:00
Georgi Gerganov
77efdf5a50 ggml : fix NEON signs (close #620, #622) 2023-03-30 20:27:32 +03:00
slaren
ed3c680bcd Fix GGML_F32Cx8_STORE in AVX without F16C path (#619) 2023-03-30 11:16:30 +02:00
anzz1
9cbc404ba6 ci : re-enable AVX512 testing (Windows-MSVC) (#584)
* CI: Re-enable AVX512 testing (Windows-MSVC)

Now with 100% less base64 encoding

* plain __cpuid is enough here
2023-03-29 23:44:39 +03:00
Georgi Gerganov
b51c717d5c ggml : init time on first ggml_init() call 2023-03-29 22:15:34 +03:00
Georgi Gerganov
0ba76c1e73 llama : fix compile warnings when reading the vocab 2023-03-29 22:13:12 +03:00
19 changed files with 948 additions and 514 deletions

View File

@@ -176,7 +176,13 @@ jobs:
if: ${{ matrix.build == 'avx512' }}
continue-on-error: true
run: |
echo "TODO: check avx512f"
cd build
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
$cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
& $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
.\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
- name: Test
id: cmake_test

1
.gitignore vendored
View File

@@ -22,6 +22,7 @@ models/*
/result
/perplexity
/embedding
/Pipfile
arm_neon.h
compile_commands.json

View File

@@ -68,7 +68,9 @@ option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
# Compile flags
#
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
@@ -253,7 +255,7 @@ endif()
#
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
enable_testing()
include(CTest)
add_subdirectory(tests)
endif ()

View File

@@ -71,7 +71,10 @@ endif
# feel free to update the Makefile for your architecture and send a pull request or issue
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
ifeq ($(UNAME_S),Darwin)
CFLAGS += -mf16c
F16C_M := $(shell sysctl machdep.cpu.features)
ifneq (,$(findstring F16C,$(F16C_M)))
CFLAGS += -mf16c
endif
AVX1_M := $(shell sysctl machdep.cpu.features)
ifneq (,$(findstring FMA,$(AVX1_M)))
CFLAGS += -mfma

View File

@@ -37,9 +37,11 @@ Supported platforms:
Supported models:
- [X] LLaMA
- [X] LLaMA 🦙
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
---
@@ -153,8 +155,8 @@ python3 -m pip install torch numpy sentencepiece
# convert the 7B model to ggml FP16 format
python3 convert-pth-to-ggml.py models/7B/ 1
# quantize the model to 4-bits
python3 quantize.py 7B
# quantize the model to 4-bits (using method 2 = q4_0)
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
# run the inference
./main -m ./models/7B/ggml-model-q4_0.bin -n 128

View File

@@ -27,9 +27,9 @@ def read_tokens(fin, vocab_size):
text_len = struct.unpack("i", fin.read(4))[0]
text_bytes = fin.read(text_len)
try:
text = text_bytes.decode("utf-8")
text = text_bytes.decode()
except UnicodeDecodeError:
text = text_bytes.decode("utf-8", "replace")
text = text_bytes.decode(errors="replace")
score = struct.unpack("f", fin.read(4))[0]
tokens.append((text, score))
return tokens
@@ -82,7 +82,12 @@ def read_variables(fin):
shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
shape = shape[::-1]
name = fin.read(name_length).decode("utf-8")
name = fin.read(name_length).decode()
# ensure tensor data is aligned
tensor_data_offset = fin.tell()
tensor_data_offset = (tensor_data_offset + 31) & -32
fin.seek(tensor_data_offset)
if ftype_cur == 2:
# 4-bit quantized weights
@@ -194,7 +199,7 @@ def chat(model, hparams, llama_dir):
device = torch.device("cpu")
llama = llama.to(device)
ctx = """You are AI.
ctx = """You are AI.
This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
User: Hello, AI.
AI: Hello! How can I assist you today?
@@ -202,11 +207,11 @@ AI: Hello! How can I assist you today?
print(ctx.rstrip("\n"))
while True:
print("-" * 60)
prompt = input(f"User: ")
prompt = input("User: ")
if ctx != "":
ctx = ctx + "User: " + prompt + "\n"
ctx = f"{ctx}User: {prompt}\n"
else:
ctx = prompt + "\nAI:"
ctx = f"{prompt}\nAI:"
ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
@@ -231,7 +236,7 @@ AI: Hello! How can I assist you today?
)
s = generation_output.sequences[0]
decoded = tokenizer.decode(s)
ctx = decoded + "\n"
ctx = f"{decoded}\n"
def main():

View File

@@ -49,7 +49,7 @@ def write_header(f_out, header):
def write_tokens(fout, tokenizer):
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
text = " \u2047 ".encode()
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
@@ -60,13 +60,13 @@ def write_tokens(fout, tokenizer):
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
# TODO: GPT4All - add extra <pad> token
text = "<pad>".encode("utf-8")
text = "<pad>".encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", 0.0))

View File

@@ -50,7 +50,7 @@ fout.write(struct.pack("i", 4))
# This loop unchanged from convert-pth-to-ggml.py:
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
text = " \u2047 ".encode()
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
@@ -61,21 +61,26 @@ for i in range(tokenizer.vocab_size()):
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
def write_header(shape, dst_name, ftype_cur):
sname = dst_name.encode('utf-8')
sname = dst_name.encode()
fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
fout.write(sname)
# ensure tensor data is aligned
tensor_data_offset = fout.tell()
tensor_data_offset = (tensor_data_offset + 31) & -32
fout.seek(tensor_data_offset)
def convert_non_q4(src_name, dst_name):
v = model[src_name]
shape = v.shape
print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
if len(shape) == 1:
print(" Converting to float32")
v = v.to(torch.float32)
@@ -100,7 +105,7 @@ def convert_q4(src_name, dst_name, permute=False):
# Each int32 item is actually 8 int4 items packed together, and it's transposed.
shape = (qweight.shape[0], qweight.shape[1] * 8)
print("Processing Q4 variable: " + src_name + " with shape: ", shape)
print(f"Processing Q4 variable: {src_name} with shape: {shape}")
# The output format has the int4 weights in groups of 32 rather than 8.
# It looks like this:
@@ -163,5 +168,5 @@ for i in range(n_layer):
fout.close()
print("Done. Output file: " + fname_out)
print("")
print(f"Done. Output file: {fname_out}")
print()

View File

@@ -1,4 +1,4 @@
# Convert a LLaMA model checkpoint to a ggml compatible file
# Convert a LLaMA model checkpoint to a ggjt compatible file
#
# Load the model using Torch
# Iterate over all variables and write them to a binary file.
@@ -24,8 +24,57 @@ import torch
from sentencepiece import SentencePieceProcessor
def parse_args():
QK = 32
GGML_TYPE_Q4_0 = 0
GGML_TYPE_Q4_1 = 1
GGML_TYPE_I8 = 2
GGML_TYPE_I16 = 3
GGML_TYPE_I32 = 4
GGML_TYPE_F16 = 5
GGML_TYPE_F32 = 6
WTYPES = {
0: GGML_TYPE_F32,
1: GGML_TYPE_F16,
2: GGML_TYPE_Q4_0,
3: GGML_TYPE_Q4_1,
}
GGML_BLCK_SIZE = {
GGML_TYPE_Q4_0: QK,
GGML_TYPE_Q4_1: QK,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 1,
GGML_TYPE_I32: 1,
GGML_TYPE_F16: 1,
GGML_TYPE_F32: 1,
}
GGML_TYPE_SIZE = {
GGML_TYPE_Q4_0: 4 + QK//2,
GGML_TYPE_Q4_1: 4*2 + QK//2,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 2,
GGML_TYPE_I32: 4,
GGML_TYPE_F16: 2,
GGML_TYPE_F32: 4,
}
def ggml_nelements(shape):
r = 1
for i in shape:
r *= i
return r
def ggml_nbytes(shape, ftype):
x = ggml_nelements(shape)
t = WTYPES[ftype]
x *= GGML_TYPE_SIZE[t]
x //= GGML_BLCK_SIZE[t]
return x
def parse_args():
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
parser.add_argument('dir_model', help='directory containing the model checkpoint')
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
@@ -33,7 +82,6 @@ def parse_args():
return parser.parse_args()
def get_n_parts(dim):
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
n_parts = mappings.get(dim)
if n_parts is None:
@@ -44,30 +92,24 @@ def get_n_parts(dim):
return n_parts
def load_hparams_and_tokenizer(dir_model):
# `dir_model` is something like `models/7B` or `models/7B/`.
# "tokenizer.model" is expected under model's parent dir.
# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
# Let's use the model's parent dir directly.
model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
fname_hparams = f"{dir_model}/params.json"
fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
with open(fname_hparams, "r") as f:
hparams = json.load(f)
print(hparams)
tokenizer = SentencePieceProcessor(fname_tokenizer)
hparams.update({"vocab_size": tokenizer.vocab_size()})
return hparams, tokenizer
def write_header(fout, hparams, ftype):
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
values = [
0x67676d66, # magic: ggmf in hex
0x67676a74, # magic: ggjt in hex
1, # file version
*[hparams[key] for key in keys],
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
@@ -76,10 +118,9 @@ def write_header(fout, hparams, ftype):
fout.write(struct.pack("i" * len(values), *values))
def write_tokens(fout, tokenizer):
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
text = " \u2047 ".encode()
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
@@ -90,90 +131,144 @@ def write_tokens(fout, tokenizer):
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))
def process_and_write_variables(fout, model, ftype):
def process_and_write_variables(fout, model, ftype, part_id, n_parts):
for name, datao in model.items():
if name.endswith("freqs"):
continue
shape = datao.shape
print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
# remove dimensions with a single element
data = datao.numpy().squeeze()
n_dims = len(shape)
partshape = data.shape
n_dims = len(data.shape)
assert n_dims in (1, 2)
# default type is fp16
print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
# coerce single-dimensional tensors from float16 to float32
ftype_cur = 1
if ftype == 0 or n_dims == 1:
print(" Converting to float32")
data = data.astype(np.float32)
ftype_cur = 0
blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
# header
sname = name.encode('utf-8')
fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
for dim in reversed(data.shape):
# determine dimension along which multipart tensor is sharded
#
# split_dim 0 regex:
# - output.*
# - layers.*.attention.wq.weight
# - layers.*.attention.wk.weight
# - layers.*.attention.wv.weight
# - layers.*.feed_forward.w1.weight
# - layers.*.feed_forward.w3.weight
#
# split_dim 1 regex:
# - tok_embeddings.*
# - layers.*.attention.wo.weight
# - layers.*.feed_forward.w2.weight
#
if n_dims > 1:
split_dim = 1
if "tok_embeddings" in name:
split_dim = 1
elif "layers" in name:
if "attention.wo.weight" in name:
split_dim = 1
elif "feed_forward.w2.weight" in name:
split_dim = 1
else:
split_dim = 0
elif "output" in name:
split_dim = 0
# output tensor header
fullshape = list(partshape)
if n_dims > 1:
fullshape[split_dim] *= n_parts
sname = name.encode()
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
for dim in reversed(fullshape):
fout.write(struct.pack("i", dim))
fout.write(sname)
# data output to file
data.tofile(fout)
# ensure tensor data is aligned
tensor_data_offset = fout.tell()
while tensor_data_offset % QK != 0:
fout.write(struct.pack("B", 0))
tensor_data_offset += 1
# output unified mappable tensor data
if n_dims == 1 or n_parts == 1:
# copy tensor which we thankfully received in one piece
if part_id == 0:
data.tofile(fout)
elif split_dim == 0:
# reassemble multifile tensor containing some of the rows
rows_per_chunk = partshape[0]
current_row = part_id * rows_per_chunk
bytes_per_row = fullshape[1] // blck_size * type_size
offset = current_row * bytes_per_row
fout.seek(tensor_data_offset + offset)
data.tofile(fout)
elif split_dim == 1:
# reassemble multifile tensor containing some of the cols
cols_per_chunk = partshape[1]
current_col = part_id * cols_per_chunk
bytes_per_row = fullshape[1] // blck_size * type_size
offset_current_col = current_col // blck_size * type_size
for row in range(partshape[0]):
offset_row = row * bytes_per_row
offset = offset_row + offset_current_col
fout.seek(tensor_data_offset + offset)
data[row].tofile(fout)
# advance file position to next tensor
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
def main():
args = parse_args()
dir_model = args.dir_model
ftype = args.ftype
ftype_str = ["f32", "f16"]
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
print(args)
# if only writing vocab to file
if args.vocab_only:
fname_model = f"{dir_model}/consolidated.00.pth"
fname_out = f"{dir_model}/ggml-vocab.bin"
print(f"Extracting only the vocab from '{fname_model}'\n")
with open(fname_out, "wb") as fout:
write_header(fout, hparams, ftype)
write_tokens(fout, tokenizer)
print(f"Done. Output file: {fname_out}\n")
return
n_parts = get_n_parts(hparams["dim"])
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
for p in range(n_parts):
# we output a single file for ggml
with open(fname_out, "wb") as fout:
write_header(fout, hparams, ftype)
write_tokens(fout, tokenizer)
offset_of_tensors = fout.tell()
# the tensors we load could be split across multiple files
for part_id in range(n_parts):
fout.seek(offset_of_tensors)
print(f"Processing part {part_id+1} of {n_parts}\n")
fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
model = torch.load(fname_model, map_location="cpu")
process_and_write_variables(fout, model, ftype, part_id, n_parts)
del model
print(f"Processing part {p+1} of {n_parts}\n")
fname_model = f"{dir_model}/consolidated.0{p}.pth"
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
model = torch.load(fname_model, map_location="cpu")
with open(fname_out, "wb") as fout:
write_header(fout, hparams, ftype)
write_tokens(fout, tokenizer)
process_and_write_variables(fout, model, ftype)
del model
print(f"Done. Output file: {fname_out}, (part {p})\n")
print(f"Done. Output file: {fname_out}\n")
if __name__ == "__main__":
main()

View File

@@ -44,7 +44,7 @@ def write_header(f_out, header):
def write_tokens(fout, tokenizer):
for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
text = " \u2047 ".encode("utf-8")
text = " \u2047 ".encode()
elif tokenizer.is_control(i):
text = b""
elif tokenizer.is_byte(i):
@@ -55,7 +55,7 @@ def write_tokens(fout, tokenizer):
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
fout.write(struct.pack("i", len(text)))
fout.write(text)
fout.write(struct.pack("f", tokenizer.get_score(i)))

View File

@@ -66,6 +66,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (params.prompt.back() == '\n') {
params.prompt.pop_back();

View File

@@ -19,7 +19,7 @@ int main(int argc, char ** argv) {
// needed to initialize f16 tables
{
struct ggml_init_params params = { 0, NULL };
struct ggml_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
}

250
ggml.c
View File

@@ -461,6 +461,39 @@ static inline __m128i packNibbles( __m256i bytes )
__m128i r1 = _mm256_extracti128_si256( bytes, 1 );
return _mm_packus_epi16( r0, r1 );
}
#elif __AVX__
static inline __m128i bytesFromNibbles( const uint8_t* rsi )
{
// Load 8 bytes from memory
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
// Expand bytes into uint16_t values
__m128i bytes = _mm_cvtepu8_epi16( tmp );
// Unpack values into individual bytes
const __m128i lowMask = _mm_set1_epi8( 0xF );
__m128i high = _mm_andnot_si128( lowMask, bytes );
__m128i low = _mm_and_si128( lowMask, bytes );
high = _mm_slli_epi16( high, 4 );
bytes = _mm_or_si128( low, high );
return bytes;
}
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
{
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
const __m128i lowByte = _mm_set1_epi16( 0xFF );
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
__m128i low = _mm_and_si128( lowByte, bytes1 );
high = _mm_srli_epi16( high, 4 );
bytes1 = _mm_or_si128( low, high );
high = _mm_andnot_si128( lowByte, bytes2 );
low = _mm_and_si128( lowByte, bytes2 );
high = _mm_srli_epi16( high, 4 );
bytes2 = _mm_or_si128( low, high );
return _mm_packus_epi16( bytes1, bytes2);
}
#endif
// method 5
@@ -509,8 +542,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);
assert(vi0 < 16);
assert(vi1 < 16);
pp[l/2] = vi0 | (vi1 << 4);
}
@@ -660,6 +693,80 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
__m128i res = packNibbles( i0 );
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
}
#elif defined(__AVX__)
for (int i = 0; i < nb; i++) {
// Load elements into 4 AVX vectors
__m256 v0 = _mm256_loadu_ps( x );
__m256 v1 = _mm256_loadu_ps( x + 8 );
__m256 v2 = _mm256_loadu_ps( x + 16 );
__m256 v3 = _mm256_loadu_ps( x + 24 );
x += 32;
// Compute max(abs(e)) for the block
const __m256 signBit = _mm256_set1_ps( -0.0f );
__m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
__m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
const float maxScalar = _mm_cvtss_f32( max4 );
// Quantize these floats
const float d = maxScalar / 7.0f;
y[i].d = d;
const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;
const __m256 mul = _mm256_set1_ps( id );
// Apply the multiplier
v0 = _mm256_mul_ps( v0, mul );
v1 = _mm256_mul_ps( v1, mul );
v2 = _mm256_mul_ps( v2, mul );
v3 = _mm256_mul_ps( v3, mul );
// Round to nearest integer
v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
// Convert floats to integers
__m256i i0 = _mm256_cvtps_epi32( v0 );
__m256i i1 = _mm256_cvtps_epi32( v1 );
__m256i i2 = _mm256_cvtps_epi32( v2 );
__m256i i3 = _mm256_cvtps_epi32( v3 );
// Since we don't have in AVX some necessary functions,
// we split the registers in half and call AVX2 analogs from SSE
__m128i ni0 = _mm256_castsi256_si128( i0 );
__m128i ni1 = _mm256_extractf128_si256( i0, 1);
__m128i ni2 = _mm256_castsi256_si128( i1 );
__m128i ni3 = _mm256_extractf128_si256( i1, 1);
__m128i ni4 = _mm256_castsi256_si128( i2 );
__m128i ni5 = _mm256_extractf128_si256( i2, 1);
__m128i ni6 = _mm256_castsi256_si128( i3 );
__m128i ni7 = _mm256_extractf128_si256( i3, 1);
// Convert int32 to int16
ni0 = _mm_packs_epi32( ni0, ni1 );
ni2 = _mm_packs_epi32( ni2, ni3 );
ni4 = _mm_packs_epi32( ni4, ni5 );
ni6 = _mm_packs_epi32( ni6, ni7 );
// Convert int16 to int8
ni0 = _mm_packs_epi16( ni0, ni2 );
ni4 = _mm_packs_epi16( ni4, ni6 );
// Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]
const __m128i off = _mm_set1_epi8( 8);
ni0 = _mm_add_epi8( ni0, off );
ni4 = _mm_add_epi8( ni4, off );
// Compress the vector into 4 bit/value, and store
__m128i res = packNibbles( ni0, ni4 );
_mm_storeu_si128( ( __m128i* )y[i].qs, res );
}
#elif defined(__wasm_simd128__)
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
@@ -730,8 +837,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, void * restric
const uint8_t vi0 = roundf(v0);
const uint8_t vi1 = roundf(v1);
assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);
assert(vi0 < 16);
assert(vi1 < 16);
pp[l/2] = vi0 | (vi1 << 4);
}
@@ -1038,8 +1145,8 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
const uint8x16_t vq = vcombine_u8(vx_0, vx_1);
// convert to 2x uint16x8_t
const uint16x8_t vi_0 = vmovl_s8(vget_low_u8 (vq));
const uint16x8_t vi_1 = vmovl_s8(vget_high_u8(vq));
const uint16x8_t vi_0 = vmovl_u8(vget_low_u8 (vq));
const uint16x8_t vi_1 = vmovl_u8(vget_high_u8(vq));
// convert to 4x float32x4_t
const float32x4_t vf_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16 (vi_0)));
@@ -1297,7 +1404,7 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
_mm256_storeu_ps(arr, y);
for (int i = 0; i < 8; i++)
x[i] = GGML_FP16_TO_FP32(arr[i]);
x[i] = GGML_FP32_TO_FP16(arr[i]);
}
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
@@ -1726,7 +1833,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
const block_q4_0 * restrict x = vx;
const block_q4_0 * restrict y = vy;
ggml_float sumf = 0.0;
float sumf = 0.0;
#if defined(__ARM_NEON)
float sum0 = 0.0f;
@@ -1821,7 +1928,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
#endif
}
sumf = (ggml_float)(sum0 + sum1);
sumf = sum0 + sum1;
#elif defined(__AVX512F__)
// Initialize accumulator with zeros
__m512 acc0 = _mm512_setzero_ps();
@@ -1829,7 +1936,6 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
const int superblock_size = 8;
const int superblock_count = nb / superblock_size;
const int remainder = nb % superblock_size;
for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) {
int i = superblock_ix * superblock_size;
@@ -1856,6 +1962,10 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
__m256 acc = _mm256_setzero_ps();
// Main loop
// TODO: figure a way to do this in a portable way
#ifdef __GNUC__
#pragma GCC unroll 16
#endif
for (int i = 0; i < nb; ++i) {
// Compute combined scale for the block
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
@@ -1869,20 +1979,21 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
bx = _mm256_sub_epi8( bx, off );
by = _mm256_sub_epi8( by, off );
// Sign-extend first 16 signed bytes into int16_t
__m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
__m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
// Compute products of int16_t integers, add pairwise
__m256i i32 = _mm256_madd_epi16( x16, y16 );
// Get absolute values of x vectors
const __m256i ax = _mm256_sign_epi8(bx, bx);
// Sign-extend last 16 signed bytes into int16_t vectors
x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
// Accumulate products of int16_t integers
i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) );
// Sign the values of the y vectors
const __m256i sy = _mm256_sign_epi8(by, bx);
// Perform multiplication and create 16-bit values
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
const __m256i ones = _mm256_set1_epi16(1);
const __m256i i32 = _mm256_madd_epi16(ones, dot);
// Convert int32_t to float
__m256 p = _mm256_cvtepi32_ps( i32 );
const __m256 p = _mm256_cvtepi32_ps( i32 );
// Apply the scale, and accumulate
acc = _mm256_fmadd_ps( d, p, acc );
}
@@ -1893,6 +2004,52 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
sumf = _mm_cvtss_f32( res );
#elif defined(__AVX__)
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
// Main loop
for (int i = 0; i < nb; ++i) {
// Compute combined scale for the block
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
__m128i i32[2];
for (int j = 0; j < 2; ++j) {
// Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
__m128i bx = bytesFromNibbles( x[i].qs + 8*j );
__m128i by = bytesFromNibbles( y[i].qs + 8*j );
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
const __m128i off = _mm_set1_epi8( 8 );
bx = _mm_sub_epi8( bx, off );
by = _mm_sub_epi8( by, off );
// Get absolute values of x vectors
const __m128i ax = _mm_sign_epi8(bx, bx);
// Sign the values of the y vectors
const __m128i sy = _mm_sign_epi8(by, bx);
// Perform multiplication and create 16-bit values
const __m128i dot = _mm_maddubs_epi16(ax, sy);
const __m128i ones = _mm_set1_epi16(1);
i32[j] = _mm_madd_epi16(ones, dot);
}
// Convert int32_t to float
__m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
// Apply the scale, and accumulate
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
}
// Return horizontal sum of the acc vector
__m128 res = _mm256_extractf128_ps( acc, 1 );
res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
sumf = _mm_cvtss_f32( res );
#elif defined(__wasm_simd128__)
// wasm simd
@@ -2530,8 +2687,9 @@ struct ggml_context {
void * mem_buffer;
bool mem_buffer_owned;
bool mem_buffer_mlocked;
bool no_alloc;
int n_objects;
int n_objects;
struct ggml_object * objects_begin;
struct ggml_object * objects_end;
@@ -2748,6 +2906,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
static bool is_first_call = true;
if (is_first_call) {
// initialize time system (required on Windows)
ggml_time_init();
// initialize GELU, SILU and EXP F32 tables
{
const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
@@ -2813,6 +2974,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
/*.mem_buffer_mlocked =*/ false,
/*.no_alloc =*/ params.no_alloc,
/*.n_objects =*/ 0,
/*.objects_begin =*/ NULL,
/*.objects_end =*/ NULL,
@@ -2880,36 +3042,47 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
return result;
}
#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
#endif
bool ggml_mlock_supported(void) {
return GGML_MLOCK_SUPPORT;
}
bool ggml_mlock(
struct ggml_context * ctx,
const void *opt_extra_addr,
size_t opt_extra_len,
char **err_p) {
// TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
#if GGML_MLOCK_SUPPORT
#ifdef __APPLE__
#define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l)."
#else
#define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)."
#endif
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
if (ctx->mem_buffer_mlocked) {
return true;
}
if (mlock(ctx->mem_buffer, ctx->mem_size)) {
int ret = asprintf(err_p, "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
ctx->mem_size, strerror(errno));
GGML_ASSERT(ret >= 0);
if (mlock(ctx->mem_buffer, ctx->mem_size) ||
(opt_extra_len &&
mlock(opt_extra_addr, opt_extra_len))) {
if ((*err_p = malloc(1024))) {
snprintf(*err_p, 1024,
"failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
ctx->mem_size + opt_extra_len,
strerror(errno));
}
return false;
}
ctx->mem_buffer_mlocked = true;
return true;
}
#else // GGML_MLOCK_SUPPORT
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
*err_p = strdup("can't mlock because it's not supported on this system");
return false;
}
#endif // GGML_MLOCK_SUPPORT
}
////////////////////////////////////////////////////////////////////////////////
@@ -2928,7 +3101,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
size_t size_needed = 0;
if (data == NULL) {
if (data == NULL && !ctx->no_alloc) {
size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
for (int i = 1; i < n_dims; i++) {
size_needed *= ne[i];
@@ -3012,7 +3185,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
/*.perf_runs =*/ 0,
/*.perf_cycles =*/ 0,
/*.perf_time_us =*/ 0,
/*.data =*/ data == NULL ? (void *)(result + 1) : data,
/*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
/*.pad =*/ { 0 },
};
@@ -10275,6 +10448,7 @@ enum ggml_opt_result ggml_opt(
struct ggml_init_params params_ctx = {
.mem_size = 16*1024*1024,
.mem_buffer = NULL,
.no_alloc = false,
};
ctx = ggml_init(params_ctx);

7
ggml.h
View File

@@ -316,6 +316,7 @@ struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
void * mem_buffer; // if NULL, memory will be allocated internally
bool no_alloc; // don't allocate memory for the tensor data
};
void ggml_time_init(void); // call this once at the beginning of the program
@@ -344,7 +345,11 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
bool ggml_mlock_supported(void);
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
bool ggml_mlock(
struct ggml_context * ctx,
const void *opt_extra_addr,
size_t opt_extra_len,
char **err_p);
struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,

477
llama.cpp
View File

@@ -12,6 +12,19 @@
#include <cassert>
#include <cstring>
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#else
#include <sys/types.h>
#include <sys/mman.h>
#include <unistd.h>
#include <fcntl.h>
#endif
#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
#define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16
@@ -142,6 +155,10 @@ struct llama_model {
// the model memory buffer
std::vector<uint8_t> buf;
// model memory mapped file
void * mm_addr = NULL;
uint64_t mm_length = 0;
// tensors
int n_loaded;
std::unordered_map<std::string, struct ggml_tensor *> tensors;
@@ -165,6 +182,7 @@ struct llama_context {
int64_t t_load_us = 0;
int64_t t_start_us = 0;
bool has_evaluated_once = false;
int64_t t_sample_us = 0;
int64_t t_eval_us = 0;
@@ -206,7 +224,7 @@ struct llama_context {
}
if (buf_last >= 0) {
buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
}
buf_last = i;
@@ -246,6 +264,7 @@ static bool kv_cache_init(
struct ggml_init_params params;
params.mem_size = cache.buf.size();
params.mem_buffer = cache.buf.data();
params.no_alloc = false;
cache.ctx = ggml_init(params);
@@ -288,6 +307,58 @@ struct llama_context_params llama_context_default_params() {
// model loading
//
static void *mmap_file(const char *fname, uint64_t *mm_length) {
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
HANDLE hFile = CreateFileA(fname,
GENERIC_READ,
FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
NULL,
OPEN_EXISTING,
FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
NULL);
if (hFile == INVALID_HANDLE_VALUE) return 0;
LARGE_INTEGER fileSize;
fileSize.QuadPart = -1;
GetFileSizeEx(hFile, &fileSize);
int64_t length = fileSize.QuadPart;
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
CloseHandle(hFile);
if (!hMapping) return 0;
void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
CloseHandle(hMapping);
if (!addr) return 0;
#else
int fd = open(fname, O_RDONLY);
if (fd == -1) return 0;
int64_t length = lseek(fd, 0, SEEK_END);
void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
close(fd);
if (addr == MAP_FAILED) return 0;
#endif
*mm_length = length;
return addr;
}
static void munmap_file(void * addr, size_t length) {
#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
UnmapViewOfFile(addr);
#else
munmap(addr, length);
#endif
}
static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
fprintf(stderr,
"%s: invalid model file (bad magic [got %#x want %#x])\n"
"\tyou most likely need to regenerate your ggml files\n"
"\tthe benefit is you'll get 10-100x faster load times\n"
"\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
"\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
"\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
path, got, want);
return false;
}
static bool llama_model_load(
const std::string & fname,
llama_context & lctx,
@@ -299,22 +370,24 @@ static bool llama_model_load(
void *progress_callback_user_data) {
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
const int64_t t_start_us = ggml_time_us();
lctx.t_start_us = t_start_us;
std::vector<char> f_buf(1024*1024);
lctx.t_start_us = ggml_time_us();
auto & model = lctx.model;
auto & vocab = lctx.vocab;
auto fin = std::ifstream(fname, std::ios::binary);
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
if (!fin) {
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
return false;
}
std::vector<char> f_buf(1024*1024);
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
fin.seekg(0, fin.end);
const size_t file_size = fin.tellg();
fin.seekg(0);
// verify magic
{
uint32_t magic;
@@ -325,8 +398,7 @@ static bool llama_model_load(
return false;
}
if (magic != LLAMA_FILE_MAGIC) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
return false;
return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
}
uint32_t format_version;
@@ -449,43 +521,24 @@ static bool llama_model_load(
}
}
// map model into memory
char *mm_addr = NULL;
model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
if (model.mm_addr == NULL) {
fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
return false;
}
mm_addr = (char *)model.mm_addr;
fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
auto & ctx = model.ctx;
size_t ctx_size = 0;
{
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const auto &hparams = model.hparams;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
ctx_size += (5 + 10*n_layer)*256; // object overhead
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
}
// print memory requirements
@@ -495,6 +548,7 @@ static bool llama_model_load(
// this is the total memory required to run the inference
const size_t mem_required =
ctx_size +
model.mm_length +
MEM_REQ_SCRATCH0.at(model.type) +
MEM_REQ_SCRATCH1.at(model.type) +
MEM_REQ_EVAL.at (model.type);
@@ -514,6 +568,7 @@ static bool llama_model_load(
struct ggml_init_params params = {
/*.mem_size =*/ lctx.model.buf.size(),
/*.mem_buffer =*/ lctx.model.buf.data(),
/*.no_alloc =*/ true,
};
model.ctx = ggml_init(params);
@@ -576,234 +631,106 @@ static bool llama_model_load(
}
}
const size_t file_offset = fin.tellg();
fin.close();
std::vector<uint8_t> tmp;
if (progress_callback) {
progress_callback(0.0, progress_callback_user_data);
}
for (int i = 0; i < n_parts; ++i) {
const int part_id = i;
//const int part_id = n_parts - i - 1;
fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());
std::string fname_part = fname;
if (i > 0) {
fname_part += "." + std::to_string(i);
}
// load weights
{
size_t total_size = 0;
model.n_loaded = 0;
fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
while (true) {
int32_t n_dims;
int32_t length;
int32_t ftype;
fin = std::ifstream(fname_part, std::ios::binary);
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
fin.seekg(0, fin.end);
const size_t file_size = fin.tellg();
fin.seekg(file_offset);
// load weights
{
size_t total_size = 0;
model.n_loaded = 0;
fprintf(stderr, "%s: ", __func__);
while (true) {
int32_t n_dims;
int32_t length;
int32_t ftype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
if (fin.eof()) {
break;
}
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
std::string name(length, 0);
fin.read(&name[0], length);
if (model.tensors.find(name.data()) == model.tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
return false;
}
// split_type = 0: split by columns
// split_type = 1: split by rows
int split_type = 0;
// split_type = 0:
// regex:
// - tok_embeddings.*
// - layers.*.attention.wo.weight
// - layers.*.feed_forward.w2.weight
// split_type = 1:
// regex:
// - output.*
// - layers.*.attention.wq.weight
// - layers.*.attention.wk.weight
// - layers.*.attention.wv.weight
// - layers.*.feed_forward.w1.weight
// - layers.*.feed_forward.w3.weight
if (name.find("tok_embeddings") != std::string::npos) {
split_type = 0;
} else if (name.find("layers") != std::string::npos) {
if (name.find("attention.wo.weight") != std::string::npos) {
split_type = 0;
} else if (name.find("feed_forward.w2.weight") != std::string::npos) {
split_type = 0;
} else {
split_type = 1;
}
} else if (name.find("output") != std::string::npos) {
split_type = 1;
}
auto tensor = model.tensors[name.data()];
if (n_dims == 1) {
if (ggml_nelements(tensor) != nelements) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
return false;
}
} else {
if (ggml_nelements(tensor)/n_parts != nelements) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
return false;
}
}
if (n_dims == 1) {
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
return false;
}
} else {
if (split_type == 0) {
if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
return false;
}
} else {
if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
return false;
}
}
}
if (0) {
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
}
size_t bpe = 0;
switch (ftype) {
case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
default:
{
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
return false;
}
};
if (n_dims == 1 || n_parts == 1) {
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
return false;
}
if (part_id == 0) {
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
} else {
fin.seekg(ggml_nbytes(tensor), std::ios::cur);
}
total_size += ggml_nbytes(tensor);
} else {
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
return false;
}
if (split_type == 0) {
const int np0 = ne[0];
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
assert(row_size == tensor->nb[1]);
for (int i1 = 0; i1 < ne[1]; ++i1) {
const size_t offset_row = i1*row_size;
const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
}
} else {
const int np1 = ne[1];
const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
for (int i1 = 0; i1 < ne[1]; ++i1) {
const size_t offset_row = (i1 + part_id*np1)*row_size;
fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
}
}
total_size += ggml_nbytes(tensor)/n_parts;
}
//fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
model.n_loaded++;
// progress
if (progress_callback) {
float current_file_progress = float(size_t(fin.tellg()) - file_offset) / float(file_size - file_offset);
float current_progress = (float(i) + current_file_progress) / float(n_parts);
progress_callback(current_progress, progress_callback_user_data);
}
if (model.n_loaded % 8 == 0) {
fprintf(stderr, ".");
fflush(stderr);
}
if (fin.eof()) {
break;
}
fprintf(stderr, " done\n");
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
if (model.n_loaded == 0) {
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
} else if (model.n_loaded != (int) model.tensors.size()) {
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
std::string name(length, 0);
fin.read(&name[0], length);
if (model.tensors.find(name.data()) == model.tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
return false;
}
auto tensor = model.tensors[name.data()];
if (ggml_nelements(tensor) != nelements) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
return false;
}
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
return false;
}
if (0) {
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
}
switch (ftype) {
case 0: // f32
case 1: // f16
break;
case 2: // q4_0
case 3: // q4_1
assert(ne[0] % 64 == 0);
break;
default:
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
return false;
};
// load the tensor data into memory without copying or reading it
size_t offset = fin.tellg();
size_t tensor_data_size = ggml_nbytes(tensor);
offset = (offset + 31) & -32;
tensor->data = mm_addr + offset;
fin.seekg(offset + tensor_data_size);
total_size += tensor_data_size;
model.n_loaded++;
// progress
if (progress_callback) {
double current_progress = size_t(fin.tellg()) / double(file_size);
progress_callback(current_progress, progress_callback_user_data);
}
}
fin.close();
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
if (model.n_loaded == 0) {
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
} else if (model.n_loaded != (int) model.tensors.size()) {
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
return false;
}
}
lctx.t_load_us = ggml_time_us() - t_start_us;
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
lctx.t_load_us = ggml_time_us() - lctx.t_start_us;
if (progress_callback) {
progress_callback(1.0, progress_callback_user_data);
@@ -849,6 +776,7 @@ static bool llama_eval_internal(
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute.size(),
/*.mem_buffer =*/ buf_compute.data(),
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params);
@@ -1126,7 +1054,7 @@ struct llama_tokenizer {
size_t offs = 0;
while (offs < text.size()) {
llama_sp_symbol sym;
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
sym.text = text.c_str() + offs;
sym.n = char_len;
offs += char_len;
@@ -1291,7 +1219,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
float maxl = -std::numeric_limits<float>::infinity();
for (const auto & kv : logits_id) {
maxl = std::max(maxl, kv.first);
maxl = Max(maxl, kv.first);
}
// compute probs for the top k tokens
@@ -1385,8 +1313,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
return false;
}
if (magic != LLAMA_FILE_MAGIC) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
return false;
return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
}
fout.write((char *) &magic, sizeof(magic));
@@ -1444,7 +1371,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
return false;
}
std::string word;
std::vector<char> word(32);
vocab.id_to_token.resize(n_vocab);
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
@@ -1452,17 +1379,17 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
fout.write((char *) &len, sizeof(len));
word.resize(len);
finp.read ((char *) word.data(), len);
fout.write((char *) word.data(), len);
finp.read ((char *) &word[0], len);
fout.write((char *) &word[0], len);
float score;
finp.read ((char *) &score, sizeof(score));
fout.write((char *) &score, sizeof(score));
vocab.token_to_id[word] = i;
vocab.token_to_id[word.data()] = i;
auto &tok_score = vocab.id_to_token[i];
tok_score.tok = word;
tok_score.tok = word.data();
tok_score.score = score;
}
}
@@ -1503,6 +1430,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
std::string name(length, 0);
finp.read (&name[0], length);
{
// ensure tensor data is aligned
uint64_t offset = finp.tellg();
offset = (offset + 31) & -32;
finp.seekg(offset);
}
{
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
@@ -1558,6 +1492,13 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s
}
fout.write(&name[0], length);
{
// ensure tensor data is aligned
uint64_t offset = fout.tellp();
offset = (offset + 31) & -32;
fout.seekp(offset);
}
if (quantize) {
printf("quantizing .. ");
work.resize(nelements); // for quantization
@@ -1655,7 +1596,10 @@ struct llama_context * llama_init_from_file(
if (params.use_mlock) {
char *err;
if (!ggml_mlock(ctx->model.ctx, &err)) {
if (!ggml_mlock(ctx->model.ctx,
ctx->model.mm_addr,
ctx->model.mm_length,
&err)) {
fprintf(stderr, "%s\n", err);
free(err);
llama_free(ctx);
@@ -1705,6 +1649,10 @@ void llama_free(struct llama_context * ctx) {
ggml_free(ctx->model.ctx);
}
if (ctx->model.mm_addr) {
munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
}
delete ctx;
}
@@ -1730,7 +1678,11 @@ int llama_eval(
fprintf(stderr, "%s: failed to eval\n", __func__);
return 1;
}
// get a more accurate load time, upon first eval
if (!ctx->has_evaluated_once) {
ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
ctx->has_evaluated_once = true;
}
return 0;
}
@@ -1823,9 +1775,9 @@ llama_token llama_sample_top_p_top_k(
void llama_print_timings(struct llama_context * ctx) {
const int64_t t_end_us = ggml_time_us();
const int32_t n_sample = std::max(1, ctx->n_sample);
const int32_t n_eval = std::max(1, ctx->n_eval);
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
const int32_t n_sample = Max(1, ctx->n_sample);
const int32_t n_eval = Max(1, ctx->n_eval);
const int32_t n_p_eval = Max(1, ctx->n_p_eval);
fprintf(stderr, "\n");
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
@@ -1837,7 +1789,6 @@ void llama_print_timings(struct llama_context * ctx) {
void llama_reset_timings(struct llama_context * ctx) {
ctx->t_start_us = ggml_time_us();
ctx->t_sample_us = ctx->n_sample = 0;
ctx->t_eval_us = ctx->n_eval = 0;
ctx->t_p_eval_us = ctx->n_p_eval = 0;

View File

@@ -20,7 +20,7 @@
#endif
#define LLAMA_FILE_VERSION 1
#define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
#ifdef __cplusplus

View File

@@ -0,0 +1,311 @@
# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
#
# We caused a breaking change to the file format on 2023-03-30 in:
# https://github.com/ggerganov/llama.cpp/pull/613
#
# (1) If you still have the Meta LLaMA .pth files, then close this
# file now; you can just run `convert-pth-to-ggml.py` again to
# migrate to the new format. The tool is easier to use too. It
# isn't necessary anymore to manage split output files because
# the new format always combines things into a single file.
#
# (2) If you deleted the Meta LLaMA .pth files due to save on disk
# space, then this tool is intended to help you. Please check
# out the instructions below.
#
# USAGE
#
# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
#
# PREREQUISITES
#
# pip install numpy
# cd llama.cpp
# make -j4
#
# EXAMPLE (7B MODEL)
#
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
#
# # check that it works
# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
#
# # you can delete the old files
# rm -f models/7B/ggml-model-f16.bin
# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
#
# EXAMPLE (13B MODEL)
#
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
#
# # check that it works
# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
#
# # you can delete the old files
# rm -f models/13B/ggml-model-f16.bin*
# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
#
import argparse
import os
import sys
import json
import struct
import numpy as np
QK = 32
GGML_TYPE_Q4_0 = 0
GGML_TYPE_Q4_1 = 1
GGML_TYPE_I8 = 2
GGML_TYPE_I16 = 3
GGML_TYPE_I32 = 4
GGML_TYPE_F16 = 5
GGML_TYPE_F32 = 6
WTYPE_NAMES = {
0: "F32",
1: "F16",
2: "Q4_0",
3: "Q4_1",
}
WTYPES = {
0: GGML_TYPE_F32,
1: GGML_TYPE_F16,
2: GGML_TYPE_Q4_0,
3: GGML_TYPE_Q4_1,
}
GGML_BLCK_SIZE = {
GGML_TYPE_Q4_0: QK,
GGML_TYPE_Q4_1: QK,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 1,
GGML_TYPE_I32: 1,
GGML_TYPE_F16: 1,
GGML_TYPE_F32: 1,
}
GGML_TYPE_SIZE = {
GGML_TYPE_Q4_0: 4 + QK//2,
GGML_TYPE_Q4_1: 4*2 + QK//2,
GGML_TYPE_I8: 1,
GGML_TYPE_I16: 2,
GGML_TYPE_I32: 4,
GGML_TYPE_F16: 2,
GGML_TYPE_F32: 4,
}
HPARAMS = [
'magic', # int32
'version', # int32
'n_vocab', # int32
'n_embd', # int32
'n_mult', # int32
'n_head', # int32
'n_layer', # int32
'n_rot', # int32
'f16', # int32
]
def read_hparams(fin):
struct_fmt = "i" * len(HPARAMS)
struct_size = struct.calcsize(struct_fmt)
buf = fin.read(struct_size)
ints = struct.unpack(struct_fmt, buf)
hparams = dict(zip(HPARAMS, ints))
return hparams
def write_hparams(fout, hparams):
struct_fmt = "i" * len(HPARAMS)
struct_size = struct.calcsize(struct_fmt)
ints = [hparams[h] for h in HPARAMS]
fout.write(struct.pack(struct_fmt, *ints))
def read_tokens(fin, hparams):
tokens = []
for i in range(hparams['n_vocab']):
len_b = fin.read(4)
(length,) = struct.unpack("i", len_b)
word = fin.read(length)
score_b = fin.read(4)
(score,) = struct.unpack("f", score_b)
tokens.append((word, score))
return tokens
def write_tokens(fout, tokens):
for word, score in tokens:
fout.write(struct.pack("i", len(word)))
fout.write(word)
fout.write(struct.pack("f", score))
def ggml_nelements(shape):
r = 1
for i in shape:
r *= i
return r
def ggml_nbytes(shape, ftype):
x = ggml_nelements(shape)
t = WTYPES[ftype]
x *= GGML_TYPE_SIZE[t]
x //= GGML_BLCK_SIZE[t]
return x
def copy_tensors(fin, fout, part_id, n_parts):
while True:
b = fin.read(4)
if not b: break
(n_dims,) = struct.unpack("i", b)
b = fin.read(4)
(length,) = struct.unpack("i", b)
b = fin.read(4)
(ftype,) = struct.unpack("i", b)
assert n_dims in (1, 2)
partshape = list(range(n_dims))
for i in range(n_dims):
b = fin.read(4)
partshape[i] = struct.unpack("i", b)[0]
partshape = list(reversed(partshape))
name = fin.read(length)
data = fin.read(ggml_nbytes(partshape, ftype))
blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
# determine dimension along which multipart tensor is sharded
#
# split_dim 0 regex:
# - output.*
# - layers.*.attention.wq.weight
# - layers.*.attention.wk.weight
# - layers.*.attention.wv.weight
# - layers.*.feed_forward.w1.weight
# - layers.*.feed_forward.w3.weight
#
# split_dim 1 regex:
# - tok_embeddings.*
# - layers.*.attention.wo.weight
# - layers.*.feed_forward.w2.weight
#
if n_dims > 1:
split_dim = 1
if b"tok_embeddings" in name:
split_dim = 1
elif b"layers" in name:
if b"attention.wo.weight" in name:
split_dim = 1
elif b"feed_forward.w2.weight" in name:
split_dim = 1
else:
split_dim = 0
elif b"output" in name:
split_dim = 0
# output tensor header
fullshape = list(partshape)
if n_dims > 1:
fullshape[split_dim] *= n_parts
fout.write(struct.pack("iii", n_dims, len(name), ftype))
for dim in reversed(fullshape):
fout.write(struct.pack("i", dim))
fout.write(name)
# ensure tensor data is aligned
tensor_data_offset = fout.tell()
while tensor_data_offset % QK != 0:
fout.write(struct.pack("B", 0))
tensor_data_offset += 1
# output unified mappable tensor data
if n_dims == 1 or n_parts == 1:
# copy tensor which we thankfully received in one piece
if part_id == 0:
fout.write(data)
elif split_dim == 0:
# reassemble multifile tensor containing some of the rows
rows_per_chunk = partshape[0]
current_row = part_id * rows_per_chunk
bytes_per_row = fullshape[1] // blck_size * type_size
offset = current_row * bytes_per_row
fout.seek(tensor_data_offset + offset)
fout.write(data)
elif split_dim == 1:
# reassemble multifile tensor containing some of the cols
cols_per_chunk = partshape[1]
current_col = part_id * cols_per_chunk
bpr = partshape[1] // blck_size * type_size
bytes_per_row = fullshape[1] // blck_size * type_size
offset_current_col = current_col // blck_size * type_size
for row in range(partshape[0]):
offset_row = row * bytes_per_row
offset = offset_row + offset_current_col
fout.seek(tensor_data_offset + offset)
fout.write(data[row * bpr:row * bpr + bpr])
# advance file position to next tensor
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
def parse_args():
parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
parser.add_argument('fout_path', help='your new ggjt file name')
return parser.parse_args()
def main():
args = parse_args()
assert args.fin_path
assert args.fout_path
assert args.fin_path != args.fout_path
with open(args.fin_path, "rb") as fin:
hparams = read_hparams(fin)
tokens = read_tokens(fin, hparams)
if hparams['magic'] == 0x67676a74: # ggjt
print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
sys.exit(1)
if hparams['magic'] != 0x67676d66: # ggmf
print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
sys.exit(1)
hparams['magic'] = 0x67676a74 # ggjt
# count number of multipart files by convention
n_parts = 1
while True:
if os.path.exists(f"{args.fin_path}.{n_parts}"):
n_parts += 1
else:
break
# we output a single file for ggml
with open(args.fout_path, "wb") as fout:
write_hparams(fout, hparams)
write_tokens(fout, tokens)
offset_of_tensors = fout.tell()
# the tensors we load could be split across multiple files
for part_id in range(n_parts):
fout.seek(offset_of_tensors)
print(f"Processing part {part_id+1} of {n_parts}\n")
fin_path = args.fin_path
if part_id > 0:
fin_path += f".{part_id}"
with open(fin_path, "rb") as fin:
read_tokens(fin, read_hparams(fin))
copy_tensors(fin, fout, part_id, n_parts)
print(f"Done. Output file: {args.fout_path}\n")
if __name__ == "__main__":
main()

Binary file not shown.

View File

@@ -1,131 +0,0 @@
#!/usr/bin/env python3
"""Script to execute the "quantize" script on a given set of models."""
import subprocess
import argparse
import glob
import sys
import os
def main():
"""Update the quantize binary name depending on the platform and parse
the command line arguments and execute the script.
"""
if "linux" in sys.platform or "darwin" in sys.platform:
quantize_script_binary = "quantize"
elif "win32" in sys.platform or "cygwin" in sys.platform:
quantize_script_binary = "quantize.exe"
else:
print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
quantize_script_binary = "quantize"
parser = argparse.ArgumentParser(
prog='python3 quantize.py',
description='This script quantizes the given models by applying the '
f'"{quantize_script_binary}" script on them.'
)
parser.add_argument(
'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
help='The models to quantize.'
)
parser.add_argument(
'-r', '--remove-16', action='store_true', dest='remove_f16',
help='Remove the f16 model after quantizing it.'
)
parser.add_argument(
'-m', '--models-path', dest='models_path',
default=os.path.join(os.getcwd(), "models"),
help='Specify the directory where the models are located.'
)
parser.add_argument(
'-q', '--quantize-script-path', dest='quantize_script_path',
default=os.path.join(os.getcwd(), quantize_script_binary),
help='Specify the path to the "quantize" script.'
)
# TODO: Revise this code
# parser.add_argument(
# '-t', '--threads', dest='threads', type='int',
# default=os.cpu_count(),
# help='Specify the number of threads to use to quantize many models at '
# 'once. Defaults to os.cpu_count().'
# )
args = parser.parse_args()
args.models_path = os.path.abspath(args.models_path)
if not os.path.isfile(args.quantize_script_path):
print(
f'The "{quantize_script_binary}" script was not found in the '
"current location.\nIf you want to use it from another location, "
"set the --quantize-script-path argument from the command line."
)
sys.exit(1)
for model in args.models:
# The model is separated in various parts
# (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
f16_model_path_base = os.path.join(
args.models_path, model, "ggml-model-f16.bin"
)
if not os.path.isfile(f16_model_path_base):
print(f'The file %s was not found' % f16_model_path_base)
sys.exit(1)
f16_model_parts_paths = map(
lambda filename: os.path.join(f16_model_path_base, filename),
glob.glob(f"{f16_model_path_base}*")
)
for f16_model_part_path in f16_model_parts_paths:
if not os.path.isfile(f16_model_part_path):
print(
f"The f16 model {os.path.basename(f16_model_part_path)} "
f"was not found in {args.models_path}{os.path.sep}{model}"
". If you want to use it from another location, set the "
"--models-path argument from the command line."
)
sys.exit(1)
__run_quantize_script(
args.quantize_script_path, f16_model_part_path
)
if args.remove_f16:
os.remove(f16_model_part_path)
# This was extracted to a top-level function for parallelization, if
# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
def __run_quantize_script(script_path, f16_model_part_path):
"""Run the quantize script specifying the path to it and the path to the
f16 model to quantize.
"""
new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
subprocess.run(
[script_path, f16_model_part_path, new_quantized_model_path, "2"],
check=True
)
if __name__ == "__main__":
try:
main()
except subprocess.CalledProcessError:
print("\nAn error ocurred while trying to quantize the models.")
sys.exit(1)
except KeyboardInterrupt:
sys.exit(0)
else:
print("\nSuccesfully quantized all models.")