mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
41 Commits
master-0e0
...
master-666
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6667401238 | ||
|
|
77a73403ca | ||
|
|
50a8a2af97 | ||
|
|
4caebf6d40 | ||
|
|
dcdd65e296 | ||
|
|
5ecff35151 | ||
|
|
7faa7460f0 | ||
|
|
5af8e32238 | ||
|
|
42747220b4 | ||
|
|
e9298af389 | ||
|
|
4ad73137a1 | ||
|
|
315a95a4d3 | ||
|
|
efd05648c8 | ||
|
|
eb17a026fd | ||
|
|
69b740289f | ||
|
|
f266259ad9 | ||
|
|
47f61aaa5f | ||
|
|
3173a62eb9 | ||
|
|
489537e6cf | ||
|
|
2d3481c721 | ||
|
|
74f5899df4 | ||
|
|
2f7c8e014e | ||
|
|
0ad964631f | ||
|
|
e95b6554b4 | ||
|
|
aa485cee33 | ||
|
|
c12b14b77f | ||
|
|
106faaf297 | ||
|
|
c85e03d12e | ||
|
|
489093548c | ||
|
|
93265e988a | ||
|
|
c56b715269 | ||
|
|
f4d277ae17 | ||
|
|
c9a59b70a5 | ||
|
|
a32f7acc9f | ||
|
|
43ffdefb74 | ||
|
|
1623a6e9b4 | ||
|
|
c14e0d2f23 | ||
|
|
723dac55fa | ||
|
|
0f07cacb05 | ||
|
|
c5d70f5c9e | ||
|
|
be87b6ed20 |
@@ -5,9 +5,10 @@ FROM ubuntu:$UBUNTU_VERSION as build
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install numpy requests sentencepiece tqdm \
|
||||
&& pip install torch --index-url https://download.pytorch.org/whl/cpu
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
||||
14
.github/workflows/build.yml
vendored
14
.github/workflows/build.yml
vendored
@@ -8,6 +8,8 @@ on:
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
|
||||
pull_request:
|
||||
types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
|
||||
@@ -18,6 +20,8 @@ env:
|
||||
|
||||
jobs:
|
||||
ubuntu-latest-make:
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
@@ -37,6 +41,8 @@ jobs:
|
||||
make
|
||||
|
||||
ubuntu-latest-cmake:
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
@@ -65,6 +71,8 @@ jobs:
|
||||
ctest --verbose
|
||||
|
||||
ubuntu-latest-cmake-sanitizer:
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
continue-on-error: true
|
||||
@@ -101,6 +109,8 @@ jobs:
|
||||
ctest --verbose
|
||||
|
||||
macOS-latest-make:
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: macos-latest
|
||||
|
||||
steps:
|
||||
@@ -119,6 +129,8 @@ jobs:
|
||||
make
|
||||
|
||||
macOS-latest-cmake:
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: macOS-latest
|
||||
|
||||
steps:
|
||||
@@ -146,6 +158,8 @@ jobs:
|
||||
ctest --verbose
|
||||
|
||||
windows-latest-cmake:
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
|
||||
2
.github/workflows/docker.yml
vendored
2
.github/workflows/docker.yml
vendored
@@ -18,6 +18,8 @@ on:
|
||||
jobs:
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
if: github.event.pull_request.draft == false
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
COMMIT_SHA: ${{ github.sha }}
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -24,6 +24,7 @@ models/*
|
||||
/perplexity
|
||||
/embedding
|
||||
/benchmark-q4_0-matmult
|
||||
/vdot
|
||||
/Pipfile
|
||||
|
||||
arm_neon.h
|
||||
|
||||
@@ -55,6 +55,8 @@ option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer"
|
||||
option(LLAMA_AVX "llama: enable AVX" ON)
|
||||
option(LLAMA_AVX2 "llama: enable AVX2" ON)
|
||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
|
||||
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
|
||||
option(LLAMA_FMA "llama: enable FMA" ON)
|
||||
# in MSVC F16C is implied with AVX2/AVX512
|
||||
if (NOT MSVC)
|
||||
@@ -120,6 +122,21 @@ if (LLAMA_OPENBLAS)
|
||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
||||
add_link_options(${BLAS_LIBRARIES})
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} openblas)
|
||||
|
||||
# find header file
|
||||
set(OPENBLAS_INCLUDE_SEARCH_PATHS
|
||||
/usr/include
|
||||
/usr/include/openblas
|
||||
/usr/include/openblas-base
|
||||
/usr/local/include
|
||||
/usr/local/include/openblas
|
||||
/usr/local/include/openblas-base
|
||||
/opt/OpenBLAS/include
|
||||
$ENV{OpenBLAS_HOME}
|
||||
$ENV{OpenBLAS_HOME}/include
|
||||
)
|
||||
find_path(OPENBLAS_INC NAMES cblas.h PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
|
||||
add_compile_options(-I${OPENBLAS_INC})
|
||||
else()
|
||||
message(WARNING "OpenBLAS not found")
|
||||
endif()
|
||||
@@ -205,6 +222,16 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
|
||||
if (MSVC)
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_options(/arch:AVX512)
|
||||
# MSVC has no compile-time flags enabling specific
|
||||
# AVX512 extensions, neither it defines the
|
||||
# macros corresponding to the extensions.
|
||||
# Do it manually.
|
||||
if (LLAMA_AVX512_VBMI)
|
||||
add_compile_definitions(__AVX512VBMI__)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VNNI)
|
||||
add_compile_definitions(__AVX512VNNI__)
|
||||
endif()
|
||||
elseif (LLAMA_AVX2)
|
||||
add_compile_options(/arch:AVX2)
|
||||
elseif (LLAMA_AVX)
|
||||
@@ -225,9 +252,13 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
|
||||
endif()
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_options(-mavx512f)
|
||||
# add_compile_options(-mavx512cd)
|
||||
# add_compile_options(-mavx512dq)
|
||||
# add_compile_options(-mavx512bw)
|
||||
add_compile_options(-mavx512bw)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VBMI)
|
||||
add_compile_options(-mavx512vbmi)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VNNI)
|
||||
add_compile_options(-mavx512vnni)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
@@ -274,4 +305,5 @@ endif ()
|
||||
|
||||
if (LLAMA_BUILD_EXAMPLES)
|
||||
add_subdirectory(examples)
|
||||
add_subdirectory(pocs)
|
||||
endif()
|
||||
|
||||
29
Makefile
29
Makefile
@@ -133,51 +133,54 @@ $(info I CC: $(CCV))
|
||||
$(info I CXX: $(CXXV))
|
||||
$(info )
|
||||
|
||||
default: main quantize perplexity embedding
|
||||
default: main quantize quantize-stats perplexity embedding vdot
|
||||
|
||||
#
|
||||
# Build library
|
||||
#
|
||||
|
||||
ggml.o: ggml.c ggml.h
|
||||
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
llama.o: llama.cpp llama.h llama_util.h
|
||||
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
|
||||
llama.o: llama.cpp ggml.h llama.h llama_util.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
common.o: examples/common.cpp examples/common.h
|
||||
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
clean:
|
||||
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
|
||||
|
||||
main: examples/main/main.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
@echo
|
||||
@echo '==== Run ./main -h for help. ===='
|
||||
@echo
|
||||
|
||||
quantize: examples/quantize/quantize.cpp ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
|
||||
$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
vdot: pocs/vdot/vdot.cpp ggml.o
|
||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||
|
||||
libllama.so: llama.o ggml.o
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
#
|
||||
# Tests
|
||||
#
|
||||
|
||||
benchmark: ggml.o
|
||||
$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)
|
||||
benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o
|
||||
$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
|
||||
./benchmark-q4_0-matmult
|
||||
|
||||
.PHONY: tests
|
||||
|
||||
@@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
|
||||
**Hot topics:**
|
||||
|
||||
- [Added LoRA support](https://github.com/ggerganov/llama.cpp/pull/820)
|
||||
- [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
|
||||
- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
|
||||
|
||||
@@ -50,6 +51,7 @@ New features will probably be added mostly through community contributions.
|
||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
|
||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||
|
||||
**UI:**
|
||||
|
||||
@@ -192,10 +194,10 @@ ls ./models
|
||||
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
|
||||
|
||||
# install Python dependencies
|
||||
python3 -m pip install torch numpy sentencepiece
|
||||
python3 -m pip install -r requirements.txt
|
||||
|
||||
# convert the 7B model to ggml FP16 format
|
||||
python3 convert-pth-to-ggml.py models/7B/ 1
|
||||
python3 convert.py models/7B/
|
||||
|
||||
# quantize the model to 4-bits (using method 2 = q4_0)
|
||||
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
|
||||
|
||||
@@ -1,299 +0,0 @@
|
||||
# Author: github.com/ductai199x
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from numba import njit
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
|
||||
def read_header(fin):
|
||||
values = struct.unpack("i" * 9, fin.read(4 * 9))
|
||||
_, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
|
||||
return {
|
||||
"vocab_size": vocab_size,
|
||||
"dim": dim,
|
||||
"multiple_of": multiple_of,
|
||||
"n_heads": n_heads,
|
||||
"n_layers": n_layers,
|
||||
}, ftype
|
||||
|
||||
|
||||
def read_tokens(fin, vocab_size):
|
||||
tokens = []
|
||||
for _ in range(vocab_size):
|
||||
text_len = struct.unpack("i", fin.read(4))[0]
|
||||
text_bytes = fin.read(text_len)
|
||||
try:
|
||||
text = text_bytes.decode()
|
||||
except UnicodeDecodeError:
|
||||
text = text_bytes.decode(errors="replace")
|
||||
score = struct.unpack("f", fin.read(4))[0]
|
||||
tokens.append((text, score))
|
||||
return tokens
|
||||
|
||||
|
||||
@njit
|
||||
def dequantize_weights_numba(fin_data, n_rows, n_cols):
|
||||
qk = 32
|
||||
nb = n_cols // qk
|
||||
bs = 4 + (qk // 2)
|
||||
|
||||
weights = np.zeros((n_rows, n_cols), dtype=np.float32)
|
||||
data_pos = 0
|
||||
|
||||
for row in range(n_rows):
|
||||
for block in range(nb):
|
||||
d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
|
||||
data_pos += 4
|
||||
packed_values = fin_data[data_pos : data_pos + (qk // 2)]
|
||||
data_pos += qk // 2
|
||||
|
||||
for i in range(qk // 2):
|
||||
packed_value = packed_values[i]
|
||||
v0 = np.float32((packed_value & 0b00001111) - 8) * d
|
||||
v1 = np.float32((packed_value >> 4) - 8) * d
|
||||
|
||||
weights[row, block * qk + 2 * i] = v0
|
||||
weights[row, block * qk + 2 * i + 1] = v1
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def dequantize_weights(fin, n_rows, n_cols):
|
||||
qk = 32
|
||||
nb = n_cols // qk
|
||||
data_size = n_rows * n_cols // 2 + n_rows * nb * 4
|
||||
fin_data = fin.read(data_size)
|
||||
return dequantize_weights_numba(fin_data, n_rows, n_cols)
|
||||
|
||||
|
||||
def read_variables(fin):
|
||||
model = {}
|
||||
pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
|
||||
while True:
|
||||
start_pos = fin.tell()
|
||||
try:
|
||||
n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
|
||||
except struct.error:
|
||||
break
|
||||
|
||||
shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
|
||||
shape = shape[::-1]
|
||||
name = fin.read(name_length).decode()
|
||||
|
||||
# ensure tensor data is aligned
|
||||
tensor_data_offset = fin.tell()
|
||||
tensor_data_offset = (tensor_data_offset + 31) & -32
|
||||
fin.seek(tensor_data_offset)
|
||||
|
||||
if ftype_cur == 2:
|
||||
# 4-bit quantized weights
|
||||
dtype = np.uint8
|
||||
data = dequantize_weights(fin, shape[0], shape[1])
|
||||
data = data.reshape(shape)
|
||||
elif ftype_cur == 0:
|
||||
dtype = np.float32
|
||||
data_size = np.prod(shape)
|
||||
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
|
||||
elif ftype_cur == 1:
|
||||
dtype = np.float16
|
||||
data_size = np.prod(shape)
|
||||
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
|
||||
|
||||
model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
|
||||
|
||||
pbar.update(fin.tell() - start_pos)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def convert_to_hf_format(model, hparams):
|
||||
# This works for llama 7B, need to test with other models
|
||||
n_layers = hparams["n_layers"]
|
||||
n_heads = hparams["n_heads"]
|
||||
dim = hparams["dim"]
|
||||
dims_per_head = dim // n_heads
|
||||
base = 10000.0
|
||||
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
||||
|
||||
# permute for sliced rotary
|
||||
def permute(w):
|
||||
return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
|
||||
|
||||
state_dict = {}
|
||||
for layer_i in range(n_layers):
|
||||
state_dict.update(
|
||||
{
|
||||
f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
|
||||
model[f"layers.{layer_i}.attention.wq.weight"]
|
||||
),
|
||||
f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
|
||||
model[f"layers.{layer_i}.attention.wk.weight"]
|
||||
),
|
||||
f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
|
||||
f"layers.{layer_i}.attention.wv.weight"
|
||||
],
|
||||
f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
|
||||
f"layers.{layer_i}.attention.wo.weight"
|
||||
],
|
||||
f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
|
||||
f"layers.{layer_i}.feed_forward.w1.weight"
|
||||
],
|
||||
f"model.layers.{layer_i}.mlp.down_proj.weight": model[
|
||||
f"layers.{layer_i}.feed_forward.w2.weight"
|
||||
],
|
||||
f"model.layers.{layer_i}.mlp.up_proj.weight": model[
|
||||
f"layers.{layer_i}.feed_forward.w3.weight"
|
||||
],
|
||||
f"model.layers.{layer_i}.input_layernorm.weight": model[
|
||||
f"layers.{layer_i}.attention_norm.weight"
|
||||
],
|
||||
f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
|
||||
f"layers.{layer_i}.ffn_norm.weight"
|
||||
],
|
||||
}
|
||||
)
|
||||
state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
|
||||
state_dict.update(
|
||||
{
|
||||
"model.embed_tokens.weight": model["tok_embeddings.weight"],
|
||||
"model.norm.weight": model["norm.weight"],
|
||||
"lm_head.weight": model["output.weight"],
|
||||
}
|
||||
)
|
||||
|
||||
return state_dict
|
||||
|
||||
|
||||
def chat(model, hparams, llama_dir):
|
||||
from transformers import (GenerationConfig, LlamaForCausalLM,
|
||||
LlamaTokenizer, StoppingCriteria,
|
||||
StoppingCriteriaList)
|
||||
from transformers.models.llama.configuration_llama import LlamaConfig
|
||||
|
||||
class StoppingCriteriaSub(StoppingCriteria):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
|
||||
print(tokenizer.decode(input_ids[0]), end="", flush=True)
|
||||
if input_ids[0][-1] == 13:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
config = LlamaConfig(
|
||||
vocab_size=hparams["vocab_size"],
|
||||
dim=hparams["dim"],
|
||||
num_hidden_layers=hparams["n_layers"],
|
||||
num_attention_heads=hparams["n_heads"],
|
||||
)
|
||||
|
||||
llama = LlamaForCausalLM(config=config)
|
||||
llama.load_state_dict(state_dict=model, strict=True)
|
||||
tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
|
||||
|
||||
device = torch.device("cpu")
|
||||
llama = llama.to(device)
|
||||
|
||||
ctx = """You are AI.
|
||||
This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
|
||||
User: Hello, AI.
|
||||
AI: Hello! How can I assist you today?
|
||||
"""
|
||||
print(ctx.rstrip("\n"))
|
||||
while True:
|
||||
print("-" * 60)
|
||||
prompt = input("User: ")
|
||||
if ctx != "":
|
||||
ctx = f"{ctx}User: {prompt}\n"
|
||||
else:
|
||||
ctx = f"{prompt}\nAI:"
|
||||
|
||||
ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
|
||||
|
||||
print("-" * 60)
|
||||
if len(ctx.strip()) > 0:
|
||||
input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
|
||||
generation_config = GenerationConfig(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
top_k=50,
|
||||
repetition_penalty=1.1764,
|
||||
)
|
||||
with torch.no_grad():
|
||||
generation_output = llama.generate(
|
||||
input_ids=input_ids,
|
||||
generation_config=generation_config,
|
||||
return_dict_in_generate=True,
|
||||
output_scores=True,
|
||||
max_length=2048,
|
||||
do_sample=True,
|
||||
stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
|
||||
)
|
||||
s = generation_output.sequences[0]
|
||||
decoded = tokenizer.decode(s)
|
||||
ctx = f"{decoded}\n"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prefix",
|
||||
"-p",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hf",
|
||||
action="store_true",
|
||||
help="Whether to save the model in the Hugging Face format. (default: False)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
llama_dir = os.path.abspath(f"{args.input_dir}/../")
|
||||
|
||||
ggml_files = sorted(
|
||||
[f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
|
||||
)
|
||||
|
||||
fin = open(ggml_files[0], "rb")
|
||||
hparams, ftype = read_header(fin)
|
||||
tokens = read_tokens(fin, hparams["vocab_size"])
|
||||
model = read_variables(fin)
|
||||
|
||||
for f in tqdm(ggml_files[1:]):
|
||||
fin = open(f, "rb")
|
||||
read_header(fin)
|
||||
read_tokens(fin, hparams["vocab_size"])
|
||||
model.update(read_variables(fin))
|
||||
|
||||
if args.hf:
|
||||
model = convert_to_hf_format(model, hparams)
|
||||
|
||||
pth_ckpt = {
|
||||
"state_dict": model,
|
||||
"hparams": hparams,
|
||||
"tokens": tokens,
|
||||
}
|
||||
|
||||
torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
|
||||
|
||||
if args.chat:
|
||||
if not args.hf:
|
||||
model = convert_to_hf_format(model, hparams)
|
||||
chat(model, hparams, llama_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,107 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
#
|
||||
# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
|
||||
#
|
||||
|
||||
# Original by https://github.com/eiz
|
||||
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
|
||||
parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
|
||||
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
|
||||
return parser.parse_args()
|
||||
|
||||
def read_header(f_in):
|
||||
struct_fmt = "i" * (3 + len(HPARAMS))
|
||||
struct_size = struct.calcsize(struct_fmt)
|
||||
buf = f_in.read(struct_size)
|
||||
return struct.unpack(struct_fmt, buf)
|
||||
|
||||
def write_header(f_out, header):
|
||||
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
|
||||
|
||||
if magic != 0x67676d6c:
|
||||
raise Exception('Invalid file magic. Must be an old style ggml file.')
|
||||
|
||||
values = [
|
||||
0x67676d66, # magic: ggml in hex
|
||||
1, # file version
|
||||
vocab_size,
|
||||
dim,
|
||||
multiple_of,
|
||||
n_heads,
|
||||
n_layers,
|
||||
rot,
|
||||
ftype
|
||||
]
|
||||
f_out.write(struct.pack("i" * len(values), *values))
|
||||
|
||||
def write_tokens(fout, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode()
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
if len(piece) != 6:
|
||||
print(f"Invalid token: {piece}")
|
||||
sys.exit(1)
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
# TODO: GPT4All - add extra <pad> token
|
||||
text = "<pad>".encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", 0.0))
|
||||
|
||||
def read_tokens(f_in, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
len_b = f_in.read(4)
|
||||
(length,) = struct.unpack("i", len_b)
|
||||
f_in.read(length)
|
||||
|
||||
def copy_all_data(f_out, f_in):
|
||||
while True:
|
||||
buf = f_in.read(1024 * 1024)
|
||||
if not buf:
|
||||
break
|
||||
f_out.write(buf)
|
||||
|
||||
def convert_one_file(path_in, tokenizer):
|
||||
path_tmp = f"{path_in}.tmp"
|
||||
path_orig= f"{path_in}.orig"
|
||||
print(f"converting {path_in}")
|
||||
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
|
||||
write_header(f_out, read_header(f_in))
|
||||
read_tokens(f_in, tokenizer)
|
||||
write_tokens(f_out, tokenizer)
|
||||
copy_all_data(f_out, f_in)
|
||||
os.rename(path_in, path_orig)
|
||||
os.rename(path_tmp, path_in)
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
tokenizer = SentencePieceProcessor(args.tokenizer_model)
|
||||
|
||||
convert_one_file(args.gpt4all_model, tokenizer)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,172 +0,0 @@
|
||||
# Convert a GPTQ quantized LLaMA model to a ggml compatible file
|
||||
# Based on: https://github.com/qwopqwop200/GPTQ-for-LLaMa
|
||||
#
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import struct
|
||||
import numpy as np
|
||||
import torch
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
if len(sys.argv) != 4:
|
||||
print("Usage: convert-gptq-to-ggml.py llamaXXb-4bit.pt tokenizer.model out.bin\n")
|
||||
sys.exit(1)
|
||||
|
||||
fname_model = sys.argv[1]
|
||||
fname_tokenizer = sys.argv[2]
|
||||
dir_out = sys.argv[3]
|
||||
|
||||
model = torch.load(fname_model, map_location="cpu")
|
||||
|
||||
n_vocab, n_embd = model['model.embed_tokens.weight'].shape
|
||||
n_layer = 1 + max(int(m.group(1)) for name in model
|
||||
if (m := re.match(r'model\.layers\.([0-9]+)', name)))
|
||||
|
||||
# hardcoded:
|
||||
n_mult = 256
|
||||
n_head = {32: 32, 40: 40, 60: 52, 80: 64}[n_layer]
|
||||
|
||||
tokenizer = SentencePieceProcessor(fname_tokenizer)
|
||||
|
||||
assert tokenizer.vocab_size() == n_vocab
|
||||
|
||||
fname_out = sys.argv[3]
|
||||
|
||||
fout = open(fname_out, "wb")
|
||||
|
||||
fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex
|
||||
fout.write(struct.pack("i", 1)) # file version
|
||||
fout.write(struct.pack("i", n_vocab))
|
||||
fout.write(struct.pack("i", n_embd))
|
||||
fout.write(struct.pack("i", n_mult))
|
||||
fout.write(struct.pack("i", n_head))
|
||||
fout.write(struct.pack("i", n_layer))
|
||||
fout.write(struct.pack("i", n_embd // n_head)) # rot (obsolete)
|
||||
fout.write(struct.pack("i", 4))
|
||||
|
||||
|
||||
# This loop unchanged from convert-pth-to-ggml.py:
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode()
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
if len(piece) != 6:
|
||||
print(f"Invalid token: {piece}")
|
||||
sys.exit(1)
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
def write_header(shape, dst_name, ftype_cur):
|
||||
sname = dst_name.encode()
|
||||
fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
|
||||
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
||||
fout.write(sname)
|
||||
|
||||
# ensure tensor data is aligned
|
||||
tensor_data_offset = fout.tell()
|
||||
tensor_data_offset = (tensor_data_offset + 31) & -32
|
||||
fout.seek(tensor_data_offset)
|
||||
|
||||
def convert_non_q4(src_name, dst_name):
|
||||
v = model[src_name]
|
||||
shape = v.shape
|
||||
print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
|
||||
if len(shape) == 1:
|
||||
print(" Converting to float32")
|
||||
v = v.to(torch.float32)
|
||||
|
||||
ftype_cur = {torch.float16: 1, torch.float32: 0}[v.dtype]
|
||||
|
||||
# header
|
||||
write_header(shape, dst_name, ftype_cur)
|
||||
|
||||
# data
|
||||
v.numpy().tofile(fout)
|
||||
|
||||
def convert_q4(src_name, dst_name, permute=False):
|
||||
zeros = model[f"{src_name}.zeros"].numpy()
|
||||
scales = model[f"{src_name}.scales"].numpy()
|
||||
bias = model[f"{src_name}.bias"].numpy()
|
||||
qweight = model[f"{src_name}.qweight"].numpy().T # transpose
|
||||
|
||||
# Q4_1 does not support bias; good thing the bias is always all zeros.
|
||||
assert not np.any(bias)
|
||||
|
||||
# Each int32 item is actually 8 int4 items packed together, and it's transposed.
|
||||
shape = (qweight.shape[0], qweight.shape[1] * 8)
|
||||
|
||||
print(f"Processing Q4 variable: {src_name} with shape: {shape}")
|
||||
|
||||
# The output format has the int4 weights in groups of 32 rather than 8.
|
||||
# It looks like this:
|
||||
# For each row:
|
||||
# For each group of 32 columns:
|
||||
# - addend (float32, 4 bytes)
|
||||
# - scale (float32, 4 bytes)
|
||||
# - weights (int4 * 32, 16 bytes)
|
||||
# Note that in the input, the scales and addends are shared between all
|
||||
# the columns in a row, so we end up wasting quite a bit of memory with
|
||||
# repeated scales and addends.
|
||||
|
||||
addends = -zeros # flip sign
|
||||
|
||||
# Since the output format is mixed between integers and floats, we have
|
||||
# to hackily view the floats as int32s just so numpy will let us
|
||||
# concatenate them.
|
||||
addends_view = addends.view(dtype=np.int32)
|
||||
scales_view = scales.view(dtype=np.int32)
|
||||
|
||||
# Split into groups of 4 columns (i.e. 32 columns of quantized data):
|
||||
grouped = qweight.reshape([qweight.shape[0], qweight.shape[1] // 4, 4])
|
||||
|
||||
# Repeat addends and scales:
|
||||
addends_rep = np.atleast_3d(addends_view).repeat(grouped.shape[1], axis=1)
|
||||
scales_rep = np.atleast_3d(scales_view).repeat(grouped.shape[1], axis=1)
|
||||
|
||||
blob = np.concatenate([scales_rep, addends_rep, grouped], axis=2, casting='no')
|
||||
|
||||
if permute:
|
||||
# Permute some rows to undo the permutation done by convert_llama_weights_to_hf.py.
|
||||
# This can be done after the above conversion because it doesn't affect column order/layout.
|
||||
blob = (blob.reshape(n_head, 2, shape[0] // n_head // 2, *blob.shape[1:])
|
||||
.swapaxes(1, 2)
|
||||
.reshape(blob.shape))
|
||||
|
||||
# header
|
||||
write_header(shape, dst_name, 3) # ftype = Q4_1
|
||||
|
||||
# data
|
||||
blob.tofile(fout)
|
||||
|
||||
convert_non_q4("model.embed_tokens.weight", "tok_embeddings.weight")
|
||||
convert_non_q4("model.norm.weight", "norm.weight")
|
||||
convert_non_q4("lm_head.weight", "output.weight")
|
||||
|
||||
for i in range(n_layer):
|
||||
convert_q4(f"model.layers.{i}.self_attn.q_proj", f"layers.{i}.attention.wq.weight", permute=True)
|
||||
convert_q4(f"model.layers.{i}.self_attn.k_proj", f"layers.{i}.attention.wk.weight", permute=True)
|
||||
convert_q4(f"model.layers.{i}.self_attn.v_proj", f"layers.{i}.attention.wv.weight")
|
||||
convert_q4(f"model.layers.{i}.self_attn.o_proj", f"layers.{i}.attention.wo.weight")
|
||||
|
||||
convert_q4(f"model.layers.{i}.mlp.gate_proj", f"layers.{i}.feed_forward.w1.weight")
|
||||
convert_q4(f"model.layers.{i}.mlp.down_proj", f"layers.{i}.feed_forward.w2.weight")
|
||||
convert_q4(f"model.layers.{i}.mlp.up_proj", f"layers.{i}.feed_forward.w3.weight")
|
||||
|
||||
convert_non_q4(f"model.layers.{i}.input_layernorm.weight", f"layers.{i}.attention_norm.weight")
|
||||
convert_non_q4(f"model.layers.{i}.post_attention_layernorm.weight", f"layers.{i}.ffn_norm.weight")
|
||||
|
||||
|
||||
fout.close()
|
||||
|
||||
print(f"Done. Output file: {fname_out}")
|
||||
print()
|
||||
124
convert-lora-to-ggml.py
Normal file
124
convert-lora-to-ggml.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import struct
|
||||
import sys
|
||||
from typing import Any, Dict, Sequence, TextIO
|
||||
|
||||
import torch
|
||||
|
||||
from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType
|
||||
|
||||
HF_SUBLAYER_TO_GGML = {
|
||||
"self_attn.q_proj": "attention.wq",
|
||||
"self_attn.k_proj": "attention.wk",
|
||||
"self_attn.v_proj": "attention.wv",
|
||||
"self_attn.o_proj": "attention.wo",
|
||||
"mlp.gate_proj": "feed_forward.w1",
|
||||
"mlp.down_proj": "feed_forward.w2",
|
||||
"mlp.up_proj": "feed_forward.w3",
|
||||
"input_layernorm": "attention_norm",
|
||||
"post_attention_layernorm": "ffn_norm",
|
||||
# "norm": "norm",
|
||||
# "embed_tokens": "tok_embeddings",
|
||||
# "lm_head": "output",
|
||||
}
|
||||
|
||||
|
||||
def translate_tensor_name(t: str) -> str:
|
||||
match = re.match(r".*layers\.(\d+)\.(\w+\.\w+)\.lora_(A|B)\.weight", t)
|
||||
if match:
|
||||
nn = match.group(1)
|
||||
sub_layer = match.group(2)
|
||||
lora_type = match.group(3)
|
||||
|
||||
sub_layer_renamed = HF_SUBLAYER_TO_GGML.get(sub_layer)
|
||||
if sub_layer_renamed is None:
|
||||
print(f"Error: unrecognized sub-layer {sub_layer} in tensor {t}")
|
||||
sys.exit(1)
|
||||
|
||||
output_string = (
|
||||
f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
|
||||
)
|
||||
return output_string
|
||||
else:
|
||||
print(f"Error: unrecognized tensor {t}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
|
||||
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
||||
fout.write(struct.pack("i", 1)) # file version
|
||||
fout.write(struct.pack("ii", params["r"], params["lora_alpha"]))
|
||||
|
||||
|
||||
def write_tensor_header(
|
||||
self, name: str, shape: Sequence[int], data_type: DataType
|
||||
) -> None:
|
||||
sname = name.encode("utf-8")
|
||||
fout.write(
|
||||
struct.pack(
|
||||
"iii",
|
||||
len(shape),
|
||||
len(sname),
|
||||
DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]],
|
||||
)
|
||||
)
|
||||
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
||||
fout.write(sname)
|
||||
fout.seek((fout.tell() + 31) & -32)
|
||||
|
||||
|
||||
if len(sys.argv) != 2:
|
||||
print(f"Usage: python {sys.argv[0]} <path>")
|
||||
print(
|
||||
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
input_json = os.path.join(sys.argv[1], "adapter_config.json")
|
||||
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
|
||||
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
||||
|
||||
model = torch.load(input_model, map_location="cpu")
|
||||
|
||||
with open(input_json, "r") as f:
|
||||
params = json.load(f)
|
||||
|
||||
if params["peft_type"] != "LORA":
|
||||
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
|
||||
sys.exit(1)
|
||||
|
||||
if params["fan_in_fan_out"] == True:
|
||||
print("Error: param fan_in_fan_out is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
if params["bias"] is not None and params["bias"] != "none":
|
||||
print("Error: param bias is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
# TODO: these seem to be layers that have been trained but without lora.
|
||||
# doesn't seem widely used but eventually should be supported
|
||||
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
|
||||
print("Error: param modules_to_save is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
with open(output_path, "wb") as fout:
|
||||
fout.truncate()
|
||||
|
||||
write_file_header(fout, params)
|
||||
for k, v in model.items():
|
||||
if k.endswith("lora_A.weight"):
|
||||
if v.dtype != torch.float16 and v.dtype != torch.float32:
|
||||
v = v.float()
|
||||
v = v.T
|
||||
else:
|
||||
v = v.float()
|
||||
|
||||
t = v.numpy()
|
||||
tname = translate_tensor_name(k)
|
||||
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
||||
write_tensor_header(fout, tname, t.shape, t.dtype)
|
||||
t.tofile(fout)
|
||||
|
||||
print(f"Converted {input_json} and {input_model} to {output_path}")
|
||||
@@ -1,274 +1,11 @@
|
||||
# Convert a LLaMA model checkpoint to a ggjt compatible file
|
||||
#
|
||||
# Load the model using Torch
|
||||
# Iterate over all variables and write them to a binary file.
|
||||
#
|
||||
# For each variable, write the following:
|
||||
# - Number of dimensions (int)
|
||||
# - Name length (int)
|
||||
# - Dimensions (int[n_dims])
|
||||
# - Name (char[name_length])
|
||||
# - Data (float[n_dims])
|
||||
#
|
||||
# At the start of the ggml file we write the model parameters
|
||||
# and vocabulary.
|
||||
#
|
||||
# Compatibility stub
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import struct
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
import convert
|
||||
|
||||
QK = 32
|
||||
|
||||
GGML_TYPE_Q4_0 = 0
|
||||
GGML_TYPE_Q4_1 = 1
|
||||
GGML_TYPE_I8 = 2
|
||||
GGML_TYPE_I16 = 3
|
||||
GGML_TYPE_I32 = 4
|
||||
GGML_TYPE_F16 = 5
|
||||
GGML_TYPE_F32 = 6
|
||||
|
||||
WTYPES = {
|
||||
0: GGML_TYPE_F32,
|
||||
1: GGML_TYPE_F16,
|
||||
2: GGML_TYPE_Q4_0,
|
||||
3: GGML_TYPE_Q4_1,
|
||||
}
|
||||
|
||||
GGML_BLCK_SIZE = {
|
||||
GGML_TYPE_Q4_0: QK,
|
||||
GGML_TYPE_Q4_1: QK,
|
||||
GGML_TYPE_I8: 1,
|
||||
GGML_TYPE_I16: 1,
|
||||
GGML_TYPE_I32: 1,
|
||||
GGML_TYPE_F16: 1,
|
||||
GGML_TYPE_F32: 1,
|
||||
}
|
||||
|
||||
GGML_TYPE_SIZE = {
|
||||
GGML_TYPE_Q4_0: 4 + QK//2,
|
||||
GGML_TYPE_Q4_1: 4*2 + QK//2,
|
||||
GGML_TYPE_I8: 1,
|
||||
GGML_TYPE_I16: 2,
|
||||
GGML_TYPE_I32: 4,
|
||||
GGML_TYPE_F16: 2,
|
||||
GGML_TYPE_F32: 4,
|
||||
}
|
||||
|
||||
def ggml_nelements(shape):
|
||||
r = 1
|
||||
for i in shape:
|
||||
r *= i
|
||||
return r
|
||||
|
||||
def ggml_nbytes(shape, ftype):
|
||||
x = ggml_nelements(shape)
|
||||
t = WTYPES[ftype]
|
||||
x *= GGML_TYPE_SIZE[t]
|
||||
x //= GGML_BLCK_SIZE[t]
|
||||
return x
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
|
||||
parser.add_argument('dir_model', help='directory containing the model checkpoint')
|
||||
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
|
||||
parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
|
||||
return parser.parse_args()
|
||||
|
||||
def get_n_parts(dim):
|
||||
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
|
||||
n_parts = mappings.get(dim)
|
||||
if n_parts is None:
|
||||
print(f"Invalid dim: {dim}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"n_parts = {n_parts}\n")
|
||||
return n_parts
|
||||
|
||||
def load_hparams_and_tokenizer(dir_model):
|
||||
# `dir_model` is something like `models/7B` or `models/7B/`.
|
||||
# "tokenizer.model" is expected under model's parent dir.
|
||||
# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
|
||||
# Let's use the model's parent dir directly.
|
||||
model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
|
||||
fname_hparams = f"{dir_model}/params.json"
|
||||
fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
|
||||
with open(fname_hparams, "r") as f:
|
||||
hparams = json.load(f)
|
||||
print(hparams)
|
||||
tokenizer = SentencePieceProcessor(fname_tokenizer)
|
||||
hparams.update({"vocab_size": tokenizer.vocab_size()})
|
||||
return hparams, tokenizer
|
||||
|
||||
def write_header(fout, hparams, ftype):
|
||||
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
||||
values = [
|
||||
0x67676a74, # magic: ggjt in hex
|
||||
1, # file version
|
||||
*[hparams[key] for key in keys],
|
||||
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
|
||||
ftype
|
||||
]
|
||||
fout.write(struct.pack("i" * len(values), *values))
|
||||
|
||||
def write_tokens(fout, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode()
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
if len(piece) != 6:
|
||||
print(f"Invalid token: {piece}")
|
||||
sys.exit(1)
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
def process_and_write_variables(fout, model, ftype, part_id, n_parts):
|
||||
for name, datao in model.items():
|
||||
if name.endswith("freqs"):
|
||||
continue
|
||||
|
||||
# remove dimensions with a single element
|
||||
data = datao.numpy().squeeze()
|
||||
partshape = data.shape
|
||||
n_dims = len(data.shape)
|
||||
assert n_dims in (1, 2)
|
||||
|
||||
print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
|
||||
|
||||
# coerce single-dimensional tensors from float16 to float32
|
||||
ftype_cur = 1
|
||||
if ftype == 0 or n_dims == 1:
|
||||
print(" Converting to float32")
|
||||
data = data.astype(np.float32)
|
||||
ftype_cur = 0
|
||||
blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
|
||||
type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]
|
||||
|
||||
# determine dimension along which multipart tensor is sharded
|
||||
#
|
||||
# split_dim 0 regex:
|
||||
# - output.*
|
||||
# - layers.*.attention.wq.weight
|
||||
# - layers.*.attention.wk.weight
|
||||
# - layers.*.attention.wv.weight
|
||||
# - layers.*.feed_forward.w1.weight
|
||||
# - layers.*.feed_forward.w3.weight
|
||||
#
|
||||
# split_dim 1 regex:
|
||||
# - tok_embeddings.*
|
||||
# - layers.*.attention.wo.weight
|
||||
# - layers.*.feed_forward.w2.weight
|
||||
#
|
||||
if n_dims > 1:
|
||||
split_dim = 1
|
||||
if "tok_embeddings" in name:
|
||||
split_dim = 1
|
||||
elif "layers" in name:
|
||||
if "attention.wo.weight" in name:
|
||||
split_dim = 1
|
||||
elif "feed_forward.w2.weight" in name:
|
||||
split_dim = 1
|
||||
else:
|
||||
split_dim = 0
|
||||
elif "output" in name:
|
||||
split_dim = 0
|
||||
|
||||
# output tensor header
|
||||
fullshape = list(partshape)
|
||||
if n_dims > 1:
|
||||
fullshape[split_dim] *= n_parts
|
||||
sname = name.encode()
|
||||
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
|
||||
for dim in reversed(fullshape):
|
||||
fout.write(struct.pack("i", dim))
|
||||
fout.write(sname)
|
||||
|
||||
# ensure tensor data is aligned
|
||||
tensor_data_offset = fout.tell()
|
||||
while tensor_data_offset % QK != 0:
|
||||
fout.write(struct.pack("B", 0))
|
||||
tensor_data_offset += 1
|
||||
|
||||
# output unified mappable tensor data
|
||||
if n_dims == 1 or n_parts == 1:
|
||||
# copy tensor which we thankfully received in one piece
|
||||
if part_id == 0:
|
||||
data.tofile(fout)
|
||||
elif split_dim == 0:
|
||||
# reassemble multifile tensor containing some of the rows
|
||||
rows_per_chunk = partshape[0]
|
||||
current_row = part_id * rows_per_chunk
|
||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
||||
offset = current_row * bytes_per_row
|
||||
fout.seek(tensor_data_offset + offset)
|
||||
data.tofile(fout)
|
||||
elif split_dim == 1:
|
||||
# reassemble multifile tensor containing some of the cols
|
||||
cols_per_chunk = partshape[1]
|
||||
current_col = part_id * cols_per_chunk
|
||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
||||
offset_current_col = current_col // blck_size * type_size
|
||||
for row in range(partshape[0]):
|
||||
offset_row = row * bytes_per_row
|
||||
offset = offset_row + offset_current_col
|
||||
fout.seek(tensor_data_offset + offset)
|
||||
data[row].tofile(fout)
|
||||
|
||||
# advance file position to next tensor
|
||||
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
dir_model = args.dir_model
|
||||
ftype = args.ftype
|
||||
ftype_str = ["f32", "f16"]
|
||||
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
|
||||
|
||||
print(args)
|
||||
|
||||
# if only writing vocab to file
|
||||
if args.vocab_only:
|
||||
fname_model = f"{dir_model}/consolidated.00.pth"
|
||||
fname_out = f"{dir_model}/ggml-vocab.bin"
|
||||
print(f"Extracting only the vocab from '{fname_model}'\n")
|
||||
with open(fname_out, "wb") as fout:
|
||||
write_header(fout, hparams, ftype)
|
||||
write_tokens(fout, tokenizer)
|
||||
print(f"Done. Output file: {fname_out}\n")
|
||||
return
|
||||
|
||||
n_parts = get_n_parts(hparams["dim"])
|
||||
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"
|
||||
|
||||
# we output a single file for ggml
|
||||
with open(fname_out, "wb") as fout:
|
||||
write_header(fout, hparams, ftype)
|
||||
write_tokens(fout, tokenizer)
|
||||
offset_of_tensors = fout.tell()
|
||||
# the tensors we load could be split across multiple files
|
||||
for part_id in range(n_parts):
|
||||
fout.seek(offset_of_tensors)
|
||||
print(f"Processing part {part_id+1} of {n_parts}\n")
|
||||
fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
|
||||
model = torch.load(fname_model, map_location="cpu")
|
||||
process_and_write_variables(fout, model, ftype, part_id, n_parts)
|
||||
del model
|
||||
|
||||
print(f"Done. Output file: {fname_out}\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
|
||||
parser.add_argument('dir_model', help='directory containing the model checkpoint')
|
||||
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
|
||||
args = parser.parse_args()
|
||||
convert.main(['--outtype', 'f16' if args.ftype == 1 else 'f32', '--', args.dir_model])
|
||||
|
||||
@@ -1,100 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# Original by https://github.com/eiz
|
||||
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
|
||||
parser.add_argument('dir_model', help='directory containing ggml .bin files')
|
||||
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
|
||||
return parser.parse_args()
|
||||
|
||||
def read_header(f_in):
|
||||
struct_fmt = "i" * (3 + len(HPARAMS))
|
||||
struct_size = struct.calcsize(struct_fmt)
|
||||
buf = f_in.read(struct_size)
|
||||
return struct.unpack(struct_fmt, buf)
|
||||
|
||||
def write_header(f_out, header):
|
||||
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
|
||||
|
||||
if magic != 0x67676d6c:
|
||||
raise Exception('Invalid file magic. Must be an old style ggml file.')
|
||||
|
||||
values = [
|
||||
0x67676d66, # magic: ggml in hex
|
||||
1, # file version
|
||||
vocab_size,
|
||||
dim,
|
||||
multiple_of,
|
||||
n_heads,
|
||||
n_layers,
|
||||
rot,
|
||||
ftype
|
||||
]
|
||||
f_out.write(struct.pack("i" * len(values), *values))
|
||||
|
||||
def write_tokens(fout, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode()
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
if len(piece) != 6:
|
||||
print(f"Invalid token: {piece}")
|
||||
sys.exit(1)
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
def read_tokens(f_in, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
len_b = f_in.read(4)
|
||||
(length,) = struct.unpack("i", len_b)
|
||||
f_in.read(length)
|
||||
|
||||
def copy_all_data(f_out, f_in):
|
||||
while True:
|
||||
buf = f_in.read(1024 * 1024)
|
||||
if not buf:
|
||||
break
|
||||
f_out.write(buf)
|
||||
|
||||
def convert_one_file(path_in, tokenizer):
|
||||
path_tmp = f"{path_in}.tmp"
|
||||
path_orig= f"{path_in}.orig"
|
||||
print(f"converting {path_in}")
|
||||
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
|
||||
write_header(f_out, read_header(f_in))
|
||||
read_tokens(f_in, tokenizer)
|
||||
write_tokens(f_out, tokenizer)
|
||||
copy_all_data(f_out, f_in)
|
||||
os.rename(path_in, path_orig)
|
||||
os.rename(path_tmp, path_in)
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
files = []
|
||||
files.extend(glob.glob(f"{args.dir_model}/*.bin"))
|
||||
files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
|
||||
|
||||
tokenizer = SentencePieceProcessor(args.tokenizer_model)
|
||||
|
||||
for file in files:
|
||||
convert_one_file(file, tokenizer)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1149
convert.py
Normal file
1149
convert.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -24,7 +24,7 @@
|
||||
|
||||
float tensor_sum_elements(struct ggml_tensor * tensor) {
|
||||
float sum = 0;
|
||||
if (tensor->type==6) {
|
||||
if (tensor->type==GGML_TYPE_F32) {
|
||||
for (int j = 0; j < tensor->ne[1]; j++) {
|
||||
for (int k = 0; k < tensor->ne[0]; k++) {
|
||||
sum += ((float *) tensor->data)[j*tensor->ne[0]+k];
|
||||
|
||||
@@ -139,6 +139,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
params.model = argv[i];
|
||||
} else if (arg == "--lora") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter = argv[i];
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_base = argv[i];
|
||||
} else if (arg == "-i" || arg == "--interactive") {
|
||||
params.interactive = true;
|
||||
} else if (arg == "--embedding") {
|
||||
@@ -242,6 +255,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
}
|
||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
||||
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
@@ -31,11 +31,12 @@ struct gpt_params {
|
||||
|
||||
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
||||
std::string prompt = "";
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
|
||||
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
|
||||
std::string lora_adapter = ""; // lora adapter path
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
|
||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
||||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <ctime>
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
params.model = "models/llama-7B/ggml-model.bin";
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
@@ -113,6 +114,17 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.lora_adapter.empty()) {
|
||||
int err = llama_apply_lora_from_file(ctx,
|
||||
params.lora_adapter.c_str(),
|
||||
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
||||
params.n_threads);
|
||||
if (err != 0) {
|
||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#include "llama.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <ctime>
|
||||
|
||||
std::vector<float> softmax(const std::vector<float>& logits) {
|
||||
std::vector<float> probs(logits.size());
|
||||
@@ -27,20 +28,27 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
|
||||
int count = 0;
|
||||
int seq_count = tokens.size() / params.n_ctx;
|
||||
int n_vocab = llama_n_vocab(ctx);
|
||||
|
||||
double nll = 0.0;
|
||||
|
||||
fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
|
||||
fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch);
|
||||
|
||||
for (int i = 0; i < seq_count; ++i) {
|
||||
int start = i * params.n_ctx;
|
||||
int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512
|
||||
// it is better to always be power of 2 for better performance
|
||||
std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
|
||||
int end = start + params.n_ctx;
|
||||
|
||||
std::vector<float> logits;
|
||||
int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch;
|
||||
auto start_t = std::chrono::high_resolution_clock::now();
|
||||
if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return;
|
||||
for (int j = 0; j < num_batches; ++j) {
|
||||
int batch_start = start + j * params.n_batch;
|
||||
int batch_size = std::min(end - batch_start, params.n_batch);
|
||||
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return;
|
||||
}
|
||||
auto batch_logits = llama_get_logits(ctx);
|
||||
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||
}
|
||||
auto end_t = std::chrono::high_resolution_clock::now();
|
||||
if (i == 0) {
|
||||
@@ -59,15 +67,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
// Example, we have a context window of 512, we will compute perplexity for each of the
|
||||
// last 256 tokens. Then, we split the input up into context window size chunks to
|
||||
// process the entire prompt.
|
||||
|
||||
auto logits = llama_get_logits(ctx);
|
||||
for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
|
||||
for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
|
||||
// Calculate probability of next token, given the previous ones.
|
||||
int n_vocab = llama_n_vocab(ctx);
|
||||
std::vector<float> tok_logits(
|
||||
logits + j * n_vocab,
|
||||
logits + (j + 1) * n_vocab);
|
||||
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
||||
logits.begin() + j * n_vocab,
|
||||
logits.begin() + (j + 1) * n_vocab);
|
||||
float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
||||
nll += -std::log(prob);
|
||||
++count;
|
||||
}
|
||||
@@ -82,11 +87,13 @@ int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
params.model = "models/llama-7B/ggml-model.bin";
|
||||
|
||||
params.n_batch = 512;
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
params.perplexity = true;
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
|
||||
if (params.n_ctx > 2048) {
|
||||
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
||||
@@ -127,6 +134,17 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.lora_adapter.empty()) {
|
||||
int err = llama_apply_lora_from_file(ctx,
|
||||
params.lora_adapter.c_str(),
|
||||
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
||||
params.n_threads);
|
||||
if (err != 0) {
|
||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
@@ -16,9 +16,6 @@
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" };
|
||||
static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list");
|
||||
|
||||
struct quantize_stats_params {
|
||||
std::string model = "models/7B/ggml-model-f16.bin";
|
||||
bool verbose = false;
|
||||
@@ -224,7 +221,7 @@ int main(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
int j;
|
||||
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], type_strs[j]) != 0; j++) {
|
||||
for (j = 0; j < GGML_TYPE_COUNT && strcmp(argv[i], ggml_type_name((ggml_type) j)) != 0; j++) {
|
||||
// find match
|
||||
}
|
||||
if (j < GGML_TYPE_COUNT) {
|
||||
@@ -279,7 +276,7 @@ int main(int argc, char ** argv) {
|
||||
continue;
|
||||
}
|
||||
if (params.verbose) {
|
||||
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), type_strs[kv_tensor.second->type], ggml_nelements(kv_tensor.second));
|
||||
printf("%s: type %s, size %" PRId64 "\n", kv_tensor.first.c_str(), ggml_type_name(kv_tensor.second->type), ggml_nelements(kv_tensor.second));
|
||||
}
|
||||
if (kv_tensor.second->type == GGML_TYPE_F16) {
|
||||
is_f16 = true;
|
||||
@@ -304,13 +301,14 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// loop throught quantization types
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
const ggml_type type = (ggml_type) i;
|
||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
||||
continue;
|
||||
}
|
||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
||||
if (params.verbose) {
|
||||
printf("testing %s ...\n", type_strs[i]);
|
||||
printf("testing %s ...\n", ggml_type_name(type));
|
||||
}
|
||||
|
||||
error_stats global_stats {};
|
||||
@@ -322,7 +320,7 @@ int main(int argc, char ** argv) {
|
||||
if (params.verbose) {
|
||||
printf(" %s ...\n", kv_tensor.first.c_str());
|
||||
}
|
||||
std::string layer_name { type_strs[i] };
|
||||
std::string layer_name { ggml_type_name(type) };
|
||||
layer_name += "::" + kv_tensor.first;
|
||||
test_roundtrip_on_layer(
|
||||
layer_name,
|
||||
@@ -337,7 +335,7 @@ int main(int argc, char ** argv) {
|
||||
);
|
||||
}
|
||||
|
||||
print_error_stats(type_strs[i], global_stats, params.print_histogram);
|
||||
print_error_stats(ggml_type_name(type), global_stats, params.print_histogram);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
|
||||
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
|
||||
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@
|
||||
inherit system;
|
||||
};
|
||||
llama-python = pkgs.python310.withPackages (ps: with ps; [
|
||||
torch
|
||||
numpy
|
||||
sentencepiece
|
||||
]);
|
||||
|
||||
32
ggml.h
32
ggml.h
@@ -204,6 +204,8 @@ enum ggml_type {
|
||||
GGML_TYPE_F16 = 1,
|
||||
GGML_TYPE_Q4_0 = 2,
|
||||
GGML_TYPE_Q4_1 = 3,
|
||||
GGML_TYPE_Q4_2 = 4,
|
||||
GGML_TYPE_Q8_0 = 5,
|
||||
GGML_TYPE_I8,
|
||||
GGML_TYPE_I16,
|
||||
GGML_TYPE_I32,
|
||||
@@ -253,6 +255,9 @@ enum ggml_op {
|
||||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_FF,
|
||||
|
||||
GGML_OP_MAP_UNARY,
|
||||
GGML_OP_MAP_BINARY,
|
||||
|
||||
GGML_OP_COUNT,
|
||||
};
|
||||
|
||||
@@ -351,6 +356,8 @@ int ggml_blck_size (enum ggml_type type);
|
||||
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
||||
|
||||
const char * ggml_type_name(enum ggml_type type);
|
||||
|
||||
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||
|
||||
struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||
@@ -424,6 +431,12 @@ struct ggml_tensor * ggml_add(
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
|
||||
struct ggml_tensor * ggml_add_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
struct ggml_tensor * ggml_sub(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
@@ -652,6 +665,21 @@ struct ggml_tensor * ggml_flash_ff(
|
||||
struct ggml_tensor * c0,
|
||||
struct ggml_tensor * c1);
|
||||
|
||||
// Mapping operations
|
||||
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
||||
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
||||
|
||||
struct ggml_tensor * ggml_map_unary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
const ggml_unary_op_f32_t fun);
|
||||
|
||||
struct ggml_tensor * ggml_map_binary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
const ggml_binary_op_f32_t fun);
|
||||
|
||||
//
|
||||
// automatic differentiation
|
||||
//
|
||||
@@ -779,6 +807,7 @@ enum ggml_opt_result ggml_opt(
|
||||
|
||||
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
//
|
||||
// system info
|
||||
@@ -787,6 +816,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
|
||||
int ggml_cpu_has_avx(void);
|
||||
int ggml_cpu_has_avx2(void);
|
||||
int ggml_cpu_has_avx512(void);
|
||||
int ggml_cpu_has_avx512_vbmi(void);
|
||||
int ggml_cpu_has_avx512_vnni(void);
|
||||
int ggml_cpu_has_fma(void);
|
||||
int ggml_cpu_has_neon(void);
|
||||
int ggml_cpu_has_arm_fma(void);
|
||||
@@ -816,6 +847,7 @@ typedef struct {
|
||||
dequantize_row_q_t dequantize_row_q;
|
||||
quantize_row_q_t quantize_row_q;
|
||||
quantize_row_q_t quantize_row_q_reference;
|
||||
quantize_row_q_t quantize_row_q_dot;
|
||||
vec_dot_q_t vec_dot_q;
|
||||
} quantize_fns_t;
|
||||
|
||||
|
||||
386
llama.cpp
386
llama.cpp
@@ -1,6 +1,8 @@
|
||||
// Defines fileno on msys:
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#endif
|
||||
|
||||
#include "llama_util.h"
|
||||
@@ -9,6 +11,7 @@
|
||||
#include "ggml.h"
|
||||
|
||||
#include <array>
|
||||
#include <ctime>
|
||||
#include <cinttypes>
|
||||
#include <fstream>
|
||||
#include <random>
|
||||
@@ -41,35 +44,51 @@ static const size_t MB = 1024*1024;
|
||||
// TODO: dynamically determine these sizes
|
||||
// needs modifications in ggml
|
||||
|
||||
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
|
||||
{ MODEL_7B, 512ull*MB },
|
||||
{ MODEL_13B, 512ull*MB },
|
||||
{ MODEL_30B, 512ull*MB },
|
||||
{ MODEL_65B, 512ull*MB },
|
||||
};
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||
{
|
||||
static std::map<e_model, size_t> _MEM_REQ_SCRATCH0 = {
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 512ull * MB },
|
||||
};
|
||||
return _MEM_REQ_SCRATCH0;
|
||||
}
|
||||
|
||||
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
|
||||
{ MODEL_7B, 512ull*MB },
|
||||
{ MODEL_13B, 512ull*MB },
|
||||
{ MODEL_30B, 512ull*MB },
|
||||
{ MODEL_65B, 512ull*MB },
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
||||
{
|
||||
static std::map<e_model, size_t> _MEM_REQ_SCRATCH1 = {
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 512ull * MB },
|
||||
};
|
||||
return _MEM_REQ_SCRATCH1;
|
||||
};
|
||||
|
||||
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
||||
static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
|
||||
{ MODEL_7B, 1026ull*MB },
|
||||
{ MODEL_13B, 1608ull*MB },
|
||||
{ MODEL_30B, 3124ull*MB },
|
||||
{ MODEL_65B, 5120ull*MB },
|
||||
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
||||
{
|
||||
static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
|
||||
{ MODEL_7B, 1026ull * MB },
|
||||
{ MODEL_13B, 1608ull * MB },
|
||||
{ MODEL_30B, 3124ull * MB },
|
||||
{ MODEL_65B, 5120ull * MB },
|
||||
};
|
||||
return _MEM_REQ_KV_SELF;
|
||||
};
|
||||
|
||||
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
||||
// not actually needed if BLAS is disabled
|
||||
static const std::map<e_model, size_t> MEM_REQ_EVAL = {
|
||||
{ MODEL_7B, 768ull*MB },
|
||||
{ MODEL_13B, 1024ull*MB },
|
||||
{ MODEL_30B, 1280ull*MB },
|
||||
{ MODEL_65B, 1536ull*MB },
|
||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
||||
{
|
||||
static std::map<e_model, size_t> _MEM_REQ_EVAL = {
|
||||
{ MODEL_7B, 768ull * MB },
|
||||
{ MODEL_13B, 1024ull * MB },
|
||||
{ MODEL_30B, 1280ull * MB },
|
||||
{ MODEL_65B, 1536ull * MB },
|
||||
};
|
||||
return _MEM_REQ_EVAL;
|
||||
};
|
||||
|
||||
// default hparams (LLaMA 7B)
|
||||
@@ -261,22 +280,12 @@ static size_t checked_div(size_t a, size_t b) {
|
||||
}
|
||||
|
||||
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
||||
std::string ret = "[" + std::to_string(ne.at(0));
|
||||
char buf[256];
|
||||
snprintf(buf, sizeof(buf), "%5u", ne.at(0));
|
||||
for (size_t i = 1; i < ne.size(); i++) {
|
||||
ret += " x " + std::to_string(ne.at(i));
|
||||
}
|
||||
ret += "]";
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const char * llama_format_type(enum ggml_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_F32: return "f32";
|
||||
case GGML_TYPE_F16: return "f16";
|
||||
case GGML_TYPE_Q4_0: return "q4_0";
|
||||
case GGML_TYPE_Q4_1: return "q4_1";
|
||||
default: LLAMA_ASSERT(false);
|
||||
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
|
||||
@@ -469,6 +478,7 @@ struct llama_file_loader {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_2:
|
||||
break;
|
||||
default: {
|
||||
throw format("unrecognized tensor type %u\n", shard.type);
|
||||
@@ -541,6 +551,7 @@ struct llama_file_saver {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q4_2:
|
||||
break;
|
||||
default: LLAMA_ASSERT(false);
|
||||
}
|
||||
@@ -626,6 +637,7 @@ struct llama_model_loader {
|
||||
throw format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
||||
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
|
||||
}
|
||||
|
||||
return get_tensor_for(lt);
|
||||
}
|
||||
|
||||
@@ -828,6 +840,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
return "mostly Q4_1, some F16";
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
|
||||
default: return "unknown, may not work";
|
||||
}
|
||||
}
|
||||
@@ -908,13 +921,13 @@ static void llama_model_load_internal(
|
||||
const size_t mem_required =
|
||||
ctx_size +
|
||||
mmapped_size +
|
||||
MEM_REQ_SCRATCH0.at(model.type) +
|
||||
MEM_REQ_SCRATCH1.at(model.type) +
|
||||
MEM_REQ_EVAL.at (model.type);
|
||||
MEM_REQ_SCRATCH0().at(model.type) +
|
||||
MEM_REQ_SCRATCH1().at(model.type) +
|
||||
MEM_REQ_EVAL().at(model.type);
|
||||
|
||||
// this is the memory required by one llama_state
|
||||
const size_t mem_required_state =
|
||||
scale*MEM_REQ_KV_SELF.at(model.type);
|
||||
scale*MEM_REQ_KV_SELF().at(model.type);
|
||||
|
||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||
@@ -951,8 +964,8 @@ static void llama_model_load_internal(
|
||||
ml->ggml_ctx = ctx;
|
||||
|
||||
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
|
||||
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
||||
model.norm = ml->get_tensor("norm.weight", {n_embd});
|
||||
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab});
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
@@ -1561,6 +1574,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
switch (ftype) {
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
||||
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
|
||||
default: throw format("invalid output file type %d\n", ftype);
|
||||
};
|
||||
|
||||
@@ -1579,10 +1593,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
tensor.data = read_data.addr;
|
||||
model_loader->load_data_for(tensor);
|
||||
|
||||
printf("[%zu/%zu] %36s - %s, type = %6s, ",
|
||||
printf("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
||||
++idx, model_loader->tensors_map.tensors.size(),
|
||||
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
||||
llama_format_type(tensor.type));
|
||||
ggml_type_name(tensor.type));
|
||||
|
||||
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
||||
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
||||
@@ -1615,7 +1629,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
f32_data[i] = ggml_fp16_to_fp32(f16_data[i]);
|
||||
}
|
||||
} else {
|
||||
throw format("type %s unsupported for integer quantization", llama_format_type(tensor.type));
|
||||
throw format("type %s unsupported for integer quantization", ggml_type_name(tensor.type));
|
||||
}
|
||||
|
||||
printf("quantizing .. ");
|
||||
@@ -1634,6 +1648,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
{
|
||||
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_2:
|
||||
{
|
||||
new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
|
||||
} break;
|
||||
default:
|
||||
LLAMA_ASSERT(false);
|
||||
}
|
||||
@@ -1741,10 +1759,10 @@ struct llama_context * llama_init_from_file(
|
||||
ctx->embedding.resize(hparams.n_embd);
|
||||
}
|
||||
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
||||
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
|
||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
||||
}
|
||||
|
||||
return ctx;
|
||||
@@ -1767,6 +1785,254 @@ int llama_model_quantize(
|
||||
}
|
||||
}
|
||||
|
||||
int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||
fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
||||
|
||||
auto & model = ctx->model;
|
||||
|
||||
const int64_t t_start_lora_us = ggml_time_us();
|
||||
|
||||
auto fin = std::ifstream(path_lora, std::ios::binary);
|
||||
if (!fin) {
|
||||
fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_lora);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// verify magic and version
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 'ggla') {
|
||||
fprintf(stderr, "%s: bad file magic\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
uint32_t format_version;
|
||||
fin.read((char *) &format_version, sizeof(format_version));
|
||||
|
||||
if (format_version != 1) {
|
||||
fprintf(stderr, "%s: unsupported file version\n", __func__ );
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t lora_r;
|
||||
int32_t lora_alpha;
|
||||
fin.read((char *) &lora_r, sizeof(lora_r));
|
||||
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
||||
float scaling = (float)lora_alpha / (float)lora_r;
|
||||
|
||||
fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
||||
|
||||
|
||||
// create a temporary ggml context to store the lora tensors
|
||||
// todo: calculate size from biggest possible tensor
|
||||
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = lora_buf.size();
|
||||
params.mem_buffer = lora_buf.data();
|
||||
params.no_alloc = false;
|
||||
|
||||
ggml_context * lora_ctx = ggml_init(params);
|
||||
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
||||
|
||||
// create a name -> tensor map of the model to accelerate lookups
|
||||
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
||||
for (auto & kv: model.tensors_by_name) {
|
||||
model_tensors.insert(kv);
|
||||
}
|
||||
|
||||
|
||||
// load base model
|
||||
std::unique_ptr<llama_model_loader> model_loader;
|
||||
ggml_context * base_ctx = NULL;
|
||||
llama_buffer base_buf;
|
||||
if (path_base_model) {
|
||||
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
||||
|
||||
size_t ctx_size, mmapped_size;
|
||||
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
||||
base_buf.resize(ctx_size);
|
||||
|
||||
ggml_init_params base_params;
|
||||
base_params.mem_size = base_buf.size;
|
||||
base_params.mem_buffer = base_buf.addr;
|
||||
base_params.no_alloc = model_loader->use_mmap;
|
||||
|
||||
base_ctx = ggml_init(base_params);
|
||||
|
||||
model_loader->ggml_ctx = base_ctx;
|
||||
|
||||
// maybe this should in llama_model_loader
|
||||
if (model_loader->use_mmap) {
|
||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
|
||||
}
|
||||
}
|
||||
|
||||
// read tensors and apply
|
||||
bool warned = false;
|
||||
int n_tensors = 0;
|
||||
while (true) {
|
||||
int32_t n_dims;
|
||||
int32_t length;
|
||||
int32_t ftype;
|
||||
|
||||
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
||||
if (fin.eof()) {
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t ne[2] = { 1, 1 };
|
||||
for (int i = 0; i < n_dims; ++i) {
|
||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||
}
|
||||
|
||||
std::string name(length, 0);
|
||||
fin.read(&name[0], length);
|
||||
|
||||
// check for lora suffix and get the type of tensor
|
||||
const std::string lora_suffix = ".lora";
|
||||
size_t pos = name.rfind(lora_suffix);
|
||||
if (pos == std::string::npos) {
|
||||
fprintf(stderr, "%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::string lora_type = name.substr(pos + lora_suffix.length());
|
||||
std::string base_name = name;
|
||||
base_name.erase(pos);
|
||||
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
||||
|
||||
if (model_tensors.find(base_name.data()) == model_tensors.end()) {
|
||||
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// create ggml tensor
|
||||
ggml_type wtype;
|
||||
switch (ftype) {
|
||||
case 0: wtype = GGML_TYPE_F32; break;
|
||||
case 1: wtype = GGML_TYPE_F16; break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: invalid tensor data type '%d'\n",
|
||||
__func__, ftype);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
ggml_tensor* lora_tensor;
|
||||
if (n_dims == 2) {
|
||||
lora_tensor = ggml_new_tensor_2d(lora_ctx, wtype, ne[0], ne[1]);
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// load tensor data
|
||||
size_t offset = fin.tellg();
|
||||
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
||||
offset = (offset + 31) & -32;
|
||||
fin.seekg(offset);
|
||||
fin.read((char*)lora_tensor->data, tensor_data_size);
|
||||
|
||||
lora_tensors[name] = lora_tensor;
|
||||
|
||||
// check if we have both A and B tensors and apply
|
||||
if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
|
||||
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
||||
|
||||
ggml_tensor * dest_t = model_tensors[base_name];
|
||||
ggml_tensor * base_t;
|
||||
if (model_loader) {
|
||||
// load from base model
|
||||
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
|
||||
fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
||||
return 1;
|
||||
}
|
||||
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
||||
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
||||
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
|
||||
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
||||
model_loader->load_data_for(lt);
|
||||
lt.ggml_tensor->data = lt.data;
|
||||
}
|
||||
else {
|
||||
base_t = dest_t;
|
||||
}
|
||||
|
||||
if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1 || base_t->type == GGML_TYPE_Q4_2) {
|
||||
if (!warned) {
|
||||
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
||||
"use a f16 or f32 base model with --lora-base\n", __func__);
|
||||
warned = true;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
||||
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
||||
|
||||
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
||||
fprintf(stderr, "%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
||||
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// w = w + BA*s
|
||||
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
||||
|
||||
if (scaling != 1.0f) {
|
||||
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
||||
BA = ggml_scale(lora_ctx, BA, scale_tensor);
|
||||
}
|
||||
|
||||
ggml_tensor * r;
|
||||
if (base_t == dest_t) {
|
||||
r = ggml_add_inplace(lora_ctx, dest_t, BA);
|
||||
}
|
||||
else {
|
||||
r = ggml_add(lora_ctx, base_t, BA);
|
||||
r = ggml_cpy(lora_ctx, r, dest_t);
|
||||
}
|
||||
|
||||
struct ggml_cgraph gf = ggml_build_forward(r);
|
||||
gf.n_threads = n_threads;
|
||||
ggml_graph_compute(lora_ctx, &gf);
|
||||
|
||||
// we won't need these tensors again, reset the context to save memory
|
||||
ggml_free(lora_ctx);
|
||||
lora_ctx = ggml_init(params);
|
||||
lora_tensors.clear();
|
||||
|
||||
n_tensors++;
|
||||
if (n_tensors % 4 == 0)
|
||||
fprintf(stderr, ".");
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: this should be in a destructor, it will leak on failure
|
||||
ggml_free(lora_ctx);
|
||||
if (base_ctx) {
|
||||
ggml_free(base_ctx);
|
||||
}
|
||||
|
||||
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
||||
fprintf(stderr, " done (%.2f ms)\n", t_lora_us / 1000.0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||
try {
|
||||
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
||||
} catch (const std::string & err) {
|
||||
fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the KV cache that will contain the context for the
|
||||
// ongoing prediction with the model.
|
||||
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
||||
@@ -1924,18 +2190,20 @@ const char * llama_print_system_info(void) {
|
||||
static std::string s;
|
||||
|
||||
s = "";
|
||||
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
||||
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
||||
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
||||
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
||||
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
||||
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
||||
s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | ";
|
||||
s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | ";
|
||||
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
13
llama.h
13
llama.h
@@ -72,6 +72,7 @@ extern "C" {
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
@@ -96,6 +97,18 @@ extern "C" {
|
||||
const char * fname_out,
|
||||
enum llama_ftype ftype);
|
||||
|
||||
// Apply a LoRA adapter to a loaded model
|
||||
// path_base_model is the path to a higher quality model to use as a base for
|
||||
// the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
// will be applied on top of the previous one
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_apply_lora_from_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_lora,
|
||||
const char * path_base_model,
|
||||
int n_threads);
|
||||
|
||||
// Returns the KV cache that will contain the context for the
|
||||
// ongoing prediction with the model.
|
||||
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
||||
|
||||
38
llama_util.h
38
llama_util.h
@@ -43,8 +43,12 @@
|
||||
} while (0)
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
__attribute__((format(gnu_printf, 1, 2)))
|
||||
#else
|
||||
__attribute__((format(printf, 1, 2)))
|
||||
#endif
|
||||
#endif
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap, ap2;
|
||||
va_start(ap, fmt);
|
||||
@@ -57,7 +61,7 @@ static std::string format(const char * fmt, ...) {
|
||||
va_end(ap2);
|
||||
va_end(ap);
|
||||
return std::string(buf.data(), size);
|
||||
};
|
||||
}
|
||||
|
||||
struct llama_file {
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
@@ -164,7 +168,7 @@ struct llama_mmap {
|
||||
#ifdef _POSIX_MAPPED_FILES
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file) {
|
||||
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_SHARED;
|
||||
@@ -172,15 +176,16 @@ struct llama_mmap {
|
||||
flags |= MAP_POPULATE;
|
||||
#endif
|
||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||
close(fd);
|
||||
if (addr == MAP_FAILED) {
|
||||
throw format("mmap failed: %s", strerror(errno));
|
||||
}
|
||||
|
||||
// Advise the kernel to preload the mapped memory
|
||||
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||
strerror(errno));
|
||||
if (prefetch) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -190,14 +195,13 @@ struct llama_mmap {
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file) {
|
||||
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
||||
size = file->size;
|
||||
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
||||
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
DWORD error = GetLastError();
|
||||
CloseHandle(hFile);
|
||||
|
||||
if (hMapping == NULL) {
|
||||
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
||||
@@ -212,13 +216,15 @@ struct llama_mmap {
|
||||
}
|
||||
|
||||
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||
// Advise the kernel to preload the mapped memory
|
||||
WIN32_MEMORY_RANGE_ENTRY range;
|
||||
range.VirtualAddress = addr;
|
||||
range.NumberOfBytes = (SIZE_T)size;
|
||||
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
if (prefetch) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
WIN32_MEMORY_RANGE_ENTRY range;
|
||||
range.VirtualAddress = addr;
|
||||
range.NumberOfBytes = (SIZE_T)size;
|
||||
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
||||
|
||||
@@ -1,311 +0,0 @@
|
||||
# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
|
||||
#
|
||||
# We caused a breaking change to the file format on 2023-03-30 in:
|
||||
# https://github.com/ggerganov/llama.cpp/pull/613
|
||||
#
|
||||
# (1) If you still have the Meta LLaMA .pth files, then close this
|
||||
# file now; you can just run `convert-pth-to-ggml.py` again to
|
||||
# migrate to the new format. The tool is easier to use too. It
|
||||
# isn't necessary anymore to manage split output files because
|
||||
# the new format always combines things into a single file.
|
||||
#
|
||||
# (2) If you deleted the Meta LLaMA .pth files due to save on disk
|
||||
# space, then this tool is intended to help you. Please check
|
||||
# out the instructions below.
|
||||
#
|
||||
# USAGE
|
||||
#
|
||||
# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
|
||||
#
|
||||
# PREREQUISITES
|
||||
#
|
||||
# pip install numpy
|
||||
# cd llama.cpp
|
||||
# make -j4
|
||||
#
|
||||
# EXAMPLE (7B MODEL)
|
||||
#
|
||||
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
|
||||
# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
|
||||
#
|
||||
# # check that it works
|
||||
# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
|
||||
#
|
||||
# # you can delete the old files
|
||||
# rm -f models/7B/ggml-model-f16.bin
|
||||
# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
|
||||
#
|
||||
# EXAMPLE (13B MODEL)
|
||||
#
|
||||
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
|
||||
# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
|
||||
#
|
||||
# # check that it works
|
||||
# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
|
||||
#
|
||||
# # you can delete the old files
|
||||
# rm -f models/13B/ggml-model-f16.bin*
|
||||
# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
|
||||
#
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import struct
|
||||
import numpy as np
|
||||
|
||||
QK = 32
|
||||
|
||||
GGML_TYPE_Q4_0 = 0
|
||||
GGML_TYPE_Q4_1 = 1
|
||||
GGML_TYPE_I8 = 2
|
||||
GGML_TYPE_I16 = 3
|
||||
GGML_TYPE_I32 = 4
|
||||
GGML_TYPE_F16 = 5
|
||||
GGML_TYPE_F32 = 6
|
||||
|
||||
WTYPE_NAMES = {
|
||||
0: "F32",
|
||||
1: "F16",
|
||||
2: "Q4_0",
|
||||
3: "Q4_1",
|
||||
}
|
||||
|
||||
WTYPES = {
|
||||
0: GGML_TYPE_F32,
|
||||
1: GGML_TYPE_F16,
|
||||
2: GGML_TYPE_Q4_0,
|
||||
3: GGML_TYPE_Q4_1,
|
||||
}
|
||||
|
||||
GGML_BLCK_SIZE = {
|
||||
GGML_TYPE_Q4_0: QK,
|
||||
GGML_TYPE_Q4_1: QK,
|
||||
GGML_TYPE_I8: 1,
|
||||
GGML_TYPE_I16: 1,
|
||||
GGML_TYPE_I32: 1,
|
||||
GGML_TYPE_F16: 1,
|
||||
GGML_TYPE_F32: 1,
|
||||
}
|
||||
|
||||
GGML_TYPE_SIZE = {
|
||||
GGML_TYPE_Q4_0: 4 + QK//2,
|
||||
GGML_TYPE_Q4_1: 4*2 + QK//2,
|
||||
GGML_TYPE_I8: 1,
|
||||
GGML_TYPE_I16: 2,
|
||||
GGML_TYPE_I32: 4,
|
||||
GGML_TYPE_F16: 2,
|
||||
GGML_TYPE_F32: 4,
|
||||
}
|
||||
|
||||
HPARAMS = [
|
||||
'magic', # int32
|
||||
'version', # int32
|
||||
'n_vocab', # int32
|
||||
'n_embd', # int32
|
||||
'n_mult', # int32
|
||||
'n_head', # int32
|
||||
'n_layer', # int32
|
||||
'n_rot', # int32
|
||||
'f16', # int32
|
||||
]
|
||||
|
||||
def read_hparams(fin):
|
||||
struct_fmt = "i" * len(HPARAMS)
|
||||
struct_size = struct.calcsize(struct_fmt)
|
||||
buf = fin.read(struct_size)
|
||||
ints = struct.unpack(struct_fmt, buf)
|
||||
hparams = dict(zip(HPARAMS, ints))
|
||||
return hparams
|
||||
|
||||
def write_hparams(fout, hparams):
|
||||
struct_fmt = "i" * len(HPARAMS)
|
||||
struct_size = struct.calcsize(struct_fmt)
|
||||
ints = [hparams[h] for h in HPARAMS]
|
||||
fout.write(struct.pack(struct_fmt, *ints))
|
||||
|
||||
def read_tokens(fin, hparams):
|
||||
tokens = []
|
||||
for i in range(hparams['n_vocab']):
|
||||
len_b = fin.read(4)
|
||||
(length,) = struct.unpack("i", len_b)
|
||||
word = fin.read(length)
|
||||
score_b = fin.read(4)
|
||||
(score,) = struct.unpack("f", score_b)
|
||||
tokens.append((word, score))
|
||||
return tokens
|
||||
|
||||
def write_tokens(fout, tokens):
|
||||
for word, score in tokens:
|
||||
fout.write(struct.pack("i", len(word)))
|
||||
fout.write(word)
|
||||
fout.write(struct.pack("f", score))
|
||||
|
||||
def ggml_nelements(shape):
|
||||
r = 1
|
||||
for i in shape:
|
||||
r *= i
|
||||
return r
|
||||
|
||||
def ggml_nbytes(shape, ftype):
|
||||
x = ggml_nelements(shape)
|
||||
t = WTYPES[ftype]
|
||||
x *= GGML_TYPE_SIZE[t]
|
||||
x //= GGML_BLCK_SIZE[t]
|
||||
return x
|
||||
|
||||
def copy_tensors(fin, fout, part_id, n_parts):
|
||||
while True:
|
||||
|
||||
b = fin.read(4)
|
||||
if not b: break
|
||||
(n_dims,) = struct.unpack("i", b)
|
||||
b = fin.read(4)
|
||||
(length,) = struct.unpack("i", b)
|
||||
b = fin.read(4)
|
||||
(ftype,) = struct.unpack("i", b)
|
||||
|
||||
assert n_dims in (1, 2)
|
||||
|
||||
partshape = list(range(n_dims))
|
||||
for i in range(n_dims):
|
||||
b = fin.read(4)
|
||||
partshape[i] = struct.unpack("i", b)[0]
|
||||
partshape = list(reversed(partshape))
|
||||
|
||||
name = fin.read(length)
|
||||
data = fin.read(ggml_nbytes(partshape, ftype))
|
||||
|
||||
blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
|
||||
type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
|
||||
|
||||
print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
|
||||
|
||||
# determine dimension along which multipart tensor is sharded
|
||||
#
|
||||
# split_dim 0 regex:
|
||||
# - output.*
|
||||
# - layers.*.attention.wq.weight
|
||||
# - layers.*.attention.wk.weight
|
||||
# - layers.*.attention.wv.weight
|
||||
# - layers.*.feed_forward.w1.weight
|
||||
# - layers.*.feed_forward.w3.weight
|
||||
#
|
||||
# split_dim 1 regex:
|
||||
# - tok_embeddings.*
|
||||
# - layers.*.attention.wo.weight
|
||||
# - layers.*.feed_forward.w2.weight
|
||||
#
|
||||
if n_dims > 1:
|
||||
split_dim = 1
|
||||
if b"tok_embeddings" in name:
|
||||
split_dim = 1
|
||||
elif b"layers" in name:
|
||||
if b"attention.wo.weight" in name:
|
||||
split_dim = 1
|
||||
elif b"feed_forward.w2.weight" in name:
|
||||
split_dim = 1
|
||||
else:
|
||||
split_dim = 0
|
||||
elif b"output" in name:
|
||||
split_dim = 0
|
||||
|
||||
# output tensor header
|
||||
fullshape = list(partshape)
|
||||
if n_dims > 1:
|
||||
fullshape[split_dim] *= n_parts
|
||||
fout.write(struct.pack("iii", n_dims, len(name), ftype))
|
||||
for dim in reversed(fullshape):
|
||||
fout.write(struct.pack("i", dim))
|
||||
fout.write(name)
|
||||
|
||||
# ensure tensor data is aligned
|
||||
tensor_data_offset = fout.tell()
|
||||
while tensor_data_offset % QK != 0:
|
||||
fout.write(struct.pack("B", 0))
|
||||
tensor_data_offset += 1
|
||||
|
||||
# output unified mappable tensor data
|
||||
if n_dims == 1 or n_parts == 1:
|
||||
# copy tensor which we thankfully received in one piece
|
||||
if part_id == 0:
|
||||
fout.write(data)
|
||||
elif split_dim == 0:
|
||||
# reassemble multifile tensor containing some of the rows
|
||||
rows_per_chunk = partshape[0]
|
||||
current_row = part_id * rows_per_chunk
|
||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
||||
offset = current_row * bytes_per_row
|
||||
fout.seek(tensor_data_offset + offset)
|
||||
fout.write(data)
|
||||
elif split_dim == 1:
|
||||
# reassemble multifile tensor containing some of the cols
|
||||
cols_per_chunk = partshape[1]
|
||||
current_col = part_id * cols_per_chunk
|
||||
bpr = partshape[1] // blck_size * type_size
|
||||
bytes_per_row = fullshape[1] // blck_size * type_size
|
||||
offset_current_col = current_col // blck_size * type_size
|
||||
for row in range(partshape[0]):
|
||||
offset_row = row * bytes_per_row
|
||||
offset = offset_row + offset_current_col
|
||||
fout.seek(tensor_data_offset + offset)
|
||||
fout.write(data[row * bpr:row * bpr + bpr])
|
||||
|
||||
# advance file position to next tensor
|
||||
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
|
||||
parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
|
||||
parser.add_argument('fout_path', help='your new ggjt file name')
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
assert args.fin_path
|
||||
assert args.fout_path
|
||||
assert args.fin_path != args.fout_path
|
||||
|
||||
with open(args.fin_path, "rb") as fin:
|
||||
hparams = read_hparams(fin)
|
||||
tokens = read_tokens(fin, hparams)
|
||||
|
||||
if hparams['magic'] == 0x67676a74: # ggjt
|
||||
print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
|
||||
sys.exit(1)
|
||||
|
||||
if hparams['magic'] != 0x67676d66: # ggmf
|
||||
print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
|
||||
sys.exit(1)
|
||||
|
||||
hparams['magic'] = 0x67676a74 # ggjt
|
||||
|
||||
# count number of multipart files by convention
|
||||
n_parts = 1
|
||||
while True:
|
||||
if os.path.exists(f"{args.fin_path}.{n_parts}"):
|
||||
n_parts += 1
|
||||
else:
|
||||
break
|
||||
|
||||
# we output a single file for ggml
|
||||
with open(args.fout_path, "wb") as fout:
|
||||
write_hparams(fout, hparams)
|
||||
write_tokens(fout, tokens)
|
||||
offset_of_tensors = fout.tell()
|
||||
# the tensors we load could be split across multiple files
|
||||
for part_id in range(n_parts):
|
||||
fout.seek(offset_of_tensors)
|
||||
print(f"Processing part {part_id+1} of {n_parts}\n")
|
||||
fin_path = args.fin_path
|
||||
if part_id > 0:
|
||||
fin_path += f".{part_id}"
|
||||
with open(fin_path, "rb") as fin:
|
||||
read_tokens(fin, read_hparams(fin))
|
||||
copy_tensors(fin, fout, part_id, n_parts)
|
||||
|
||||
print(f"Done. Output file: {args.fout_path}\n")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
12
pocs/CMakeLists.txt
Normal file
12
pocs/CMakeLists.txt
Normal file
@@ -0,0 +1,12 @@
|
||||
# dependencies
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
# third-party
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
else()
|
||||
add_subdirectory(vdot)
|
||||
endif()
|
||||
4
pocs/vdot/CMakeLists.txt
Normal file
4
pocs/vdot/CMakeLists.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
set(TARGET vdot)
|
||||
add_executable(${TARGET} vdot.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
305
pocs/vdot/vdot.cpp
Normal file
305
pocs/vdot/vdot.cpp
Normal file
@@ -0,0 +1,305 @@
|
||||
#include <cstdio>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <chrono>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
#include <array>
|
||||
|
||||
#include <ggml.h>
|
||||
|
||||
constexpr int kVecSize = 1 << 18;
|
||||
|
||||
float drawFromGaussianPdf(std::mt19937& rndm) {
|
||||
constexpr double kScale = 1./(1. + std::mt19937::max());
|
||||
constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
|
||||
static float lastX;
|
||||
static bool haveX = false;
|
||||
if (haveX) { haveX = false; return lastX; }
|
||||
auto r = sqrt(-2*log(1 - kScale*rndm()));
|
||||
auto phi = kTwoPiTimesScale * rndm();
|
||||
lastX = r*sin(phi);
|
||||
haveX = true;
|
||||
return r*cos(phi);
|
||||
}
|
||||
void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
|
||||
for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
|
||||
}
|
||||
|
||||
// Copy-pasted from ggml.c
|
||||
#define QK4_0 32
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||
} block_q4_0;
|
||||
static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
||||
|
||||
#define QK4_1 32
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
float m; // min
|
||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||
} block_q4_1;
|
||||
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
||||
|
||||
// Copy-pasted from ggml.c
|
||||
#define QK8_0 32
|
||||
typedef struct {
|
||||
float d; // delta
|
||||
int8_t qs[QK8_0]; // quants
|
||||
} block_q8_0;
|
||||
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
|
||||
|
||||
// "Scalar" dot product between the quantized vector x and float vector y
|
||||
inline double dot(int n, const block_q4_0* x, const float* y) {
|
||||
const static float kValues[16] = {-8.f, -7.f, -6.f, -5.f, -4.f, -3.f, -2.f, -1.f, 0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f};
|
||||
constexpr uint32_t kMask1 = 0x0f0f0f0f;
|
||||
uint32_t u1, u2;
|
||||
auto q1 = (const uint8_t*)&u1;
|
||||
auto q2 = (const uint8_t*)&u2;
|
||||
double sum = 0;
|
||||
for (int i=0; i<n; ++i) {
|
||||
float d = x->d;
|
||||
auto u = (const uint32_t*)x->qs;
|
||||
float s = 0;
|
||||
for (int k=0; k<4; ++k) {
|
||||
u1 = u[k] & kMask1;
|
||||
u2 = (u[k] >> 4) & kMask1;
|
||||
s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
|
||||
y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
|
||||
y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
|
||||
y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
|
||||
y += 8;
|
||||
}
|
||||
sum += s*d;
|
||||
++x;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
// Alternative version of the above. Faster on my Mac (~45 us vs ~55 us per dot product),
|
||||
// but about the same on X86_64 (Ryzen 7950X CPU).
|
||||
inline double dot3(int n, const block_q4_0* x, const float* y) {
|
||||
const static std::pair<float,float> kValues[256] = {
|
||||
{-8.f, -8.f}, {-7.f, -8.f}, {-6.f, -8.f}, {-5.f, -8.f}, {-4.f, -8.f}, {-3.f, -8.f}, {-2.f, -8.f}, {-1.f, -8.f},
|
||||
{ 0.f, -8.f}, { 1.f, -8.f}, { 2.f, -8.f}, { 3.f, -8.f}, { 4.f, -8.f}, { 5.f, -8.f}, { 6.f, -8.f}, { 7.f, -8.f},
|
||||
{-8.f, -7.f}, {-7.f, -7.f}, {-6.f, -7.f}, {-5.f, -7.f}, {-4.f, -7.f}, {-3.f, -7.f}, {-2.f, -7.f}, {-1.f, -7.f},
|
||||
{ 0.f, -7.f}, { 1.f, -7.f}, { 2.f, -7.f}, { 3.f, -7.f}, { 4.f, -7.f}, { 5.f, -7.f}, { 6.f, -7.f}, { 7.f, -7.f},
|
||||
{-8.f, -6.f}, {-7.f, -6.f}, {-6.f, -6.f}, {-5.f, -6.f}, {-4.f, -6.f}, {-3.f, -6.f}, {-2.f, -6.f}, {-1.f, -6.f},
|
||||
{ 0.f, -6.f}, { 1.f, -6.f}, { 2.f, -6.f}, { 3.f, -6.f}, { 4.f, -6.f}, { 5.f, -6.f}, { 6.f, -6.f}, { 7.f, -6.f},
|
||||
{-8.f, -5.f}, {-7.f, -5.f}, {-6.f, -5.f}, {-5.f, -5.f}, {-4.f, -5.f}, {-3.f, -5.f}, {-2.f, -5.f}, {-1.f, -5.f},
|
||||
{ 0.f, -5.f}, { 1.f, -5.f}, { 2.f, -5.f}, { 3.f, -5.f}, { 4.f, -5.f}, { 5.f, -5.f}, { 6.f, -5.f}, { 7.f, -5.f},
|
||||
{-8.f, -4.f}, {-7.f, -4.f}, {-6.f, -4.f}, {-5.f, -4.f}, {-4.f, -4.f}, {-3.f, -4.f}, {-2.f, -4.f}, {-1.f, -4.f},
|
||||
{ 0.f, -4.f}, { 1.f, -4.f}, { 2.f, -4.f}, { 3.f, -4.f}, { 4.f, -4.f}, { 5.f, -4.f}, { 6.f, -4.f}, { 7.f, -4.f},
|
||||
{-8.f, -3.f}, {-7.f, -3.f}, {-6.f, -3.f}, {-5.f, -3.f}, {-4.f, -3.f}, {-3.f, -3.f}, {-2.f, -3.f}, {-1.f, -3.f},
|
||||
{ 0.f, -3.f}, { 1.f, -3.f}, { 2.f, -3.f}, { 3.f, -3.f}, { 4.f, -3.f}, { 5.f, -3.f}, { 6.f, -3.f}, { 7.f, -3.f},
|
||||
{-8.f, -2.f}, {-7.f, -2.f}, {-6.f, -2.f}, {-5.f, -2.f}, {-4.f, -2.f}, {-3.f, -2.f}, {-2.f, -2.f}, {-1.f, -2.f},
|
||||
{ 0.f, -2.f}, { 1.f, -2.f}, { 2.f, -2.f}, { 3.f, -2.f}, { 4.f, -2.f}, { 5.f, -2.f}, { 6.f, -2.f}, { 7.f, -2.f},
|
||||
{-8.f, -1.f}, {-7.f, -1.f}, {-6.f, -1.f}, {-5.f, -1.f}, {-4.f, -1.f}, {-3.f, -1.f}, {-2.f, -1.f}, {-1.f, -1.f},
|
||||
{ 0.f, -1.f}, { 1.f, -1.f}, { 2.f, -1.f}, { 3.f, -1.f}, { 4.f, -1.f}, { 5.f, -1.f}, { 6.f, -1.f}, { 7.f, -1.f},
|
||||
{-8.f, 0.f}, {-7.f, 0.f}, {-6.f, 0.f}, {-5.f, 0.f}, {-4.f, 0.f}, {-3.f, 0.f}, {-2.f, 0.f}, {-1.f, 0.f},
|
||||
{ 0.f, 0.f}, { 1.f, 0.f}, { 2.f, 0.f}, { 3.f, 0.f}, { 4.f, 0.f}, { 5.f, 0.f}, { 6.f, 0.f}, { 7.f, 0.f},
|
||||
{-8.f, 1.f}, {-7.f, 1.f}, {-6.f, 1.f}, {-5.f, 1.f}, {-4.f, 1.f}, {-3.f, 1.f}, {-2.f, 1.f}, {-1.f, 1.f},
|
||||
{ 0.f, 1.f}, { 1.f, 1.f}, { 2.f, 1.f}, { 3.f, 1.f}, { 4.f, 1.f}, { 5.f, 1.f}, { 6.f, 1.f}, { 7.f, 1.f},
|
||||
{-8.f, 2.f}, {-7.f, 2.f}, {-6.f, 2.f}, {-5.f, 2.f}, {-4.f, 2.f}, {-3.f, 2.f}, {-2.f, 2.f}, {-1.f, 2.f},
|
||||
{ 0.f, 2.f}, { 1.f, 2.f}, { 2.f, 2.f}, { 3.f, 2.f}, { 4.f, 2.f}, { 5.f, 2.f}, { 6.f, 2.f}, { 7.f, 2.f},
|
||||
{-8.f, 3.f}, {-7.f, 3.f}, {-6.f, 3.f}, {-5.f, 3.f}, {-4.f, 3.f}, {-3.f, 3.f}, {-2.f, 3.f}, {-1.f, 3.f},
|
||||
{ 0.f, 3.f}, { 1.f, 3.f}, { 2.f, 3.f}, { 3.f, 3.f}, { 4.f, 3.f}, { 5.f, 3.f}, { 6.f, 3.f}, { 7.f, 3.f},
|
||||
{-8.f, 4.f}, {-7.f, 4.f}, {-6.f, 4.f}, {-5.f, 4.f}, {-4.f, 4.f}, {-3.f, 4.f}, {-2.f, 4.f}, {-1.f, 4.f},
|
||||
{ 0.f, 4.f}, { 1.f, 4.f}, { 2.f, 4.f}, { 3.f, 4.f}, { 4.f, 4.f}, { 5.f, 4.f}, { 6.f, 4.f}, { 7.f, 4.f},
|
||||
{-8.f, 5.f}, {-7.f, 5.f}, {-6.f, 5.f}, {-5.f, 5.f}, {-4.f, 5.f}, {-3.f, 5.f}, {-2.f, 5.f}, {-1.f, 5.f},
|
||||
{ 0.f, 5.f}, { 1.f, 5.f}, { 2.f, 5.f}, { 3.f, 5.f}, { 4.f, 5.f}, { 5.f, 5.f}, { 6.f, 5.f}, { 7.f, 5.f},
|
||||
{-8.f, 6.f}, {-7.f, 6.f}, {-6.f, 6.f}, {-5.f, 6.f}, {-4.f, 6.f}, {-3.f, 6.f}, {-2.f, 6.f}, {-1.f, 6.f},
|
||||
{ 0.f, 6.f}, { 1.f, 6.f}, { 2.f, 6.f}, { 3.f, 6.f}, { 4.f, 6.f}, { 5.f, 6.f}, { 6.f, 6.f}, { 7.f, 6.f},
|
||||
{-8.f, 7.f}, {-7.f, 7.f}, {-6.f, 7.f}, {-5.f, 7.f}, {-4.f, 7.f}, {-3.f, 7.f}, {-2.f, 7.f}, {-1.f, 7.f},
|
||||
{ 0.f, 7.f}, { 1.f, 7.f}, { 2.f, 7.f}, { 3.f, 7.f}, { 4.f, 7.f}, { 5.f, 7.f}, { 6.f, 7.f}, { 7.f, 7.f}
|
||||
};
|
||||
double sum = 0;
|
||||
for (int i=0; i<n; ++i) {
|
||||
float d = x->d;
|
||||
auto q = x->qs;
|
||||
float s = 0;
|
||||
for (int k=0; k<4; ++k) {
|
||||
s += y[0]*kValues[q[0]].first + y[1]*kValues[q[0]].second +
|
||||
y[2]*kValues[q[1]].first + y[3]*kValues[q[1]].second +
|
||||
y[4]*kValues[q[2]].first + y[5]*kValues[q[2]].second +
|
||||
y[6]*kValues[q[3]].first + y[7]*kValues[q[3]].second;
|
||||
y += 8; q += 4;
|
||||
}
|
||||
sum += s*d;
|
||||
++x;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
inline double dot41(int n, const block_q4_1* x, const float* y) {
|
||||
const static float kValues[16] = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f};
|
||||
constexpr uint32_t kMask1 = 0x0f0f0f0f;
|
||||
uint32_t u1, u2;
|
||||
auto q1 = (const uint8_t*)&u1;
|
||||
auto q2 = (const uint8_t*)&u2;
|
||||
double sum = 0;
|
||||
for (int i=0; i<n; ++i) {
|
||||
auto u = (const uint32_t*)x->qs;
|
||||
float s = 0, s1 = 0;
|
||||
for (int k=0; k<4; ++k) {
|
||||
u1 = u[k] & kMask1;
|
||||
u2 = (u[k] >> 4) & kMask1;
|
||||
s += y[0]*kValues[q1[0]] + y[1]*kValues[q2[0]] +
|
||||
y[2]*kValues[q1[1]] + y[3]*kValues[q2[1]] +
|
||||
y[4]*kValues[q1[2]] + y[5]*kValues[q2[2]] +
|
||||
y[6]*kValues[q1[3]] + y[7]*kValues[q2[3]];
|
||||
s1 += y[0] + y[1] + y[2] + y[3] + y[4] + y[5] + y[6] + y[7];
|
||||
y += 8;
|
||||
}
|
||||
sum += s*x->d + s1*x->m;
|
||||
++x;
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
// Copy-pasted from ggml.c
|
||||
static void quantize_row_q8_0_reference(const float *x, block_q8_0 *y, int k) {
|
||||
assert(k % QK8_0 == 0);
|
||||
const int nb = k / QK8_0;
|
||||
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
for (int l = 0; l < QK8_0; l++) {
|
||||
const float v = x[i*QK8_0 + l];
|
||||
amax = std::max(amax, fabsf(v));
|
||||
}
|
||||
|
||||
const float d = amax / ((1 << 7) - 1);
|
||||
const float id = d ? 1.0f/d : 0.0f;
|
||||
|
||||
y[i].d = d;
|
||||
|
||||
for (int l = 0; l < QK8_0; ++l) {
|
||||
const float v = x[i*QK8_0 + l]*id;
|
||||
y[i].qs[l] = roundf(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Copy-pasted from ggml.c
|
||||
static void dot_q4_q8(const int n, float* s, const void* vx, const void* vy) {
|
||||
const int nb = n / QK8_0;
|
||||
const block_q4_0* x = (const block_q4_0*)vx;
|
||||
const block_q8_0* y = (const block_q8_0*)vy;
|
||||
float sumf = 0;
|
||||
for (int i = 0; i < nb; i++) {
|
||||
const float d0 = x[i].d;
|
||||
const float d1 = y[i].d;
|
||||
|
||||
const uint8_t * p0 = x[i].qs;
|
||||
const int8_t * p1 = y[i].qs;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK8_0/2; j++) {
|
||||
const uint8_t v0 = p0[j];
|
||||
|
||||
const int i0 = (int8_t) (v0 & 0xf) - 8;
|
||||
const int i1 = (int8_t) (v0 >> 4) - 8;
|
||||
|
||||
const int i2 = p1[2*j + 0];
|
||||
const int i3 = p1[2*j + 1];
|
||||
|
||||
sumi += i0*i2 + i1*i3;
|
||||
}
|
||||
sumf += d0*d1*sumi;
|
||||
}
|
||||
*s = sumf;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
int nloop = argc > 1 ? atoi(argv[1]) : 10;
|
||||
bool scalar = argc > 2 ? atoi(argv[2]) : false;
|
||||
bool useQ4_1 = argc > 3 ? atoi(argv[3]) : false;
|
||||
|
||||
if (scalar && useQ4_1) {
|
||||
printf("It is not possible to use Q4_1 quantization and scalar implementations\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::mt19937 rndm(1234);
|
||||
|
||||
std::vector<float> x1(kVecSize), y1(kVecSize);
|
||||
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
|
||||
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
|
||||
|
||||
auto funcs = useQ4_1 ? ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1) : ggml_internal_get_quantize_fn(GGML_TYPE_Q4_0);
|
||||
|
||||
std::vector<block_q4_0> q40;
|
||||
std::vector<block_q4_1> q41;
|
||||
if (useQ4_1) q41.resize(n4);
|
||||
else q40.resize(n4);
|
||||
std::vector<block_q8_0> q8(n8);
|
||||
std::vector<int64_t> H(16, 0);
|
||||
double sumt = 0, sumt2 = 0, maxt = 0;
|
||||
double sumqt = 0, sumqt2 = 0, maxqt = 0;
|
||||
double sum = 0, sumq = 0, exactSum = 0;
|
||||
for (int iloop=0; iloop<nloop; ++iloop) {
|
||||
|
||||
// Fill vector x with random numbers
|
||||
fillRandomGaussianFloats(x1, rndm);
|
||||
|
||||
// Fill vector y with random numbers
|
||||
fillRandomGaussianFloats(y1, rndm);
|
||||
|
||||
// Compute the exact dot product
|
||||
for (int k=0; k<kVecSize; ++k) exactSum += x1[k]*y1[k];
|
||||
|
||||
// quantize x.
|
||||
// Note, we do not include this in the timing as in practical application
|
||||
// we already have the quantized model weights.
|
||||
if (useQ4_1) {
|
||||
funcs.quantize_row_q(x1.data(), q41.data(), kVecSize);
|
||||
} else {
|
||||
funcs.quantize_row_q(x1.data(), q40.data(), kVecSize);
|
||||
}
|
||||
|
||||
// Now measure time the dot product needs using the "scalar" version above
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
if (useQ4_1) sum += dot41(kVecSize / QK4_1, q41.data(), y1.data());
|
||||
else sum += dot(kVecSize / QK4_0, q40.data(), y1.data());
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
auto t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
||||
sumt += t; sumt2 += t*t; maxt = std::max(maxt, t);
|
||||
|
||||
// And now measure the time needed to quantize y and perform the dot product with the quantized y
|
||||
t1 = std::chrono::high_resolution_clock::now();
|
||||
float result;
|
||||
if (scalar) {
|
||||
quantize_row_q8_0_reference(y1.data(), q8.data(), kVecSize);
|
||||
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
|
||||
}
|
||||
else {
|
||||
funcs.quantize_row_q_dot(y1.data(), q8.data(), kVecSize);
|
||||
if (useQ4_1) funcs.vec_dot_q(kVecSize, &result, q41.data(), q8.data());
|
||||
else funcs.vec_dot_q(kVecSize, &result, q40.data(), q8.data());
|
||||
}
|
||||
sumq += result;
|
||||
t2 = std::chrono::high_resolution_clock::now();
|
||||
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
||||
sumqt += t; sumqt2 += t*t; maxqt = std::max(maxqt, t);
|
||||
|
||||
}
|
||||
|
||||
// Report the time (and the average of the dot products so the compiler does not come up with the idea
|
||||
// of optimizing away the function calls after figuring that the result is not used).
|
||||
sum /= nloop; sumq /= nloop;
|
||||
exactSum /= nloop;
|
||||
printf("Exact result: <dot> = %g\n",exactSum);
|
||||
printf("<dot> = %g, %g\n",sum,sumq);
|
||||
sumt /= nloop; sumt2 /= nloop; sumt2 -= sumt*sumt;
|
||||
if (sumt2 > 0) sumt2 = sqrt(sumt2);
|
||||
printf("time = %g +/- %g us. maxt = %g us\n",sumt,sumt2,maxt);
|
||||
sumqt /= nloop; sumqt2 /= nloop; sumqt2 -= sumqt*sumqt;
|
||||
if (sumqt2 > 0) sumqt2 = sqrt(sumqt2);
|
||||
printf("timeq = %g +/- %g us. maxt = %g us\n",sumqt,sumqt2,maxqt);
|
||||
return 0;
|
||||
}
|
||||
2
requirements.txt
Normal file
2
requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
numpy==1.24
|
||||
sentencepiece==0.1.98
|
||||
@@ -5,13 +5,17 @@
|
||||
#include <map>
|
||||
#include <vector>
|
||||
|
||||
static const std::map<std::string, std::vector<llama_token>> k_tests = {
|
||||
{ "Hello World", { 1, 10994, 2787, }, },
|
||||
{ " Hello World", { 1, 15043, 2787, }, },
|
||||
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
|
||||
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
|
||||
static const std::map<std::string, std::vector<llama_token>> & k_tests()
|
||||
{
|
||||
static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||
{ "Hello World", { 1, 10994, 2787, }, },
|
||||
{ " Hello World", { 1, 15043, 2787, }, },
|
||||
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
|
||||
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
|
||||
};
|
||||
return _k_tests;
|
||||
};
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
@@ -47,7 +51,7 @@ int main(int argc, char **argv) {
|
||||
return 2;
|
||||
}
|
||||
|
||||
for (const auto & test_kv : k_tests) {
|
||||
for (const auto & test_kv : k_tests()) {
|
||||
std::vector<llama_token> res(test_kv.first.size());
|
||||
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
|
||||
res.resize(n);
|
||||
|
||||
Reference in New Issue
Block a user