mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-26 14:23:22 +02:00
Compare commits
32 Commits
master-a6b
...
master-77e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
77efdf5a50 | ||
|
|
ed3c680bcd | ||
|
|
9cbc404ba6 | ||
|
|
b51c717d5c | ||
|
|
0ba76c1e73 | ||
|
|
cea1c85948 | ||
|
|
f202ada131 | ||
|
|
3b44d30d9b | ||
|
|
61cbfff5c9 | ||
|
|
d9ad104440 | ||
|
|
b467702b87 | ||
|
|
516d88e75c | ||
|
|
53635c081c | ||
|
|
41318d708e | ||
|
|
a6956b25a1 | ||
|
|
83df5639eb | ||
|
|
a5c42c4b13 | ||
|
|
5a5f8b1501 | ||
|
|
f1217055ea | ||
|
|
7f4c5c6651 | ||
|
|
2a98bc18ea | ||
|
|
d0aaff571c | ||
|
|
d0330fd783 | ||
|
|
99c5b27654 | ||
|
|
692ce3164e | ||
|
|
96f9c0506f | ||
|
|
d502bc7c9d | ||
|
|
436e561931 | ||
|
|
20e1e84884 | ||
|
|
c1f885067c | ||
|
|
e0670260fb | ||
|
|
28ba975aea |
20
.github/workflows/build.yml
vendored
20
.github/workflows/build.yml
vendored
@@ -8,10 +8,10 @@ on:
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
|
||||
pull_request:
|
||||
types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
|
||||
paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
@@ -62,7 +62,7 @@ jobs:
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest --output-on-failure
|
||||
ctest --verbose
|
||||
|
||||
ubuntu-latest-cmake-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
@@ -98,7 +98,7 @@ jobs:
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest --output-on-failure
|
||||
ctest --verbose
|
||||
|
||||
macOS-latest-make:
|
||||
runs-on: macos-latest
|
||||
@@ -143,7 +143,7 @@ jobs:
|
||||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest --output-on-failure
|
||||
ctest --verbose
|
||||
|
||||
windows-latest-cmake:
|
||||
runs-on: windows-latest
|
||||
@@ -177,15 +177,19 @@ jobs:
|
||||
continue-on-error: true
|
||||
run: |
|
||||
cd build
|
||||
Set-Content -Path .\avx512f.exe -Value ([Convert]::FromBase64String('TVqQAAMAAAAEAAAA//8AALgAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAyAAAAA4fug4AtAnNIbgBTM0hVGhpcyBwcm9ncmFtIGNhbm5vdCBiZSBydW4gaW4gRE9TIG1vZGUuDQ0KJAAAAAAAAAClmfXY4fibi+H4m4vh+JuL4fiai+P4m4si98aL4vibi7Xbq4vg+JuLUmljaOH4m4sAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABQRQAATAEBAGo6H2QAAAAAAAAAAOAADwELAQYAAAIAAAAAAAAAAAAADBAAAAAQAAAAIAAAAABAAAAQAAAAAgAABAAAAAAAAAAEAAAAAAAAAAAgAAAAAgAAAAAAAAMAAAAAABAAABAAAAAAEAAAEAAAAAAAABAAAAAAAAAAAAAAAFQQAAAoAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAADAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC50ZXh0AAAAsgAAAAAQAAAAAgAAAAIAAAAAAAAAAAAAAAAAACAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACUEAAAiBAAAAAAAABVi+xRUVNTuAcAAAAPosHrEGaD4wGJXfxbg0X8MI1F+GoAUI1F/GoBUGr1/xUAEEAAUP8VBBBAAItF/FuDwND32BvAQMnDzMx8EAAAAAAAAAAAAACkEAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAlBAAAIgQAAAAAAAApANXcml0ZUZpbGUAuQFHZXRTdGRIYW5kbGUAAEtFUk5FTDMyLmRsbAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==')) -AsByteStream
|
||||
.\avx512f.exe && echo " AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo " AVX512F: NO"
|
||||
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
|
||||
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
|
||||
$cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
|
||||
echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
|
||||
& $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
|
||||
.\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
|
||||
|
||||
- name: Test
|
||||
id: cmake_test
|
||||
if: ${{ matrix.build != 'avx512' || env.HAS_AVX512F == '1' }} # Test AVX-512 only when possible
|
||||
run: |
|
||||
cd build
|
||||
ctest -C Release --output-on-failure
|
||||
ctest -C Release --verbose
|
||||
|
||||
- name: Get commit hash
|
||||
id: commit
|
||||
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -5,6 +5,7 @@
|
||||
.vscode/
|
||||
.DS_Store
|
||||
|
||||
.build/
|
||||
build/
|
||||
build-em/
|
||||
build-debug/
|
||||
@@ -20,9 +21,14 @@ models/*
|
||||
/quantize
|
||||
/result
|
||||
/perplexity
|
||||
/embedding
|
||||
|
||||
arm_neon.h
|
||||
compile_commands.json
|
||||
|
||||
.envrc
|
||||
.direnv/
|
||||
|
||||
.venv
|
||||
__pycache__
|
||||
.swiftpm
|
||||
|
||||
@@ -124,8 +124,9 @@ if (LLAMA_ALL_WARNINGS)
|
||||
-Wall
|
||||
-Wextra
|
||||
-Wpedantic
|
||||
-Wshadow
|
||||
-Wcast-qual
|
||||
-Wdouble-promotion
|
||||
-Wshadow
|
||||
-Wstrict-prototypes
|
||||
-Wpointer-arith
|
||||
-Wno-unused-function
|
||||
@@ -135,6 +136,7 @@ if (LLAMA_ALL_WARNINGS)
|
||||
-Wextra
|
||||
-Wpedantic
|
||||
-Wcast-qual
|
||||
-Wno-unused-function
|
||||
)
|
||||
else()
|
||||
# todo : msvc
|
||||
|
||||
4
Makefile
4
Makefile
@@ -35,6 +35,10 @@ CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
|
||||
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
|
||||
LDFLAGS =
|
||||
|
||||
# warnings
|
||||
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
|
||||
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
|
||||
|
||||
# OS specific
|
||||
# TODO: support Windows
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
|
||||
20
Package.swift
Normal file
20
Package.swift
Normal file
@@ -0,0 +1,20 @@
|
||||
// swift-tools-version:5.3
|
||||
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "llama",
|
||||
products: [
|
||||
.library(name: "llama", targets: ["llama"]),
|
||||
],
|
||||
targets: [
|
||||
.target(
|
||||
name: "llama",
|
||||
path: ".",
|
||||
sources: ["ggml.c", "llama.cpp"],
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])]
|
||||
),
|
||||
],
|
||||
cxxLanguageStandard: .cxx11
|
||||
)
|
||||
23
README.md
23
README.md
@@ -10,9 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||
**Hot topics:**
|
||||
|
||||
- [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
|
||||
- New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
|
||||
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
|
||||
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
|
||||
- Support for [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
|
||||
|
||||
## Description
|
||||
|
||||
@@ -37,6 +35,12 @@ Supported platforms:
|
||||
- [X] Windows (via CMake)
|
||||
- [X] Docker
|
||||
|
||||
Supported models:
|
||||
|
||||
- [X] LLaMA
|
||||
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
|
||||
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
|
||||
|
||||
---
|
||||
|
||||
Here is a typical run using LLaMA-7B:
|
||||
@@ -222,6 +226,19 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
||||
>
|
||||
```
|
||||
|
||||
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
|
||||
|
||||
- Obtain the `gpt4all-lora-quantized.bin` model
|
||||
- It is distributed in the old `ggml` format which is now obsoleted
|
||||
- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py):
|
||||
|
||||
```bash
|
||||
python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
|
||||
```
|
||||
|
||||
- You can now use the newly generated `gpt4all-lora-quantized.bin` model in exactly the same way as all other models
|
||||
- The original model is saved in the same folder with a suffix `.orig`
|
||||
|
||||
### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
|
||||
|
||||
- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
|
||||
|
||||
294
convert-ggml-to-pth.py
Normal file
294
convert-ggml-to-pth.py
Normal file
@@ -0,0 +1,294 @@
|
||||
# Author: github.com/ductai199x
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from numba import njit
|
||||
from tqdm.auto import tqdm
|
||||
|
||||
|
||||
def read_header(fin):
|
||||
values = struct.unpack("i" * 9, fin.read(4 * 9))
|
||||
_, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
|
||||
return {
|
||||
"vocab_size": vocab_size,
|
||||
"dim": dim,
|
||||
"multiple_of": multiple_of,
|
||||
"n_heads": n_heads,
|
||||
"n_layers": n_layers,
|
||||
}, ftype
|
||||
|
||||
|
||||
def read_tokens(fin, vocab_size):
|
||||
tokens = []
|
||||
for _ in range(vocab_size):
|
||||
text_len = struct.unpack("i", fin.read(4))[0]
|
||||
text_bytes = fin.read(text_len)
|
||||
try:
|
||||
text = text_bytes.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
text = text_bytes.decode("utf-8", "replace")
|
||||
score = struct.unpack("f", fin.read(4))[0]
|
||||
tokens.append((text, score))
|
||||
return tokens
|
||||
|
||||
|
||||
@njit
|
||||
def dequantize_weights_numba(fin_data, n_rows, n_cols):
|
||||
qk = 32
|
||||
nb = n_cols // qk
|
||||
bs = 4 + (qk // 2)
|
||||
|
||||
weights = np.zeros((n_rows, n_cols), dtype=np.float32)
|
||||
data_pos = 0
|
||||
|
||||
for row in range(n_rows):
|
||||
for block in range(nb):
|
||||
d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
|
||||
data_pos += 4
|
||||
packed_values = fin_data[data_pos : data_pos + (qk // 2)]
|
||||
data_pos += qk // 2
|
||||
|
||||
for i in range(qk // 2):
|
||||
packed_value = packed_values[i]
|
||||
v0 = np.float32((packed_value & 0b00001111) - 8) * d
|
||||
v1 = np.float32((packed_value >> 4) - 8) * d
|
||||
|
||||
weights[row, block * qk + 2 * i] = v0
|
||||
weights[row, block * qk + 2 * i + 1] = v1
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def dequantize_weights(fin, n_rows, n_cols):
|
||||
qk = 32
|
||||
nb = n_cols // qk
|
||||
data_size = n_rows * n_cols // 2 + n_rows * nb * 4
|
||||
fin_data = fin.read(data_size)
|
||||
return dequantize_weights_numba(fin_data, n_rows, n_cols)
|
||||
|
||||
|
||||
def read_variables(fin):
|
||||
model = {}
|
||||
pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
|
||||
while True:
|
||||
start_pos = fin.tell()
|
||||
try:
|
||||
n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
|
||||
except struct.error:
|
||||
break
|
||||
|
||||
shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
|
||||
shape = shape[::-1]
|
||||
name = fin.read(name_length).decode("utf-8")
|
||||
|
||||
if ftype_cur == 2:
|
||||
# 4-bit quantized weights
|
||||
dtype = np.uint8
|
||||
data = dequantize_weights(fin, shape[0], shape[1])
|
||||
data = data.reshape(shape)
|
||||
elif ftype_cur == 0:
|
||||
dtype = np.float32
|
||||
data_size = np.prod(shape)
|
||||
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
|
||||
elif ftype_cur == 1:
|
||||
dtype = np.float16
|
||||
data_size = np.prod(shape)
|
||||
data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
|
||||
|
||||
model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
|
||||
|
||||
pbar.update(fin.tell() - start_pos)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def convert_to_hf_format(model, hparams):
|
||||
# This works for llama 7B, need to test with other models
|
||||
n_layers = hparams["n_layers"]
|
||||
n_heads = hparams["n_heads"]
|
||||
dim = hparams["dim"]
|
||||
dims_per_head = dim // n_heads
|
||||
base = 10000.0
|
||||
inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
|
||||
|
||||
# permute for sliced rotary
|
||||
def permute(w):
|
||||
return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
|
||||
|
||||
state_dict = {}
|
||||
for layer_i in range(n_layers):
|
||||
state_dict.update(
|
||||
{
|
||||
f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
|
||||
model[f"layers.{layer_i}.attention.wq.weight"]
|
||||
),
|
||||
f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
|
||||
model[f"layers.{layer_i}.attention.wk.weight"]
|
||||
),
|
||||
f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
|
||||
f"layers.{layer_i}.attention.wv.weight"
|
||||
],
|
||||
f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
|
||||
f"layers.{layer_i}.attention.wo.weight"
|
||||
],
|
||||
f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
|
||||
f"layers.{layer_i}.feed_forward.w1.weight"
|
||||
],
|
||||
f"model.layers.{layer_i}.mlp.down_proj.weight": model[
|
||||
f"layers.{layer_i}.feed_forward.w2.weight"
|
||||
],
|
||||
f"model.layers.{layer_i}.mlp.up_proj.weight": model[
|
||||
f"layers.{layer_i}.feed_forward.w3.weight"
|
||||
],
|
||||
f"model.layers.{layer_i}.input_layernorm.weight": model[
|
||||
f"layers.{layer_i}.attention_norm.weight"
|
||||
],
|
||||
f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
|
||||
f"layers.{layer_i}.ffn_norm.weight"
|
||||
],
|
||||
}
|
||||
)
|
||||
state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
|
||||
state_dict.update(
|
||||
{
|
||||
"model.embed_tokens.weight": model["tok_embeddings.weight"],
|
||||
"model.norm.weight": model["norm.weight"],
|
||||
"lm_head.weight": model["output.weight"],
|
||||
}
|
||||
)
|
||||
|
||||
return state_dict
|
||||
|
||||
|
||||
def chat(model, hparams, llama_dir):
|
||||
from transformers import (GenerationConfig, LlamaForCausalLM,
|
||||
LlamaTokenizer, StoppingCriteria,
|
||||
StoppingCriteriaList)
|
||||
from transformers.models.llama.configuration_llama import LlamaConfig
|
||||
|
||||
class StoppingCriteriaSub(StoppingCriteria):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
|
||||
print(tokenizer.decode(input_ids[0]), end="", flush=True)
|
||||
if input_ids[0][-1] == 13:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
config = LlamaConfig(
|
||||
vocab_size=hparams["vocab_size"],
|
||||
dim=hparams["dim"],
|
||||
num_hidden_layers=hparams["n_layers"],
|
||||
num_attention_heads=hparams["n_heads"],
|
||||
)
|
||||
|
||||
llama = LlamaForCausalLM(config=config)
|
||||
llama.load_state_dict(state_dict=model, strict=True)
|
||||
tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
|
||||
|
||||
device = torch.device("cpu")
|
||||
llama = llama.to(device)
|
||||
|
||||
ctx = """You are AI.
|
||||
This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
|
||||
User: Hello, AI.
|
||||
AI: Hello! How can I assist you today?
|
||||
"""
|
||||
print(ctx.rstrip("\n"))
|
||||
while True:
|
||||
print("-" * 60)
|
||||
prompt = input(f"User: ")
|
||||
if ctx != "":
|
||||
ctx = ctx + "User: " + prompt + "\n"
|
||||
else:
|
||||
ctx = prompt + "\nAI:"
|
||||
|
||||
ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
|
||||
|
||||
print("-" * 60)
|
||||
if len(ctx.strip()) > 0:
|
||||
input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
|
||||
generation_config = GenerationConfig(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
top_k=50,
|
||||
repetition_penalty=1.1764,
|
||||
)
|
||||
with torch.no_grad():
|
||||
generation_output = llama.generate(
|
||||
input_ids=input_ids,
|
||||
generation_config=generation_config,
|
||||
return_dict_in_generate=True,
|
||||
output_scores=True,
|
||||
max_length=2048,
|
||||
do_sample=True,
|
||||
stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
|
||||
)
|
||||
s = generation_output.sequences[0]
|
||||
decoded = tokenizer.decode(s)
|
||||
ctx = decoded + "\n"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--prefix",
|
||||
"-p",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hf",
|
||||
action="store_true",
|
||||
help="Whether to save the model in the huggingface format. (default: False)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
llama_dir = os.path.abspath(f"{args.input_dir}/../")
|
||||
|
||||
ggml_files = sorted(
|
||||
[f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
|
||||
)
|
||||
|
||||
fin = open(ggml_files[0], "rb")
|
||||
hparams, ftype = read_header(fin)
|
||||
tokens = read_tokens(fin, hparams["vocab_size"])
|
||||
model = read_variables(fin)
|
||||
|
||||
for f in tqdm(ggml_files[1:]):
|
||||
fin = open(f, "rb")
|
||||
read_header(fin)
|
||||
read_tokens(fin, hparams["vocab_size"])
|
||||
model.update(read_variables(fin))
|
||||
|
||||
if args.hf:
|
||||
model = convert_to_hf_format(model, hparams)
|
||||
|
||||
pth_ckpt = {
|
||||
"state_dict": model,
|
||||
"hparams": hparams,
|
||||
"tokens": tokens,
|
||||
}
|
||||
|
||||
torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
|
||||
|
||||
if args.chat:
|
||||
if not args.hf:
|
||||
model = convert_to_hf_format(model, hparams)
|
||||
chat(model, hparams, llama_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
107
convert-gpt4all-to-ggml.py
Normal file
107
convert-gpt4all-to-ggml.py
Normal file
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
#
|
||||
# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
|
||||
#
|
||||
|
||||
# Original by https://github.com/eiz
|
||||
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
|
||||
parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
|
||||
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
|
||||
return parser.parse_args()
|
||||
|
||||
def read_header(f_in):
|
||||
struct_fmt = "i" * (3 + len(HPARAMS))
|
||||
struct_size = struct.calcsize(struct_fmt)
|
||||
buf = f_in.read(struct_size)
|
||||
return struct.unpack(struct_fmt, buf)
|
||||
|
||||
def write_header(f_out, header):
|
||||
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
|
||||
|
||||
if magic != 0x67676d6c:
|
||||
raise Exception('Invalid file magic. Must be an old style ggml file.')
|
||||
|
||||
values = [
|
||||
0x67676d66, # magic: ggml in hex
|
||||
1, # file version
|
||||
vocab_size,
|
||||
dim,
|
||||
multiple_of,
|
||||
n_heads,
|
||||
n_layers,
|
||||
rot,
|
||||
ftype
|
||||
]
|
||||
f_out.write(struct.pack("i" * len(values), *values))
|
||||
|
||||
def write_tokens(fout, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode("utf-8")
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
if len(piece) != 6:
|
||||
print(f"Invalid token: {piece}")
|
||||
sys.exit(1)
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
# TODO: GPT4All - add extra <pad> token
|
||||
text = "<pad>".encode("utf-8")
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", 0.0))
|
||||
|
||||
def read_tokens(f_in, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
len_b = f_in.read(4)
|
||||
(length,) = struct.unpack("i", len_b)
|
||||
f_in.read(length)
|
||||
|
||||
def copy_all_data(f_out, f_in):
|
||||
while True:
|
||||
buf = f_in.read(1024 * 1024)
|
||||
if not buf:
|
||||
break
|
||||
f_out.write(buf)
|
||||
|
||||
def convert_one_file(path_in, tokenizer):
|
||||
path_tmp = f"{path_in}.tmp"
|
||||
path_orig= f"{path_in}.orig"
|
||||
print(f"converting {path_in}")
|
||||
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
|
||||
write_header(f_out, read_header(f_in))
|
||||
read_tokens(f_in, tokenizer)
|
||||
write_tokens(f_out, tokenizer)
|
||||
copy_all_data(f_out, f_in)
|
||||
os.rename(path_in, path_orig)
|
||||
os.rename(path_tmp, path_in)
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
tokenizer = SentencePieceProcessor(args.tokenizer_model)
|
||||
|
||||
convert_one_file(args.gpt4all_model, tokenizer)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -145,13 +145,11 @@ def main():
|
||||
|
||||
print(f"Extracting only the vocab from '{fname_model}'\n")
|
||||
|
||||
model = torch.load(fname_model, map_location="cpu")
|
||||
|
||||
with open(fname_out, "wb") as fout:
|
||||
write_header(fout, hparams, ftype)
|
||||
write_tokens(fout, tokenizer)
|
||||
|
||||
del model
|
||||
|
||||
print(f"Done. Output file: {fname_out}\n")
|
||||
|
||||
|
||||
100
convert-unversioned-ggml-to-ggml.py
Normal file
100
convert-unversioned-ggml-to-ggml.py
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
# Original by https://github.com/eiz
|
||||
# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
|
||||
parser.add_argument('dir_model', help='directory containing ggml .bin files')
|
||||
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
|
||||
return parser.parse_args()
|
||||
|
||||
def read_header(f_in):
|
||||
struct_fmt = "i" * (3 + len(HPARAMS))
|
||||
struct_size = struct.calcsize(struct_fmt)
|
||||
buf = f_in.read(struct_size)
|
||||
return struct.unpack(struct_fmt, buf)
|
||||
|
||||
def write_header(f_out, header):
|
||||
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
|
||||
|
||||
if magic != 0x67676d6c:
|
||||
raise Exception('Invalid file magic. Must be an old style ggml file.')
|
||||
|
||||
values = [
|
||||
0x67676d66, # magic: ggml in hex
|
||||
1, # file version
|
||||
vocab_size,
|
||||
dim,
|
||||
multiple_of,
|
||||
n_heads,
|
||||
n_layers,
|
||||
rot,
|
||||
ftype
|
||||
]
|
||||
f_out.write(struct.pack("i" * len(values), *values))
|
||||
|
||||
def write_tokens(fout, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
if tokenizer.is_unknown(i):
|
||||
text = " \u2047 ".encode("utf-8")
|
||||
elif tokenizer.is_control(i):
|
||||
text = b""
|
||||
elif tokenizer.is_byte(i):
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
if len(piece) != 6:
|
||||
print(f"Invalid token: {piece}")
|
||||
sys.exit(1)
|
||||
byte_value = int(piece[3:-1], 16)
|
||||
text = struct.pack("B", byte_value)
|
||||
else:
|
||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||
fout.write(struct.pack("i", len(text)))
|
||||
fout.write(text)
|
||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||
|
||||
def read_tokens(f_in, tokenizer):
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
len_b = f_in.read(4)
|
||||
(length,) = struct.unpack("i", len_b)
|
||||
f_in.read(length)
|
||||
|
||||
def copy_all_data(f_out, f_in):
|
||||
while True:
|
||||
buf = f_in.read(1024 * 1024)
|
||||
if not buf:
|
||||
break
|
||||
f_out.write(buf)
|
||||
|
||||
def convert_one_file(path_in, tokenizer):
|
||||
path_tmp = f"{path_in}.tmp"
|
||||
path_orig= f"{path_in}.orig"
|
||||
print(f"converting {path_in}")
|
||||
with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
|
||||
write_header(f_out, read_header(f_in))
|
||||
read_tokens(f_in, tokenizer)
|
||||
write_tokens(f_out, tokenizer)
|
||||
copy_all_data(f_out, f_in)
|
||||
os.rename(path_in, path_orig)
|
||||
os.rename(path_tmp, path_in)
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
files = []
|
||||
files.extend(glob.glob(f"{args.dir_model}/*.bin"))
|
||||
files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
|
||||
|
||||
tokenizer = SentencePieceProcessor(args.tokenizer_model)
|
||||
|
||||
for file in files:
|
||||
convert_one_file(file, tokenizer)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
57
examples/chat-13B.bat
Normal file
57
examples/chat-13B.bat
Normal file
@@ -0,0 +1,57 @@
|
||||
@setlocal disabledelayedexpansion enableextensions
|
||||
@echo off
|
||||
|
||||
cd /d "%~dp0.."
|
||||
if not "%errorlevel%"=="0" (
|
||||
echo Unable to change directory.
|
||||
pause
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin"
|
||||
if not defined USER_NAME set "USER_NAME=User"
|
||||
if not defined AI_NAME set "AI_NAME=ChatLLaMa"
|
||||
rem Adjust to the number of CPU cores you want to use.
|
||||
rem if not defined N_THREAD set "N_THREAD=8"
|
||||
rem Number of tokens to predict (made it larger than default because we want a long interaction)
|
||||
if not defined N_PREDICTS set "N_PREDICTS=2048"
|
||||
if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
|
||||
|
||||
rem Default main script paths
|
||||
set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
|
||||
|
||||
rem Get main script path from command line arguments
|
||||
set "MAIN_SCRIPT_PATH=%~1"
|
||||
|
||||
rem If the main script path was not specified, try the default paths
|
||||
if not defined MAIN_SCRIPT_PATH (
|
||||
for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do (
|
||||
if exist "%%i" set "MAIN_SCRIPT_PATH=%%i"
|
||||
)
|
||||
)
|
||||
|
||||
rem If the main script path was not found, tell the user how to specify it
|
||||
if not defined MAIN_SCRIPT_PATH (
|
||||
echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations:
|
||||
echo %DEFAULT_MAIN_SCRIPT_PATHS%
|
||||
pause
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
rem Default context, feel free to edit it
|
||||
set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown."
|
||||
|
||||
rem Set a temporary variable if N_THREAD is set
|
||||
if defined N_THREAD (
|
||||
set "_N_THREAD=--threads %N_THREAD%"
|
||||
) else (
|
||||
set "_N_THREAD="
|
||||
)
|
||||
|
||||
rem Run the script
|
||||
echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^
|
||||
--model "%MODEL%" ^
|
||||
--n_predict %N_PREDICTS% ^
|
||||
--color --interactive ^
|
||||
--reverse-prompt "%USER_NAME%:" ^
|
||||
--prompt "%PROMPT_TEXT%"
|
||||
@@ -215,13 +215,13 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, " prompt file to start generation.\n");
|
||||
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
|
||||
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
|
||||
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
|
||||
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", (double)params.top_p);
|
||||
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
|
||||
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
|
||||
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
|
||||
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
|
||||
fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
|
||||
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
|
||||
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
|
||||
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
set(TARGET embedding)
|
||||
add_executable(${TARGET} embedding.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
set(TARGET main)
|
||||
add_executable(${TARGET} main.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -209,7 +209,8 @@ int main(int argc, char ** argv) {
|
||||
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
|
||||
fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
|
||||
params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
|
||||
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
@@ -274,10 +275,10 @@ int main(int argc, char ** argv) {
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||
// out of user input, sample next token
|
||||
const float top_k = params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float temp = params.temp;
|
||||
const float repeat_penalty = params.repeat_penalty;
|
||||
const int32_t top_k = params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float temp = params.temp;
|
||||
const float repeat_penalty = params.repeat_penalty;
|
||||
|
||||
llama_token id = 0;
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
set(TARGET perplexity)
|
||||
add_executable(${TARGET} perplexity.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -1,15 +1,17 @@
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
std::vector<double> softmax(const std::vector<float>& logits) {
|
||||
std::vector<double> probs(logits.size());
|
||||
#include <cmath>
|
||||
|
||||
std::vector<float> softmax(const std::vector<float>& logits) {
|
||||
std::vector<float> probs(logits.size());
|
||||
float max_logit = logits[0];
|
||||
for (float v : logits) max_logit = std::max(max_logit, v);
|
||||
double sum_exp = 0.0;
|
||||
for (size_t i = 0; i < logits.size(); i++) {
|
||||
// Subtract the maximum logit value from the current logit value for numerical stability
|
||||
float logit = logits[i] - max_logit;
|
||||
double exp_logit = std::exp(logit);
|
||||
const float logit = logits[i] - max_logit;
|
||||
const float exp_logit = expf(logit);
|
||||
sum_exp += exp_logit;
|
||||
probs[i] = exp_logit;
|
||||
}
|
||||
@@ -24,14 +26,16 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
int count = 0;
|
||||
double nll = 0.0;
|
||||
int seq_count = tokens.size() / params.n_ctx;
|
||||
|
||||
double nll = 0.0;
|
||||
|
||||
fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
|
||||
|
||||
for (int i = 0; i < seq_count; ++i) {
|
||||
int start = i * params.n_ctx;
|
||||
int end = start + params.n_ctx - 1;
|
||||
int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512
|
||||
// it is better to always be power of 2 for better performance
|
||||
std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
|
||||
auto start_t = std::chrono::high_resolution_clock::now();
|
||||
if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
|
||||
@@ -40,7 +44,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
}
|
||||
auto end_t = std::chrono::high_resolution_clock::now();
|
||||
if (i == 0) {
|
||||
double seconds = std::chrono::duration<double>(end_t - start_t).count();
|
||||
const float seconds = std::chrono::duration<float>(end_t - start_t).count();
|
||||
printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
|
||||
}
|
||||
// We get the logits for all the tokens in the context window (params.n_ctx)
|
||||
@@ -63,7 +67,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||
std::vector<float> tok_logits(
|
||||
logits + j * n_vocab,
|
||||
logits + (j + 1) * n_vocab);
|
||||
double prob = softmax(tok_logits)[tokens[start + j + 1]];
|
||||
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
||||
nll += -std::log(prob);
|
||||
++count;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
set(TARGET quantize)
|
||||
add_executable(${TARGET} quantize.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE llama ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
||||
@@ -4,8 +4,6 @@
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
|
||||
const int QK = 32;
|
||||
|
||||
// usage:
|
||||
// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
|
||||
//
|
||||
@@ -39,7 +37,7 @@ int main(int argc, char ** argv) {
|
||||
{
|
||||
const int64_t t_start_us = ggml_time_us();
|
||||
|
||||
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) {
|
||||
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
|
||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||
return 1;
|
||||
}
|
||||
@@ -52,8 +50,8 @@ int main(int argc, char ** argv) {
|
||||
const int64_t t_main_end_us = ggml_time_us();
|
||||
|
||||
printf("\n");
|
||||
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
|
||||
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
|
||||
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
|
||||
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
17
examples/reason-act.sh
Executable file
17
examples/reason-act.sh
Executable file
@@ -0,0 +1,17 @@
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
# get -m model parameter otherwise defer to default
|
||||
if [ "$1" == "-m" ]; then
|
||||
MODEL="-m $2 "
|
||||
fi
|
||||
|
||||
./main $MODEL --color \
|
||||
-f ./prompts/reason-act.txt \
|
||||
-i --interactive-first \
|
||||
--top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
|
||||
-r "Question:" -r "Observation:" --in-prefix " " \
|
||||
-n -1
|
||||
4
ggml.h
4
ggml.h
@@ -748,8 +748,8 @@ enum ggml_opt_result ggml_opt(
|
||||
// quantization
|
||||
//
|
||||
|
||||
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
||||
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
|
||||
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
//
|
||||
// system info
|
||||
|
||||
73
llama.cpp
73
llama.cpp
@@ -320,7 +320,7 @@ static bool llama_model_load(
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
|
||||
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
|
||||
__func__, fname.c_str());
|
||||
return false;
|
||||
}
|
||||
@@ -779,8 +779,8 @@ static bool llama_model_load(
|
||||
|
||||
// progress
|
||||
if (progress_callback) {
|
||||
double current_file_progress = double(size_t(fin.tellg()) - file_offset) / double(file_size - file_offset);
|
||||
double current_progress = (double(i) + current_file_progress) / double(n_parts);
|
||||
float current_file_progress = float(size_t(fin.tellg()) - file_offset) / float(file_size - file_offset);
|
||||
float current_progress = (float(i) + current_file_progress) / float(n_parts);
|
||||
progress_callback(current_progress, progress_callback_user_data);
|
||||
}
|
||||
if (model.n_loaded % 8 == 0) {
|
||||
@@ -856,7 +856,7 @@ static bool llama_eval_internal(
|
||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||
ggml_cgraph gf = {};
|
||||
gf.n_threads = N > 255 && ggml_cpu_has_blas() ? 1 : n_threads;
|
||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
|
||||
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
||||
@@ -922,7 +922,7 @@ static bool llama_eval_internal(
|
||||
struct ggml_tensor * KQ_scaled =
|
||||
ggml_scale(ctx0,
|
||||
KQ,
|
||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)));
|
||||
ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));
|
||||
|
||||
// KQ_masked = mask_past(KQ_scaled)
|
||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
||||
@@ -1240,12 +1240,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
||||
// sampling
|
||||
//
|
||||
|
||||
static void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k) {
|
||||
static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
|
||||
// find the top k tokens
|
||||
std::partial_sort(
|
||||
logits_id.begin(),
|
||||
logits_id.begin() + top_k, logits_id.end(),
|
||||
[](const std::pair<double, llama_vocab::id> & a, const std::pair<double, llama_vocab::id> & b) {
|
||||
[](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
|
||||
return a.first > b.first;
|
||||
});
|
||||
|
||||
@@ -1256,9 +1256,9 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||
llama_context & lctx,
|
||||
const std::vector<llama_vocab::id> & last_n_tokens,
|
||||
int top_k,
|
||||
double top_p,
|
||||
double temp,
|
||||
double repeat_penalty) {
|
||||
float top_p,
|
||||
float temp,
|
||||
float repeat_penalty) {
|
||||
auto & rng = lctx.rng;
|
||||
|
||||
const int n_logits = lctx.model.hparams.n_vocab;
|
||||
@@ -1266,17 +1266,17 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||
const auto & logits = lctx.logits;
|
||||
const auto * plogits = logits.data() + logits.size() - n_logits;
|
||||
|
||||
std::vector<std::pair<double, llama_vocab::id>> logits_id;
|
||||
std::vector<std::pair<float, llama_vocab::id>> logits_id;
|
||||
logits_id.reserve(n_logits);
|
||||
|
||||
{
|
||||
const double scale = 1.0/temp;
|
||||
const float scale = 1.0f/temp;
|
||||
for (int i = 0; i < n_logits; ++i) {
|
||||
// repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
|
||||
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
|
||||
if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
|
||||
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
|
||||
if (plogits[i] < 0.0) {
|
||||
if (plogits[i] < 0.0f) {
|
||||
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
|
||||
} else {
|
||||
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
|
||||
@@ -1289,18 +1289,18 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||
|
||||
sample_top_k(logits_id, top_k);
|
||||
|
||||
double maxl = -std::numeric_limits<double>::infinity();
|
||||
float maxl = -std::numeric_limits<float>::infinity();
|
||||
for (const auto & kv : logits_id) {
|
||||
maxl = std::max(maxl, kv.first);
|
||||
}
|
||||
|
||||
// compute probs for the top k tokens
|
||||
std::vector<double> probs;
|
||||
std::vector<float> probs;
|
||||
probs.reserve(logits_id.size());
|
||||
|
||||
double sum = 0.0;
|
||||
for (const auto & kv : logits_id) {
|
||||
double p = exp(kv.first - maxl);
|
||||
const float p = expf(kv.first - maxl);
|
||||
probs.push_back(p);
|
||||
sum += p;
|
||||
}
|
||||
@@ -1310,8 +1310,8 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||
p /= sum;
|
||||
}
|
||||
|
||||
if (top_p < 1.0f) {
|
||||
double cumsum = 0.0f;
|
||||
if (top_p < 1.0) {
|
||||
double cumsum = 0.0;
|
||||
for (int i = 0; i < (int) probs.size(); i++) {
|
||||
cumsum += probs[i];
|
||||
if (cumsum >= top_p) {
|
||||
@@ -1345,7 +1345,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||
//
|
||||
|
||||
// TODO: reuse code from the llama_model_load() somehow
|
||||
bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype, int qk) {
|
||||
static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
|
||||
ggml_type type = GGML_TYPE_Q4_1;
|
||||
|
||||
switch (itype) {
|
||||
@@ -1444,7 +1444,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
|
||||
return false;
|
||||
}
|
||||
|
||||
std::string word;
|
||||
std::vector<char> word(32);
|
||||
vocab.id_to_token.resize(n_vocab);
|
||||
for (int i = 0; i < n_vocab; i++) {
|
||||
uint32_t len;
|
||||
@@ -1459,10 +1459,10 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
|
||||
finp.read ((char *) &score, sizeof(score));
|
||||
fout.write((char *) &score, sizeof(score));
|
||||
|
||||
vocab.token_to_id[word] = i;
|
||||
vocab.token_to_id[word.data()] = i;
|
||||
|
||||
auto &tok_score = vocab.id_to_token[i];
|
||||
tok_score.tok = word;
|
||||
tok_score.tok = word.data();
|
||||
tok_score.score = score;
|
||||
}
|
||||
}
|
||||
@@ -1568,11 +1568,11 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], qk, hist_cur.data());
|
||||
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], qk, hist_cur.data());
|
||||
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -1590,7 +1590,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int) hist_cur.size(); ++i) {
|
||||
printf("%5.3f ", hist_cur[i] / (float)nelements);
|
||||
printf("%5.3f ", hist_cur[i] / float(nelements));
|
||||
}
|
||||
printf("\n");
|
||||
} else {
|
||||
@@ -1613,7 +1613,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
|
||||
|
||||
printf("%s: hist: ", __func__);
|
||||
for (int i = 0; i < (int) hist_all.size(); ++i) {
|
||||
printf("%5.3f ", hist_all[i] / (float)sum_all);
|
||||
printf("%5.3f ", hist_all[i] / float(sum_all));
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
@@ -1711,9 +1711,8 @@ void llama_free(struct llama_context * ctx) {
|
||||
int llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
const char * fname_out,
|
||||
int itype,
|
||||
int qk) {
|
||||
if (!llama_model_quantize_internal(fname_inp, fname_out, itype, qk)) {
|
||||
int itype) {
|
||||
if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
|
||||
fprintf(stderr, "%s: failed to quantize\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -1796,9 +1795,9 @@ llama_token llama_sample_top_p_top_k(
|
||||
const llama_token * last_n_tokens_data,
|
||||
int last_n_tokens_size,
|
||||
int top_k,
|
||||
double top_p,
|
||||
double temp,
|
||||
double repeat_penalty) {
|
||||
float top_p,
|
||||
float temp,
|
||||
float repeat_penalty) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
llama_token result = 0;
|
||||
@@ -1829,11 +1828,11 @@ void llama_print_timings(struct llama_context * ctx) {
|
||||
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
|
||||
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_sample_us, n_sample, 1e-3f * ctx->t_sample_us / n_sample);
|
||||
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3f * ctx->t_p_eval_us, n_p_eval, 1e-3f * ctx->t_p_eval_us / n_p_eval);
|
||||
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_eval_us, n_eval, 1e-3f * ctx->t_eval_us / n_eval);
|
||||
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
|
||||
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
||||
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
|
||||
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
|
||||
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval);
|
||||
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
||||
}
|
||||
|
||||
void llama_reset_timings(struct llama_context * ctx) {
|
||||
|
||||
13
llama.h
13
llama.h
@@ -6,7 +6,7 @@
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
# ifdef _WIN32
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef LLAMA_BUILD
|
||||
# define LLAMA_API __declspec(dllexport)
|
||||
# else
|
||||
@@ -45,7 +45,7 @@ extern "C" {
|
||||
|
||||
} llama_token_data;
|
||||
|
||||
typedef void (*llama_progress_callback)(double progress, void *ctx);
|
||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||
|
||||
struct llama_context_params {
|
||||
int n_ctx; // text context
|
||||
@@ -81,8 +81,7 @@ extern "C" {
|
||||
LLAMA_API int llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
const char * fname_out,
|
||||
int itype,
|
||||
int qk);
|
||||
int itype);
|
||||
|
||||
// Run the llama inference to obtain the logits and probabilities for the next token.
|
||||
// tokens + n_tokens is the provided batch of new tokens to process
|
||||
@@ -135,9 +134,9 @@ extern "C" {
|
||||
const llama_token * last_n_tokens_data,
|
||||
int last_n_tokens_size,
|
||||
int top_k,
|
||||
double top_p,
|
||||
double temp,
|
||||
double repeat_penalty);
|
||||
float top_p,
|
||||
float temp,
|
||||
float repeat_penalty);
|
||||
|
||||
// Performance information
|
||||
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
||||
|
||||
18
prompts/reason-act.txt
Normal file
18
prompts/reason-act.txt
Normal file
@@ -0,0 +1,18 @@
|
||||
You run in a loop of Thought, Action, Observation.
|
||||
At the end of the loop either Answer or restate your Thought and Action.
|
||||
Use Thought to describe your thoughts about the question you have been asked.
|
||||
Use Action to run one of these actions available to you:
|
||||
- calculate[python math expression]
|
||||
Observation will be the result of running those actions
|
||||
|
||||
|
||||
Question: What is 4 * 7 / 3?
|
||||
Thought: Do I need to use an action? Yes, I use calculate to do math
|
||||
Action: calculate[4 * 7 / 3]
|
||||
Observation: 9.3333333333
|
||||
Thought: Do I need to use an action? No, have the result
|
||||
Answer: The calculate tool says it is 9.3333333333
|
||||
Question: What is capital of france?
|
||||
Thought: Do I need to use an action? No, I know the answer
|
||||
Answer: Paris is the capital of France
|
||||
Question:
|
||||
@@ -74,6 +74,10 @@ def main():
|
||||
args.models_path, model, "ggml-model-f16.bin"
|
||||
)
|
||||
|
||||
if not os.path.isfile(f16_model_path_base):
|
||||
print(f'The file %s was not found' % f16_model_path_base)
|
||||
sys.exit(1)
|
||||
|
||||
f16_model_parts_paths = map(
|
||||
lambda filename: os.path.join(f16_model_path_base, filename),
|
||||
glob.glob(f"{f16_model_path_base}*")
|
||||
|
||||
1
spm-headers/llama.h
Symbolic link
1
spm-headers/llama.h
Symbolic link
@@ -0,0 +1 @@
|
||||
../llama.h
|
||||
@@ -5,5 +5,6 @@ function(llama_add_test source)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||
endfunction()
|
||||
|
||||
# llama_add_test(test-double-float.c) # SLOW
|
||||
llama_add_test(test-quantize.c)
|
||||
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
||||
|
||||
53
tests/test-double-float.c
Normal file
53
tests/test-double-float.c
Normal file
@@ -0,0 +1,53 @@
|
||||
// These tests may take a long time!
|
||||
// They are to prove that conversion from double to float of various functions in ggml.c doesn't affect the result.
|
||||
// This is done by checking all finite (non-NaN, non-infinite) floats.
|
||||
|
||||
#undef NDEBUG
|
||||
#include <assert.h>
|
||||
#include <immintrin.h>
|
||||
#include <math.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||
|
||||
// ggml.c::quantize_row_q4_0_reference
|
||||
inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
|
||||
|
||||
// ggml.c::ggml_silu_f32
|
||||
inline static float silu_orig(float x) {
|
||||
return x/(1.0 + exp(-x));
|
||||
}
|
||||
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
// ggml.c::quantize_row_q4_0_reference
|
||||
inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
|
||||
|
||||
// ggml.c::ggml_silu_f32
|
||||
inline static float silu_float(float x) {
|
||||
return x/(1.0f + expf(-x));
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
uint32_t x = UINT32_MAX;
|
||||
do {
|
||||
float f = *(float *)&x;
|
||||
assert(!isfinite(f) || (round_orig(f) == round_float(f)));
|
||||
} while (x--);
|
||||
|
||||
#ifdef __F16C__
|
||||
// GELU and SILU implementations are used with a FP16 lookup table.
|
||||
// The original and float-only results are not equal for all inputs after converting to FP16.
|
||||
// GELU is an approximation anyway (tanh), not tested here.
|
||||
// For SILU, verify that the results are at least the closest floating point numbers, if the FP16 values don't match.
|
||||
for (x = 0; x <= UINT16_MAX; x++) {
|
||||
float f = _cvtsh_ss(x);
|
||||
const float so = silu_orig(f);
|
||||
const float sf = silu_float(f);
|
||||
assert( (_cvtss_sh(so, 0) == _cvtss_sh(sf, 0))
|
||||
|| (nextafterf(so, sf) == sf)
|
||||
|| (nextafterf(sf, so) == so));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -13,7 +13,7 @@ int main(void) {
|
||||
src[i] = (float)(i + 1);
|
||||
}
|
||||
|
||||
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist);
|
||||
size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist);
|
||||
assert(size == 20);
|
||||
float max_result = ((float *)dst)[0];
|
||||
float max_expected = src[31] / ((1 << 3) - 1);
|
||||
@@ -24,7 +24,7 @@ int main(void) {
|
||||
assert(q4_result == q4_expected);
|
||||
}
|
||||
|
||||
size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist);
|
||||
size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
|
||||
assert(size == 24);
|
||||
float delta_result = ((float *)dst)[0];
|
||||
float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
|
||||
|
||||
@@ -77,5 +77,7 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
}
|
||||
|
||||
llama_free(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user