mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
17 Commits
b1336
...
fix-refact
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
acead654d2 | ||
|
|
94e502dfb7 | ||
|
|
7d8b24932f | ||
|
|
0f8df395ce | ||
|
|
b0ec5218c3 | ||
|
|
63d3b06a43 | ||
|
|
a16e89cec8 | ||
|
|
4d03833211 | ||
|
|
c47066d833 | ||
|
|
f1782c68de | ||
|
|
c26765a0a1 | ||
|
|
42833bc7a8 | ||
|
|
bdbe11719d | ||
|
|
0e797c2fc5 | ||
|
|
3a716b4dae | ||
|
|
1faaae8c2b | ||
|
|
cb13d73a72 |
4
.github/workflows/build.yml
vendored
4
.github/workflows/build.yml
vendored
@@ -10,10 +10,10 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift']
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift']
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
|
||||
3
.github/workflows/gguf-publish.yml
vendored
3
.github/workflows/gguf-publish.yml
vendored
@@ -36,8 +36,9 @@ jobs:
|
||||
poetry install
|
||||
|
||||
- name: Build package
|
||||
run: poetry build
|
||||
run: cd gguf-py && poetry build
|
||||
- name: Publish package
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
packages-dir: gguf-py/dist
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -10,6 +10,7 @@
|
||||
*.gcno
|
||||
*.gcda
|
||||
*.dot
|
||||
*.metallib
|
||||
.DS_Store
|
||||
.build/
|
||||
.cache/
|
||||
|
||||
@@ -10,15 +10,18 @@ let platforms: [SupportedPlatform]? = [
|
||||
.tvOS(.v14)
|
||||
]
|
||||
let exclude: [String] = []
|
||||
let additionalSources: [String] = ["ggml-metal.m", "ggml-metal.metal"]
|
||||
let resources: [Resource] = [
|
||||
.process("ggml-metal.metal")
|
||||
]
|
||||
let additionalSources: [String] = ["ggml-metal.m"]
|
||||
let additionalSettings: [CSetting] = [
|
||||
.unsafeFlags(["-fno-objc-arc"]),
|
||||
.define("GGML_SWIFT"),
|
||||
.define("GGML_USE_METAL")
|
||||
]
|
||||
#else
|
||||
let platforms: [SupportedPlatform]? = nil
|
||||
let exclude: [String] = ["ggml-metal.metal"]
|
||||
let resources: [Resource] = []
|
||||
let additionalSources: [String] = []
|
||||
let additionalSettings: [CSetting] = []
|
||||
#endif
|
||||
@@ -40,6 +43,7 @@ let package = Package(
|
||||
"ggml-alloc.c",
|
||||
"k_quants.c",
|
||||
] + additionalSources,
|
||||
resources: resources,
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: [
|
||||
.unsafeFlags(["-Wno-shorten-64-to-32"]),
|
||||
|
||||
27
README.md
27
README.md
@@ -95,6 +95,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
|
||||
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
|
||||
- [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
||||
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
|
||||
|
||||
**Bindings:**
|
||||
|
||||
@@ -377,7 +378,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||
|
||||
- #### cuBLAS
|
||||
|
||||
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
||||
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
|
||||
- Using `make`:
|
||||
```bash
|
||||
make LLAMA_CUBLAS=1
|
||||
@@ -613,6 +614,18 @@ For more information, see [https://huggingface.co/docs/transformers/perplexity](
|
||||
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
|
||||
The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.
|
||||
|
||||
#### How to run
|
||||
|
||||
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
||||
3. Output:
|
||||
```
|
||||
perplexity : calculating perplexity over 655 chunks
|
||||
24.43 seconds per pass - ETA 4.45 hours
|
||||
[1]4.5970,[2]5.1807,[3]6.0382,...
|
||||
```
|
||||
And after 4.45 hours, you will have the final perplexity.
|
||||
|
||||
### Interactive mode
|
||||
|
||||
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
|
||||
@@ -775,18 +788,6 @@ If your issue is with model generation quality, then please at least scan the fo
|
||||
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
|
||||
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
|
||||
|
||||
#### How to run
|
||||
|
||||
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
|
||||
3. Output:
|
||||
```
|
||||
perplexity : calculating perplexity over 655 chunks
|
||||
24.43 seconds per pass - ETA 4.45 hours
|
||||
[1]4.5970,[2]5.1807,[3]6.0382,...
|
||||
```
|
||||
And after 4.45 hours, you will have the final perplexity.
|
||||
|
||||
### Android
|
||||
|
||||
#### Building the Project using Android NDK
|
||||
|
||||
@@ -111,12 +111,14 @@ pub fn build(b: *std.build.Builder) !void {
|
||||
const common = make.obj("common", "common/common.cpp");
|
||||
const console = make.obj("common", "common/console.cpp");
|
||||
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
|
||||
const train = make.obj("train", "common/train.cpp");
|
||||
|
||||
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser });
|
||||
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common });
|
||||
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common });
|
||||
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common });
|
||||
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, llama, common, train });
|
||||
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common, train });
|
||||
|
||||
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser });
|
||||
if (server.target.isWindows()) {
|
||||
|
||||
@@ -170,7 +170,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
// store the external file name in params
|
||||
params.prompt_file = argv[i];
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||
if (params.prompt.back() == '\n') {
|
||||
if (!params.prompt.empty() && params.prompt.back() == '\n') {
|
||||
params.prompt.pop_back();
|
||||
}
|
||||
} else if (arg == "-n" || arg == "--n-predict") {
|
||||
@@ -295,7 +295,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
break;
|
||||
}
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
|
||||
if (params.cfg_negative_prompt.back() == '\n') {
|
||||
if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') {
|
||||
params.cfg_negative_prompt.pop_back();
|
||||
}
|
||||
} else if (arg == "--cfg-scale") {
|
||||
|
||||
130
convert-persimmon-to-gguf.py
Normal file
130
convert-persimmon-to-gguf.py
Normal file
@@ -0,0 +1,130 @@
|
||||
import torch
|
||||
import os
|
||||
from pprint import pprint
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
def _flatten_dict(dct, tensors, prefix=None):
|
||||
assert isinstance(dct, dict)
|
||||
for key in dct.keys():
|
||||
new_prefix = prefix + '.' + key if prefix is not None else key
|
||||
if isinstance(dct[key], torch.Tensor):
|
||||
tensors[new_prefix] = dct[key]
|
||||
elif isinstance(dct[key], dict):
|
||||
_flatten_dict(dct[key], tensors, new_prefix)
|
||||
else:
|
||||
raise ValueError(type(dct[key]))
|
||||
return None
|
||||
|
||||
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
||||
tokenizer_path = dir_model / 'adept_vocab.model'
|
||||
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
||||
tokenizer = SentencePieceProcessor(str(tokenizer_path))
|
||||
print('gguf: adding tokens')
|
||||
tokens: list[bytes] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
for i in range(tokenizer.vocab_size()):
|
||||
text: bytes
|
||||
score: float
|
||||
|
||||
piece = tokenizer.id_to_piece(i)
|
||||
text = piece.encode("utf-8")
|
||||
score = tokenizer.get_score(i)
|
||||
|
||||
toktype = 1
|
||||
if tokenizer.is_unknown(i):
|
||||
toktype = 2
|
||||
if tokenizer.is_control(i):
|
||||
toktype = 3
|
||||
if tokenizer.is_unused(i):
|
||||
toktype = 5
|
||||
if tokenizer.is_byte(i):
|
||||
toktype = 6
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(score)
|
||||
toktypes.append(toktype)
|
||||
pass
|
||||
return tokens, scores, toktypes
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||
parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
|
||||
parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
|
||||
parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
|
||||
args = parser.parse_args()
|
||||
sys.path.append(str(args.adept_inference_dir))
|
||||
persimmon_model = torch.load(args.ckpt_path)
|
||||
hparams = persimmon_model['args']
|
||||
pprint(hparams)
|
||||
tensors = {}
|
||||
_flatten_dict(persimmon_model['model'], tensors, None)
|
||||
|
||||
arch = gguf.MODEL_ARCH.PERSIMMON
|
||||
gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
|
||||
|
||||
block_count = hparams.num_layers
|
||||
head_count = hparams.num_attention_heads
|
||||
head_count_kv = head_count
|
||||
ctx_length = hparams.seq_length
|
||||
hidden_size = hparams.hidden_size
|
||||
|
||||
gguf_writer.add_name('persimmon-8b-chat')
|
||||
gguf_writer.add_context_length(ctx_length)
|
||||
gguf_writer.add_embedding_length(hidden_size)
|
||||
gguf_writer.add_block_count(block_count)
|
||||
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
|
||||
gguf_writer.add_rope_dimension_count(hidden_size // head_count)
|
||||
gguf_writer.add_head_count(head_count)
|
||||
gguf_writer.add_head_count_kv(head_count_kv)
|
||||
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
|
||||
gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
|
||||
|
||||
tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
|
||||
gguf_writer.add_tokenizer_model('llama')
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
gguf_writer.add_token_types(toktypes)
|
||||
gguf_writer.add_bos_token_id(71013)
|
||||
gguf_writer.add_eos_token_id(71013)
|
||||
|
||||
tensor_map = gguf.get_tensor_name_map(arch, block_count)
|
||||
print(tensor_map)
|
||||
for name in tensors.keys():
|
||||
data = tensors[name]
|
||||
if name.endswith(".self_attention.rotary_emb.inv_freq"):
|
||||
continue
|
||||
old_dtype = data.dtype
|
||||
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
|
||||
data = data.to(torch.float32).squeeze().numpy()
|
||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print("Can not map tensor '" + name + "'")
|
||||
sys.exit()
|
||||
n_dims = len(data.shape)
|
||||
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
gguf_writer.write_kv_data_to_file()
|
||||
print("gguf: write tensors")
|
||||
gguf_writer.write_tensors_to_file()
|
||||
|
||||
gguf_writer.close()
|
||||
|
||||
print(f"gguf: model successfully exported to '{args.outfile}'")
|
||||
print("")
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -17,33 +17,6 @@ if "NO_LOCAL_GGUF" not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf"))
|
||||
import gguf
|
||||
|
||||
|
||||
def bytes_to_unicode():
|
||||
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = (
|
||||
list(range(ord("!"), ord("~") + 1))
|
||||
+ list(range(ord("¡"), ord("¬") + 1))
|
||||
+ list(range(ord("®"), ord("ÿ") + 1))
|
||||
)
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8 + n)
|
||||
n += 1
|
||||
return dict(zip(bs, (chr(n) for n in cs)))
|
||||
|
||||
|
||||
def count_model_parts(dir_model: Path) -> int:
|
||||
num_parts = 0
|
||||
for filename in os.listdir(dir_model):
|
||||
@@ -153,53 +126,25 @@ tokens: list[bytearray] = []
|
||||
scores: list[float] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
tokenizer_json_file = dir_model / "tokenizer.json"
|
||||
if not tokenizer_json_file.is_file():
|
||||
print(f"Error: Missing {tokenizer_json_file}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# gpt2 tokenizer
|
||||
gguf_writer.add_tokenizer_model("gpt2")
|
||||
|
||||
with open(tokenizer_json_file, "r", encoding="utf-8") as f:
|
||||
tokenizer_json = json.load(f)
|
||||
|
||||
print("gguf: get gpt2 tokenizer vocab")
|
||||
|
||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||
|
||||
# The number of tokens in tokenizer.json can differ from the expected vocab size.
|
||||
# This causes downstream issues with mismatched tensor sizes when running the inference
|
||||
vocab_size = (
|
||||
hparams["vocab_size"]
|
||||
if "vocab_size" in hparams
|
||||
else len(tokenizer_json["model"]["vocab"])
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||
assert max(tokenizer.vocab.values()) < vocab_size
|
||||
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||
byte_encoder = bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i in reverse_vocab:
|
||||
text = reverse_vocab[i]
|
||||
try:
|
||||
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||
except KeyError:
|
||||
text = bytearray()
|
||||
for c in reverse_vocab[i]:
|
||||
if ord(c) < 256: # single byte character
|
||||
text.append(byte_decoder[ord(c)])
|
||||
else: # multibyte special token character
|
||||
text.extend(c.encode("utf-8"))
|
||||
else:
|
||||
print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
|
||||
pad_token = f"[PAD{i}]".encode("utf8")
|
||||
text = bytearray(pad_token)
|
||||
|
||||
tokens.append(text)
|
||||
scores.append(0.0) # dymmy
|
||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
gguf_writer.add_token_scores(scores)
|
||||
|
||||
@@ -167,7 +167,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
|
||||
// users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
|
||||
llama_batch batch = llama_batch_init(params.n_ctx, 0);
|
||||
llama_batch batch = llama_batch_init(n_ctx, 0);
|
||||
|
||||
int32_t n_total_prompt = 0;
|
||||
int32_t n_total_gen = 0;
|
||||
|
||||
@@ -114,9 +114,9 @@ node index.js
|
||||
|
||||
`top_k`: Limit the next token selection to the K most probable tokens (default: 40).
|
||||
|
||||
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9).
|
||||
`top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95).
|
||||
|
||||
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity).
|
||||
`n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity).
|
||||
|
||||
`n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context.
|
||||
By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt.
|
||||
@@ -156,6 +156,8 @@ node index.js
|
||||
|
||||
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced (default: []).
|
||||
|
||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
|
||||
|
||||
- **POST** `/tokenize`: Tokenize a given text.
|
||||
|
||||
*Options:*
|
||||
|
||||
339
ggml-metal.m
339
ggml-metal.m
@@ -81,18 +81,18 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||
GGML_METAL_DECL_KERNEL(norm);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
||||
@@ -109,6 +109,8 @@ struct ggml_metal_context {
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
||||
GGML_METAL_DECL_KERNEL(cpy_f16_f16);
|
||||
GGML_METAL_DECL_KERNEL(concat);
|
||||
GGML_METAL_DECL_KERNEL(sqr);
|
||||
|
||||
#undef GGML_METAL_DECL_KERNEL
|
||||
};
|
||||
@@ -183,56 +185,44 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
|
||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
#ifdef GGML_SWIFT
|
||||
// load the default.metallib file
|
||||
// load library
|
||||
{
|
||||
NSError * error = nil;
|
||||
|
||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
NSString * llamaBundlePath = [bundle pathForResource:@"llama_llama" ofType:@"bundle"];
|
||||
NSBundle * llamaBundle = [NSBundle bundleWithPath:llamaBundlePath];
|
||||
NSString * libPath = [llamaBundle pathForResource:@"default" ofType:@"metallib"];
|
||||
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
||||
|
||||
// Load the metallib file into a Metal library
|
||||
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
||||
|
||||
if (error) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
NSBundle * bundle = nil;
|
||||
#ifdef SWIFT_PACKAGE
|
||||
bundle = SWIFTPM_MODULE_BUNDLE;
|
||||
#else
|
||||
UNUSED(msl_library_source);
|
||||
|
||||
// read the source from "ggml-metal.metal" into a string and use newLibraryWithSource
|
||||
{
|
||||
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
#endif
|
||||
NSError * error = nil;
|
||||
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||
if (libPath != nil) {
|
||||
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
||||
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
||||
} else {
|
||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
//NSString * path = [[NSBundle mainBundle] pathForResource:@"../../examples/metal/metal" ofType:@"metal"];
|
||||
NSBundle * bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
||||
NSString * path = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [path UTF8String]);
|
||||
|
||||
NSString * src = [NSString stringWithContentsOfFile:path encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
|
||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
|
||||
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
|
||||
if (error) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
MTLCompileOptions* options = nil;
|
||||
#ifdef GGML_QKK_64
|
||||
MTLCompileOptions* options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
||||
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
||||
#else
|
||||
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
||||
options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
||||
#endif
|
||||
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
||||
}
|
||||
|
||||
if (error) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// load kernels
|
||||
{
|
||||
@@ -272,40 +262,57 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||
GGML_METAL_ADD_KERNEL(norm);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
|
||||
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
|
||||
}
|
||||
GGML_METAL_ADD_KERNEL(rope_f32);
|
||||
GGML_METAL_ADD_KERNEL(rope_f16);
|
||||
GGML_METAL_ADD_KERNEL(alibi_f32);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
||||
GGML_METAL_ADD_KERNEL(cpy_f16_f16);
|
||||
GGML_METAL_ADD_KERNEL(concat);
|
||||
GGML_METAL_ADD_KERNEL(sqr);
|
||||
|
||||
#undef GGML_METAL_ADD_KERNEL
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
#if TARGET_OS_OSX
|
||||
// print MTL GPU family:
|
||||
GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]);
|
||||
|
||||
// determine max supported GPU family
|
||||
// https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf
|
||||
// https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||
for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) {
|
||||
if ([ctx->device supportsFamily:i]) {
|
||||
GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
if (ctx->device.maxTransferRate != 0) {
|
||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||
@@ -347,34 +354,38 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
||||
GGML_METAL_DEL_KERNEL(rms_norm);
|
||||
GGML_METAL_DEL_KERNEL(norm);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
|
||||
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
|
||||
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
|
||||
}
|
||||
GGML_METAL_DEL_KERNEL(rope_f32);
|
||||
GGML_METAL_DEL_KERNEL(rope_f16);
|
||||
GGML_METAL_DEL_KERNEL(alibi_f32);
|
||||
GGML_METAL_DEL_KERNEL(cpy_f32_f16);
|
||||
GGML_METAL_DEL_KERNEL(cpy_f32_f32);
|
||||
GGML_METAL_DEL_KERNEL(cpy_f16_f16);
|
||||
GGML_METAL_DEL_KERNEL(concat);
|
||||
GGML_METAL_DEL_KERNEL(sqr);
|
||||
|
||||
#undef GGML_METAL_DEL_KERNEL
|
||||
|
||||
@@ -431,7 +442,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
||||
|
||||
//metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
|
||||
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
|
||||
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
||||
*offs = (size_t) ioffs;
|
||||
|
||||
@@ -766,6 +777,44 @@ void ggml_metal_graph_compute(
|
||||
{
|
||||
// noop
|
||||
} break;
|
||||
case GGML_OP_CONCAT:
|
||||
{
|
||||
const int64_t nb = ne00;
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_concat];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
||||
[encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
|
||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
|
||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
|
||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
|
||||
[encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
|
||||
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
|
||||
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
|
||||
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
|
||||
[encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
|
||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
|
||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
|
||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
|
||||
[encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
|
||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:19];
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:20];
|
||||
[encoder setBytes:&ne2 length:sizeof(ne2) atIndex:21];
|
||||
[encoder setBytes:&ne3 length:sizeof(ne3) atIndex:22];
|
||||
[encoder setBytes:&nb0 length:sizeof(nb0) atIndex:23];
|
||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:24];
|
||||
[encoder setBytes:&nb2 length:sizeof(nb2) atIndex:25];
|
||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:26];
|
||||
[encoder setBytes:&nb length:sizeof(nb) atIndex:27];
|
||||
|
||||
const int nth = MIN(1024, ne0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
{
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
@@ -861,9 +910,10 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
||||
|
||||
const int64_t n = ggml_nelements(dst)/4;
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
GGML_ASSERT(n % 4 == 0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(gf->nodes[i])) {
|
||||
@@ -873,9 +923,10 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst)/4;
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
GGML_ASSERT(n % 4 == 0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_RELU:
|
||||
{
|
||||
@@ -893,9 +944,10 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst)/4;
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
GGML_ASSERT(n % 4 == 0);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -903,6 +955,17 @@ void ggml_metal_graph_compute(
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_SQR:
|
||||
{
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
|
||||
[encoder setComputePipelineState:ctx->pipeline_sqr];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
{
|
||||
const int nth = MIN(32, ne00);
|
||||
@@ -944,21 +1007,46 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT:
|
||||
{
|
||||
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
||||
|
||||
GGML_ASSERT(ne00 == ne10);
|
||||
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
|
||||
uint gqa = ne12/ne02;
|
||||
GGML_ASSERT(ne03 == ne13);
|
||||
|
||||
const uint gqa = ne12/ne02;
|
||||
|
||||
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
||||
// to the matrix-vector kernel
|
||||
int ne11_mm_min = 1;
|
||||
|
||||
#if 0
|
||||
// the numbers below are measured on M2 Ultra for 7B and 13B models
|
||||
// these numbers do not translate to other devices or model sizes
|
||||
// TODO: need to find a better approach
|
||||
if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F16: ne11_mm_min = 2; break;
|
||||
case GGML_TYPE_Q8_0: ne11_mm_min = 7; break;
|
||||
case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
|
||||
case GGML_TYPE_Q3_K: ne11_mm_min = 7; break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
|
||||
case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
|
||||
case GGML_TYPE_Q5_0: // not tested yet
|
||||
case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
|
||||
case GGML_TYPE_Q5_K: ne11_mm_min = 7; break;
|
||||
case GGML_TYPE_Q6_K: ne11_mm_min = 7; break;
|
||||
default: ne11_mm_min = 1; break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||
if (!ggml_is_transposed(src0) &&
|
||||
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
||||
!ggml_is_transposed(src0) &&
|
||||
!ggml_is_transposed(src1) &&
|
||||
src1t == GGML_TYPE_F32 &&
|
||||
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
||||
ne00%32 == 0 &&
|
||||
ne11 > 2) {
|
||||
ne00 % 32 == 0 &&
|
||||
ne11 > ne11_mm_min) {
|
||||
//printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break;
|
||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break;
|
||||
@@ -987,17 +1075,18 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12];
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:13];
|
||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
[encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||
} else {
|
||||
int nth0 = 32;
|
||||
int nth1 = 1;
|
||||
int nrows = 1;
|
||||
//printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
|
||||
|
||||
// use custom matrix x vector kernel
|
||||
switch (src0t) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32];
|
||||
nrows = 4;
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
@@ -1005,12 +1094,12 @@ void ggml_metal_graph_compute(
|
||||
nth0 = 32;
|
||||
nth1 = 1;
|
||||
if (ne11 * ne12 < 4) {
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row];
|
||||
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4];
|
||||
nrows = ne11;
|
||||
} else {
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32];
|
||||
nrows = 4;
|
||||
}
|
||||
} break;
|
||||
@@ -1021,7 +1110,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
@@ -1030,7 +1119,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
{
|
||||
@@ -1039,7 +1128,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 8;
|
||||
nth1 = 8;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q2_K:
|
||||
{
|
||||
@@ -1048,7 +1137,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q3_K:
|
||||
{
|
||||
@@ -1057,7 +1146,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q4_K:
|
||||
{
|
||||
@@ -1066,7 +1155,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 4; //1;
|
||||
nth1 = 8; //32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q5_K:
|
||||
{
|
||||
@@ -1075,7 +1164,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32];
|
||||
} break;
|
||||
case GGML_TYPE_Q6_K:
|
||||
{
|
||||
@@ -1084,7 +1173,7 @@ void ggml_metal_graph_compute(
|
||||
|
||||
nth0 = 2;
|
||||
nth1 = 32;
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
||||
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -1113,7 +1202,7 @@ void ggml_metal_graph_compute(
|
||||
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:17];
|
||||
|
||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 ||
|
||||
src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) {
|
||||
src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
|
||||
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
||||
}
|
||||
else if (src0t == GGML_TYPE_Q4_K) {
|
||||
@@ -1166,6 +1255,8 @@ void ggml_metal_graph_compute(
|
||||
} break;
|
||||
case GGML_OP_RMS_NORM:
|
||||
{
|
||||
GGML_ASSERT(ne00 % 4 == 0);
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
|
||||
173
ggml-metal.metal
173
ggml-metal.metal
@@ -13,8 +13,8 @@ typedef struct {
|
||||
|
||||
#define QK4_1 32
|
||||
typedef struct {
|
||||
half d; // delta
|
||||
half m; // min
|
||||
half d; // delta
|
||||
half m; // min
|
||||
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
||||
} block_q4_1;
|
||||
|
||||
@@ -132,6 +132,13 @@ kernel void kernel_relu(
|
||||
dst[tpig] = max(0.0f, src0[tpig]);
|
||||
}
|
||||
|
||||
kernel void kernel_sqr(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = src0[tpig] * src0[tpig];
|
||||
}
|
||||
|
||||
constant float GELU_COEF_A = 0.044715f;
|
||||
constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
|
||||
@@ -338,10 +345,11 @@ kernel void kernel_rms_norm(
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint ntg[[threads_per_threadgroup]]) {
|
||||
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
||||
device const float * x_scalar = (device const float *) x;
|
||||
float4 sumf=0;
|
||||
float all_sum=0;
|
||||
device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01);
|
||||
device const float * x_scalar = (device const float *) x;
|
||||
|
||||
float4 sumf = 0;
|
||||
float all_sum = 0;
|
||||
|
||||
// parallel sum
|
||||
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
|
||||
@@ -354,6 +362,7 @@ kernel void kernel_rms_norm(
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
// broadcast, simd group number is ntg / 32
|
||||
for (uint i = ntg / 32 / 2; i > 0; i /= 2) {
|
||||
if (tpitg < i) {
|
||||
@@ -361,7 +370,9 @@ kernel void kernel_rms_norm(
|
||||
}
|
||||
}
|
||||
if (tpitg == 0) {
|
||||
for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];}
|
||||
for (int i = 4 * (ne00 / 4); i < ne00; i++) {
|
||||
sum[0] += x_scalar[i];
|
||||
}
|
||||
sum[0] /= ne00;
|
||||
}
|
||||
|
||||
@@ -376,7 +387,9 @@ kernel void kernel_rms_norm(
|
||||
y[i00] = x[i00] * scale;
|
||||
}
|
||||
if (tpitg == 0) {
|
||||
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;}
|
||||
for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {
|
||||
y_scalar[i00] = x_scalar[i00] * scale;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -416,8 +429,8 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre
|
||||
}
|
||||
|
||||
// putting them in the kernel cause a significant performance penalty
|
||||
#define N_DST 4 // each SIMD group works on 4 rows
|
||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||
#define N_DST 4 // each SIMD group works on 4 rows
|
||||
#define N_SIMDGROUP 2 // number of SIMD groups in a thread group
|
||||
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
||||
//Note: This is a template, but strictly speaking it only applies to
|
||||
// quantizations where the block size is 32. It also does not
|
||||
@@ -428,18 +441,23 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
||||
int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa,
|
||||
uint3 tgpig, uint tiisg, uint sgitg) {
|
||||
const int nb = ne00/QK4_0;
|
||||
|
||||
const int r0 = tgpig.x;
|
||||
const int r1 = tgpig.y;
|
||||
const int im = tgpig.z;
|
||||
|
||||
const int first_row = (r0 * nsg + sgitg) * nr;
|
||||
|
||||
const uint offset0 = first_row * nb + im/gqa*(nb*ne0);
|
||||
|
||||
device const block_q_type * x = (device const block_q_type *) src0 + offset0;
|
||||
device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1;
|
||||
float yl[16]; // src1 vector cache
|
||||
float sumf[nr]={0.f};
|
||||
|
||||
const int ix = tiisg/2;
|
||||
const int il = 8*(tiisg%2);
|
||||
float yl[16]; // src1 vector cache
|
||||
float sumf[nr] = {0.f};
|
||||
|
||||
const int ix = (tiisg/2);
|
||||
const int il = (tiisg%2)*8;
|
||||
|
||||
device const float * yb = y + ix * QK4_0 + il;
|
||||
|
||||
@@ -450,6 +468,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
||||
sumy += yb[i] + yb[i+1];
|
||||
yl[i+0] = yb[i+ 0];
|
||||
yl[i+1] = yb[i+ 1]/256.f;
|
||||
|
||||
sumy += yb[i+16] + yb[i+17];
|
||||
yl[i+8] = yb[i+16]/16.f;
|
||||
yl[i+9] = yb[i+17]/4096.f;
|
||||
@@ -465,12 +484,12 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device
|
||||
for (int row = 0; row < nr; ++row) {
|
||||
const float tot = simd_sum(sumf[row]);
|
||||
if (tiisg == 0 && first_row + row < ne01) {
|
||||
dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot;
|
||||
dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_q4_0_f32(
|
||||
kernel void kernel_mul_mv_q4_0_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -483,12 +502,12 @@ kernel void kernel_mul_mat_q4_0_f32(
|
||||
constant int64_t & ne1[[buffer(16)]],
|
||||
constant uint & gqa[[buffer(17)]],
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
uint tiisg[[thread_index_in_simdgroup]],
|
||||
uint sgitg[[simdgroup_index_in_threadgroup]]) {
|
||||
mul_vec_q_n_f32<block_q4_0, N_DST, N_SIMDGROUP, N_SIMDWIDTH>(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg);
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_q4_1_f32(
|
||||
kernel void kernel_mul_mv_q4_1_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -508,7 +527,7 @@ kernel void kernel_mul_mat_q4_1_f32(
|
||||
|
||||
#define NB_Q8_0 8
|
||||
|
||||
kernel void kernel_mul_mat_q8_0_f32(
|
||||
kernel void kernel_mul_mv_q8_0_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -572,7 +591,7 @@ kernel void kernel_mul_mat_q8_0_f32(
|
||||
|
||||
#define N_F32_F32 4
|
||||
|
||||
kernel void kernel_mul_mat_f32_f32(
|
||||
kernel void kernel_mul_mv_f32_f32(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
@@ -643,7 +662,7 @@ kernel void kernel_mul_mat_f32_f32(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_f16_f32_1row(
|
||||
kernel void kernel_mul_mv_f16_f32_1row(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
@@ -662,7 +681,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
uint tiisg[[thread_index_in_simdgroup]]) {
|
||||
|
||||
const int64_t r0 = tgpig.x;
|
||||
const int64_t r1 = tgpig.y;
|
||||
@@ -697,7 +716,7 @@ kernel void kernel_mul_mat_f16_f32_1row(
|
||||
|
||||
#define N_F16_F32 4
|
||||
|
||||
kernel void kernel_mul_mat_f16_f32(
|
||||
kernel void kernel_mul_mv_f16_f32(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
@@ -769,7 +788,7 @@ kernel void kernel_mul_mat_f16_f32(
|
||||
}
|
||||
|
||||
// Assumes row size (ne00) is a multiple of 4
|
||||
kernel void kernel_mul_mat_f16_f32_l4(
|
||||
kernel void kernel_mul_mv_f16_f32_l4(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device float * dst,
|
||||
@@ -1098,6 +1117,62 @@ kernel void kernel_cpy_f32_f32(
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_concat(
|
||||
device const char * src0,
|
||||
device const char * src1,
|
||||
device char * dst,
|
||||
constant int64_t & ne00,
|
||||
constant int64_t & ne01,
|
||||
constant int64_t & ne02,
|
||||
constant int64_t & ne03,
|
||||
constant uint64_t & nb00,
|
||||
constant uint64_t & nb01,
|
||||
constant uint64_t & nb02,
|
||||
constant uint64_t & nb03,
|
||||
constant int64_t & ne10,
|
||||
constant int64_t & ne11,
|
||||
constant int64_t & ne12,
|
||||
constant int64_t & ne13,
|
||||
constant uint64_t & nb10,
|
||||
constant uint64_t & nb11,
|
||||
constant uint64_t & nb12,
|
||||
constant uint64_t & nb13,
|
||||
constant int64_t & ne0,
|
||||
constant int64_t & ne1,
|
||||
constant int64_t & ne2,
|
||||
constant int64_t & ne3,
|
||||
constant uint64_t & nb0,
|
||||
constant uint64_t & nb1,
|
||||
constant uint64_t & nb2,
|
||||
constant uint64_t & nb3,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
|
||||
const int64_t i03 = tgpig.z;
|
||||
const int64_t i02 = tgpig.y;
|
||||
const int64_t i01 = tgpig.x;
|
||||
|
||||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
|
||||
device const char * src0_ptr = src0 + i03 * nb03 + i02 * nb02 + i01 * nb01 + tpitg.x*nb00;
|
||||
device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11 + tpitg.x*nb10;
|
||||
device char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1 + tpitg.x*nb0;
|
||||
|
||||
for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
|
||||
if (i02 < ne02) {
|
||||
((device float *)dst_ptr)[0] = ((device float *)src0_ptr)[0];
|
||||
src0_ptr += ntg.x*nb00;
|
||||
} else {
|
||||
((device float *)dst_ptr)[0] = ((device float *)src1_ptr)[0];
|
||||
src1_ptr += ntg.x*nb10;
|
||||
}
|
||||
dst_ptr += ntg.x*nb0;
|
||||
}
|
||||
}
|
||||
|
||||
//============================================ k-quants ======================================================
|
||||
|
||||
#ifndef QK_K
|
||||
@@ -1190,7 +1265,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) {
|
||||
|
||||
//====================================== dot products =========================
|
||||
|
||||
kernel void kernel_mul_mat_q2_K_f32(
|
||||
kernel void kernel_mul_mv_q2_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1334,7 +1409,7 @@ kernel void kernel_mul_mat_q2_K_f32(
|
||||
}
|
||||
|
||||
#if QK_K == 256
|
||||
kernel void kernel_mul_mat_q3_K_f32(
|
||||
kernel void kernel_mul_mv_q3_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1486,7 +1561,7 @@ kernel void kernel_mul_mat_q3_K_f32(
|
||||
}
|
||||
}
|
||||
#else
|
||||
kernel void kernel_mul_mat_q3_K_f32(
|
||||
kernel void kernel_mul_mv_q3_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1557,7 +1632,7 @@ kernel void kernel_mul_mat_q3_K_f32(
|
||||
#endif
|
||||
|
||||
#if QK_K == 256
|
||||
kernel void kernel_mul_mat_q4_K_f32(
|
||||
kernel void kernel_mul_mv_q4_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1663,7 +1738,7 @@ kernel void kernel_mul_mat_q4_K_f32(
|
||||
}
|
||||
}
|
||||
#else
|
||||
kernel void kernel_mul_mat_q4_K_f32(
|
||||
kernel void kernel_mul_mv_q4_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1752,7 +1827,7 @@ kernel void kernel_mul_mat_q4_K_f32(
|
||||
}
|
||||
#endif
|
||||
|
||||
kernel void kernel_mul_mat_q5_K_f32(
|
||||
kernel void kernel_mul_mv_q5_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -1925,7 +2000,7 @@ kernel void kernel_mul_mat_q5_K_f32(
|
||||
|
||||
}
|
||||
|
||||
kernel void kernel_mul_mat_q6_K_f32(
|
||||
kernel void kernel_mul_mv_q6_K_f32(
|
||||
device const void * src0,
|
||||
device const float * src1,
|
||||
device float * dst,
|
||||
@@ -2263,7 +2338,7 @@ kernel void kernel_get_rows(
|
||||
}
|
||||
|
||||
#define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A
|
||||
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A
|
||||
#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B
|
||||
#define BLOCK_SIZE_K 32
|
||||
#define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A
|
||||
#define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B
|
||||
@@ -2300,9 +2375,11 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
const uint r0 = tgpig.y;
|
||||
const uint r1 = tgpig.x;
|
||||
const uint im = tgpig.z;
|
||||
|
||||
// if this block is of 64x32 shape or smaller
|
||||
short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
|
||||
short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
|
||||
|
||||
// a thread shouldn't load data outside of the matrix
|
||||
short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
|
||||
short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
|
||||
@@ -2326,26 +2403,30 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
+ nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
|
||||
|
||||
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
|
||||
//load data and store to threadgroup memory
|
||||
// load data and store to threadgroup memory
|
||||
half4x4 temp_a;
|
||||
dequantize_func(x, il, temp_a);
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
#pragma unroll(16)
|
||||
for (int i = 0; i < 16; i++) {
|
||||
*(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
|
||||
+ 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \
|
||||
+ (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
|
||||
+ (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
|
||||
+ (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4];
|
||||
}
|
||||
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \
|
||||
= *((device float2x4 *)y);
|
||||
|
||||
*(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
|
||||
|
||||
il = (il + 2 < nl) ? il + 2 : il % 2;
|
||||
x = (il < 2) ? x + (2+nl-1)/nl : x;
|
||||
y += BLOCK_SIZE_K;
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
//load matrices from threadgroup memory and conduct outer products
|
||||
|
||||
// load matrices from threadgroup memory and conduct outer products
|
||||
threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
|
||||
threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
|
||||
|
||||
#pragma unroll(4)
|
||||
for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
|
||||
#pragma unroll(4)
|
||||
@@ -2360,6 +2441,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
|
||||
lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
|
||||
lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
|
||||
|
||||
#pragma unroll(8)
|
||||
for (int i = 0; i < 8; i++){
|
||||
simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]);
|
||||
@@ -2368,25 +2450,26 @@ kernel void kernel_mul_mm(device const uchar * src0,
|
||||
}
|
||||
|
||||
if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) {
|
||||
device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \
|
||||
+ (BLOCK_SIZE_N * r1 + 16 * (sgitg>>1)) * ne0 + im*ne1*ne0;
|
||||
device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) \
|
||||
+ (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0);
|
||||
}
|
||||
} else {
|
||||
// block is smaller than 64x32, we should avoid writing data outside of the matrix
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
threadgroup float *temp_str = ((threadgroup float *)shared_memory) \
|
||||
threadgroup float * temp_str = ((threadgroup float *)shared_memory) \
|
||||
+ 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||
if (sgitg==0) {
|
||||
|
||||
device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0;
|
||||
if (sgitg == 0) {
|
||||
for (int i = 0; i < n_rows; i++) {
|
||||
for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) {
|
||||
for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
|
||||
*(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M);
|
||||
}
|
||||
}
|
||||
|
||||
27
ggml.c
27
ggml.c
@@ -11256,7 +11256,7 @@ static void ggml_compute_forward_silu_f32(
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int k = 0; k < nc; k++) {
|
||||
const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];
|
||||
const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k];
|
||||
UNUSED(x);
|
||||
assert(!isnan(x));
|
||||
assert(!isinf(x));
|
||||
@@ -13089,17 +13089,17 @@ static void ggml_compute_forward_alibi_f32(
|
||||
|
||||
assert(n_past >= 0);
|
||||
|
||||
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
||||
const int ne1 = src0->ne[1]; // seq_len_without_past
|
||||
const int ne2 = src0->ne[2]; // n_head -> this is k
|
||||
//const int ne3 = src0->ne[3]; // 1 -> bsz
|
||||
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
|
||||
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
|
||||
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
|
||||
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
|
||||
|
||||
const int n = ggml_nrows(src0);
|
||||
const int ne2_ne3 = n/ne1; // ne2*ne3
|
||||
const int64_t n = ggml_nrows(src0);
|
||||
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
|
||||
|
||||
const int nb0 = src0->nb[0];
|
||||
const int nb1 = src0->nb[1];
|
||||
const int nb2 = src0->nb[2];
|
||||
const size_t nb0 = src0->nb[0];
|
||||
const size_t nb1 = src0->nb[1];
|
||||
const size_t nb2 = src0->nb[2];
|
||||
//const int nb3 = src0->nb[3];
|
||||
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
@@ -13111,9 +13111,9 @@ static void ggml_compute_forward_alibi_f32(
|
||||
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
|
||||
|
||||
for (int i = 0; i < ne0; i++) {
|
||||
for (int j = 0; j < ne1; j++) {
|
||||
for (int k = 0; k < ne2_ne3; k++) {
|
||||
for (int64_t i = 0; i < ne0; i++) {
|
||||
for (int64_t j = 0; j < ne1; j++) {
|
||||
for (int64_t k = 0; k < ne2_ne3; k++) {
|
||||
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
|
||||
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
|
||||
|
||||
@@ -13128,7 +13128,6 @@ static void ggml_compute_forward_alibi_f32(
|
||||
}
|
||||
|
||||
pdst[0] = i * m_k + src[0];
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -69,4 +69,3 @@ python -m twine upload dist/*
|
||||
## TODO
|
||||
- [ ] Add tests
|
||||
- [ ] Include conversion scripts as command line entry points in this package.
|
||||
- Add CI workflow for releasing the package.
|
||||
|
||||
@@ -85,6 +85,7 @@ class MODEL_ARCH(IntEnum):
|
||||
GPTNEOX : int = auto()
|
||||
MPT : int = auto()
|
||||
STARCODER : int = auto()
|
||||
PERSIMMON : int = auto()
|
||||
REFACT : int = auto()
|
||||
BERT : int = auto()
|
||||
|
||||
@@ -108,6 +109,8 @@ class MODEL_TENSOR(IntEnum):
|
||||
FFN_DOWN : int = auto()
|
||||
FFN_UP : int = auto()
|
||||
FFN_NORM : int = auto()
|
||||
ATTN_Q_NORM : int = auto()
|
||||
ATTN_K_NORM : int = auto()
|
||||
|
||||
|
||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
@@ -119,6 +122,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||
MODEL_ARCH.MPT: "mpt",
|
||||
MODEL_ARCH.STARCODER: "starcoder",
|
||||
MODEL_ARCH.PERSIMMON: "persimmon",
|
||||
MODEL_ARCH.REFACT: "refact",
|
||||
MODEL_ARCH.BERT: "bert",
|
||||
}
|
||||
@@ -130,7 +134,6 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||
MODEL_TENSOR.OUTPUT: "output",
|
||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||
|
||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||
@@ -139,6 +142,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
||||
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
@@ -249,6 +254,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.PERSIMMON: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.ATTN_Q_NORM,
|
||||
MODEL_TENSOR.ATTN_K_NORM,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
MODEL_ARCH.REFACT: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
@@ -279,6 +298,9 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
MODEL_ARCH.PERSIMMON: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
@@ -286,12 +308,13 @@ class TensorNameMap:
|
||||
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
# Token embeddings
|
||||
MODEL_TENSOR.TOKEN_EMBD: (
|
||||
"gpt_neox.embed_in", # gptneox
|
||||
"transformer.wte", # gpt2 gpt-j mpt refact
|
||||
"transformer.word_embeddings", # falcon
|
||||
"model.embed_tokens", # llama-hf
|
||||
"tok_embeddings", # llama-pth
|
||||
"embeddings.word_embeddings", # bert
|
||||
"gpt_neox.embed_in", # gptneox
|
||||
"transformer.wte", # gpt2 gpt-j mpt refact
|
||||
"transformer.word_embeddings", # falcon
|
||||
"model.embed_tokens", # llama-hf
|
||||
"tok_embeddings", # llama-pth
|
||||
"embeddings.word_embeddings", # bert
|
||||
"language_model.embedding.word_embeddings", # persimmon
|
||||
),
|
||||
|
||||
# Token type embeddings
|
||||
@@ -307,20 +330,22 @@ class TensorNameMap:
|
||||
|
||||
# Output
|
||||
MODEL_TENSOR.OUTPUT: (
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 gpt-j mpt falcon llama-hf baichuan
|
||||
"output", # llama-pth
|
||||
"embed_out", # gptneox
|
||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
||||
"output", # llama-pth
|
||||
"word_embeddings_for_head", # persimmon
|
||||
),
|
||||
|
||||
# Output norm
|
||||
MODEL_TENSOR.OUTPUT_NORM: (
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 gpt-j falcon
|
||||
"model.norm", # llama-hf baichuan
|
||||
"norm", # llama-pth
|
||||
"embeddings.LayerNorm", # bert
|
||||
"transformer.norm_f", # mpt
|
||||
"ln_f", # refact
|
||||
"gpt_neox.final_layer_norm", # gptneox
|
||||
"transformer.ln_f", # gpt2 gpt-j falcon
|
||||
"model.norm", # llama-hf baichuan
|
||||
"norm", # llama-pth
|
||||
"embeddings.LayerNorm", # bert
|
||||
"transformer.norm_f", # mpt
|
||||
"ln_f", # refact
|
||||
"language_model.encoder.final_layernorm", # persimmon
|
||||
),
|
||||
|
||||
# Rope frequencies
|
||||
@@ -332,14 +357,15 @@ class TensorNameMap:
|
||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||
# Attention norm
|
||||
MODEL_TENSOR.ATTN_NORM: (
|
||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
||||
"transformer.blocks.{bid}.norm_1", # mpt
|
||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||
"layers.{bid}.attention_norm", # llama-pth
|
||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_1", # gpt2 gpt-j refact
|
||||
"transformer.blocks.{bid}.norm_1", # mpt
|
||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||
"layers.{bid}.attention_norm", # llama-pth
|
||||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||
),
|
||||
|
||||
# Attention norm 2
|
||||
@@ -349,10 +375,11 @@ class TensorNameMap:
|
||||
|
||||
# Attention query-key-value
|
||||
MODEL_TENSOR.ATTN_QKV: (
|
||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||
"transformer.h.{bid}.attn.c_attn", # gpt2
|
||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||
"transformer.h.{bid}.attn.c_attn", # gpt2
|
||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
||||
),
|
||||
|
||||
# Attention query
|
||||
@@ -381,14 +408,15 @@ class TensorNameMap:
|
||||
|
||||
# Attention output
|
||||
MODEL_TENSOR.ATTN_OUT: (
|
||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
||||
"layers.{bid}.attention.wo", # llama-pth
|
||||
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||
"transformer.h.{bid}.attn.c_proj", # gpt2 refact
|
||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
||||
"layers.{bid}.attention.wo", # llama-pth
|
||||
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.self_attention.dense" # persimmon
|
||||
),
|
||||
|
||||
# Rotary embeddings
|
||||
@@ -399,24 +427,26 @@ class TensorNameMap:
|
||||
|
||||
# Feed-forward norm
|
||||
MODEL_TENSOR.FFN_NORM: (
|
||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_2", # gpt2 refact
|
||||
"transformer.blocks.{bid}.norm_2", # mpt
|
||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||
"layers.{bid}.ffn_norm", # llama-pth
|
||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||
"transformer.h.{bid}.ln_2", # gpt2 refact
|
||||
"transformer.blocks.{bid}.norm_2", # mpt
|
||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||
"layers.{bid}.ffn_norm", # llama-pth
|
||||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||
),
|
||||
|
||||
# Feed-forward up
|
||||
MODEL_TENSOR.FFN_UP: (
|
||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||
"model.layers.{bid}.mlp.up_proj", # llama-hf refact
|
||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||
),
|
||||
|
||||
# Feed-forward gate
|
||||
@@ -427,15 +457,28 @@ class TensorNameMap:
|
||||
|
||||
# Feed-forward down
|
||||
MODEL_TENSOR.FFN_DOWN: (
|
||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||
"encoder.layer.{bid}.output.dense", # bert
|
||||
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||
"transformer.h.{bid}.mlp.c_proj", # gpt2 refact
|
||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||
"encoder.layer.{bid}.output.dense", # bert
|
||||
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_Q_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.q_layernorm",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ATTN_K_NORM: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.k_layernorm",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.ROPE_FREQS: (
|
||||
"language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq", # persimmon
|
||||
)
|
||||
}
|
||||
|
||||
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.4.0"
|
||||
version = "0.4.4"
|
||||
description = "Write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
|
||||
535
llama.cpp
535
llama.cpp
@@ -186,6 +186,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GPTNEOX,
|
||||
LLM_ARCH_MPT,
|
||||
LLM_ARCH_STARCODER,
|
||||
LLM_ARCH_PERSIMMON,
|
||||
LLM_ARCH_REFACT,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
@@ -199,6 +200,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_MPT, "mpt" },
|
||||
{ LLM_ARCH_BAICHUAN, "baichuan" },
|
||||
{ LLM_ARCH_STARCODER, "starcoder" },
|
||||
{ LLM_ARCH_PERSIMMON, "persimmon" },
|
||||
{ LLM_ARCH_REFACT, "refact" },
|
||||
};
|
||||
|
||||
@@ -318,6 +320,8 @@ enum llm_tensor {
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
};
|
||||
|
||||
static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
|
||||
@@ -399,6 +403,23 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_PERSIMMON,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd"},
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm"},
|
||||
{ LLM_TENSOR_OUTPUT, "output"},
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm"},
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv"},
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output"},
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm"},
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down"},
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up"},
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd"},
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_MPT,
|
||||
{
|
||||
@@ -959,6 +980,7 @@ enum e_model {
|
||||
MODEL_1B,
|
||||
MODEL_3B,
|
||||
MODEL_7B,
|
||||
MODEL_8B,
|
||||
MODEL_13B,
|
||||
MODEL_15B,
|
||||
MODEL_30B,
|
||||
@@ -1041,6 +1063,10 @@ struct llama_layer {
|
||||
struct ggml_tensor * attn_norm_b;
|
||||
struct ggml_tensor * attn_norm_2;
|
||||
struct ggml_tensor * attn_norm_2_b;
|
||||
struct ggml_tensor * attn_q_norm;
|
||||
struct ggml_tensor * attn_q_norm_b;
|
||||
struct ggml_tensor * attn_k_norm;
|
||||
struct ggml_tensor * attn_k_norm_b;
|
||||
|
||||
// attention
|
||||
struct ggml_tensor * wq;
|
||||
@@ -1299,7 +1325,11 @@ static bool llama_kv_cache_init(
|
||||
cache.cells.clear();
|
||||
cache.cells.resize(n_ctx);
|
||||
|
||||
// TODO: this should be:
|
||||
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
|
||||
// change it and test that it works
|
||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
||||
memset(cache.buf.data, 0, cache.buf.size);
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = cache.buf.size;
|
||||
@@ -1901,6 +1931,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||
case MODEL_1B: return "1B";
|
||||
case MODEL_3B: return "3B";
|
||||
case MODEL_7B: return "7B";
|
||||
case MODEL_8B: return "8B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_15B: return "15B";
|
||||
case MODEL_30B: return "30B";
|
||||
@@ -2013,6 +2044,14 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PERSIMMON:
|
||||
{
|
||||
GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS));
|
||||
switch (hparams.n_layer) {
|
||||
case 36: model.type = e_model::MODEL_8B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_REFACT:
|
||||
{
|
||||
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
|
||||
@@ -2549,6 +2588,67 @@ static void llm_load_tensors(
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_PERSIMMON:
|
||||
{
|
||||
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
|
||||
{
|
||||
ggml_backend backend_norm;
|
||||
ggml_backend backend_output;
|
||||
|
||||
if (n_gpu_layers > int(n_layer)) {
|
||||
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
||||
// on Windows however this is detrimental unless everything is on the GPU
|
||||
#ifndef _WIN32
|
||||
backend_norm = LLAMA_BACKEND_OFFLOAD;
|
||||
#else
|
||||
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||
#endif // _WIN32
|
||||
|
||||
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
|
||||
} else {
|
||||
backend_norm = GGML_BACKEND_CPU;
|
||||
backend_output = GGML_BACKEND_CPU;
|
||||
}
|
||||
|
||||
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
|
||||
if (backend_norm == GGML_BACKEND_GPU) {
|
||||
vram_weights += ggml_nbytes(model.output_norm);
|
||||
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||
}
|
||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||
vram_weights += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
model.layers.resize(n_layer);
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
|
||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT;
|
||||
auto & layer = model.layers[i];
|
||||
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
|
||||
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
|
||||
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
|
||||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
||||
layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
|
||||
layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
|
||||
layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
|
||||
layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@@ -2658,8 +2758,8 @@ static bool llama_model_load(
|
||||
}
|
||||
|
||||
static struct ggml_cgraph * llm_build_llama(
|
||||
llama_context & lctx,
|
||||
const llama_batch & batch) {
|
||||
llama_context & lctx,
|
||||
const llama_batch & batch) {
|
||||
const auto & model = lctx.model;
|
||||
const auto & hparams = model.hparams;
|
||||
const auto & cparams = lctx.cparams;
|
||||
@@ -2697,11 +2797,9 @@ static struct ggml_cgraph * llm_build_llama(
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute.size,
|
||||
/*.mem_buffer =*/ buf_compute.data,
|
||||
/*.no_alloc =*/ false,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
params.no_alloc = true;
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
@@ -3085,11 +3183,9 @@ static struct ggml_cgraph * llm_build_baichaun(
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute.size,
|
||||
/*.mem_buffer =*/ buf_compute.data,
|
||||
/*.no_alloc =*/ false,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
params.no_alloc = true;
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
@@ -3486,11 +3582,9 @@ static struct ggml_cgraph * llm_build_refact(
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute.size,
|
||||
/*.mem_buffer =*/ buf_compute.data,
|
||||
/*.no_alloc =*/ false,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
params.no_alloc = true;
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
@@ -3840,11 +3934,9 @@ static struct ggml_cgraph * llm_build_falcon(
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute.size,
|
||||
/*.mem_buffer =*/ buf_compute.data,
|
||||
/*.no_alloc =*/ false,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
params.no_alloc = true;
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
@@ -4200,11 +4292,9 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute.size,
|
||||
/*.mem_buffer =*/ buf_compute.data,
|
||||
/*.no_alloc =*/ false,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
params.no_alloc = true;
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
@@ -4415,6 +4505,404 @@ static struct ggml_cgraph * llm_build_starcoder(
|
||||
return gf;
|
||||
}
|
||||
|
||||
|
||||
static struct ggml_cgraph * llm_build_persimmon(
|
||||
llama_context & lctx,
|
||||
const llama_batch & batch) {
|
||||
const auto & model = lctx.model;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const auto & kv_self = lctx.kv_self;
|
||||
|
||||
GGML_ASSERT(!!kv_self.ctx);
|
||||
|
||||
const auto & cparams = lctx.cparams;
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int64_t n_layer = hparams.n_layer;
|
||||
const int64_t n_ctx = cparams.n_ctx;
|
||||
const int64_t n_head_kv = hparams.n_head_kv;
|
||||
const int64_t n_head = hparams.n_head;
|
||||
const int64_t n_embd_head = hparams.n_embd_head();
|
||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||
const size_t n_rot = n_embd_head / 2;
|
||||
|
||||
const float freq_base = cparams.rope_freq_base;
|
||||
const float freq_scale = cparams.rope_freq_scale;
|
||||
const float norm_eps = hparams.f_norm_eps;
|
||||
|
||||
const int n_gpu_layers = model.n_gpu_layers;
|
||||
|
||||
|
||||
const int32_t n_tokens = batch.n_tokens;
|
||||
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
|
||||
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
|
||||
|
||||
const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift;
|
||||
|
||||
auto & buf_compute = lctx.buf_compute;
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute.size,
|
||||
/*.mem_buffer =*/ buf_compute.data,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
if (batch.token) {
|
||||
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
|
||||
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens));
|
||||
}
|
||||
ggml_set_name(inp_tokens, "inp_tokens");
|
||||
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
||||
} else {
|
||||
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
|
||||
ggml_allocr_alloc(lctx.alloc, inpL);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL));
|
||||
}
|
||||
}
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
(void) i_gpu_start;
|
||||
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
|
||||
offload_func_t offload_func_kq = llama_nop;
|
||||
offload_func_t offload_func_v = llama_nop;
|
||||
// KQ_scale
|
||||
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head)));
|
||||
}
|
||||
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
offload_func_kq(KQ_mask);
|
||||
ggml_set_name(KQ_mask, "KQ_mask");
|
||||
ggml_allocr_alloc(lctx.alloc, KQ_mask);
|
||||
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
float * data = (float *) KQ_mask->data;
|
||||
memset(data, 0, ggml_nbytes(KQ_mask));
|
||||
for (int h = 0; h < 1; ++h) {
|
||||
for (int j = 0; j < n_tokens; ++j) {
|
||||
const llama_pos pos = batch.pos[j];
|
||||
const llama_seq_id seq_id = batch.seq_id[j];
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
|
||||
data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
offload_func_kq(KQ_pos);
|
||||
ggml_set_name(KQ_pos, "KQ_pos");
|
||||
ggml_allocr_alloc(lctx.alloc, KQ_pos);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
int * data = (int *) KQ_pos->data;
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
data[i] = batch.pos[i];
|
||||
}
|
||||
}
|
||||
if (do_rope_shift) {
|
||||
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
|
||||
offload_func_kq(K_shift);
|
||||
ggml_set_name(K_shift, "K_shift");
|
||||
ggml_allocr_alloc(lctx.alloc, K_shift);
|
||||
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||
int * data = (int *) K_shift->data;
|
||||
for (int i = 0; i < n_ctx; ++i) {
|
||||
data[i] = kv_self.cells[i].delta;
|
||||
}
|
||||
}
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * tmp =
|
||||
// we rotate only the first n_rot dimensions.
|
||||
ggml_rope_custom_inplace(ctx0,
|
||||
ggml_view_3d(ctx0, kv_self.k,
|
||||
n_rot, n_head, n_ctx,
|
||||
ggml_element_size(kv_self.k)*n_embd_gqa,
|
||||
ggml_element_size(kv_self.k)*n_embd_head,
|
||||
ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il)
|
||||
),
|
||||
K_shift, n_rot, 2, 0, freq_base, freq_scale);
|
||||
offload_func_kq(tmp);
|
||||
ggml_build_forward_expand(gf, tmp);
|
||||
}
|
||||
}
|
||||
for (int il=0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * residual = inpL;
|
||||
offload_func_t offload_func = llama_nop;
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpL, norm_eps);
|
||||
offload_func(cur);
|
||||
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
|
||||
offload_func(cur);
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b);
|
||||
offload_func(cur);
|
||||
ggml_format_name(cur, "input_layernorm_%d", il);
|
||||
}
|
||||
// self attention
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||
offload_func_kq(cur);
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
offload_func_kq(cur);
|
||||
|
||||
// split qkv
|
||||
GGML_ASSERT(n_head_kv == n_head);
|
||||
ggml_set_name(cur, format("qkv_%d", il).c_str());
|
||||
struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
|
||||
offload_func_kq(tmpqkv);
|
||||
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
|
||||
offload_func_kq(tmpqkv_perm);
|
||||
ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il);
|
||||
struct ggml_tensor * tmpq = ggml_view_3d(
|
||||
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
||||
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
||||
0
|
||||
);
|
||||
offload_func_kq(tmpq);
|
||||
struct ggml_tensor * tmpk = ggml_view_3d(
|
||||
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
||||
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
||||
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
||||
);
|
||||
offload_func_kq(tmpk);
|
||||
// Q/K Layernorm
|
||||
tmpq = ggml_norm(ctx0, tmpq, norm_eps);
|
||||
offload_func_kq(tmpq);
|
||||
tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm);
|
||||
offload_func_kq(tmpq);
|
||||
tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b);
|
||||
offload_func_kq(tmpq);
|
||||
|
||||
tmpk = ggml_norm(ctx0, tmpk, norm_eps);
|
||||
offload_func_v(tmpk);
|
||||
tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm);
|
||||
offload_func_v(tmpk);
|
||||
tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b);
|
||||
offload_func_v(tmpk);
|
||||
|
||||
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
||||
struct ggml_tensor * qrot = ggml_view_3d(
|
||||
ctx0, tmpq, n_rot, n_head, n_tokens,
|
||||
ggml_element_size(tmpq) * n_embd_head,
|
||||
ggml_element_size(tmpq) * n_embd_head * n_head,
|
||||
0
|
||||
);
|
||||
offload_func_kq(qrot);
|
||||
ggml_format_name(qrot, "qrot_%d", il);
|
||||
struct ggml_tensor * krot = ggml_view_3d(
|
||||
ctx0, tmpk, n_rot, n_head, n_tokens,
|
||||
ggml_element_size(tmpk) * n_embd_head,
|
||||
ggml_element_size(tmpk) * n_embd_head * n_head,
|
||||
0
|
||||
);
|
||||
offload_func_kq(krot);
|
||||
ggml_format_name(krot, "krot_%d", il);
|
||||
|
||||
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
||||
struct ggml_tensor * qpass = ggml_view_3d(
|
||||
ctx0, tmpq, n_rot, n_head, n_tokens,
|
||||
ggml_element_size(tmpq) * n_embd_head,
|
||||
ggml_element_size(tmpq) * n_embd_head * n_head,
|
||||
ggml_element_size(tmpq) * n_rot
|
||||
);
|
||||
offload_func_kq(qpass);
|
||||
ggml_format_name(qpass, "qpass_%d", il);
|
||||
struct ggml_tensor * kpass = ggml_view_3d(
|
||||
ctx0, tmpk, n_rot, n_head, n_tokens,
|
||||
ggml_element_size(tmpk) * n_embd_head,
|
||||
ggml_element_size(tmpk) * n_embd_head * n_head,
|
||||
ggml_element_size(tmpk) * n_rot
|
||||
);
|
||||
offload_func_kq(kpass);
|
||||
ggml_format_name(kpass, "kpass_%d", il);
|
||||
|
||||
struct ggml_tensor * qrotated = ggml_rope_custom(
|
||||
ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
||||
);
|
||||
offload_func_kq(qrotated);
|
||||
struct ggml_tensor * krotated = ggml_rope_custom(
|
||||
ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale
|
||||
);
|
||||
offload_func_kq(krotated);
|
||||
// ggml currently only supports concatenation on dim=2
|
||||
// so we need to permute qrot, qpass, concat, then permute back.
|
||||
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
||||
offload_func_kq(qrotated);
|
||||
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
||||
offload_func_kq(krotated);
|
||||
|
||||
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
||||
offload_func_kq(qpass);
|
||||
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
||||
offload_func_kq(kpass);
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
||||
offload_func_kq(Qcur);
|
||||
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
||||
offload_func_kq(Kcur);
|
||||
|
||||
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3));
|
||||
offload_func_kq(Q);
|
||||
|
||||
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
||||
offload_func_kq(Kcur);
|
||||
{
|
||||
struct ggml_tensor * tmpv = ggml_view_3d(
|
||||
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
||||
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
||||
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2
|
||||
);
|
||||
offload_func_v(tmpv);
|
||||
// store K, V in cache
|
||||
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens));
|
||||
offload_func_v(Vcur);
|
||||
ggml_set_name(Vcur, "Vcur");
|
||||
|
||||
struct ggml_tensor * k = ggml_view_1d(
|
||||
ctx0, kv_self.k, n_tokens*n_embd_gqa,
|
||||
(ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)
|
||||
);
|
||||
offload_func_kq(k);
|
||||
ggml_set_name(k, "k");
|
||||
|
||||
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
|
||||
( n_ctx)*ggml_element_size(kv_self.v),
|
||||
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
|
||||
offload_func_v(v);
|
||||
ggml_set_name(v, "v");
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
|
||||
}
|
||||
struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k,
|
||||
n_embd_head, n_kv, n_head_kv,
|
||||
ggml_element_size(kv_self.k)*n_embd_gqa,
|
||||
ggml_element_size(kv_self.k)*n_embd_head,
|
||||
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
|
||||
|
||||
offload_func_kq(K);
|
||||
ggml_format_name(K, "K_%d", il);
|
||||
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
offload_func_kq(KQ);
|
||||
ggml_set_name(KQ, "KQ");
|
||||
|
||||
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
||||
offload_func_kq(KQ_scaled);
|
||||
ggml_set_name(KQ_scaled, "KQ_scaled");
|
||||
|
||||
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
|
||||
offload_func_kq(KQ_masked);
|
||||
ggml_set_name(KQ_masked, "KQ_masked");
|
||||
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||
offload_func_kq(KQ_soft_max);
|
||||
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
||||
|
||||
struct ggml_tensor * V =
|
||||
ggml_view_3d(ctx0, kv_self.v,
|
||||
n_kv, n_embd_head, n_head_kv,
|
||||
ggml_element_size(kv_self.v)*n_ctx,
|
||||
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
|
||||
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
|
||||
offload_func_v(V);
|
||||
ggml_set_name(V, "V");
|
||||
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||
offload_func_v(KQV);
|
||||
ggml_set_name(KQV, "KQV");
|
||||
|
||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
offload_func_v(KQV_merged);
|
||||
ggml_set_name(KQV_merged, "KQV_merged");
|
||||
|
||||
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
|
||||
offload_func_v(cur);
|
||||
ggml_set_name(cur, "KQV_merged_contiguous");
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur);
|
||||
offload_func(cur);
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bo);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "result_wo");
|
||||
}
|
||||
|
||||
struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur);
|
||||
offload_func(inpFF);
|
||||
ggml_set_name(inpFF, "inpFF");
|
||||
{
|
||||
// MLP
|
||||
{
|
||||
// Norm
|
||||
cur = ggml_norm(ctx0, inpFF, norm_eps);
|
||||
offload_func(cur);
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0, cur, model.layers[il].ffn_norm),
|
||||
model.layers[il].ffn_norm_b
|
||||
);
|
||||
ggml_set_name(cur, "ffn_norm");
|
||||
offload_func(cur);
|
||||
}
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur);
|
||||
offload_func(cur);
|
||||
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].b3);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "result_ffn_up");
|
||||
|
||||
cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur));
|
||||
ggml_set_name(cur, "result_ffn_act");
|
||||
offload_func(cur);
|
||||
offload_func(cur->src[0]);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur);
|
||||
offload_func(cur);
|
||||
cur = ggml_add(ctx0,
|
||||
cur,
|
||||
model.layers[il].b2);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "outFF");
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, inpFF);
|
||||
offload_func(cur);
|
||||
ggml_set_name(cur, "inpFF_+_outFF");
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
{
|
||||
cur = ggml_norm(ctx0, cur, norm_eps);
|
||||
offload_func_nr(cur);
|
||||
cur = ggml_mul(ctx0, cur, model.output_norm);
|
||||
offload_func_nr(cur);
|
||||
|
||||
cur = ggml_add(ctx0, cur, model.output_norm_b);
|
||||
// offload_func_nr(cur);
|
||||
|
||||
ggml_set_name(cur, "result_norm");
|
||||
}
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
ggml_set_name(cur, "result_output");
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
ggml_free(ctx0);
|
||||
return gf;
|
||||
}
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph(
|
||||
llama_context & lctx,
|
||||
const llama_batch & batch) {
|
||||
@@ -4439,6 +4927,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm_build_starcoder(lctx, batch);
|
||||
} break;
|
||||
case LLM_ARCH_PERSIMMON:
|
||||
{
|
||||
result = llm_build_persimmon(lctx, batch);
|
||||
} break;
|
||||
case LLM_ARCH_REFACT:
|
||||
{
|
||||
result = llm_build_refact(lctx, batch);
|
||||
@@ -6706,6 +7198,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
}
|
||||
|
||||
std::ofstream fout(fname_out, std::ios::binary);
|
||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||
|
||||
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
||||
|
||||
@@ -8200,7 +8693,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
||||
buf[0] = llama_token_to_byte(model->vocab, token);
|
||||
return 1;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
// TODO: for now we accept all unsupported token types,
|
||||
// suppressing them like CONTROL tokens.
|
||||
// GGML_ASSERT(false);
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -8216,7 +8711,9 @@ int llama_token_to_piece(const struct llama_model * model, llama_token token, ch
|
||||
} else if (llama_is_control_token(model->vocab, token)) {
|
||||
;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
// TODO: for now we accept all unsupported token types,
|
||||
// suppressing them like CONTROL tokens.
|
||||
// GGML_ASSERT(false);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
numpy==1.24
|
||||
numpy==1.24.4
|
||||
sentencepiece==0.1.98
|
||||
gguf>=0.1.0
|
||||
|
||||
Reference in New Issue
Block a user