mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-19 14:13:22 +02:00
Compare commits
21 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c47cf414ef | ||
|
|
b5f4ae09c3 | ||
|
|
dfbfdd60f9 | ||
|
|
15961ec04d | ||
|
|
a56d09a440 | ||
|
|
d84c48505f | ||
|
|
877b4d0c62 | ||
|
|
12247f4c69 | ||
|
|
4e9a7f7f7f | ||
|
|
3020327f6c | ||
|
|
46acb36767 | ||
|
|
131b058409 | ||
|
|
753e36f650 | ||
|
|
7ce2c77f88 | ||
|
|
aab606a11f | ||
|
|
b0bc9f4a9d | ||
|
|
4755afd1cb | ||
|
|
6e0438da3c | ||
|
|
727107707a | ||
|
|
69ff61397d | ||
|
|
044ec4b2a5 |
22
.github/workflows/close-issue.yml
vendored
Normal file
22
.github/workflows/close-issue.yml
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
name: Close inactive issues
|
||||
on:
|
||||
schedule:
|
||||
- cron: "42 0 * * *"
|
||||
|
||||
jobs:
|
||||
close-issues:
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
steps:
|
||||
- uses: actions/stale@v5
|
||||
with:
|
||||
days-before-issue-stale: 30
|
||||
days-before-issue-close: 14
|
||||
stale-issue-label: "stale"
|
||||
stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
|
||||
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
|
||||
days-before-pr-stale: -1
|
||||
days-before-pr-close: -1
|
||||
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
2
Makefile
2
Makefile
@@ -553,7 +553,7 @@ endif
|
||||
endif # LLAMA_METAL
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
ggml-metal.o: ggml-metal.m ggml-metal.h
|
||||
ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ifdef LLAMA_METAL_EMBED_LIBRARY
|
||||
|
||||
@@ -112,6 +112,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
||||
- [x] [Gemma](https://ai.google.dev/gemma)
|
||||
- [x] [Mamba](https://github.com/state-spaces/mamba)
|
||||
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
|
||||
|
||||
**Multimodal models:**
|
||||
|
||||
@@ -133,6 +134,7 @@ Typically finetunes of the base models below are supported as well.
|
||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
||||
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
|
||||
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
|
||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -37,10 +37,13 @@ extern char const *LLAMA_COMMIT;
|
||||
extern char const *LLAMA_COMPILER;
|
||||
extern char const *LLAMA_BUILD_TARGET;
|
||||
|
||||
struct llama_control_vector_load_info;
|
||||
|
||||
int32_t get_num_physical_cores();
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
int32_t get_num_physical_cores();
|
||||
|
||||
struct gpt_params {
|
||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||
@@ -103,6 +106,11 @@ struct gpt_params {
|
||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
|
||||
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||
|
||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||
|
||||
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||
// (which is more convenient to use for plotting)
|
||||
@@ -269,3 +277,24 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
|
||||
void llama_embd_normalize(const float * inp, float * out, int n);
|
||||
|
||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||
|
||||
//
|
||||
// Control vector utils
|
||||
//
|
||||
|
||||
struct llama_control_vector_data {
|
||||
int n_embd;
|
||||
|
||||
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
struct llama_control_vector_load_info {
|
||||
float strength;
|
||||
|
||||
std::string fname;
|
||||
};
|
||||
|
||||
// Load control vectors, scale each by strength, and add them together.
|
||||
// On error, returns {-1, empty}
|
||||
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
|
||||
|
||||
@@ -1965,6 +1965,23 @@ class MambaModel(Model):
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
@Model.register("CohereForCausalLM")
|
||||
class CommandR2Model(Model):
|
||||
model_arch = gguf.MODEL_ARCH.COMMAND_R
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# max_position_embeddings = 8192 in config.json but model was actually
|
||||
# trained on 128k context length
|
||||
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
||||
126
convert.py
126
convert.py
@@ -332,6 +332,9 @@ class Params:
|
||||
#
|
||||
|
||||
class BpeVocab:
|
||||
tokenizer_model = "gpt2"
|
||||
name = "bpe"
|
||||
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||
self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
|
||||
if isinstance(self.bpe_tokenizer.get('model'), dict):
|
||||
@@ -390,6 +393,9 @@ class BpeVocab:
|
||||
|
||||
|
||||
class SentencePieceVocab:
|
||||
tokenizer_model = "llama"
|
||||
name = "spm"
|
||||
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
|
||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
||||
added_tokens: dict[str, int]
|
||||
@@ -453,6 +459,9 @@ class SentencePieceVocab:
|
||||
|
||||
|
||||
class HfVocab:
|
||||
tokenizer_model = "llama"
|
||||
name = "hfft"
|
||||
|
||||
def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None:
|
||||
try:
|
||||
from transformers import AutoTokenizer
|
||||
@@ -553,7 +562,15 @@ class HfVocab:
|
||||
return f"<HfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||
|
||||
|
||||
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab"
|
||||
class NoVocab:
|
||||
tokenizer_model = "no_vocab"
|
||||
name = "no_vocab"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "<NoVocab for a model without integrated vocabulary>"
|
||||
|
||||
|
||||
Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab"
|
||||
|
||||
|
||||
#
|
||||
@@ -935,8 +952,10 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
|
||||
# Handle special case where the model's vocab size is not set
|
||||
if params.n_vocab == -1:
|
||||
raise ValueError(
|
||||
f"The model's vocab size is set to -1 in params.json. Please update it manually. Maybe {vocab.vocab_size}?"
|
||||
f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}"
|
||||
)
|
||||
if isinstance(vocab, NoVocab):
|
||||
return # model has no vocab
|
||||
|
||||
# Check for a vocab size mismatch
|
||||
if params.n_vocab == vocab.vocab_size:
|
||||
@@ -977,6 +996,7 @@ class OutputFile:
|
||||
name = str(params.path_model.parent).split('/')[-1]
|
||||
|
||||
self.gguf.add_name (name)
|
||||
self.gguf.add_vocab_size (params.n_vocab)
|
||||
self.gguf.add_context_length (params.n_ctx)
|
||||
self.gguf.add_embedding_length (params.n_embd)
|
||||
self.gguf.add_block_count (params.n_layer)
|
||||
@@ -1013,21 +1033,9 @@ class OutputFile:
|
||||
if params.ftype is not None:
|
||||
self.gguf.add_file_type(params.ftype)
|
||||
|
||||
def handle_tokenizer_model(self, vocab: Vocab) -> str:
|
||||
# Map the vocab types to the supported tokenizer models
|
||||
tokenizer_model = {
|
||||
SentencePieceVocab: "llama",
|
||||
HfVocab: "llama",
|
||||
BpeVocab: "gpt2",
|
||||
}.get(type(vocab))
|
||||
|
||||
# Block if vocab type is not predefined
|
||||
if tokenizer_model is None:
|
||||
raise ValueError("Unknown vocab type: Not supported")
|
||||
|
||||
return tokenizer_model
|
||||
|
||||
def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]:
|
||||
assert not isinstance(vocab, NoVocab)
|
||||
|
||||
tokens = []
|
||||
scores = []
|
||||
toktypes = []
|
||||
@@ -1043,11 +1051,8 @@ class OutputFile:
|
||||
return tokens, scores, toktypes
|
||||
|
||||
def add_meta_vocab(self, vocab: Vocab) -> None:
|
||||
# Handle the tokenizer model
|
||||
tokenizer_model = self.handle_tokenizer_model(vocab)
|
||||
|
||||
# Ensure that tokenizer_model is added to the GGUF model
|
||||
self.gguf.add_tokenizer_model(tokenizer_model)
|
||||
self.gguf.add_tokenizer_model(vocab.tokenizer_model)
|
||||
|
||||
# Extract model vocabulary for model conversion
|
||||
tokens, scores, toktypes = self.extract_vocabulary_from_model(vocab)
|
||||
@@ -1074,6 +1079,26 @@ class OutputFile:
|
||||
def write_tensor_info(self) -> None:
|
||||
self.gguf.write_ti_data_to_file()
|
||||
|
||||
def write_tensor_data(self, ftype: GGMLFileType, model: LazyModel, concurrency: int) -> None:
|
||||
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency=concurrency)
|
||||
if ftype == GGMLFileType.MostlyQ8_0:
|
||||
ndarrays = bounded_parallel_map(
|
||||
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
|
||||
use_processpool_executor=True,
|
||||
)
|
||||
else:
|
||||
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
|
||||
|
||||
start = time.time()
|
||||
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
|
||||
elapsed = time.time() - start
|
||||
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
||||
padi = len(str(len(model)))
|
||||
print(
|
||||
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
|
||||
)
|
||||
self.gguf.write_tensor_data(ndarray)
|
||||
|
||||
def close(self) -> None:
|
||||
self.gguf.close()
|
||||
|
||||
@@ -1082,7 +1107,7 @@ class OutputFile:
|
||||
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
|
||||
) -> None:
|
||||
check_vocab_size(params, vocab, pad_vocab = pad_vocab)
|
||||
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
||||
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
||||
@@ -1120,8 +1145,11 @@ class OutputFile:
|
||||
|
||||
# meta data
|
||||
of.add_meta_arch(params)
|
||||
of.add_meta_vocab(vocab)
|
||||
of.add_meta_special_vocab(svocab)
|
||||
if isinstance(vocab, NoVocab):
|
||||
of.gguf.add_tokenizer_model(vocab.tokenizer_model)
|
||||
else:
|
||||
of.add_meta_vocab(vocab)
|
||||
of.add_meta_special_vocab(svocab)
|
||||
|
||||
# tensor info
|
||||
for name, lazy_tensor in model.items():
|
||||
@@ -1131,24 +1159,7 @@ class OutputFile:
|
||||
of.write_tensor_info()
|
||||
|
||||
# tensor data
|
||||
ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency)
|
||||
if ftype == GGMLFileType.MostlyQ8_0:
|
||||
ndarrays = bounded_parallel_map(
|
||||
OutputFile.maybe_do_quantize, ndarrays_inner, concurrency=concurrency, max_workers=concurrency,
|
||||
use_processpool_executor=True,
|
||||
)
|
||||
else:
|
||||
ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner)
|
||||
|
||||
start = time.time()
|
||||
for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
|
||||
elapsed = time.time() - start
|
||||
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
|
||||
padi = len(str(len(model)))
|
||||
print(
|
||||
f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
|
||||
)
|
||||
of.gguf.write_tensor_data(ndarray)
|
||||
of.write_tensor_data(ftype, model, concurrency)
|
||||
|
||||
of.close()
|
||||
|
||||
@@ -1309,8 +1320,8 @@ class VocabFactory:
|
||||
return vtype, path
|
||||
raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}")
|
||||
|
||||
def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab:
|
||||
load_merges = vocabtype == "bpe"
|
||||
def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab:
|
||||
load_merges = vocab.name == "bpe"
|
||||
n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None
|
||||
return gguf.SpecialVocab(
|
||||
model_parent_path,
|
||||
@@ -1319,30 +1330,34 @@ class VocabFactory:
|
||||
n_vocab=n_vocab,
|
||||
)
|
||||
|
||||
def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
|
||||
def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab:
|
||||
vocab_type, path = self._select_file(vocab_types)
|
||||
print(f"Loading vocab file {path!r}, type {vocab_type!r}")
|
||||
|
||||
added_tokens_path = path.parent / "added_tokens.json"
|
||||
vocab: Vocab
|
||||
if vocab_type == "bpe":
|
||||
vocab = BpeVocab(
|
||||
return BpeVocab(
|
||||
path, added_tokens_path if added_tokens_path.exists() else None
|
||||
)
|
||||
elif vocab_type == "spm":
|
||||
vocab = SentencePieceVocab(
|
||||
if vocab_type == "spm":
|
||||
return SentencePieceVocab(
|
||||
path, added_tokens_path if added_tokens_path.exists() else None
|
||||
)
|
||||
elif vocab_type == "hfft":
|
||||
vocab = HfVocab(
|
||||
if vocab_type == "hfft":
|
||||
return HfVocab(
|
||||
path.parent, added_tokens_path if added_tokens_path.exists() else None
|
||||
)
|
||||
raise ValueError(vocab_type)
|
||||
|
||||
def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]:
|
||||
vocab: Vocab
|
||||
if len(vocab_types) == 1 and "no_vocab" in vocab_types:
|
||||
vocab = NoVocab()
|
||||
else:
|
||||
raise ValueError(vocab_type)
|
||||
vocab = self._create_vocab_by_path(vocab_types)
|
||||
# FIXME: Respect --vocab-dir?
|
||||
special_vocab = self._create_special_vocab(
|
||||
vocab,
|
||||
vocab_type,
|
||||
model_parent_path,
|
||||
)
|
||||
return vocab, special_vocab
|
||||
@@ -1380,6 +1395,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
||||
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
parser.add_argument("--no-vocab", action="store_true", help="store model without the vocab")
|
||||
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
|
||||
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
|
||||
parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft")
|
||||
@@ -1392,6 +1408,10 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
||||
|
||||
args = parser.parse_args(args_in)
|
||||
if args.no_vocab:
|
||||
if args.vocab_only:
|
||||
raise ValueError("no need to specify --vocab-only if using --no-vocab")
|
||||
args.vocab_type = "no_vocab"
|
||||
|
||||
if args.dump_single:
|
||||
model_plus = lazy_load_file(args.model)
|
||||
@@ -1442,7 +1462,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||
print(f"Wrote {outfile}")
|
||||
return
|
||||
|
||||
if model_plus.vocab is not None and args.vocab_dir is None:
|
||||
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
|
||||
vocab = model_plus.vocab
|
||||
|
||||
print(f"Vocab info: {vocab}")
|
||||
|
||||
@@ -112,13 +112,20 @@ int main(int argc, char ** argv) {
|
||||
// tokenize the prompts and trim
|
||||
std::vector<std::vector<int32_t>> inputs;
|
||||
for (const auto & prompt : prompts) {
|
||||
auto inp = ::llama_tokenize(ctx, prompt, true);
|
||||
auto inp = ::llama_tokenize(ctx, prompt, true, false);
|
||||
if (inp.size() > n_batch) {
|
||||
inp.resize(n_batch);
|
||||
}
|
||||
inputs.push_back(inp);
|
||||
}
|
||||
|
||||
// add eos if not present
|
||||
for (auto & inp : inputs) {
|
||||
if (inp.empty() || inp.back() != llama_token_eos(model)) {
|
||||
inp.push_back(llama_token_eos(model));
|
||||
}
|
||||
}
|
||||
|
||||
// tokenization stats
|
||||
if (params.verbose_prompt) {
|
||||
for (int i = 0; i < (int) inputs.size(); i++) {
|
||||
@@ -172,7 +179,7 @@ int main(int argc, char ** argv) {
|
||||
for (int j = 0; j < n_prompts; j++) {
|
||||
fprintf(stdout, "embedding %d: ", j);
|
||||
for (int i = 0; i < std::min(16, n_embd); i++) {
|
||||
fprintf(stdout, "%f ", emb[j * n_embd + i]);
|
||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
||||
}
|
||||
fprintf(stdout, "\n");
|
||||
}
|
||||
|
||||
@@ -211,6 +211,7 @@ static bool gguf_ex_read_1(const std::string & fname) {
|
||||
for (int j = 0; j < ggml_nelements(cur); ++j) {
|
||||
if (data[j] != 100 + i) {
|
||||
fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
|
||||
gguf_free(ctx);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
62
examples/gritlm/README.md
Normal file
62
examples/gritlm/README.md
Normal file
@@ -0,0 +1,62 @@
|
||||
## Generative Representational Instruction Tuning (GRIT) Example
|
||||
[gritlm] a model which can generate embeddings as well as "normal" text
|
||||
generation depending on the instructions in the prompt.
|
||||
|
||||
* Paper: https://arxiv.org/pdf/2402.09906.pdf
|
||||
|
||||
### Retrieval-Augmented Generation (RAG) use case
|
||||
One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
|
||||
that we take documents that we want to use as context, to ground the large
|
||||
language model (LLM), and we create token embeddings for them. We then store
|
||||
these token embeddings in a vector database.
|
||||
|
||||
When we perform a query, prompt the LLM, we will first create token embeddings
|
||||
for the query and then search the vector database to retrieve the most
|
||||
similar vectors, and return those documents so they can be passed to the LLM as
|
||||
context. Then the query and the context will be passed to the LLM which will
|
||||
have to _again_ create token embeddings for the query. But because gritlm is used
|
||||
the first query can be cached and the second query tokenization generation does
|
||||
not have to be performed at all.
|
||||
|
||||
### Running the example
|
||||
Download a Grit model:
|
||||
```console
|
||||
$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
|
||||
```
|
||||
|
||||
Run the example using the downloaded model:
|
||||
```console
|
||||
$ ./gritlm -m gritlm-7b_q4_1.gguf
|
||||
|
||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
|
||||
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
|
||||
Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
|
||||
Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
|
||||
|
||||
Oh, brave adventurer, who dared to climb
|
||||
The lofty peak of Mt. Fuji in the night,
|
||||
When shadows lurk and ghosts do roam,
|
||||
And darkness reigns, a fearsome sight.
|
||||
|
||||
Thou didst set out, with heart aglow,
|
||||
To conquer this mountain, so high,
|
||||
And reach the summit, where the stars do glow,
|
||||
And the moon shines bright, up in the sky.
|
||||
|
||||
Through the mist and fog, thou didst press on,
|
||||
With steadfast courage, and a steadfast will,
|
||||
Through the darkness, thou didst not be gone,
|
||||
But didst climb on, with a steadfast skill.
|
||||
|
||||
At last, thou didst reach the summit's crest,
|
||||
And gazed upon the world below,
|
||||
And saw the beauty of the night's best,
|
||||
And felt the peace, that only nature knows.
|
||||
|
||||
Oh, brave adventurer, who dared to climb
|
||||
The lofty peak of Mt. Fuji in the night,
|
||||
Thou art a hero, in the eyes of all,
|
||||
For thou didst conquer this mountain, so bright.
|
||||
```
|
||||
|
||||
[gritlm]: https://github.com/ContextualAI/gritlm
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <ctime>
|
||||
#include <cstdlib>
|
||||
#include <iterator>
|
||||
#include <map>
|
||||
#include <numeric>
|
||||
@@ -103,6 +104,7 @@ static std::string get_cpu_info() {
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
#endif
|
||||
// TODO: other platforms
|
||||
@@ -1122,15 +1124,19 @@ struct sql_printer : public printer {
|
||||
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
|
||||
llama_set_n_threads(ctx, n_threads, n_threads);
|
||||
|
||||
//std::vector<llama_token> tokens(n_prompt, llama_token_bos(llama_get_model(ctx)));
|
||||
//llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0));
|
||||
//GGML_UNUSED(n_batch);
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const int32_t n_vocab = llama_n_vocab(model);
|
||||
|
||||
std::vector<llama_token> tokens(n_batch);
|
||||
|
||||
std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
|
||||
int n_processed = 0;
|
||||
|
||||
while (n_processed < n_prompt) {
|
||||
int n_tokens = std::min(n_prompt - n_processed, n_batch);
|
||||
tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
|
||||
for (int i = 1; i < n_tokens; i++) {
|
||||
tokens[i] = std::rand() % n_vocab;
|
||||
}
|
||||
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
|
||||
n_processed += n_tokens;
|
||||
}
|
||||
@@ -1141,11 +1147,15 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
|
||||
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
|
||||
llama_set_n_threads(ctx, n_threads, n_threads);
|
||||
|
||||
llama_token token = llama_token_bos(llama_get_model(ctx));
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
const int32_t n_vocab = llama_n_vocab(model);
|
||||
|
||||
llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
|
||||
|
||||
for (int i = 0; i < n_gen; i++) {
|
||||
llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
|
||||
llama_synchronize(ctx);
|
||||
token = std::rand() % n_vocab;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -995,6 +995,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
if (!new_clip->ctx_data) {
|
||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||
clip_free(new_clip);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -1002,6 +1003,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
if (!fin) {
|
||||
printf("cannot open model file for loading tensors\n");
|
||||
clip_free(new_clip);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -1023,6 +1025,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
||||
if (!fin) {
|
||||
printf("%s: failed to seek for tensor %s\n", __func__, name);
|
||||
clip_free(new_clip);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
int num_bytes = ggml_nbytes(cur);
|
||||
@@ -1232,16 +1235,16 @@ struct clip_image_f32 * clip_image_f32_init() {
|
||||
|
||||
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
|
||||
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
|
||||
void clip_image_u8_batch_free(struct clip_image_u8_batch & batch) {
|
||||
if (batch.size > 0) {
|
||||
delete[] batch.data;
|
||||
batch.size = 0;
|
||||
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) {
|
||||
if (batch->size > 0) {
|
||||
delete[] batch->data;
|
||||
batch->size = 0;
|
||||
}
|
||||
}
|
||||
void clip_image_f32_batch_free(struct clip_image_f32_batch & batch) {
|
||||
if (batch.size > 0) {
|
||||
delete[] batch.data;
|
||||
batch.size = 0;
|
||||
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
|
||||
if (batch->size > 0) {
|
||||
delete[] batch->data;
|
||||
batch->size = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1494,7 +1497,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
|
||||
|
||||
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
||||
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
|
||||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
|
||||
bool pad_to_square = true;
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
@@ -1506,11 +1509,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
pad_to_square = false;
|
||||
}
|
||||
// free the previous res_imgs if any set
|
||||
if (res_imgs.size > 0) {
|
||||
if (res_imgs->size > 0) {
|
||||
clip_image_f32_batch_free(res_imgs);
|
||||
}
|
||||
res_imgs.data = nullptr;
|
||||
res_imgs.size = 0;
|
||||
res_imgs->data = nullptr;
|
||||
res_imgs->size = 0;
|
||||
|
||||
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
||||
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
||||
@@ -1565,11 +1568,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
|
||||
patches.insert(patches.begin(), image_original_resize);
|
||||
// clip_image_f32_batch_init(patches.size());
|
||||
res_imgs.size = patches.size();
|
||||
res_imgs.data = new clip_image_f32[res_imgs.size];
|
||||
res_imgs->size = patches.size();
|
||||
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||
int num=0;
|
||||
for (auto& patch : patches) {
|
||||
normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
|
||||
normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
|
||||
num++;
|
||||
}
|
||||
|
||||
@@ -1657,9 +1660,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||
// }
|
||||
// res_imgs.push_back(res);
|
||||
|
||||
res_imgs.size = 1;
|
||||
res_imgs.data = new clip_image_f32[res_imgs.size];
|
||||
res_imgs.data[0] = *res;
|
||||
res_imgs->size = 1;
|
||||
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||
res_imgs->data[0] = *res;
|
||||
clip_image_f32_free(res);
|
||||
|
||||
return true;
|
||||
@@ -1908,6 +1911,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
||||
break;
|
||||
default:
|
||||
printf("Please use an input file in f32 or f16\n");
|
||||
gguf_free(ctx_out);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -60,8 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||
|
||||
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
||||
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
||||
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch);
|
||||
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
|
||||
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
||||
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
||||
|
||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||
|
||||
@@ -69,7 +69,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
|
||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||
|
||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
|
||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
|
||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||
|
||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||
|
||||
|
||||
@@ -223,7 +223,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||
clip_image_f32_batch img_res_v;
|
||||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
|
||||
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
|
||||
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
|
||||
delete[] img_res_v.data;
|
||||
return false;
|
||||
|
||||
@@ -29,9 +29,9 @@ struct llava_image_embed {
|
||||
};
|
||||
|
||||
/** sanity check for clip <-> llava embed size match */
|
||||
LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
|
||||
LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
|
||||
|
||||
LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
||||
LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
||||
|
||||
/** build an image embed from image file bytes */
|
||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
||||
|
||||
@@ -13,8 +13,11 @@ source /opt/intel/oneapi/setvars.sh
|
||||
#for FP32
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#build example/main
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build example/llama-bench
|
||||
#cmake --build . --config Release --target llama-bench
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
@@ -9,18 +9,28 @@ source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
GGML_SYCL_DEVICE=$1
|
||||
GGML_SYCL_SINGLE_GPU=1
|
||||
else
|
||||
GGML_SYCL_DEVICE=0
|
||||
fi
|
||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||
|
||||
#export GGML_SYCL_DEBUG=1
|
||||
|
||||
|
||||
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
||||
|
||||
#use all GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
|
||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
else
|
||||
#use multiple GPUs with same max compute units
|
||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
fi
|
||||
|
||||
#use main GPU only
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||
|
||||
#use multiple GPUs with same max compute units
|
||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
|
||||
|
||||
@@ -711,6 +711,7 @@ static bool load_checkpoint_file(const char * filename, struct my_llama_model *
|
||||
|
||||
load_checkpoint_gguf(fctx, f_ggml_ctx, model, train);
|
||||
|
||||
gguf_free(fctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -11541,6 +11541,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
||||
if (ggml_backend_is_cuda(event->backend)) {
|
||||
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
|
||||
} else {
|
||||
#if 0
|
||||
// untested
|
||||
auto wait_fn = [](void * user_data) {
|
||||
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
|
||||
@@ -11548,6 +11549,8 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
||||
};
|
||||
|
||||
CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
|
||||
#endif
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
343
ggml-sycl.cpp
343
ggml-sycl.cpp
@@ -16,6 +16,7 @@
|
||||
#include <cinttypes>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <float.h>
|
||||
#include <limits>
|
||||
#include <stdint.h>
|
||||
@@ -24,10 +25,9 @@
|
||||
#include <cmath>
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <regex>
|
||||
|
||||
#include <sycl/sycl.hpp>
|
||||
#include <sycl/half_type.hpp>
|
||||
@@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp
|
||||
#define __dpct_noinline__ __attribute__((noinline))
|
||||
#endif
|
||||
|
||||
|
||||
std::string get_device_type_name(const sycl::device &Device) {
|
||||
auto DeviceType = Device.get_info<sycl::info::device::device_type>();
|
||||
switch (DeviceType) {
|
||||
case sycl::info::device_type::cpu:
|
||||
return "cpu";
|
||||
case sycl::info::device_type::gpu:
|
||||
return "gpu";
|
||||
case sycl::info::device_type::host:
|
||||
return "host";
|
||||
case sycl::info::device_type::accelerator:
|
||||
return "acc";
|
||||
default:
|
||||
return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
std::string get_device_backend_and_type(const sycl::device &device) {
|
||||
std::stringstream device_type;
|
||||
sycl::backend backend = device.get_backend();
|
||||
device_type << backend << ":" << get_device_type_name(device);
|
||||
return device_type.str();
|
||||
}
|
||||
|
||||
namespace dpct
|
||||
{
|
||||
typedef sycl::queue *queue_ptr;
|
||||
@@ -942,17 +966,65 @@ namespace dpct
|
||||
|
||||
private:
|
||||
mutable std::recursive_mutex m_mutex;
|
||||
static bool compare_dev(sycl::device &device1, sycl::device &device2)
|
||||
{
|
||||
dpct::device_info prop1;
|
||||
dpct::get_device_info(prop1, device1);
|
||||
dpct::device_info prop2;
|
||||
dpct::get_device_info(prop2, device2);
|
||||
return prop1.get_max_compute_units() > prop2.get_max_compute_units();
|
||||
}
|
||||
static int convert_backend_index(std::string & backend) {
|
||||
if (backend == "ext_oneapi_level_zero:gpu") return 0;
|
||||
if (backend == "opencl:gpu") return 1;
|
||||
if (backend == "opencl:cpu") return 2;
|
||||
if (backend == "opencl:acc") return 3;
|
||||
printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
static bool compare_backend(std::string &backend1, std::string &backend2) {
|
||||
return convert_backend_index(backend1) < convert_backend_index(backend2);
|
||||
}
|
||||
dev_mgr()
|
||||
{
|
||||
sycl::device default_device =
|
||||
sycl::device(sycl::default_selector_v);
|
||||
_devs.push_back(std::make_shared<device_ext>(default_device));
|
||||
|
||||
std::vector<sycl::device> sycl_all_devs =
|
||||
sycl::device::get_devices(sycl::info::device_type::all);
|
||||
std::vector<sycl::device> sycl_all_devs;
|
||||
// Collect other devices except for the default device.
|
||||
if (default_device.is_cpu())
|
||||
_cpu_device = 0;
|
||||
|
||||
auto Platforms = sycl::platform::get_platforms();
|
||||
// Keep track of the number of devices per backend
|
||||
std::map<sycl::backend, size_t> DeviceNums;
|
||||
std::map<std::string, std::vector<sycl::device>> backend_devices;
|
||||
|
||||
while (!Platforms.empty()) {
|
||||
auto Platform = Platforms.back();
|
||||
Platforms.pop_back();
|
||||
auto devices = Platform.get_devices();
|
||||
std::string backend_type = get_device_backend_and_type(devices[0]);
|
||||
for (const auto &device : devices) {
|
||||
backend_devices[backend_type].push_back(device);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> keys;
|
||||
for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
|
||||
keys.push_back(it->first);
|
||||
}
|
||||
std::sort(keys.begin(), keys.end(), compare_backend);
|
||||
|
||||
for (auto &key : keys) {
|
||||
std::vector<sycl::device> devs = backend_devices[key];
|
||||
std::sort(devs.begin(), devs.end(), compare_dev);
|
||||
for (const auto &dev : devs) {
|
||||
sycl_all_devs.push_back(dev);
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &dev : sycl_all_devs)
|
||||
{
|
||||
if (dev == default_device)
|
||||
@@ -3202,6 +3274,11 @@ static int g_work_group_size = 0;
|
||||
#define GGML_SYCL_MMV_Y 1
|
||||
#endif
|
||||
|
||||
enum ggml_sycl_backend_gpu_mode {
|
||||
SYCL_UNSET_GPU_MODE = -1,
|
||||
SYCL_SINGLE_GPU_MODE = 0,
|
||||
SYCL_MUL_GPU_MODE
|
||||
};
|
||||
|
||||
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||
|
||||
@@ -3401,12 +3478,31 @@ class sycl_gpu_mgr {
|
||||
int work_group_size = 0;
|
||||
std::string gpus_list = "";
|
||||
|
||||
/*
|
||||
Use all GPUs with same top max compute units
|
||||
*/
|
||||
sycl_gpu_mgr() {
|
||||
detect_sycl_gpu_list_with_max_cu();
|
||||
get_allow_gpus();
|
||||
create_context_with_gpus();
|
||||
}
|
||||
|
||||
/*
|
||||
Only use the assigned GPU
|
||||
*/
|
||||
sycl_gpu_mgr(int main_gpu_id) {
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
gpus.push_back(main_gpu_id);
|
||||
devices.push_back(device);
|
||||
work_group_size = prop.get_max_work_group_size();
|
||||
max_compute_units = prop.get_max_compute_units();
|
||||
|
||||
get_allow_gpus();
|
||||
create_context_with_gpus();
|
||||
}
|
||||
|
||||
void create_context_with_gpus() {
|
||||
sycl::context ctx = sycl::context(devices);
|
||||
assert(gpus.size() > 0);
|
||||
@@ -3422,7 +3518,7 @@ class sycl_gpu_mgr {
|
||||
gpus_list += std::to_string(gpus[i]);
|
||||
gpus_list += ",";
|
||||
}
|
||||
if (gpus_list.length() > 2) {
|
||||
if (gpus_list.length() > 1) {
|
||||
gpus_list.pop_back();
|
||||
}
|
||||
}
|
||||
@@ -3451,7 +3547,7 @@ class sycl_gpu_mgr {
|
||||
dpct::device_info prop;
|
||||
dpct::get_device_info(prop, device);
|
||||
if (max_compute_units == prop.get_max_compute_units() &&
|
||||
prop.get_major_version() == 1) {
|
||||
is_ext_oneapi_device(device)) {
|
||||
gpus.push_back(id);
|
||||
devices.push_back(device);
|
||||
work_group_size = prop.get_max_work_group_size();
|
||||
@@ -3471,8 +3567,8 @@ class sycl_gpu_mgr {
|
||||
if (gpus[i] == id)
|
||||
return i;
|
||||
}
|
||||
assert(false);
|
||||
return -1;
|
||||
printf("miss to get device index by id=%d\n", id);
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
int get_next_index(int id) {
|
||||
@@ -3481,8 +3577,16 @@ class sycl_gpu_mgr {
|
||||
if (gpus[i] == id)
|
||||
return i;
|
||||
}
|
||||
assert(false);
|
||||
return -1;
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
bool is_ext_oneapi_device(const sycl::device &dev) {
|
||||
sycl::backend dev_backend = dev.get_backend();
|
||||
if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
|
||||
dev_backend == sycl::backend::ext_oneapi_cuda ||
|
||||
dev_backend == sycl::backend::ext_oneapi_hip)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -3491,11 +3595,14 @@ static int g_device_count = -1;
|
||||
static int g_all_sycl_device_count = -1;
|
||||
static int g_main_device = -1;
|
||||
static int g_main_device_id = -1;
|
||||
static bool g_ggml_backend_sycl_buffer_type_initialized = false;
|
||||
|
||||
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
|
||||
|
||||
static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
|
||||
|
||||
static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE;
|
||||
|
||||
struct sycl_device_capabilities {
|
||||
int cc; // compute capability
|
||||
bool vmm; // virtual memory support
|
||||
@@ -12999,17 +13106,20 @@ bool ggml_sycl_loaded(void) {
|
||||
return g_sycl_loaded;
|
||||
}
|
||||
|
||||
void print_device_detail(int id) {
|
||||
void print_device_detail(int id, sycl::device &device, std::string device_type) {
|
||||
|
||||
dpct::device_info prop;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id))));
|
||||
sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
|
||||
dpct::get_device_info(prop, device)));
|
||||
|
||||
std::string version;
|
||||
version += std::to_string(prop.get_major_version());
|
||||
version += ".";
|
||||
version += std::to_string(prop.get_minor_version());
|
||||
|
||||
fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id,
|
||||
device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
|
||||
|
||||
fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
|
||||
prop.get_name(), version.c_str(), prop.get_max_compute_units(),
|
||||
prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
|
||||
prop.get_global_mem_size());
|
||||
@@ -13017,19 +13127,35 @@ void print_device_detail(int id) {
|
||||
|
||||
void ggml_backend_sycl_print_sycl_devices() {
|
||||
int device_count = dpct::dev_mgr::instance().device_count();
|
||||
std::map<std::string, size_t> DeviceNums;
|
||||
fprintf(stderr, "found %d SYCL devices:\n", device_count);
|
||||
fprintf(stderr, "|ID| Name |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n");
|
||||
fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n");
|
||||
fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n");
|
||||
fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n");
|
||||
fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
|
||||
for (int id = 0; id < device_count; ++id) {
|
||||
print_device_detail(id);
|
||||
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||
sycl::backend backend = device.get_backend();
|
||||
std::string backend_type = get_device_backend_and_type(device);
|
||||
int type_id=DeviceNums[backend_type]++;
|
||||
std::stringstream device_type;
|
||||
device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]";
|
||||
print_device_detail(id, device, device_type.str());
|
||||
}
|
||||
}
|
||||
|
||||
void print_gpu_device_list() {
|
||||
fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
|
||||
g_sycl_gpu_mgr->get_gpu_count(),
|
||||
g_sycl_gpu_mgr->gpus_list.c_str(),
|
||||
g_sycl_gpu_mgr->max_compute_units);
|
||||
GGML_ASSERT(g_sycl_gpu_mgr);
|
||||
|
||||
char* hint=NULL;
|
||||
if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) {
|
||||
hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n";
|
||||
} else {
|
||||
hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
|
||||
}
|
||||
fprintf(stderr, hint,
|
||||
g_sycl_gpu_mgr->get_gpu_count(),
|
||||
g_sycl_gpu_mgr->gpus_list.c_str(),
|
||||
g_sycl_gpu_mgr->max_compute_units);
|
||||
}
|
||||
|
||||
int get_sycl_env(const char *env_name, int default_val) {
|
||||
@@ -13065,23 +13191,6 @@ void ggml_init_sycl() try {
|
||||
#else
|
||||
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
|
||||
#endif
|
||||
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
||||
dpct::dev_mgr::instance().device_count()) != 0) {
|
||||
initialized = true;
|
||||
g_sycl_loaded = false;
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
||||
ggml_backend_sycl_print_sycl_devices();
|
||||
|
||||
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
||||
|
||||
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
|
||||
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
||||
|
||||
print_gpu_device_list();
|
||||
|
||||
int64_t total_vram = 0;
|
||||
|
||||
/* NOT REMOVE, keep it for next optimize for XMX.
|
||||
#if defined(SYCL_USE_XMX)
|
||||
@@ -13090,49 +13199,15 @@ void ggml_init_sycl() try {
|
||||
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
||||
#endif
|
||||
*/
|
||||
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
|
||||
g_device_caps[id].vmm = 0;
|
||||
g_device_caps[id].device_id = -1;
|
||||
g_device_caps[id].cc = 0;
|
||||
g_tensor_split[id] = 0;
|
||||
g_default_tensor_split[id] = 0;
|
||||
|
||||
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
||||
dpct::dev_mgr::instance().device_count()) != 0) {
|
||||
initialized = true;
|
||||
g_sycl_loaded = false;
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
int device_id = g_sycl_gpu_mgr->gpus[i];
|
||||
g_device_caps[i].vmm = 0;
|
||||
|
||||
dpct::device_info prop;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, dpct::dev_mgr::instance().get_device(device_id))));
|
||||
|
||||
g_default_tensor_split[i] = total_vram;
|
||||
total_vram += prop.get_global_mem_size();
|
||||
|
||||
g_device_caps[i].cc =
|
||||
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
||||
}
|
||||
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
g_default_tensor_split[i] /= total_vram;
|
||||
}
|
||||
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
SYCL_CHECK(ggml_sycl_set_device(i));
|
||||
|
||||
// create sycl streams
|
||||
for (int is = 0; is < MAX_STREAMS; ++is) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
g_syclStreams[i][is] =
|
||||
dpct::get_current_device().create_queue(
|
||||
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
|
||||
}
|
||||
|
||||
const dpct::queue_ptr stream = g_syclStreams[i][0];
|
||||
// create sycl handle
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
|
||||
}
|
||||
|
||||
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
||||
ggml_backend_sycl_print_sycl_devices();
|
||||
initialized = true;
|
||||
g_sycl_loaded = true;
|
||||
}
|
||||
@@ -13143,6 +13218,63 @@ catch (sycl::exception const &exc) {
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
void ggml_init_by_gpus(int device_count) try {
|
||||
g_device_count = device_count;
|
||||
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
||||
|
||||
int64_t total_vram = 0;
|
||||
|
||||
print_gpu_device_list();
|
||||
|
||||
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
|
||||
g_device_caps[id].vmm = 0;
|
||||
g_device_caps[id].device_id = -1;
|
||||
g_device_caps[id].cc = 0;
|
||||
g_tensor_split[id] = 0;
|
||||
g_default_tensor_split[id] = 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
int device_id = g_sycl_gpu_mgr->gpus[i];
|
||||
g_device_caps[i].vmm = 0;
|
||||
|
||||
dpct::device_info prop;
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
|
||||
prop, dpct::dev_mgr::instance().get_device(device_id))));
|
||||
|
||||
g_default_tensor_split[i] = total_vram;
|
||||
total_vram += prop.get_global_mem_size();
|
||||
|
||||
g_device_caps[i].cc =
|
||||
100 * prop.get_major_version() + 10 * prop.get_minor_version();
|
||||
}
|
||||
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
g_default_tensor_split[i] /= total_vram;
|
||||
}
|
||||
|
||||
for (int i = 0; i < g_device_count; ++i) {
|
||||
SYCL_CHECK(ggml_sycl_set_device(i));
|
||||
|
||||
// create sycl streams
|
||||
for (int is = 0; is < MAX_STREAMS; ++is) {
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||
g_syclStreams[i][is] =
|
||||
dpct::get_current_device().create_queue(
|
||||
g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device())));
|
||||
}
|
||||
|
||||
const dpct::queue_ptr stream = g_syclStreams[i][0];
|
||||
// create sycl handle
|
||||
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
|
||||
}
|
||||
}
|
||||
catch (sycl::exception const &exc) {
|
||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||
<< ", line:" << __LINE__ << std::endl;
|
||||
std::exit(1);
|
||||
}
|
||||
|
||||
void *ggml_sycl_host_malloc(size_t size) try {
|
||||
if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
|
||||
return nullptr;
|
||||
@@ -16542,22 +16674,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
||||
/* .is_host = */ nullptr,
|
||||
};
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
||||
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
|
||||
if (device_index>=g_device_count or device_index<0) {
|
||||
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
||||
device_index, g_device_count-1);
|
||||
GGML_ASSERT(device_index<g_device_count);
|
||||
}
|
||||
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
|
||||
|
||||
static bool ggml_backend_sycl_buffer_type_initialized = false;
|
||||
|
||||
if (!ggml_backend_sycl_buffer_type_initialized) {
|
||||
if (!g_ggml_backend_sycl_buffer_type_initialized) {
|
||||
for (int i = 0; i < g_device_count; i++) {
|
||||
ggml_backend_sycl_buffer_types[i] = {
|
||||
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
||||
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
|
||||
};
|
||||
}
|
||||
ggml_backend_sycl_buffer_type_initialized = true;
|
||||
g_ggml_backend_sycl_buffer_type_initialized = true;
|
||||
}
|
||||
|
||||
return &ggml_backend_sycl_buffer_types[device];
|
||||
return &ggml_backend_sycl_buffer_types[device_index];
|
||||
}
|
||||
|
||||
// sycl split buffer type
|
||||
@@ -17310,11 +17444,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
|
||||
return g_sycl_gpu_mgr->get_index(device_id);
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
|
||||
return g_sycl_gpu_mgr->gpus[device_index];
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
|
||||
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
|
||||
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
|
||||
if (g_sycl_gpu_mgr) {
|
||||
delete g_sycl_gpu_mgr;
|
||||
}
|
||||
g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
|
||||
g_ggml_sycl_backend_gpu_mode = SYCL_SINGLE_GPU_MODE;
|
||||
ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
|
||||
g_ggml_backend_sycl_buffer_type_initialized = false;
|
||||
}
|
||||
|
||||
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
|
||||
if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n");
|
||||
|
||||
if (g_sycl_gpu_mgr) {
|
||||
delete g_sycl_gpu_mgr;
|
||||
}
|
||||
g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
||||
g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE;
|
||||
ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
|
||||
g_ggml_backend_sycl_buffer_type_initialized = false;
|
||||
}
|
||||
|
||||
extern "C" int ggml_backend_sycl_reg_devices();
|
||||
|
||||
int ggml_backend_sycl_reg_devices() {
|
||||
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
||||
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
|
||||
ggml_backend_sycl_set_mul_device_mode();
|
||||
assert(g_device_count>0);
|
||||
for (int i = 0; i < g_device_count; i++) {
|
||||
int id = g_sycl_gpu_mgr->gpus[i];
|
||||
|
||||
@@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
|
||||
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
|
||||
|
||||
// TODO: these are temporary
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
|
||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
|
||||
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
|
||||
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
112
ggml.c
112
ggml.c
@@ -470,6 +470,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
.type_size = sizeof(int32_t),
|
||||
.is_quantized = false,
|
||||
},
|
||||
[GGML_TYPE_I64] = {
|
||||
.type_name = "i64",
|
||||
.blck_size = 1,
|
||||
.type_size = sizeof(int64_t),
|
||||
.is_quantized = false,
|
||||
},
|
||||
[GGML_TYPE_F64] = {
|
||||
.type_name = "f64",
|
||||
.blck_size = 1,
|
||||
.type_size = sizeof(double),
|
||||
.is_quantized = false,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_F32] = {
|
||||
.type_name = "f32",
|
||||
.blck_size = 1,
|
||||
@@ -918,6 +931,101 @@ inline static float vaddvq_f32(float32x4_t v) {
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
||||
#endif
|
||||
|
||||
#elif defined(__AVX512F__)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 AVX512
|
||||
|
||||
#define GGML_F32_STEP 64
|
||||
#define GGML_F32_EPR 16
|
||||
|
||||
#define GGML_F32x16 __m512
|
||||
#define GGML_F32x16_ZERO _mm512_setzero_ps()
|
||||
#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
|
||||
#define GGML_F32x16_LOAD _mm512_loadu_ps
|
||||
#define GGML_F32x16_STORE _mm512_storeu_ps
|
||||
// _mm512_fmadd_ps is defined in AVX512F so no guard is required
|
||||
#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
||||
#define GGML_F32x16_ADD _mm512_add_ps
|
||||
#define GGML_F32x16_MUL _mm512_mul_ps
|
||||
#define GGML_F32x16_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = _mm512_reduce_add_ps(x[0]); \
|
||||
} while (0)
|
||||
|
||||
// TODO: is this optimal ?
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x16
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32x16_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32x16_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32x16_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32x16_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
|
||||
|
||||
// F16 AVX512
|
||||
|
||||
// F16 AVX
|
||||
|
||||
#define GGML_F16_STEP 64
|
||||
#define GGML_F16_EPR 16
|
||||
|
||||
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
|
||||
|
||||
#define GGML_F32Cx16 __m512
|
||||
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
||||
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
||||
|
||||
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
|
||||
// so F16C guard isn't required
|
||||
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
|
||||
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
|
||||
|
||||
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
||||
#define GGML_F32Cx16_ADD _mm512_add_ps
|
||||
#define GGML_F32Cx16_MUL _mm512_mul_ps
|
||||
#define GGML_F32Cx16_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = _mm512_reduce_add_ps(x[0]); \
|
||||
} while (0)
|
||||
|
||||
#define GGML_F16_VEC GGML_F32Cx16
|
||||
#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
||||
|
||||
#elif defined(__AVX__)
|
||||
|
||||
#define GGML_SIMD
|
||||
@@ -12418,6 +12526,8 @@ static void ggml_compute_forward_alibi(
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_I64:
|
||||
case GGML_TYPE_F64:
|
||||
case GGML_TYPE_COUNT:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
@@ -12504,6 +12614,8 @@ static void ggml_compute_forward_clamp(
|
||||
case GGML_TYPE_I8:
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_I64:
|
||||
case GGML_TYPE_F64:
|
||||
case GGML_TYPE_COUNT:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
|
||||
2
ggml.h
2
ggml.h
@@ -366,6 +366,8 @@ extern "C" {
|
||||
GGML_TYPE_I8 = 24,
|
||||
GGML_TYPE_I16 = 25,
|
||||
GGML_TYPE_I32 = 26,
|
||||
GGML_TYPE_I64 = 27,
|
||||
GGML_TYPE_F64 = 28,
|
||||
GGML_TYPE_COUNT,
|
||||
};
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ class Keys:
|
||||
FILE_TYPE = "general.file_type"
|
||||
|
||||
class LLM:
|
||||
VOCAB_SIZE = "{arch}.vocab_size"
|
||||
CONTEXT_LENGTH = "{arch}.context_length"
|
||||
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
||||
BLOCK_COUNT = "{arch}.block_count"
|
||||
@@ -41,6 +42,7 @@ class Keys:
|
||||
EXPERT_COUNT = "{arch}.expert_count"
|
||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||
POOLING_TYPE = "{arch}.pooling_type"
|
||||
LOGIT_SCALE = "{arch}.logit_scale"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "{arch}.attention.head_count"
|
||||
@@ -120,6 +122,7 @@ class MODEL_ARCH(IntEnum):
|
||||
GEMMA = auto()
|
||||
STARCODER2 = auto()
|
||||
MAMBA = auto()
|
||||
COMMAND_R = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
@@ -186,6 +189,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
MODEL_ARCH.GEMMA: "gemma",
|
||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||
MODEL_ARCH.MAMBA: "mamba",
|
||||
MODEL_ARCH.COMMAND_R: "command-r",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
@@ -578,6 +582,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.SSM_D,
|
||||
MODEL_TENSOR.SSM_OUT,
|
||||
],
|
||||
MODEL_ARCH.COMMAND_R: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
@@ -664,6 +680,8 @@ class GGMLQuantizationType(IntEnum):
|
||||
I8 = 24
|
||||
I16 = 25
|
||||
I32 = 26
|
||||
I64 = 27
|
||||
F64 = 28
|
||||
|
||||
|
||||
class GGUFEndian(IntEnum):
|
||||
@@ -733,6 +751,8 @@ GGML_QUANT_SIZES = {
|
||||
GGMLQuantizationType.I8: (1, 1),
|
||||
GGMLQuantizationType.I16: (1, 2),
|
||||
GGMLQuantizationType.I32: (1, 4),
|
||||
GGMLQuantizationType.I64: (1, 8),
|
||||
GGMLQuantizationType.F64: (1, 8),
|
||||
}
|
||||
|
||||
|
||||
@@ -752,6 +772,7 @@ KEY_GENERAL_SOURCE_HF_REPO = Keys.General.SOURCE_HF_REPO
|
||||
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
|
||||
|
||||
# LLM
|
||||
KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
|
||||
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
|
||||
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
|
||||
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT
|
||||
|
||||
@@ -242,12 +242,15 @@ class GGUFReader:
|
||||
n_bytes = n_elems * type_size // block_size
|
||||
data_offs = int(start_offs + offset_tensor[0])
|
||||
item_type: npt.DTypeLike
|
||||
if ggml_type == GGMLQuantizationType.F32:
|
||||
item_count = n_elems
|
||||
item_type = np.float32
|
||||
elif ggml_type == GGMLQuantizationType.F16:
|
||||
if ggml_type == GGMLQuantizationType.F16:
|
||||
item_count = n_elems
|
||||
item_type = np.float16
|
||||
elif ggml_type == GGMLQuantizationType.F32:
|
||||
item_count = n_elems
|
||||
item_type = np.float32
|
||||
elif ggml_type == GGMLQuantizationType.F64:
|
||||
item_count = n_elems
|
||||
item_type = np.float64
|
||||
elif ggml_type == GGMLQuantizationType.I8:
|
||||
item_count = n_elems
|
||||
item_type = np.int8
|
||||
@@ -257,6 +260,9 @@ class GGUFReader:
|
||||
elif ggml_type == GGMLQuantizationType.I32:
|
||||
item_count = n_elems
|
||||
item_type = np.int32
|
||||
elif ggml_type == GGMLQuantizationType.I64:
|
||||
item_count = n_elems
|
||||
item_type = np.int64
|
||||
else:
|
||||
item_count = n_bytes
|
||||
item_type = np.uint8
|
||||
|
||||
@@ -204,18 +204,22 @@ class GGUFWriter:
|
||||
for i in range(n_dims):
|
||||
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
|
||||
if raw_dtype is None:
|
||||
if tensor_dtype == np.float32:
|
||||
dtype = GGMLQuantizationType.F32
|
||||
elif tensor_dtype == np.float16:
|
||||
if tensor_dtype == np.float16:
|
||||
dtype = GGMLQuantizationType.F16
|
||||
elif tensor_dtype == np.float32:
|
||||
dtype = GGMLQuantizationType.F32
|
||||
elif tensor_dtype == np.float64:
|
||||
dtype = GGMLQuantizationType.F64
|
||||
elif tensor_dtype == np.int8:
|
||||
dtype = GGMLQuantizationType.I8
|
||||
elif tensor_dtype == np.int16:
|
||||
dtype = GGMLQuantizationType.I16
|
||||
elif tensor_dtype == np.int32:
|
||||
dtype = GGMLQuantizationType.I32
|
||||
elif tensor_dtype == np.int64:
|
||||
dtype = GGMLQuantizationType.I64
|
||||
else:
|
||||
raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now")
|
||||
raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
|
||||
else:
|
||||
dtype = raw_dtype
|
||||
self.ti_data += self._pack("I", dtype)
|
||||
@@ -321,6 +325,9 @@ class GGUFWriter:
|
||||
self.data_alignment = alignment
|
||||
self.add_uint32(Keys.General.ALIGNMENT, alignment)
|
||||
|
||||
def add_vocab_size(self, size: int) -> None:
|
||||
self.add_uint32(Keys.LLM.VOCAB_SIZE.format(arch=self.arch), size)
|
||||
|
||||
def add_context_length(self, length: int) -> None:
|
||||
self.add_uint32(Keys.LLM.CONTEXT_LENGTH.format(arch=self.arch), length)
|
||||
|
||||
@@ -354,6 +361,9 @@ class GGUFWriter:
|
||||
def add_clamp_kqv(self, value: float) -> None:
|
||||
self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
|
||||
|
||||
def add_logit_scale(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
|
||||
|
||||
def add_expert_count(self, count: int) -> None:
|
||||
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.7.0"
|
||||
version = "0.8.0"
|
||||
description = "Read and write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
|
||||
451
llama.cpp
451
llama.cpp
@@ -214,6 +214,7 @@ enum llm_arch {
|
||||
LLM_ARCH_GEMMA,
|
||||
LLM_ARCH_STARCODER2,
|
||||
LLM_ARCH_MAMBA,
|
||||
LLM_ARCH_COMMAND_R,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@@ -243,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_GEMMA, "gemma" },
|
||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||
{ LLM_ARCH_MAMBA, "mamba" },
|
||||
{ LLM_ARCH_COMMAND_R, "command-r" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@@ -258,6 +260,7 @@ enum llm_kv {
|
||||
LLM_KV_GENERAL_SOURCE_URL,
|
||||
LLM_KV_GENERAL_SOURCE_HF_REPO,
|
||||
|
||||
LLM_KV_VOCAB_SIZE,
|
||||
LLM_KV_CONTEXT_LENGTH,
|
||||
LLM_KV_EMBEDDING_LENGTH,
|
||||
LLM_KV_BLOCK_COUNT,
|
||||
@@ -267,6 +270,7 @@ enum llm_kv {
|
||||
LLM_KV_EXPERT_COUNT,
|
||||
LLM_KV_EXPERT_USED_COUNT,
|
||||
LLM_KV_POOLING_TYPE,
|
||||
LLM_KV_LOGIT_SCALE,
|
||||
|
||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
@@ -321,6 +325,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_GENERAL_SOURCE_URL, "general.source.url" },
|
||||
{ LLM_KV_GENERAL_SOURCE_HF_REPO, "general.source.huggingface.repository" },
|
||||
|
||||
{ LLM_KV_VOCAB_SIZE, "%s.vocab_size" },
|
||||
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
|
||||
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
|
||||
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
|
||||
@@ -330,6 +335,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
||||
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
||||
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
||||
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||
@@ -836,6 +842,21 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_COMMAND_R,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@@ -1595,6 +1616,7 @@ enum e_model {
|
||||
MODEL_20B,
|
||||
MODEL_30B,
|
||||
MODEL_34B,
|
||||
MODEL_35B,
|
||||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
@@ -1641,6 +1663,7 @@ struct llama_hparams {
|
||||
|
||||
float f_clamp_kqv = 0.0f;
|
||||
float f_max_alibi_bias = 0.0f;
|
||||
float f_logit_scale = 0.0f;
|
||||
|
||||
bool causal_attn = true;
|
||||
bool need_kq_pos = false;
|
||||
@@ -1871,6 +1894,31 @@ struct llama_kv_cache {
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_control_vector {
|
||||
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||
std::vector<struct ggml_context *> ctxs;
|
||||
std::vector<ggml_backend_buffer_t> bufs;
|
||||
|
||||
int32_t layer_start = -1;
|
||||
int32_t layer_end = -1;
|
||||
|
||||
ggml_tensor * tensor_for(int il) const {
|
||||
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
||||
return nullptr;
|
||||
}
|
||||
return tensors[il];
|
||||
}
|
||||
|
||||
~llama_control_vector() {
|
||||
for (struct ggml_context * ctx : ctxs) {
|
||||
ggml_free(ctx);
|
||||
}
|
||||
for (ggml_backend_buffer_t buf : bufs) {
|
||||
ggml_backend_buffer_free(buf);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_vocab {
|
||||
using id = int32_t;
|
||||
using token = std::string;
|
||||
@@ -2085,6 +2133,9 @@ struct llama_context {
|
||||
struct ggml_tensor * inp_s_mask; // F32 [1, kv_size]
|
||||
struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
|
||||
|
||||
// control vectors
|
||||
struct llama_control_vector cvec;
|
||||
|
||||
#ifdef GGML_USE_MPI
|
||||
ggml_mpi_context * ctx_mpi = NULL;
|
||||
#endif
|
||||
@@ -3229,6 +3280,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||
case MODEL_20B: return "20B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_35B: return "35B";
|
||||
case MODEL_40B: return "40B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_70B: return "70B";
|
||||
@@ -3242,10 +3294,11 @@ static const char * llama_model_type_name(e_model type) {
|
||||
|
||||
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
||||
switch (type) {
|
||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
||||
default: return "unknown";
|
||||
case LLAMA_VOCAB_TYPE_NONE: return "no vocab";
|
||||
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
||||
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
||||
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3277,14 +3330,14 @@ static void llm_load_hparams(
|
||||
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
||||
|
||||
// get hparams kv
|
||||
ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
||||
ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
||||
ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
||||
ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key (LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key (LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
||||
ml.get_key(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||
ml.get_key(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||
ml.get_key(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
||||
ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
||||
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false);
|
||||
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false);
|
||||
|
||||
GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS);
|
||||
GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert);
|
||||
@@ -3620,6 +3673,15 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
{
|
||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 40: model.type = e_model::MODEL_35B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
@@ -3645,30 +3707,25 @@ static void llm_load_vocab(
|
||||
|
||||
const auto kv = LLM_KV(model.arch);
|
||||
|
||||
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
||||
if (token_idx == -1) {
|
||||
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
||||
}
|
||||
|
||||
const float * scores = nullptr;
|
||||
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
||||
if (score_idx != -1) {
|
||||
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||||
}
|
||||
|
||||
const int * toktypes = nullptr;
|
||||
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
||||
if (toktype_idx != -1) {
|
||||
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||
}
|
||||
|
||||
// determine vocab type
|
||||
{
|
||||
std::string tokenizer_name;
|
||||
|
||||
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
|
||||
|
||||
if (tokenizer_name == "llama") {
|
||||
if (tokenizer_name == "no_vocab") {
|
||||
vocab.type = LLAMA_VOCAB_TYPE_NONE;
|
||||
|
||||
// default special tokens
|
||||
vocab.special_bos_id = -1;
|
||||
vocab.special_eos_id = -1;
|
||||
vocab.special_unk_id = -1;
|
||||
vocab.special_sep_id = -1;
|
||||
vocab.special_pad_id = -1;
|
||||
vocab.linefeed_id = -1;
|
||||
|
||||
return;
|
||||
} else if (tokenizer_name == "llama") {
|
||||
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
||||
|
||||
// default special tokens
|
||||
@@ -3734,6 +3791,23 @@ static void llm_load_vocab(
|
||||
}
|
||||
}
|
||||
|
||||
const int token_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_LIST).c_str());
|
||||
if (token_idx == -1) {
|
||||
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
||||
}
|
||||
|
||||
const float * scores = nullptr;
|
||||
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
||||
if (score_idx != -1) {
|
||||
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||||
}
|
||||
|
||||
const int * toktypes = nullptr;
|
||||
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
||||
if (toktype_idx != -1) {
|
||||
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||
}
|
||||
|
||||
const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx);
|
||||
|
||||
vocab.id_to_token.resize(n_vocab);
|
||||
@@ -3929,6 +4003,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
||||
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
||||
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
||||
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
||||
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
||||
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
||||
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
||||
@@ -4903,6 +4978,37 @@ static bool llm_load_tensors(
|
||||
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// output
|
||||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
// init output from the input tok embed
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
@@ -5023,7 +5129,8 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
|
||||
llm_load_print_meta(ml, model);
|
||||
|
||||
if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
||||
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
||||
model.hparams.n_vocab != model.vocab.id_to_token.size()) {
|
||||
throw std::runtime_error("vocab size mismatch");
|
||||
}
|
||||
|
||||
@@ -5048,6 +5155,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
||||
ggml_backend_sycl_set_single_device_mode(params.main_gpu);
|
||||
//SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
|
||||
params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
|
||||
} else {
|
||||
ggml_backend_sycl_set_mul_device_mode();
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!llm_load_tensors(
|
||||
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
||||
params.progress_callback, params.progress_callback_user_data
|
||||
@@ -5842,6 +5959,12 @@ struct llm_build_context {
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
||||
if (layer_dir != nullptr) {
|
||||
cur = ggml_add(ctx0, cur, layer_dir);
|
||||
}
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
@@ -5877,7 +6000,7 @@ struct llm_build_context {
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
@@ -5927,7 +6050,6 @@ struct llm_build_context {
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
@@ -8289,6 +8411,121 @@ struct llm_build_context {
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_command_r() {
|
||||
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
const float f_logit_scale = hparams.f_logit_scale;
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm, NULL,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
struct ggml_tensor * ffn_inp = cur;
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (model.layers[il].bq) {
|
||||
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (model.layers[il].bk) {
|
||||
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (model.layers[il].bv) {
|
||||
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * attn_out = cur;
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
cur = llm_build_ffn(ctx0, ffn_inp,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
// add together residual + FFN + self-attention
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
cur = ggml_add(ctx0, cur, attn_out);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
|
||||
if (f_logit_scale) {
|
||||
cur = ggml_scale(ctx0, cur, f_logit_scale);
|
||||
}
|
||||
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
|
||||
}
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
@@ -8471,6 +8708,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_mamba();
|
||||
} break;
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
{
|
||||
result = llm.build_command_r();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
@@ -9361,26 +9602,32 @@ static enum llama_vocab_type llama_vocab_get_type(const llama_vocab & vocab) {
|
||||
}
|
||||
|
||||
static bool llama_is_normal_token(const llama_vocab & vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_NORMAL;
|
||||
}
|
||||
|
||||
static bool llama_is_unknown_token(const llama_vocab & vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
|
||||
}
|
||||
|
||||
static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
|
||||
}
|
||||
|
||||
static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id) {
|
||||
GGML_ASSERT(vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
|
||||
}
|
||||
|
||||
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
||||
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
||||
GGML_ASSERT(llama_is_byte_token(vocab, id));
|
||||
const auto& token_data = vocab.id_to_token.at(id);
|
||||
switch (llama_vocab_get_type(vocab)) {
|
||||
@@ -9401,6 +9648,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
||||
}
|
||||
|
||||
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
||||
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
|
||||
static const char * hex = "0123456789ABCDEF";
|
||||
switch (llama_vocab_get_type(vocab)) {
|
||||
case LLAMA_VOCAB_TYPE_SPM: {
|
||||
@@ -10232,6 +10480,8 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_NONE:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
return output;
|
||||
@@ -11952,7 +12202,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
|
||||
return new_type;
|
||||
}
|
||||
|
||||
static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||
static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
|
||||
std::mutex mutex;
|
||||
int counter = 0;
|
||||
size_t new_size = 0;
|
||||
@@ -12896,23 +13146,22 @@ struct llama_context * llama_new_context_with_model(
|
||||
if (model->n_gpu_layers > 0) {
|
||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||
int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
|
||||
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
|
||||
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
|
||||
int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
ctx->backends.push_back(backend);
|
||||
} else {
|
||||
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
||||
int id_list[GGML_SYCL_MAX_DEVICES];
|
||||
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
||||
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
||||
int device_id = id_list[i];
|
||||
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
||||
if (backend == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
||||
int id_list[GGML_SYCL_MAX_DEVICES];
|
||||
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
@@ -13113,6 +13362,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_ORION:
|
||||
case LLM_ARCH_INTERNLM2:
|
||||
case LLM_ARCH_MINICPM:
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
@@ -13138,7 +13388,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
}
|
||||
|
||||
int32_t llama_n_vocab(const struct llama_model * model) {
|
||||
return model->vocab.id_to_token.size();
|
||||
return model->hparams.n_vocab;
|
||||
}
|
||||
|
||||
int32_t llama_n_ctx_train(const struct llama_model * model) {
|
||||
@@ -13149,6 +13399,10 @@ int32_t llama_n_embd(const struct llama_model * model) {
|
||||
return model->hparams.n_embd;
|
||||
}
|
||||
|
||||
int32_t llama_n_layer(const struct llama_model * model) {
|
||||
return model->hparams.n_layer;
|
||||
}
|
||||
|
||||
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
||||
return model->hparams.rope_freq_scale_train;
|
||||
}
|
||||
@@ -13248,6 +13502,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
|
||||
}
|
||||
}
|
||||
|
||||
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
|
||||
GGML_ASSERT(cvec.tensors.empty());
|
||||
GGML_ASSERT(cvec.ctxs.empty());
|
||||
GGML_ASSERT(cvec.bufs.empty());
|
||||
|
||||
// count layer buffer types
|
||||
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
||||
for (int64_t i = 0; i < model.hparams.n_layer; i++) {
|
||||
buft_layer_count[model.buft_layer[i].buft]++;
|
||||
}
|
||||
|
||||
// allocate contexts
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
for (auto & it : buft_layer_count) {
|
||||
int n_layers = it.second;
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ n_layers * ggml_tensor_overhead(),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
if (!ctx) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
ctx_map[it.first] = ctx;
|
||||
}
|
||||
|
||||
// make tensors
|
||||
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
||||
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
|
||||
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||||
cvec.tensors.push_back(tensor);
|
||||
}
|
||||
|
||||
// allocate tensors / buffers and zero
|
||||
for (auto it : ctx_map) {
|
||||
ggml_backend_buffer_type_t buft = it.first;
|
||||
ggml_context * ctx = it.second;
|
||||
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||
if (!buf) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_clear(buf, 0);
|
||||
cvec.ctxs.push_back(ctx);
|
||||
cvec.bufs.push_back(buf);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
|
||||
const llama_model & model = lctx->model;
|
||||
llama_control_vector & cvec = lctx->cvec;
|
||||
|
||||
if (data == nullptr) {
|
||||
// disable the current control vector (but leave allocated for later)
|
||||
cvec.layer_start = -1;
|
||||
cvec.layer_end = -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (n_embd != (int) model.hparams.n_embd) {
|
||||
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (cvec.tensors.empty()) {
|
||||
if (!llama_control_vector_init(cvec, model)) {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
cvec.layer_start = il_start;
|
||||
cvec.layer_end = il_end;
|
||||
|
||||
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
||||
assert(cvec.tensors[il] != nullptr);
|
||||
|
||||
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
||||
if (off + n_embd <= len) {
|
||||
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
|
||||
struct llama_kv_cache_view result = {
|
||||
/*.n_cells = */ 0,
|
||||
@@ -13962,14 +14306,17 @@ float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id
|
||||
}
|
||||
|
||||
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
|
||||
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return model->vocab.id_to_token[token].text.c_str();
|
||||
}
|
||||
|
||||
float llama_token_get_score(const struct llama_model * model, llama_token token) {
|
||||
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return model->vocab.id_to_token[token].score;
|
||||
}
|
||||
|
||||
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
|
||||
GGML_ASSERT(model->vocab.type != LLAMA_VOCAB_TYPE_NONE);
|
||||
return model->vocab.id_to_token[token].type;
|
||||
}
|
||||
|
||||
@@ -14214,6 +14561,26 @@ static int32_t llama_chat_apply_template_internal(
|
||||
if (add_ass) {
|
||||
ss << "<start_of_turn>model\n";
|
||||
}
|
||||
} else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
|
||||
// OrionStarAI/Orion-14B-Chat
|
||||
std::string system_prompt = "";
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "system") {
|
||||
// there is no system message support, we will merge it with user prompt
|
||||
system_prompt = message->content;
|
||||
continue;
|
||||
} else if (role == "user") {
|
||||
ss << "Human: ";
|
||||
if (!system_prompt.empty()) {
|
||||
ss << system_prompt << "\n\n";
|
||||
system_prompt = "";
|
||||
}
|
||||
ss << message->content << "\n\nAssistant: </s>";
|
||||
} else {
|
||||
ss << message->content << "</s>";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
|
||||
30
llama.h
30
llama.h
@@ -59,9 +59,10 @@ extern "C" {
|
||||
typedef int32_t llama_seq_id;
|
||||
|
||||
enum llama_vocab_type {
|
||||
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
|
||||
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
|
||||
LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
|
||||
LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
|
||||
LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece
|
||||
LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding
|
||||
LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece
|
||||
};
|
||||
|
||||
// note: these values should be synchronized with ggml_rope
|
||||
@@ -387,6 +388,7 @@ extern "C" {
|
||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
||||
|
||||
// Get the model's RoPE frequency scaling factor
|
||||
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||
@@ -434,10 +436,24 @@ extern "C" {
|
||||
// Returns 0 on success
|
||||
LLAMA_API int32_t llama_model_apply_lora_from_file(
|
||||
const struct llama_model * model,
|
||||
const char * path_lora,
|
||||
float scale,
|
||||
const char * path_base_model,
|
||||
int32_t n_threads);
|
||||
const char * path_lora,
|
||||
float scale,
|
||||
const char * path_base_model,
|
||||
int32_t n_threads);
|
||||
|
||||
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||
// the currently loaded vector.
|
||||
// n_embd should be the size of a single layer's control, and data should point
|
||||
// to an n_embd x n_layers buffer starting from layer 1.
|
||||
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||
// See llama_control_vector_load in common to load a control vector.
|
||||
LLAMA_API int32_t llama_control_vector_apply(
|
||||
struct llama_context * lctx,
|
||||
const float * data,
|
||||
size_t len,
|
||||
int32_t n_embd,
|
||||
int32_t il_start,
|
||||
int32_t il_end);
|
||||
|
||||
//
|
||||
// KV cache
|
||||
|
||||
@@ -31,6 +31,8 @@ int main(void) {
|
||||
"{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
|
||||
// google/gemma-7b-it
|
||||
"{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
|
||||
// OrionStarAI/Orion-14B-Chat
|
||||
"{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
|
||||
};
|
||||
std::vector<std::string> expected_output = {
|
||||
// teknium/OpenHermes-2.5-Mistral-7B
|
||||
@@ -45,6 +47,8 @@ int main(void) {
|
||||
"system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n I am an assistant </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
|
||||
// google/gemma-7b-it
|
||||
"<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
|
||||
// OrionStarAI/Orion-14B-Chat
|
||||
"Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s> I am an assistant </s>Human: Another question\n\nAssistant: </s>",
|
||||
};
|
||||
std::vector<char> formatted_chat(1024);
|
||||
int32_t res;
|
||||
|
||||
Reference in New Issue
Block a user