mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-05 13:53:23 +02:00
Compare commits
15 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
aa719c2f88 | ||
|
|
4cdd0bb453 | ||
|
|
b5bd037832 | ||
|
|
dfcd53f7ec | ||
|
|
4ea00794b8 | ||
|
|
02a6a82ae7 | ||
|
|
c498fc82fe | ||
|
|
e7a5130a20 | ||
|
|
bee378e098 | ||
|
|
5fb557653b | ||
|
|
4ae88d07d0 | ||
|
|
e789095502 | ||
|
|
f2a789e334 | ||
|
|
3a59971967 | ||
|
|
63b54c81a6 |
@@ -2,10 +2,10 @@ ARG GCC_VERSION=15.2.0
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
### Build Llama.cpp stage
|
||||
FROM --platform=linux/s390x gcc:${GCC_VERSION} AS build
|
||||
FROM gcc:${GCC_VERSION} AS build
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt \
|
||||
--mount=type=cache,target=/var/lib/apt/lists \
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
|
||||
apt update -y && \
|
||||
apt upgrade -y && \
|
||||
apt install -y --no-install-recommends \
|
||||
@@ -40,7 +40,7 @@ COPY requirements /opt/llama.cpp/gguf-py/requirements
|
||||
|
||||
|
||||
### Collect all llama.cpp binaries, libraries and distro libraries
|
||||
FROM --platform=linux/s390x scratch AS collector
|
||||
FROM scratch AS collector
|
||||
|
||||
# Copy llama.cpp binaries and libraries
|
||||
COPY --from=build /opt/llama.cpp/bin /llama.cpp/bin
|
||||
@@ -49,13 +49,14 @@ COPY --from=build /opt/llama.cpp/gguf-py /llama.cpp/gguf-py
|
||||
|
||||
|
||||
### Base image
|
||||
FROM --platform=linux/s390x ubuntu:${UBUNTU_VERSION} AS base
|
||||
FROM ubuntu:${UBUNTU_VERSION} AS base
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt \
|
||||
--mount=type=cache,target=/var/lib/apt/lists \
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
|
||||
apt update -y && \
|
||||
apt install -y --no-install-recommends \
|
||||
# WARNING: Do not use libopenblas-openmp-dev. libopenblas-dev is faster.
|
||||
# See: https://github.com/ggml-org/llama.cpp/pull/15915#issuecomment-3317166506
|
||||
curl libgomp1 libopenblas-dev && \
|
||||
apt autoremove -y && \
|
||||
apt clean -y && \
|
||||
@@ -68,13 +69,13 @@ COPY --from=collector /llama.cpp/lib /usr/lib/s390x-linux-gnu
|
||||
|
||||
|
||||
### Full
|
||||
FROM --platform=linux/s390x base AS full
|
||||
FROM base AS full
|
||||
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
WORKDIR /app
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt \
|
||||
--mount=type=cache,target=/var/lib/apt/lists \
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt/lists,sharing=locked \
|
||||
apt update -y && \
|
||||
apt install -y \
|
||||
git cmake libjpeg-dev \
|
||||
@@ -97,7 +98,7 @@ ENTRYPOINT [ "/app/tools.sh" ]
|
||||
|
||||
|
||||
### CLI Only
|
||||
FROM --platform=linux/s390x base AS light
|
||||
FROM base AS light
|
||||
|
||||
WORKDIR /llama.cpp/bin
|
||||
|
||||
@@ -108,7 +109,7 @@ ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
|
||||
|
||||
|
||||
### Server
|
||||
FROM --platform=linux/s390x base AS server
|
||||
FROM base AS server
|
||||
|
||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||
|
||||
|
||||
89
.github/workflows/build.yml
vendored
89
.github/workflows/build.yml
vendored
@@ -1251,56 +1251,129 @@ jobs:
|
||||
# TODO: simplify the following workflows using a matrix
|
||||
# TODO: run lighter CI on PRs and the full CI only on master (if needed)
|
||||
ggml-ci-x64-cpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, X64, CPU, low-perf]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ggml-ci-x64-cpu-low-perf
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-low-perf:
|
||||
runs-on: [self-hosted, Linux, ARM64, CPU, low-perf]
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ggml-ci-arm64-cpu-low-perf
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-x64-cpu-high-perf:
|
||||
runs-on: [self-hosted, Linux, X64, CPU, high-perf]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ggml-ci-x64-cpu-high-perf
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
LLAMA_ARG_THREADS=$(nproc) bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-high-perf:
|
||||
runs-on: [self-hosted, Linux, ARM64, CPU, high-perf]
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ggml-ci-arm64-cpu-high-perf
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-arm64-cpu-high-perf-sve:
|
||||
runs-on: ubuntu-22.04-arm
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: ccache
|
||||
uses: ggml-org/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ggml-ci-arm64-cpu-high-perf-sve
|
||||
evict-old-files: 1d
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential libcurl4-openssl-dev
|
||||
|
||||
- name: Test
|
||||
id: ggml-ci
|
||||
run: |
|
||||
LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
ggml-ci-x64-nvidia-cuda:
|
||||
runs-on: [self-hosted, Linux, X64, NVIDIA]
|
||||
|
||||
@@ -63,7 +63,7 @@
|
||||
/ggml/src/ggml-quants.* @ggerganov
|
||||
/ggml/src/ggml-threading.* @ggerganov @slaren
|
||||
/ggml/src/ggml-vulkan/ @0cc4m
|
||||
/ggml/src/ggml-zdnn/ @taronaeo
|
||||
/ggml/src/ggml-zdnn/ @taronaeo @Andreas-Krebbel @AlekseiNikiforovIBM
|
||||
/ggml/src/ggml.c @ggerganov @slaren
|
||||
/ggml/src/ggml.cpp @ggerganov @slaren
|
||||
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
|
||||
|
||||
@@ -25,7 +25,7 @@ The project differentiates between 3 levels of contributors:
|
||||
- Squash-merge PRs
|
||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
|
||||
- Let other maintainers, merge their own PRs
|
||||
- Let other maintainers merge their own PRs
|
||||
- When merging a PR, make sure you have a good understanding of the changes
|
||||
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
|
||||
|
||||
|
||||
27
ci/run.sh
27
ci/run.sh
@@ -109,6 +109,11 @@ if [ ! -z ${GG_BUILD_MUSA} ]; then
|
||||
MUSA_ARCH=${MUSA_ARCH:-21}
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_NO_SVE} ]; then
|
||||
# arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
|
||||
fi
|
||||
## helpers
|
||||
|
||||
# download a file if it does not exist or if it is outdated
|
||||
@@ -345,16 +350,16 @@ function gg_run_qwen3_0_6b {
|
||||
|
||||
wiki_test="${path_wiki}/wiki.test.raw"
|
||||
|
||||
./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0
|
||||
./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0
|
||||
./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1
|
||||
./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0
|
||||
./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1
|
||||
./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k
|
||||
./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k
|
||||
./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k
|
||||
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k
|
||||
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k
|
||||
./bin/llama-quantize ${model_bf16} ${model_q8_0} q8_0 $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q4_0} q4_0 $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q4_1} q4_1 $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q5_0} q5_0 $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q5_1} q5_1 $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q2_k} q2_k $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q3_k} q3_k $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q4_k} q4_k $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q5_k} q5_k $(nproc)
|
||||
./bin/llama-quantize ${model_bf16} ${model_q6_k} q6_k $(nproc)
|
||||
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_f16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
|
||||
(time ./bin/llama-cli -no-cnv --model ${model_bf16} -ngl 99 -c 1024 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-bf16.log
|
||||
@@ -427,7 +432,7 @@ function gg_run_qwen3_0_6b {
|
||||
function gg_sum_qwen3_0_6b {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Pythia 2.8B:\n'
|
||||
gg_printf 'Qwen3 0.6B:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
|
||||
@@ -961,15 +961,13 @@ struct common_init_result common_init_from_params(common_params & params) {
|
||||
|
||||
bool has_eos = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_sep = llama_vocab_sep(vocab) != LLAMA_TOKEN_NULL;
|
||||
bool has_rerank_prompt = llama_model_chat_template(model, "rerank") != NULL;
|
||||
|
||||
if (!has_eos && !has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token or SEP token, reranking will not work\n", __func__);
|
||||
if (!has_eos && !has_sep && !has_rerank_prompt) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, SEP token, or rerank prompt. Reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
} else if (!has_eos) {
|
||||
LOG_WRN("%s: warning: vocab does not have an EOS token, using SEP token as fallback\n", __func__);
|
||||
} else if (!has_sep) {
|
||||
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
|
||||
ok = false;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
|
||||
@@ -332,6 +332,7 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
|
||||
}
|
||||
if (ctx) {
|
||||
llama_perf_context_print(ctx);
|
||||
llama_memory_breakdown_print(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3717,11 +3717,29 @@ class Qwen2MoeModel(TextModel):
|
||||
class Qwen3Model(Qwen2Model):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN3
|
||||
|
||||
# extra logic for rerank models
|
||||
is_rerank: bool = False
|
||||
is_tied_embeddings: bool = False
|
||||
token_false_id: int | None = None
|
||||
token_true_id: int | None = None
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# track for intern-s1-mini
|
||||
hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
|
||||
self.origin_hf_arch = hparams.get('architectures', [None])[0]
|
||||
|
||||
# a bit hacky, but currently the only way to detect if this is a rerank model
|
||||
# ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
|
||||
readme_path = self.dir_model / "README.md"
|
||||
readme_text = ""
|
||||
if readme_path.exists():
|
||||
with readme_path.open("r", encoding="utf-8") as f:
|
||||
readme_text = f.read()
|
||||
if "# Qwen3-Reranker" in readme_text:
|
||||
self._find_rerank_config()
|
||||
|
||||
def set_vocab(self):
|
||||
# deal with intern-s1-mini
|
||||
if self.origin_hf_arch == 'InternS1ForConditionalGeneration':
|
||||
@@ -3730,6 +3748,53 @@ class Qwen3Model(Qwen2Model):
|
||||
|
||||
super().set_vocab()
|
||||
|
||||
def _find_rerank_config(self):
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
|
||||
|
||||
self.is_rerank = True
|
||||
self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False)
|
||||
self.token_false_id = tokenizer.convert_tokens_to_ids("no")
|
||||
self.token_true_id = tokenizer.convert_tokens_to_ids("yes")
|
||||
self.sep_token_id = tokenizer.convert_tokens_to_ids("|")
|
||||
|
||||
assert self.token_false_id is not None and self.token_true_id is not None
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
if self.is_rerank:
|
||||
self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK)
|
||||
self.gguf_writer.add_classifier_output_labels(["yes", "no"])
|
||||
self.gguf_writer.add_chat_template([{
|
||||
"name": "rerank",
|
||||
"template": "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n"
|
||||
"<|im_start|>user\n<Instruct>: Given a web search query, retrieve relevant passages that answer the query\n<Query>: {query}\n<Document>: {document}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n<think>\n\n</think>\n\n"
|
||||
}])
|
||||
|
||||
def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor:
|
||||
# extract "yes" and "no" tokens from the output lm_head tensor
|
||||
false_row = data_torch[self.token_false_id]
|
||||
true_row = data_torch[self.token_true_id]
|
||||
return torch.stack([true_row, false_row], dim=0)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if self.is_rerank:
|
||||
is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
|
||||
is_real_head = not self.is_tied_embeddings and "lm_head" in name
|
||||
if is_tied_head or is_real_head:
|
||||
cls_out_head = (
|
||||
gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight",
|
||||
self._get_cls_out_tensor(data_torch),
|
||||
)
|
||||
if is_tied_head:
|
||||
embed = (self.map_tensor_name(name), data_torch)
|
||||
return [cls_out_head, embed]
|
||||
if is_real_head:
|
||||
return [cls_out_head]
|
||||
|
||||
return super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
@ModelBase.register("Qwen3MoeForCausalLM")
|
||||
class Qwen3MoeModel(Qwen2MoeModel):
|
||||
|
||||
@@ -95,8 +95,13 @@ int main(int argc, char ** argv) {
|
||||
params.n_batch = params.n_ctx;
|
||||
}
|
||||
|
||||
// For non-causal models, batch size must be equal to ubatch size
|
||||
params.n_ubatch = params.n_batch;
|
||||
// for non-causal models, batch size must be equal to ubatch size
|
||||
if (params.attention_type != LLAMA_ATTENTION_TYPE_CAUSAL) {
|
||||
params.n_ubatch = params.n_batch;
|
||||
}
|
||||
|
||||
// get max number of sequences per batch
|
||||
const int n_seq_max = llama_max_parallel_sequences();
|
||||
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
@@ -144,6 +149,7 @@ int main(int argc, char ** argv) {
|
||||
// get added sep and eos token, if any
|
||||
const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
|
||||
const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
|
||||
const char * rerank_prompt = llama_model_chat_template(model, "rerank");
|
||||
|
||||
// tokenize the prompts and trim
|
||||
std::vector<std::vector<int32_t>> inputs;
|
||||
@@ -153,21 +159,28 @@ int main(int argc, char ** argv) {
|
||||
// split classification pairs and insert expected separator tokens
|
||||
if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
|
||||
std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
|
||||
std::string final_prompt;
|
||||
|
||||
for (size_t i = 0; i < pairs.size(); i++) {
|
||||
final_prompt += pairs[i];
|
||||
if (i != pairs.size() - 1) {
|
||||
if (!added_eos_token.empty()) {
|
||||
final_prompt += added_eos_token;
|
||||
}
|
||||
if (!added_sep_token.empty()) {
|
||||
final_prompt += added_sep_token;
|
||||
if (rerank_prompt != nullptr) {
|
||||
const std::string query = pairs[0];
|
||||
const std::string doc = pairs[1];
|
||||
std::string final_prompt = rerank_prompt;
|
||||
string_replace_all(final_prompt, "{query}" , query);
|
||||
string_replace_all(final_prompt, "{document}", doc );
|
||||
inp = common_tokenize(vocab, final_prompt, true, true);
|
||||
} else {
|
||||
std::string final_prompt;
|
||||
for (size_t i = 0; i < pairs.size(); i++) {
|
||||
final_prompt += pairs[i];
|
||||
if (i != pairs.size() - 1) {
|
||||
if (!added_eos_token.empty()) {
|
||||
final_prompt += added_eos_token;
|
||||
}
|
||||
if (!added_sep_token.empty()) {
|
||||
final_prompt += added_sep_token;
|
||||
}
|
||||
}
|
||||
}
|
||||
inp = common_tokenize(ctx, final_prompt, true, true);
|
||||
}
|
||||
|
||||
inp = common_tokenize(ctx, final_prompt, true, true);
|
||||
} else {
|
||||
inp = common_tokenize(ctx, prompt, true, true);
|
||||
}
|
||||
@@ -229,7 +242,7 @@ int main(int argc, char ** argv) {
|
||||
const uint64_t n_toks = inp.size();
|
||||
|
||||
// encode if at capacity
|
||||
if (batch.n_tokens + n_toks > n_batch) {
|
||||
if (batch.n_tokens + n_toks > n_batch || s >= n_seq_max) {
|
||||
float * out = emb + e * n_embd;
|
||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
|
||||
|
||||
@@ -48,7 +48,7 @@ def main():
|
||||
print(f"Error: Model file not found: {model_path}")
|
||||
sys.exit(1)
|
||||
|
||||
model_name = os.path.splitext(os.path.basename(model_path))[0]
|
||||
model_name = os.path.basename(model_path)
|
||||
data_dir = Path("data")
|
||||
|
||||
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
||||
|
||||
@@ -67,7 +67,7 @@ def main():
|
||||
parser.add_argument('-m', '--model-path', required=True, help='Path to the model directory')
|
||||
args = parser.parse_args()
|
||||
|
||||
model_name = os.path.splitext(os.path.basename(args.model_path))[0]
|
||||
model_name = os.path.basename(args.model_path)
|
||||
data_dir = Path("data")
|
||||
|
||||
pytorch_file = data_dir / f"pytorch-{model_name}.bin"
|
||||
|
||||
@@ -314,7 +314,8 @@ extern "C" {
|
||||
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
|
||||
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
|
||||
|
||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
|
||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||
|
||||
@@ -23,7 +23,7 @@ static bool ggml_is_view(const struct ggml_tensor * t) {
|
||||
}
|
||||
|
||||
// ops that return true for this function must not use restrict pointers for their backend implementations
|
||||
static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
bool ggml_op_can_inplace(enum ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_DIAG_MASK_ZERO:
|
||||
@@ -95,39 +95,104 @@ enum ggml_status ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_te
|
||||
|
||||
// dynamic tensor allocator
|
||||
|
||||
#define GGML_VBUFFER_MAX_CHUNKS 16
|
||||
|
||||
// relative memory address within an allocation that can be split into multiple buffers (chunks)
|
||||
struct buffer_address {
|
||||
int chunk; // index of a backend buffer
|
||||
size_t offset; // local memory offset within the buffer
|
||||
};
|
||||
|
||||
static const struct buffer_address GGML_BUFFER_ADDRESS_INVALID = { -1, SIZE_MAX };
|
||||
|
||||
static bool ggml_buffer_address_less(struct buffer_address a, struct buffer_address b) {
|
||||
return a.chunk != b.chunk ? a.chunk < b.chunk : a.offset < b.offset;
|
||||
}
|
||||
|
||||
struct free_block {
|
||||
size_t offset;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
struct tallocr_chunk {
|
||||
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
||||
int n_free_blocks;
|
||||
size_t max_size;
|
||||
};
|
||||
|
||||
struct ggml_dyn_tallocr {
|
||||
size_t alignment;
|
||||
int n_free_blocks;
|
||||
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
||||
size_t max_size;
|
||||
size_t max_chunk_size;
|
||||
struct tallocr_chunk * chunks[GGML_VBUFFER_MAX_CHUNKS];
|
||||
int n_chunks;
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
struct {
|
||||
const struct ggml_tensor * tensor;
|
||||
size_t offset;
|
||||
struct buffer_address addr;
|
||||
} allocated_tensors[1024];
|
||||
#endif
|
||||
};
|
||||
|
||||
static void ggml_dyn_tallocr_insert_block(struct tallocr_chunk * chunk, size_t offset, size_t size) {
|
||||
GGML_ASSERT(chunk->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
|
||||
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
||||
int insert_pos = 0;
|
||||
while (insert_pos < chunk->n_free_blocks && chunk->free_blocks[insert_pos].offset < offset) {
|
||||
insert_pos++;
|
||||
}
|
||||
// shift all blocks from insert_pos onward to make room for the new block
|
||||
for (int i = chunk->n_free_blocks; i > insert_pos; i--) {
|
||||
chunk->free_blocks[i] = chunk->free_blocks[i-1];
|
||||
}
|
||||
// insert the new block
|
||||
chunk->free_blocks[insert_pos].offset = offset;
|
||||
chunk->free_blocks[insert_pos].size = size;
|
||||
chunk->n_free_blocks++;
|
||||
}
|
||||
|
||||
static void ggml_dyn_tallocr_remove_block(struct tallocr_chunk * chunk, int idx) {
|
||||
// shift all elements after idx by 1 to the left, overwriting the element at idx
|
||||
for (int i = idx; i < chunk->n_free_blocks; i++) {
|
||||
chunk->free_blocks[i] = chunk->free_blocks[i+1];
|
||||
}
|
||||
chunk->n_free_blocks--;
|
||||
}
|
||||
|
||||
static int ggml_dyn_tallocr_new_chunk(struct ggml_dyn_tallocr * alloc, size_t min_size) {
|
||||
if (alloc->n_chunks >= GGML_VBUFFER_MAX_CHUNKS) {
|
||||
return -1;
|
||||
}
|
||||
struct tallocr_chunk * chunk = calloc(1, sizeof(struct tallocr_chunk));
|
||||
chunk->n_free_blocks = 1;
|
||||
chunk->free_blocks[0].offset = 0;
|
||||
// available space in a chunk is limited to max_chunk_size, but can be higher if:
|
||||
// 1. a single tensor exceeds the maximum, and cannot fit any other way
|
||||
// 2. we are running out of chunks
|
||||
// backends will either manage to allocate the larger size, or report an error.
|
||||
chunk->free_blocks[0].size = MAX(min_size, alloc->max_chunk_size);
|
||||
if (alloc->n_chunks == GGML_VBUFFER_MAX_CHUNKS - 1) {
|
||||
chunk->free_blocks[0].size = SIZE_MAX/2;
|
||||
}
|
||||
alloc->chunks[alloc->n_chunks] = chunk;
|
||||
alloc->n_chunks++;
|
||||
return alloc->n_chunks - 1;
|
||||
}
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
||||
static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
if (alloc->allocated_tensors[i].tensor == NULL) {
|
||||
alloc->allocated_tensors[i].tensor = tensor;
|
||||
alloc->allocated_tensors[i].offset = offset;
|
||||
alloc->allocated_tensors[i].addr = addr;
|
||||
return;
|
||||
}
|
||||
}
|
||||
GGML_ABORT("out of allocated_tensors");
|
||||
}
|
||||
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
|
||||
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, const struct ggml_tensor * tensor) {
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
if (alloc->allocated_tensors[i].offset == offset) {
|
||||
if (alloc->allocated_tensors[i].addr.chunk == addr.chunk && alloc->allocated_tensors[i].addr.offset == addr.offset) {
|
||||
alloc->allocated_tensors[i].tensor = NULL;
|
||||
return;
|
||||
}
|
||||
@@ -136,76 +201,94 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
|
||||
}
|
||||
#endif
|
||||
|
||||
static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
|
||||
static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
|
||||
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
||||
|
||||
int best_fit_chunk = -1;
|
||||
int best_fit_block = -1;
|
||||
size_t max_avail = 0;
|
||||
|
||||
// find the best fitting free block besides the last block
|
||||
int best_fit_block = -1;
|
||||
size_t best_fit_size = SIZE_MAX;
|
||||
for (int i = 0; i < alloc->n_free_blocks - 1; i++) {
|
||||
struct free_block * block = &alloc->free_blocks[i];
|
||||
max_avail = MAX(max_avail, block->size);
|
||||
if (block->size >= size && block->size <= best_fit_size) {
|
||||
best_fit_block = i;
|
||||
best_fit_size = block->size;
|
||||
// find the best fitting free block besides the last block, within any chunk
|
||||
for (int c = 0; c < alloc->n_chunks; ++c) {
|
||||
struct tallocr_chunk * chunk = alloc->chunks[c];
|
||||
size_t best_fit_size = SIZE_MAX;
|
||||
for (int i = 0; i < chunk->n_free_blocks - 1; i++) {
|
||||
struct free_block * block = &chunk->free_blocks[i];
|
||||
max_avail = MAX(max_avail, block->size);
|
||||
if (block->size >= size && block->size <= best_fit_size) {
|
||||
best_fit_chunk = c;
|
||||
best_fit_block = i;
|
||||
best_fit_size = block->size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (best_fit_block == -1) {
|
||||
// the last block is our last resort
|
||||
struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1];
|
||||
max_avail = MAX(max_avail, block->size);
|
||||
if (block->size >= size) {
|
||||
best_fit_block = alloc->n_free_blocks - 1;
|
||||
} else {
|
||||
// this should never happen
|
||||
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
||||
__func__, size, max_avail);
|
||||
GGML_ABORT("not enough space in the buffer");
|
||||
}
|
||||
}
|
||||
|
||||
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
||||
size_t offset = block->offset;
|
||||
block->offset = offset + size;
|
||||
block->size -= size;
|
||||
if (block->size == 0) {
|
||||
// remove block if empty
|
||||
alloc->n_free_blocks--;
|
||||
for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
|
||||
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
||||
}
|
||||
}
|
||||
|
||||
AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
add_allocated_tensor(alloc, offset, tensor);
|
||||
size_t cur_max = offset + size;
|
||||
if (cur_max > alloc->max_size) {
|
||||
// sort allocated_tensors by offset
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
for (int j = i + 1; j < 1024; j++) {
|
||||
if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
|
||||
const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
|
||||
size_t tmp_offset = alloc->allocated_tensors[i].offset;
|
||||
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
|
||||
alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
|
||||
alloc->allocated_tensors[j].tensor = tmp_tensor;
|
||||
alloc->allocated_tensors[j].offset = tmp_offset;
|
||||
// no suitable block found, try the last block (this will grow a chunks size)
|
||||
for (int c = 0; c < alloc->n_chunks; ++c) {
|
||||
struct tallocr_chunk * chunk = alloc->chunks[c];
|
||||
if (chunk->n_free_blocks > 0) {
|
||||
struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
|
||||
max_avail = MAX(max_avail, block->size);
|
||||
if (block->size >= size) {
|
||||
best_fit_chunk = c;
|
||||
best_fit_block = chunk->n_free_blocks - 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
if (best_fit_block == -1) {
|
||||
// none of the existing chunks have enough space left
|
||||
best_fit_chunk = ggml_dyn_tallocr_new_chunk(alloc, size);
|
||||
best_fit_block = 0;
|
||||
}
|
||||
if (best_fit_chunk == -1) {
|
||||
// since the last chunk always has virtually endless memory, this should never happen
|
||||
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
||||
__func__, size, max_avail);
|
||||
GGML_ABORT("graph allocation: failed to reserve memory");
|
||||
}
|
||||
|
||||
struct tallocr_chunk * chunk = alloc->chunks[best_fit_chunk];
|
||||
struct free_block * block = &chunk->free_blocks[best_fit_block];
|
||||
struct buffer_address addr = {.chunk = best_fit_chunk, .offset = block->offset };
|
||||
block->offset += size;
|
||||
block->size -= size;
|
||||
if (block->size == 0) {
|
||||
// remove block if empty
|
||||
ggml_dyn_tallocr_remove_block(chunk, best_fit_block);
|
||||
}
|
||||
|
||||
AT_PRINTF("block %d, offset %zu, chunk %d\n", best_fit_block, addr.offset, addr.chunk);
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
add_allocated_tensor(alloc, addr, tensor);
|
||||
size_t cur_max = addr.offset + size;
|
||||
if (cur_max > alloc->max_size[addr.chunk]) {
|
||||
// sort allocated_tensors by chunk/offset
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
for (int j = i + 1; j < 1024; j++) {
|
||||
if (ggml_buffer_address_less(alloc->allocated_tensors[j].addr, alloc->allocated_tensors[i].addr)) {
|
||||
const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
|
||||
struct buffer_address tmp_addr = alloc->allocated_tensors[i].addr;
|
||||
alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
|
||||
alloc->allocated_tensors[i].addr = alloc->allocated_tensors[j].addr;
|
||||
alloc->allocated_tensors[j].tensor = tmp_tensor;
|
||||
alloc->allocated_tensors[j].addr = tmp_addr;
|
||||
}
|
||||
}
|
||||
}
|
||||
GGML_LOG_DEBUG("max_size[%d] = %.2f MB: tensors: ", addr.chunk, cur_max / 1024.0 / 1024.0);
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
if (alloc->allocated_tensors[i].tensor) {
|
||||
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
||||
alloc->allocated_tensors[i].offset,
|
||||
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
||||
GGML_LOG_DEBUG("%s [%d: %zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
||||
alloc->allocated_tensors[i].addr.chunk,
|
||||
alloc->allocated_tensors[i].addr.offset,
|
||||
alloc->allocated_tensors[i].addr.offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
||||
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
@@ -213,78 +296,69 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
||||
}
|
||||
#endif
|
||||
|
||||
alloc->max_size = MAX(alloc->max_size, offset + size);
|
||||
chunk->max_size = MAX(chunk->max_size, addr.offset + size);
|
||||
|
||||
return offset;
|
||||
return addr;
|
||||
|
||||
GGML_UNUSED(tensor);
|
||||
}
|
||||
|
||||
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
|
||||
static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, struct buffer_address addr, size_t size, const struct ggml_tensor * tensor) {
|
||||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
|
||||
AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
|
||||
AT_PRINTF("%s: freeing %s at {chunk=%d, offset=%zu} (%zu bytes) - n_free_blocks = %d\n",
|
||||
__func__, tensor->name, addr.chunk, addr.offset, size, alloc->chunks[addr.chunk]->n_free_blocks);
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
remove_allocated_tensor(alloc, offset, tensor);
|
||||
remove_allocated_tensor(alloc, addr, tensor);
|
||||
#endif
|
||||
|
||||
struct tallocr_chunk * chunk = alloc->chunks[addr.chunk];
|
||||
|
||||
// see if we can merge with an existing block
|
||||
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
||||
struct free_block * block = &alloc->free_blocks[i];
|
||||
for (int i = 0; i < chunk->n_free_blocks; i++) {
|
||||
struct free_block * block = &chunk->free_blocks[i];
|
||||
// check if ptr is at the end of the block
|
||||
if (block->offset + block->size == offset) {
|
||||
if (block->offset + block->size == addr.offset) {
|
||||
block->size += size;
|
||||
// check if we can merge with the next block
|
||||
if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
|
||||
block->size += alloc->free_blocks[i+1].size;
|
||||
alloc->n_free_blocks--;
|
||||
for (int j = i+1; j < alloc->n_free_blocks; j++) {
|
||||
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
||||
if (i < chunk->n_free_blocks - 1) {
|
||||
struct free_block * next = &chunk->free_blocks[i+1];
|
||||
if (block->offset + block->size == next->offset) {
|
||||
block->size += next->size;
|
||||
ggml_dyn_tallocr_remove_block(chunk, i+1);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
// check if ptr is at the beginning of the block
|
||||
if (offset + size == block->offset) {
|
||||
block->offset = offset;
|
||||
if (addr.offset + size == block->offset) {
|
||||
block->offset = addr.offset;
|
||||
block->size += size;
|
||||
// check if we can merge with the previous block
|
||||
if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
|
||||
alloc->free_blocks[i-1].size += block->size;
|
||||
alloc->n_free_blocks--;
|
||||
for (int j = i; j < alloc->n_free_blocks; j++) {
|
||||
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
||||
if (i > 0) {
|
||||
struct free_block * prev = &chunk->free_blocks[i-1];
|
||||
if (prev->offset + prev->size == block->offset) {
|
||||
prev->size += block->size;
|
||||
ggml_dyn_tallocr_remove_block(chunk, i);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
// otherwise, add a new block
|
||||
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
|
||||
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
||||
int insert_pos = 0;
|
||||
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
|
||||
insert_pos++;
|
||||
}
|
||||
// shift all blocks from insert_pos onward to make room for the new block
|
||||
for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
|
||||
alloc->free_blocks[i] = alloc->free_blocks[i-1];
|
||||
}
|
||||
// insert the new block
|
||||
alloc->free_blocks[insert_pos].offset = offset;
|
||||
alloc->free_blocks[insert_pos].size = size;
|
||||
alloc->n_free_blocks++;
|
||||
ggml_dyn_tallocr_insert_block(chunk, addr.offset, size);
|
||||
|
||||
GGML_UNUSED(tensor);
|
||||
}
|
||||
|
||||
static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
||||
alloc->n_free_blocks = 1;
|
||||
alloc->free_blocks[0].offset = 0;
|
||||
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
|
||||
alloc->max_size = 0;
|
||||
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; i++) {
|
||||
free(alloc->chunks[i]);
|
||||
alloc->chunks[i] = NULL;
|
||||
}
|
||||
alloc->n_chunks = 0;
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
for (int i = 0; i < 1024; i++) {
|
||||
@@ -293,14 +367,14 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
|
||||
#endif
|
||||
}
|
||||
|
||||
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
|
||||
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment, size_t max_buffer_size) {
|
||||
struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
|
||||
|
||||
*alloc = (struct ggml_dyn_tallocr) {
|
||||
/*.alignment = */ alignment,
|
||||
/*.n_free_blocks = */ 0,
|
||||
/*.free_blocks = */ {{0}},
|
||||
/*.max_size = */ 0,
|
||||
/*.alignment = */ alignment,
|
||||
/*.max_chunk_size = */ MIN(max_buffer_size, SIZE_MAX/2), // clamp to avoid overflows
|
||||
/*.chunks = */ {NULL},
|
||||
/*.n_chunks = */ 0,
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
/*.allocated_tensors = */ {{0}},
|
||||
#endif
|
||||
@@ -312,11 +386,79 @@ static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
|
||||
}
|
||||
|
||||
static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
|
||||
for (int i = 0; i < alloc->n_chunks; ++i) {
|
||||
free(alloc->chunks[i]);
|
||||
}
|
||||
free(alloc);
|
||||
}
|
||||
|
||||
static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
|
||||
return alloc->max_size;
|
||||
size_t max_size = 0;
|
||||
for (int i = 0; i < alloc->n_chunks; i++) {
|
||||
max_size += alloc->chunks[i]->max_size;
|
||||
}
|
||||
return max_size;
|
||||
}
|
||||
|
||||
|
||||
// virtual buffer with contiguous memory range, split into multiple backend buffers (chunks)
|
||||
|
||||
struct vbuffer {
|
||||
ggml_backend_buffer_t chunks[GGML_VBUFFER_MAX_CHUNKS];
|
||||
};
|
||||
|
||||
static void ggml_vbuffer_free(struct vbuffer * buf) {
|
||||
if (buf == NULL) {
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS; ++i) {
|
||||
ggml_backend_buffer_free(buf->chunks[i]);
|
||||
}
|
||||
free(buf);
|
||||
}
|
||||
|
||||
static int ggml_vbuffer_n_chunks(struct vbuffer * buf) {
|
||||
int n = 0;
|
||||
while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++;
|
||||
return n;
|
||||
}
|
||||
|
||||
static size_t ggml_vbuffer_size(struct vbuffer * buf) {
|
||||
size_t size = 0;
|
||||
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
|
||||
size += ggml_backend_buffer_get_size(buf->chunks[i]);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
static struct vbuffer * ggml_vbuffer_alloc(ggml_backend_buffer_type_t buft, const struct ggml_dyn_tallocr * talloc, enum ggml_backend_buffer_usage usage) {
|
||||
struct vbuffer * buf = (struct vbuffer *)calloc(1, sizeof(struct vbuffer));
|
||||
if (buf == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (int n = 0; n < talloc->n_chunks; n++) {
|
||||
size_t chunk_size = talloc->chunks[n]->max_size;
|
||||
buf->chunks[n] = ggml_backend_buft_alloc_buffer(buft, chunk_size);
|
||||
if (buf->chunks[n] == NULL) {
|
||||
ggml_vbuffer_free(buf);
|
||||
return NULL;
|
||||
}
|
||||
ggml_backend_buffer_set_usage(buf->chunks[n], usage);
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void ggml_vbuffer_tensor_alloc(struct vbuffer * buf, struct ggml_tensor * tensor, struct buffer_address buf_addr) {
|
||||
void * base = ggml_backend_buffer_get_base(buf->chunks[buf_addr.chunk]);
|
||||
void * addr = (char *)base + buf_addr.offset;
|
||||
ggml_backend_tensor_alloc(buf->chunks[buf_addr.chunk], tensor, addr);
|
||||
}
|
||||
|
||||
static void ggml_vbuffer_reset(struct vbuffer * buf) {
|
||||
for (int i = 0; i < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[i]; ++i) {
|
||||
ggml_backend_buffer_reset(buf->chunks[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -328,13 +470,13 @@ struct hash_node {
|
||||
int n_children;
|
||||
int n_views;
|
||||
int buffer_id;
|
||||
size_t offset; // offset within the buffer
|
||||
struct buffer_address addr;
|
||||
bool allocated;
|
||||
};
|
||||
|
||||
struct tensor_alloc {
|
||||
int buffer_id;
|
||||
size_t offset;
|
||||
struct buffer_address addr;
|
||||
size_t size_max; // 0 = pre-allocated, unused, or view
|
||||
};
|
||||
|
||||
@@ -349,7 +491,7 @@ struct node_alloc {
|
||||
|
||||
struct ggml_gallocr {
|
||||
ggml_backend_buffer_type_t * bufts; // [n_buffers]
|
||||
ggml_backend_buffer_t * buffers; // [n_buffers]
|
||||
struct vbuffer ** buffers; // [n_buffers]
|
||||
struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
|
||||
int n_buffers;
|
||||
|
||||
@@ -370,7 +512,7 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t));
|
||||
GGML_ASSERT(galloc->bufts != NULL);
|
||||
|
||||
galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
|
||||
galloc->buffers = calloc(n_bufs, sizeof(struct vbuffer *));
|
||||
GGML_ASSERT(galloc->buffers != NULL);
|
||||
|
||||
galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
|
||||
@@ -390,7 +532,8 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
|
||||
|
||||
if (galloc->buf_tallocs[i] == NULL) {
|
||||
size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
|
||||
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
|
||||
size_t max_size = ggml_backend_buft_get_max_size(bufts[i]);
|
||||
galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment, max_size);
|
||||
}
|
||||
}
|
||||
galloc->n_buffers = n_bufs;
|
||||
@@ -418,7 +561,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
|
||||
}
|
||||
}
|
||||
if (!freed) {
|
||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||
ggml_vbuffer_free(galloc->buffers[i]);
|
||||
}
|
||||
}
|
||||
if (galloc->buf_tallocs != NULL) {
|
||||
@@ -467,7 +610,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||
|
||||
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
|
||||
hn->allocated = true;
|
||||
assert(hn->offset == 0);
|
||||
assert(hn->addr.offset == 0);
|
||||
|
||||
// try to reuse a parent's buffer (inplace)
|
||||
if (ggml_op_can_inplace(node->op)) {
|
||||
@@ -501,9 +644,9 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||
struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
|
||||
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
||||
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
||||
assert(view_src_hn->offset == p_hn->offset);
|
||||
assert(view_src_hn->addr.chunk == p_hn->addr.chunk && view_src_hn->addr.offset == p_hn->addr.offset);
|
||||
hn->buffer_id = p_hn->buffer_id;
|
||||
hn->offset = p_hn->offset;
|
||||
hn->addr = p_hn->addr;
|
||||
p_hn->allocated = false; // avoid freeing the parent
|
||||
view_src_hn->allocated = false;
|
||||
return;
|
||||
@@ -511,7 +654,7 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||
} else {
|
||||
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||
hn->buffer_id = p_hn->buffer_id;
|
||||
hn->offset = p_hn->offset;
|
||||
hn->addr = p_hn->addr;
|
||||
p_hn->allocated = false; // avoid freeing the parent
|
||||
return;
|
||||
}
|
||||
@@ -522,9 +665,8 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor
|
||||
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
||||
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
||||
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
||||
size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
|
||||
hn->buffer_id = buffer_id;
|
||||
hn->offset = offset;
|
||||
hn->addr = ggml_dyn_tallocr_alloc(alloc, size, node);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -536,12 +678,11 @@ static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * n
|
||||
}
|
||||
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
||||
size_t offset = hn->offset;
|
||||
int buffer_id = hn->buffer_id;
|
||||
struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
|
||||
ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
|
||||
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
|
||||
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
|
||||
ggml_dyn_tallocr_free_tensor(alloc, hn->addr, size, node);
|
||||
hn->allocated = false;
|
||||
}
|
||||
|
||||
@@ -692,24 +833,24 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
struct node_alloc * node_alloc = &galloc->node_allocs[i];
|
||||
if (node->view_src || node->data) {
|
||||
node_alloc->dst.buffer_id = -1;
|
||||
node_alloc->dst.offset = SIZE_MAX;
|
||||
node_alloc->dst.addr = GGML_BUFFER_ADDRESS_INVALID;
|
||||
node_alloc->dst.size_max = 0;
|
||||
} else {
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
||||
node_alloc->dst.buffer_id = hn->buffer_id;
|
||||
node_alloc->dst.offset = hn->offset;
|
||||
node_alloc->dst.addr = hn->addr;
|
||||
node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
|
||||
}
|
||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||
struct ggml_tensor * src = node->src[j];
|
||||
if (!src || src->view_src || src->data) {
|
||||
node_alloc->src[j].buffer_id = -1;
|
||||
node_alloc->src[j].offset = SIZE_MAX;
|
||||
node_alloc->src[j].addr = GGML_BUFFER_ADDRESS_INVALID;
|
||||
node_alloc->src[j].size_max = 0;
|
||||
} else {
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
|
||||
node_alloc->src[j].buffer_id = hn->buffer_id;
|
||||
node_alloc->src[j].offset = hn->offset;
|
||||
node_alloc->src[j].addr = hn->addr;
|
||||
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
|
||||
}
|
||||
}
|
||||
@@ -725,11 +866,11 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
|
||||
if (leaf->view_src || leaf->data) {
|
||||
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
||||
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
||||
galloc->leaf_allocs[i].leaf.addr = GGML_BUFFER_ADDRESS_INVALID;
|
||||
galloc->leaf_allocs[i].leaf.size_max = 0;
|
||||
} else {
|
||||
galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
|
||||
galloc->leaf_allocs[i].leaf.offset = hn->offset;
|
||||
galloc->leaf_allocs[i].leaf.addr = hn->addr;
|
||||
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
|
||||
}
|
||||
}
|
||||
@@ -744,7 +885,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
}
|
||||
}
|
||||
|
||||
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
|
||||
size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0;
|
||||
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
|
||||
|
||||
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
||||
@@ -753,13 +894,12 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
#endif
|
||||
|
||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
||||
ggml_vbuffer_free(galloc->buffers[i]);
|
||||
galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
if (galloc->buffers[i] == NULL) {
|
||||
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||
return false;
|
||||
}
|
||||
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -772,11 +912,11 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
|
||||
|
||||
static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
|
||||
int buffer_id = tensor_alloc->buffer_id;
|
||||
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
||||
assert(tensor->data || tensor->view_src || ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
|
||||
|
||||
if (tensor->view_src != NULL) {
|
||||
if (tensor->buffer == NULL) {
|
||||
assert(tensor_alloc->offset == SIZE_MAX);
|
||||
assert(tensor_alloc->addr.offset == SIZE_MAX);
|
||||
if (tensor->view_src->buffer == NULL) {
|
||||
// this tensor was allocated without ggml-backend
|
||||
return;
|
||||
@@ -785,11 +925,9 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||
}
|
||||
} else {
|
||||
if (tensor->data == NULL) {
|
||||
assert(tensor_alloc->offset != SIZE_MAX);
|
||||
assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
|
||||
void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
|
||||
void * addr = (char *)base + tensor_alloc->offset;
|
||||
ggml_backend_tensor_alloc(galloc->buffers[buffer_id], tensor, addr);
|
||||
assert(tensor_alloc->addr.offset != SIZE_MAX);
|
||||
assert(ggml_backend_buft_get_alloc_size(galloc->bufts[buffer_id], tensor) <= tensor_alloc->size_max);
|
||||
ggml_vbuffer_tensor_alloc(galloc->buffers[buffer_id], tensor, tensor_alloc->addr);
|
||||
} else {
|
||||
if (tensor->buffer == NULL) {
|
||||
// this tensor was allocated without ggml-backend
|
||||
@@ -874,7 +1012,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
||||
// reset buffers
|
||||
for (int i = 0; i < galloc->n_buffers; i++) {
|
||||
if (galloc->buffers[i] != NULL) {
|
||||
ggml_backend_buffer_reset(galloc->buffers[i]);
|
||||
ggml_vbuffer_reset(galloc->buffers[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -917,7 +1055,7 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
|
||||
}
|
||||
}
|
||||
|
||||
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
|
||||
return ggml_vbuffer_size(galloc->buffers[buffer_id]);
|
||||
}
|
||||
|
||||
// utils
|
||||
|
||||
@@ -1793,6 +1793,14 @@ ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i)
|
||||
return sched->backends[i];
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_sched_get_buffer_type(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
GGML_ASSERT(sched);
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
|
||||
return sched->bufts[backend_index];
|
||||
}
|
||||
|
||||
size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
GGML_ASSERT(sched);
|
||||
int backend_index = ggml_backend_sched_backend_id(sched, backend);
|
||||
|
||||
@@ -105,6 +105,18 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
||||
|
||||
return ((v4f32)res)[0];
|
||||
}
|
||||
|
||||
// multiply int8_t, add results pairwise twice
|
||||
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
||||
// Get absolute values of x vectors
|
||||
const __m128i ax = __lsx_vsigncov_b(x, x);
|
||||
// Sign the values of the y vectors
|
||||
const __m128i sy = __lsx_vsigncov_b(x, y);
|
||||
// Perform multiplication and create 16-bit values
|
||||
const __m128i dot = lsx_maddubs_h(ax, sy);
|
||||
const __m128i ones = __lsx_vreplgr2vr_h(1);
|
||||
return lsx_madd_h(ones, dot);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__loongarch_asx)
|
||||
@@ -323,18 +335,6 @@ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
|
||||
}
|
||||
}
|
||||
|
||||
// multiply int8_t, add results pairwise twice
|
||||
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
|
||||
// Get absolute values of x vectors
|
||||
const __m128i ax = __lsx_vsigncov_b(x, x);
|
||||
// Sign the values of the y vectors
|
||||
const __m128i sy = __lsx_vsigncov_b(x, y);
|
||||
// Perform multiplication and create 16-bit values
|
||||
const __m128i dot = lsx_maddubs_h(ax, sy);
|
||||
const __m128i ones = __lsx_vreplgr2vr_h(1);
|
||||
return lsx_madd_h(ones, dot);
|
||||
}
|
||||
|
||||
// horizontally add 8 floats
|
||||
static inline float hsum_float_8(const __m256 x) {
|
||||
__m128 res = lasx_extractf128(x, 1);
|
||||
|
||||
@@ -998,9 +998,9 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 __m128
|
||||
#define GGML_F32x4_ZERO __lsx_vldi(0)
|
||||
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
||||
#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
|
||||
#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
|
||||
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
|
||||
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
||||
#define GGML_F32x4_ADD __lsx_vfadd_s
|
||||
@@ -1022,7 +1022,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
||||
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
|
||||
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
|
||||
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
||||
const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
|
||||
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
|
||||
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
|
||||
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||
@@ -1052,7 +1052,7 @@ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
||||
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
|
||||
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);
|
||||
|
||||
return __lsx_vld(tmp, 0);
|
||||
return (__m128)__lsx_vld(tmp, 0);
|
||||
}
|
||||
|
||||
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
@@ -1067,9 +1067,9 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
}
|
||||
|
||||
#define GGML_F32Cx4 __m128
|
||||
#define GGML_F32Cx4_ZERO __lsx_vldi(0)
|
||||
#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
|
||||
#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
|
||||
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
|
||||
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
|
||||
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32Cx4_ADD __lsx_vfadd_s
|
||||
|
||||
@@ -342,6 +342,10 @@ struct ggml_cgraph {
|
||||
// if you need the gradients, get them from the original graph
|
||||
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
|
||||
// ggml-alloc.c: true if the operation can reuse memory from its sources
|
||||
GGML_API bool ggml_op_can_inplace(enum ggml_op op);
|
||||
|
||||
|
||||
// Memory allocation
|
||||
|
||||
GGML_API void * ggml_aligned_malloc(size_t size);
|
||||
|
||||
@@ -256,8 +256,6 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
|
||||
|
||||
// perform reorders only across these types of ops
|
||||
// can be expanded when needed
|
||||
// IMPORTANT: do not add ops such as GGML_OP_CPY or GGML_OP_SET_ROWS
|
||||
// the dependencies from such ops are not always represented in the graph
|
||||
const auto & h_safe = [](ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_MUL_MAT:
|
||||
@@ -273,6 +271,8 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
|
||||
case GGML_OP_GLU:
|
||||
case GGML_OP_SCALE:
|
||||
case GGML_OP_GET_ROWS:
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_SET_ROWS:
|
||||
return true;
|
||||
default:
|
||||
return ggml_op_is_empty(op);
|
||||
@@ -383,6 +383,7 @@ void ggml_graph_optimize(ggml_cgraph * gf) {
|
||||
// fuse only ops that start with these operations
|
||||
// can be expanded when needed
|
||||
if (node.op() == GGML_OP_ADD ||
|
||||
node.op() == GGML_OP_NORM ||
|
||||
node.op() == GGML_OP_RMS_NORM) {
|
||||
ops[0] = node.op();
|
||||
|
||||
@@ -392,6 +393,7 @@ void ggml_graph_optimize(ggml_cgraph * gf) {
|
||||
// can be expanded when needed
|
||||
if (gf->nodes[f]->op != GGML_OP_ADD &&
|
||||
gf->nodes[f]->op != GGML_OP_MUL &&
|
||||
gf->nodes[f]->op != GGML_OP_NORM &&
|
||||
gf->nodes[f]->op != GGML_OP_RMS_NORM) {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1090,36 +1090,6 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_bin(
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rms_norm(ggml_metal_library_t lib, const ggml_tensor * op, int32_t n_fuse) {
|
||||
assert(op->op == GGML_OP_RMS_NORM);
|
||||
|
||||
GGML_ASSERT(op->src[0]->ne[0] % 4 == 0);
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
|
||||
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
switch (n_fuse) {
|
||||
case 1: snprintf(base, 256, "kernel_rms_norm_f32"); break;
|
||||
case 2: snprintf(base, 256, "kernel_rms_norm_mul_f32"); break;
|
||||
case 3: snprintf(base, 256, "kernel_rms_norm_mul_add_f32"); break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
snprintf(name, 256, "%s", base);
|
||||
|
||||
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
|
||||
if (res) {
|
||||
return res;
|
||||
}
|
||||
|
||||
res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
|
||||
|
||||
ggml_metal_pipeline_set_smem(res, 32*sizeof(float));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_l2_norm(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
assert(op->op == GGML_OP_L2_NORM);
|
||||
|
||||
@@ -1167,16 +1137,37 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_group_norm(ggml_metal_libr
|
||||
return res;
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm(ggml_metal_library_t lib, const ggml_tensor * op) {
|
||||
assert(op->op == GGML_OP_NORM);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm(ggml_metal_library_t lib, const ggml_tensor * op, int n_fuse) {
|
||||
assert(op->op == GGML_OP_NORM || op->op == GGML_OP_RMS_NORM);
|
||||
|
||||
GGML_ASSERT(op->src[0]->ne[0] % 4 == 0);
|
||||
GGML_ASSERT(ggml_is_contiguous_1(op->src[0]));
|
||||
GGML_ASSERT(ggml_is_contiguous_rows(op->src[0]));
|
||||
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
snprintf(base, 256, "kernel_norm_f32");
|
||||
const char * suffix = "";
|
||||
if (op->ne[0] % 4 == 0) {
|
||||
suffix = "_4";
|
||||
}
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_NORM:
|
||||
switch (n_fuse) {
|
||||
case 1: snprintf(base, 256, "kernel_norm_f32%s", suffix); break;
|
||||
case 2: snprintf(base, 256, "kernel_norm_mul_f32%s", suffix); break;
|
||||
case 3: snprintf(base, 256, "kernel_norm_mul_add_f32%s", suffix); break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
} break;
|
||||
case GGML_OP_RMS_NORM:
|
||||
switch (n_fuse) {
|
||||
case 1: snprintf(base, 256, "kernel_rms_norm_f32%s", suffix); break;
|
||||
case 2: snprintf(base, 256, "kernel_rms_norm_mul_f32%s", suffix); break;
|
||||
case 3: snprintf(base, 256, "kernel_rms_norm_mul_add_f32%s", suffix); break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
} break;
|
||||
default: GGML_ABORT("fatal error");
|
||||
}
|
||||
|
||||
snprintf(name, 256, "%s", base);
|
||||
|
||||
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
|
||||
@@ -1237,7 +1228,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col(ggml_metal_library_
|
||||
char base[256];
|
||||
char name[256];
|
||||
|
||||
snprintf(base, 256, "kernel_im2col_ext_%s", ggml_type_name(op->type));
|
||||
snprintf(base, 256, "kernel_im2col_%s", ggml_type_name(op->type));
|
||||
snprintf(name, 256, "%s", base);
|
||||
|
||||
ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
|
||||
|
||||
@@ -123,10 +123,9 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv_id (ggml_me
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argmax (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argsort (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_bin (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rms_norm (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_l2_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_group_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
|
||||
|
||||
@@ -661,13 +661,13 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
|
||||
case GGML_OP_SOFT_MAX:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]);
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_L2_NORM:
|
||||
return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
|
||||
case GGML_OP_ARGMAX:
|
||||
return has_simdgroup_reduction;
|
||||
case GGML_OP_NORM:
|
||||
return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
|
||||
case GGML_OP_RMS_NORM:
|
||||
return has_simdgroup_reduction && (ggml_is_contiguous_rows(op->src[0]));
|
||||
case GGML_OP_ROPE:
|
||||
return true;
|
||||
case GGML_OP_IM2COL:
|
||||
|
||||
@@ -428,16 +428,11 @@ typedef struct {
|
||||
uint64_t nb1;
|
||||
} ggml_metal_kargs_mul_mv_id;
|
||||
|
||||
// NORM
|
||||
// RMS_NORM
|
||||
typedef struct {
|
||||
int32_t ne00;
|
||||
int32_t ne00_4;
|
||||
uint64_t nb01;
|
||||
float eps;
|
||||
} ggml_metal_kargs_norm;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne00;
|
||||
int32_t ne00_4;
|
||||
int32_t ne00_t;
|
||||
uint64_t nb1;
|
||||
uint64_t nb2;
|
||||
uint64_t nb3;
|
||||
@@ -448,7 +443,7 @@ typedef struct {
|
||||
uint64_t nbf1[3];
|
||||
uint64_t nbf2[3];
|
||||
uint64_t nbf3[3];
|
||||
} ggml_metal_kargs_rms_norm;
|
||||
} ggml_metal_kargs_norm;
|
||||
|
||||
typedef struct {
|
||||
int32_t ne00;
|
||||
|
||||
@@ -266,10 +266,6 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||
{
|
||||
n_fuse = ggml_metal_op_set_rows(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_RMS_NORM:
|
||||
{
|
||||
n_fuse = ggml_metal_op_rms_norm(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_L2_NORM:
|
||||
{
|
||||
n_fuse = ggml_metal_op_l2_norm(ctx, idx);
|
||||
@@ -279,6 +275,7 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
|
||||
n_fuse = ggml_metal_op_group_norm(ctx, idx);
|
||||
} break;
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
{
|
||||
n_fuse = ggml_metal_op_norm(ctx, idx);
|
||||
} break;
|
||||
@@ -2346,146 +2343,6 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
|
||||
return n_fuse;
|
||||
}
|
||||
|
||||
int ggml_metal_op_rms_norm(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_cgraph * gf = ctx->gf;
|
||||
ggml_tensor * op = ggml_graph_node(gf, idx);
|
||||
|
||||
ggml_metal_library_t lib = ctx->lib;
|
||||
ggml_metal_encoder_t enc = ctx->enc;
|
||||
|
||||
const int idx_end = ctx->idx_end;
|
||||
|
||||
const bool use_fusion = ctx->use_fusion;
|
||||
|
||||
const int debug_fusion = ctx->debug_fusion;
|
||||
|
||||
ggml_tensor ** ops = ggml_graph_nodes(gf) + idx;
|
||||
|
||||
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
|
||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||
GGML_TENSOR_LOCALS(uint32_t, nb, op, nb);
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, op->op_params, sizeof(float));
|
||||
|
||||
ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
|
||||
ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
|
||||
|
||||
ggml_metal_kargs_rms_norm args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne00_4 =*/ ne00/4,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
/*.eps =*/ eps,
|
||||
/*.nef1 =*/ { ne01 },
|
||||
/*.nef2 =*/ { ne02 },
|
||||
/*.nef3 =*/ { ne03 },
|
||||
/*.nbf1 =*/ { nb01 },
|
||||
/*.nbf2 =*/ { nb02 },
|
||||
/*.nbf3 =*/ { nb03 },
|
||||
};
|
||||
|
||||
ggml_op fops[8];
|
||||
|
||||
int n_fuse = 1;
|
||||
|
||||
ggml_metal_buffer_id bid_fuse[2] = { bid_src0, bid_src0 };
|
||||
|
||||
// d[0] = rms_norm(a)
|
||||
// d[1] = mul(d[0], b)
|
||||
// d[2] = add(d[1], c)
|
||||
if (use_fusion) {
|
||||
fops[0] = GGML_OP_RMS_NORM;
|
||||
fops[1] = GGML_OP_MUL;
|
||||
fops[2] = GGML_OP_ADD;
|
||||
|
||||
for (n_fuse = 0; n_fuse <= 1 && idx + n_fuse + 1 < idx_end; ++n_fuse) {
|
||||
if (!ggml_can_fuse(gf, idx + n_fuse, fops + n_fuse, 2)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ops[n_fuse] != ops[n_fuse + 1]->src[0]) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ops[n_fuse + 1]->src[1]->ne[0] != op->ne[0]) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (!ggml_is_contiguous_rows(ops[n_fuse + 1]->src[1])) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ops[n_fuse + 1]->type != GGML_TYPE_F32) {
|
||||
break;
|
||||
}
|
||||
|
||||
//ctx->fuse_cnt[ops[n_fuse + 1]->op]++;
|
||||
|
||||
bid_fuse[n_fuse] = ggml_metal_get_buffer_id(ops[n_fuse + 1]->src[1]);
|
||||
|
||||
args.nef1[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[1];
|
||||
args.nef2[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[2];
|
||||
args.nef3[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[3];
|
||||
|
||||
args.nbf1[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[1];
|
||||
args.nbf2[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[2];
|
||||
args.nbf3[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[3];
|
||||
}
|
||||
|
||||
++n_fuse;
|
||||
|
||||
if (debug_fusion > 1 && n_fuse > 1) {
|
||||
if (n_fuse == 2) {
|
||||
GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL\n", __func__);
|
||||
}
|
||||
if (n_fuse == 3) {
|
||||
GGML_LOG_DEBUG("%s: fuse: RMS_NORM + MUL + ADD\n", __func__);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (n_fuse > 1) {
|
||||
bid_dst = ggml_metal_get_buffer_id(ops[n_fuse - 1]);
|
||||
|
||||
for (int i = 1; i < n_fuse; ++i) {
|
||||
if (!ggml_metal_op_concurrency_check(ctx, ops[i])) {
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rms_norm(lib, op, n_fuse);
|
||||
|
||||
int nth = 32; // SIMD width
|
||||
|
||||
while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||
nth *= 2;
|
||||
}
|
||||
|
||||
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||
nth = std::min(nth, ne00/4);
|
||||
|
||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_fuse[0], 2);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_fuse[1], 3);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_dst, 4);
|
||||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
|
||||
|
||||
return n_fuse;
|
||||
}
|
||||
|
||||
int ggml_metal_op_l2_norm(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_cgraph * gf = ctx->gf;
|
||||
ggml_tensor * op = ggml_graph_node(gf, idx);
|
||||
@@ -2594,6 +2451,14 @@ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
|
||||
ggml_metal_library_t lib = ctx->lib;
|
||||
ggml_metal_encoder_t enc = ctx->enc;
|
||||
|
||||
const int idx_end = ctx->idx_end;
|
||||
|
||||
const bool use_fusion = ctx->use_fusion;
|
||||
|
||||
const int debug_fusion = ctx->debug_fusion;
|
||||
|
||||
ggml_tensor ** ops = ggml_graph_nodes(gf) + idx;
|
||||
|
||||
GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
|
||||
GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
|
||||
GGML_TENSOR_LOCALS( int32_t, ne, op, ne);
|
||||
@@ -2602,37 +2467,121 @@ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
|
||||
float eps;
|
||||
memcpy(&eps, op->op_params, sizeof(float));
|
||||
|
||||
ggml_metal_buffer_id bid_src0 = ggml_metal_get_buffer_id(op->src[0]);
|
||||
ggml_metal_buffer_id bid_dst = ggml_metal_get_buffer_id(op);
|
||||
|
||||
ggml_metal_kargs_norm args = {
|
||||
/*.ne00 =*/ ne00,
|
||||
/*.ne00_4 =*/ ne00/4,
|
||||
/*.nb01 =*/ nb01,
|
||||
/*.ne00_t =*/ ne00 % 4 == 0 ? ne00/4 : ne00,
|
||||
/*.nb1 =*/ nb1,
|
||||
/*.nb2 =*/ nb2,
|
||||
/*.nb3 =*/ nb3,
|
||||
/*.eps =*/ eps,
|
||||
/*.nef1 =*/ { ne01 },
|
||||
/*.nef2 =*/ { ne02 },
|
||||
/*.nef3 =*/ { ne03 },
|
||||
/*.nbf1 =*/ { nb01 },
|
||||
/*.nbf2 =*/ { nb02 },
|
||||
/*.nbf3 =*/ { nb03 },
|
||||
};
|
||||
|
||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_norm(lib, op);
|
||||
ggml_op fops[8];
|
||||
|
||||
int n_fuse = 1;
|
||||
|
||||
ggml_metal_buffer_id bid_fuse[2] = { bid_src0, bid_src0 };
|
||||
|
||||
// d[0] = norm(a)
|
||||
// d[1] = mul(d[0], b)
|
||||
// d[2] = add(d[1], c)
|
||||
if (use_fusion) {
|
||||
fops[0] = op->op;
|
||||
fops[1] = GGML_OP_MUL;
|
||||
fops[2] = GGML_OP_ADD;
|
||||
|
||||
for (n_fuse = 0; n_fuse <= 1 && idx + n_fuse + 1 < idx_end; ++n_fuse) {
|
||||
if (!ggml_can_fuse(gf, idx + n_fuse, fops + n_fuse, 2)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ops[n_fuse] != ops[n_fuse + 1]->src[0]) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ops[n_fuse + 1]->src[1]->ne[0] != op->ne[0]) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (!ggml_is_contiguous_rows(ops[n_fuse + 1]->src[1])) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (ops[n_fuse + 1]->type != GGML_TYPE_F32) {
|
||||
break;
|
||||
}
|
||||
|
||||
//ctx->fuse_cnt[ops[n_fuse + 1]->op]++;
|
||||
|
||||
bid_fuse[n_fuse] = ggml_metal_get_buffer_id(ops[n_fuse + 1]->src[1]);
|
||||
|
||||
args.nef1[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[1];
|
||||
args.nef2[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[2];
|
||||
args.nef3[n_fuse + 1] = ops[n_fuse + 1]->src[1]->ne[3];
|
||||
|
||||
args.nbf1[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[1];
|
||||
args.nbf2[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[2];
|
||||
args.nbf3[n_fuse + 1] = ops[n_fuse + 1]->src[1]->nb[3];
|
||||
}
|
||||
|
||||
++n_fuse;
|
||||
|
||||
if (debug_fusion > 1 && n_fuse > 1) {
|
||||
if (n_fuse == 2) {
|
||||
GGML_LOG_DEBUG("%s: fuse: %s + MUL\n", __func__, ggml_op_name(op->op));
|
||||
}
|
||||
if (n_fuse == 3) {
|
||||
GGML_LOG_DEBUG("%s: fuse: %s + MUL + ADD\n", __func__, ggml_op_name(op->op));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (n_fuse > 1) {
|
||||
bid_dst = ggml_metal_get_buffer_id(ops[n_fuse - 1]);
|
||||
|
||||
for (int i = 1; i < n_fuse; ++i) {
|
||||
if (!ggml_metal_op_concurrency_check(ctx, ops[i])) {
|
||||
ggml_metal_op_concurrency_reset(ctx);
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);
|
||||
|
||||
int nth = 32; // SIMD width
|
||||
while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||
|
||||
while (nth < args.ne00_t && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
|
||||
nth *= 2;
|
||||
}
|
||||
|
||||
nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||
nth = std::min(nth, ne00/4);
|
||||
nth = std::min(nth, args.ne00_t);
|
||||
|
||||
const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
|
||||
|
||||
const int64_t nrows = ggml_nrows(op->src[0]);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_src0, 1);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_fuse[0], 2);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_fuse[1], 3);
|
||||
ggml_metal_encoder_set_buffer (enc, bid_dst, 4);
|
||||
|
||||
ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, nrows, 1, 1, nth, 1, 1);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
|
||||
|
||||
return 1;
|
||||
return n_fuse;
|
||||
}
|
||||
|
||||
int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
|
||||
@@ -2768,7 +2717,6 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
|
||||
const uint64_t ofs0 = op->src[1]->nb[is_2D ? 3 : 2] / 4;
|
||||
const uint64_t ofs1 = op->src[1]->nb[is_2D ? 2 : 1] / 4;
|
||||
|
||||
|
||||
ggml_metal_kargs_im2col args = {
|
||||
/*.ofs0 =*/ ofs0,
|
||||
/*.ofs1 =*/ ofs1,
|
||||
@@ -2789,15 +2737,16 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
|
||||
|
||||
ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);
|
||||
|
||||
const uint64_t n_threads = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), N);
|
||||
const int64_t quotient = N / n_threads + (N % n_threads > 0 ? 1 : 0);
|
||||
GGML_ASSERT(KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
|
||||
|
||||
const uint64_t ntptg0 = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)/(KH*KW), N);
|
||||
|
||||
ggml_metal_encoder_set_pipeline(enc, pipeline);
|
||||
ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 1);
|
||||
ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
|
||||
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, quotient * CHW, OH, OW, n_threads, 1, 1);
|
||||
ggml_metal_encoder_dispatch_threadgroups(enc, IC, OH, OW, ntptg0, KH, KW);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -60,7 +60,6 @@ int ggml_metal_op_mul_mat_id (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_add_id (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_flash_attn_ext (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_bin (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_rms_norm (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_l2_norm (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_group_norm (ggml_metal_op_t ctx, int idx);
|
||||
int ggml_metal_op_norm (ggml_metal_op_t ctx, int idx);
|
||||
|
||||
@@ -66,6 +66,10 @@ static inline float e8m0_to_fp32(uint8_t x) {
|
||||
return as_type<float>(bits);
|
||||
}
|
||||
|
||||
static inline float dot(float x, float y) {
|
||||
return x*y;
|
||||
}
|
||||
|
||||
// NOTE: this is not dequantizing - we are simply fitting the template
|
||||
template <typename type4x4>
|
||||
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
|
||||
@@ -2493,30 +2497,43 @@ kernel void kernel_argmax_f32(
|
||||
dst_i32[tgpig] = arg_val;
|
||||
}
|
||||
|
||||
kernel void kernel_norm_f32(
|
||||
// F == 1 : norm (no fuse)
|
||||
// F == 2 : norm + mul
|
||||
// F == 3 : norm + mul + add
|
||||
template <typename T, short F>
|
||||
kernel void kernel_norm_fuse_impl(
|
||||
constant ggml_metal_kargs_norm & args,
|
||||
device const char * src0,
|
||||
device const char * src1_0,
|
||||
device const char * src1_1,
|
||||
device char * dst,
|
||||
threadgroup float * shmem_f32 [[threadgroup(0)]],
|
||||
uint tgpig[[threadgroup_position_in_grid]],
|
||||
ushort tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort ntg[[threads_per_threadgroup]]) {
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
ushort3 tpitg[[thread_position_in_threadgroup]],
|
||||
ushort sgitg[[simdgroup_index_in_threadgroup]],
|
||||
ushort tiisg[[thread_index_in_simdgroup]],
|
||||
ushort3 ntg[[threads_per_threadgroup]]) {
|
||||
if (sgitg == 0) {
|
||||
shmem_f32[tiisg] = 0.0f;
|
||||
}
|
||||
|
||||
device const float4 * x = (device const float4 *) (src0 + tgpig*args.nb01);
|
||||
const int i01 = tgpig.x;
|
||||
const int i02 = tgpig.y;
|
||||
const int i03 = tgpig.z;
|
||||
|
||||
float4 sumf4(0.0f);
|
||||
device const T * x = (device const T *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);
|
||||
|
||||
device const T * f0 = (device const T *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
|
||||
device const T * f1 = (device const T *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);
|
||||
|
||||
T sumft(0.0f);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
|
||||
sumf4 += x[i00];
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
sumft += x[i00];
|
||||
}
|
||||
sumf = sumf4[0] + sumf4[1] + sumf4[2] + sumf4[3];
|
||||
sumf = dot(sumft, T(1.0f));
|
||||
sumf = simd_sum(sumf);
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
@@ -2532,10 +2549,10 @@ kernel void kernel_norm_f32(
|
||||
|
||||
const float mean = sumf/args.ne00;
|
||||
|
||||
device float4 * y = (device float4 *) dst + tgpig*args.ne00_4;
|
||||
device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
|
||||
|
||||
sumf = 0.0f;
|
||||
for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
y[i00] = x[i00] - mean;
|
||||
sumf += dot(y[i00], y[i00]);
|
||||
}
|
||||
@@ -2555,17 +2572,35 @@ kernel void kernel_norm_f32(
|
||||
const float variance = sumf/args.ne00;
|
||||
|
||||
const float scale = 1.0f/sqrt(variance + args.eps);
|
||||
for (int i00 = tpitg; i00 < args.ne00_4; i00 += ntg) {
|
||||
y[i00] = y[i00] * scale;
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
if (F == 1) {
|
||||
y[i00] = (y[i00]*scale);
|
||||
}
|
||||
if (F == 2) {
|
||||
y[i00] = (y[i00]*scale)*f0[i00];
|
||||
}
|
||||
if (F == 3) {
|
||||
y[i00] = (y[i00]*scale)*f0[i00] + f1[i00];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_norm_fuse_impl<float4, 1>) kernel_norm_fuse_t;
|
||||
|
||||
template [[host_name("kernel_norm_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float, 1>;
|
||||
template [[host_name("kernel_norm_mul_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float, 2>;
|
||||
template [[host_name("kernel_norm_mul_add_f32")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float, 3>;
|
||||
|
||||
template [[host_name("kernel_norm_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float4, 1>;
|
||||
template [[host_name("kernel_norm_mul_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float4, 2>;
|
||||
template [[host_name("kernel_norm_mul_add_f32_4")]] kernel kernel_norm_fuse_t kernel_norm_fuse_impl<float4, 3>;
|
||||
|
||||
// F == 1 : rms_norm (no fuse)
|
||||
// F == 2 : rms_norm + mul
|
||||
// F == 3 : rms_norm + mul + add
|
||||
template <short F>
|
||||
template <typename T, short F>
|
||||
kernel void kernel_rms_norm_fuse_impl(
|
||||
constant ggml_metal_kargs_rms_norm & args,
|
||||
constant ggml_metal_kargs_norm & args,
|
||||
device const char * src0,
|
||||
device const char * src1_0,
|
||||
device const char * src1_1,
|
||||
@@ -2584,15 +2619,15 @@ kernel void kernel_rms_norm_fuse_impl(
|
||||
const int i02 = tgpig.y;
|
||||
const int i03 = tgpig.z;
|
||||
|
||||
device const float4 * x = (device const float4 *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);
|
||||
device const T * x = (device const T *) (src0 + i03*args.nbf3[0] + i02*args.nbf2[0] + i01*args.nbf1[0]);
|
||||
|
||||
device const float4 * f0 = (device const float4 *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
|
||||
device const float4 * f1 = (device const float4 *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);
|
||||
device const T * f0 = (device const T *) (src1_0 + (i03%args.nef3[1])*args.nbf3[1] + (i02%args.nef2[1])*args.nbf2[1] + (i01%args.nef1[1])*args.nbf1[1]);
|
||||
device const T * f1 = (device const T *) (src1_1 + (i03%args.nef3[2])*args.nbf3[2] + (i02%args.nef2[2])*args.nbf2[2] + (i01%args.nef1[2])*args.nbf1[2]);
|
||||
|
||||
float sumf = 0.0f;
|
||||
|
||||
// parallel sum
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_4; i00 += ntg.x) {
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
sumf += dot(x[i00], x[i00]);
|
||||
}
|
||||
sumf = simd_sum(sumf);
|
||||
@@ -2611,8 +2646,8 @@ kernel void kernel_rms_norm_fuse_impl(
|
||||
const float mean = sumf/args.ne00;
|
||||
const float scale = 1.0f/sqrt(mean + args.eps);
|
||||
|
||||
device float4 * y = (device float4 *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_4; i00 += ntg.x) {
|
||||
device T * y = (device T *) (dst + i03*args.nb3 + i02*args.nb2 + i01*args.nb1);
|
||||
for (int i00 = tpitg.x; i00 < args.ne00_t; i00 += ntg.x) {
|
||||
if (F == 1) {
|
||||
y[i00] = (x[i00]*scale);
|
||||
}
|
||||
@@ -2625,11 +2660,15 @@ kernel void kernel_rms_norm_fuse_impl(
|
||||
}
|
||||
}
|
||||
|
||||
typedef decltype(kernel_rms_norm_fuse_impl<1>) kernel_rms_norm_fuse_t;
|
||||
typedef decltype(kernel_rms_norm_fuse_impl<float4, 1>) kernel_rms_norm_fuse_t;
|
||||
|
||||
template [[host_name("kernel_rms_norm_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<1>;
|
||||
template [[host_name("kernel_rms_norm_mul_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<2>;
|
||||
template [[host_name("kernel_rms_norm_mul_add_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<3>;
|
||||
template [[host_name("kernel_rms_norm_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float, 1>;
|
||||
template [[host_name("kernel_rms_norm_mul_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float, 2>;
|
||||
template [[host_name("kernel_rms_norm_mul_add_f32")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float, 3>;
|
||||
|
||||
template [[host_name("kernel_rms_norm_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 1>;
|
||||
template [[host_name("kernel_rms_norm_mul_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 2>;
|
||||
template [[host_name("kernel_rms_norm_mul_add_f32_4")]] kernel kernel_rms_norm_fuse_t kernel_rms_norm_fuse_impl<float4, 3>;
|
||||
|
||||
kernel void kernel_l2_norm_f32(
|
||||
constant ggml_metal_kargs_l2_norm & args,
|
||||
@@ -3987,60 +4026,7 @@ template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kerne
|
||||
template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
|
||||
template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
|
||||
|
||||
// TODO: obolete -- remove
|
||||
//typedef void (im2col_t)(
|
||||
// constant ggml_metal_kargs_im2col & args,
|
||||
// device const float * x,
|
||||
// device char * dst,
|
||||
// uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
// uint3 tgpg[[threadgroups_per_grid]],
|
||||
// uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
// uint3 ntg[[threads_per_threadgroup]]);
|
||||
//
|
||||
//template <typename T>
|
||||
//kernel void kernel_im2col(
|
||||
// constant ggml_metal_kargs_im2col & args,
|
||||
// device const float * x,
|
||||
// device char * dst,
|
||||
// uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
// uint3 tgpg[[threadgroups_per_grid]],
|
||||
// uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
// uint3 ntg[[threads_per_threadgroup]]) {
|
||||
//// const int64_t IC = tgpg[0];
|
||||
// const int64_t OH = tgpg[1];
|
||||
// const int64_t OW = tgpg[2];
|
||||
//
|
||||
//// const int64_t N = ntg[0];
|
||||
// const int64_t KH = ntg[1];
|
||||
// const int64_t KW = ntg[2];
|
||||
//
|
||||
// const int64_t in = tpitg[0];
|
||||
// const int64_t ikh = tpitg[1];
|
||||
// const int64_t ikw = tpitg[2];
|
||||
//
|
||||
// const int64_t iic = tgpig[0];
|
||||
// const int64_t ioh = tgpig[1];
|
||||
// const int64_t iow = tgpig[2];
|
||||
//
|
||||
// const int64_t iiw = iow*args.s0 + ikw*args.d0 - args.p0;
|
||||
// const int64_t iih = ioh*args.s1 + ikh*args.d1 - args.p1;
|
||||
//
|
||||
// const int64_t offset_dst = (in*OH*OW + ioh*OW + iow)*args.CHW + (iic*(KH*KW) + ikh*KW + ikw);
|
||||
//
|
||||
// device T * pdst = (device T *) (dst);
|
||||
//
|
||||
// if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
|
||||
// pdst[offset_dst] = 0.0f;
|
||||
// } else {
|
||||
// const int64_t offset_src = in*args.ofs0 + iic*args.ofs1 + iih*args.IW + iiw;
|
||||
// pdst[offset_dst] = x[offset_src];
|
||||
// }
|
||||
//}
|
||||
//
|
||||
//template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
|
||||
//template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
|
||||
|
||||
typedef void (im2col_ext_t)(
|
||||
typedef void (im2col_t)(
|
||||
constant ggml_metal_kargs_im2col & args,
|
||||
device const float * x,
|
||||
device char * dst,
|
||||
@@ -4050,48 +4036,113 @@ typedef void (im2col_ext_t)(
|
||||
uint3 ntg[[threads_per_threadgroup]]);
|
||||
|
||||
template <typename T>
|
||||
kernel void kernel_im2col_ext(
|
||||
kernel void kernel_im2col(
|
||||
constant ggml_metal_kargs_im2col & args,
|
||||
device const float * x,
|
||||
device char * dst,
|
||||
uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
uint3 tgpg[[threadgroups_per_grid]], // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
|
||||
uint3 tgpg[[threadgroups_per_grid]],
|
||||
uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
uint3 ntg[[threads_per_threadgroup]]) { // [M, 1, 1]
|
||||
const int64_t KHW = (int64_t)args.KHW;
|
||||
uint3 ntg[[threads_per_threadgroup]]) {
|
||||
// const int64_t IC = tgpg[0];
|
||||
const int64_t OH = tgpg[1];
|
||||
const int64_t OW = tgpg[2];
|
||||
|
||||
const int64_t d = tgpig[0] / args.CHW;
|
||||
const int64_t chw = tgpig[0] % args.CHW;
|
||||
const int64_t tgpig_0 = chw / KHW; // 0 ~ (IC - 1)
|
||||
const int64_t HW = tgpig[0] % KHW;
|
||||
const int64_t KH = ntg[1];
|
||||
const int64_t KW = ntg[2];
|
||||
|
||||
const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
|
||||
if (tpitg_0 >= args.N) {
|
||||
return;
|
||||
}
|
||||
int64_t in = tpitg[0];
|
||||
const int64_t ikh = tpitg[1];
|
||||
const int64_t ikw = tpitg[2];
|
||||
|
||||
const int64_t tpitg_1 = HW / args.KW;
|
||||
const int64_t tpitg_2 = HW % args.KW;
|
||||
const int64_t iic = tgpig[0];
|
||||
const int64_t ioh = tgpig[1];
|
||||
const int64_t iow = tgpig[2];
|
||||
|
||||
const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0;
|
||||
const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1;
|
||||
const int64_t iiw = iow*args.s0 + ikw*args.d0 - args.p0;
|
||||
const int64_t iih = ioh*args.s1 + ikh*args.d1 - args.p1;
|
||||
|
||||
const int64_t offset_dst =
|
||||
(tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW +
|
||||
(tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2);
|
||||
int64_t offset_dst = (in*OH*OW + ioh*OW + iow)*args.CHW + (iic*(KH*KW) + ikh*KW + ikw);
|
||||
|
||||
device T * pdst = (device T *) (dst);
|
||||
|
||||
if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
|
||||
pdst[offset_dst] = 0.0f;
|
||||
while (in < args.N) {
|
||||
pdst[offset_dst] = 0.0f;
|
||||
offset_dst += ntg[0]*args.CHW*OH*OW;
|
||||
|
||||
in += ntg[0];
|
||||
}
|
||||
} else {
|
||||
const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1;
|
||||
pdst[offset_dst] = x[offset_src + iih * args.IW + iiw];
|
||||
int64_t offset_src = in*args.ofs0 + iic*args.ofs1 + iih*args.IW + iiw;
|
||||
|
||||
while (in < args.N) {
|
||||
pdst[offset_dst] = x[offset_src];
|
||||
|
||||
offset_dst += ntg[0]*args.CHW*OH*OW;
|
||||
offset_src += ntg[0]*args.ofs0;
|
||||
|
||||
in += ntg[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
|
||||
template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
|
||||
template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
|
||||
template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
|
||||
|
||||
// TODO: obolete -- remove
|
||||
//typedef void (im2col_ext_t)(
|
||||
// constant ggml_metal_kargs_im2col & args,
|
||||
// device const float * x,
|
||||
// device char * dst,
|
||||
// uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
// uint3 tgpg[[threadgroups_per_grid]],
|
||||
// uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
// uint3 ntg[[threads_per_threadgroup]]);
|
||||
//
|
||||
//template <typename T>
|
||||
//kernel void kernel_im2col_ext(
|
||||
// constant ggml_metal_kargs_im2col & args,
|
||||
// device const float * x,
|
||||
// device char * dst,
|
||||
// uint3 tgpig[[threadgroup_position_in_grid]],
|
||||
// uint3 tgpg[[threadgroups_per_grid]], // tgpg[0] = D x IC x KH x KW, CHW = IC x KH x KW
|
||||
// uint3 tpitg[[thread_position_in_threadgroup]],
|
||||
// uint3 ntg[[threads_per_threadgroup]]) { // [M, 1, 1]
|
||||
// const int64_t KHW = (int64_t)args.KHW;
|
||||
//
|
||||
// const int64_t d = tgpig[0] / args.CHW;
|
||||
// const int64_t chw = tgpig[0] % args.CHW;
|
||||
// const int64_t tgpig_0 = chw / KHW; // 0 ~ (IC - 1)
|
||||
// const int64_t HW = tgpig[0] % KHW;
|
||||
//
|
||||
// const int64_t tpitg_0 = (d * ntg[0]) + tpitg[0];
|
||||
// if (tpitg_0 >= args.N) {
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// const int64_t tpitg_1 = HW / args.KW;
|
||||
// const int64_t tpitg_2 = HW % args.KW;
|
||||
//
|
||||
// const int64_t iiw = tgpig[2] * args.s0 + tpitg_2 * args.d0 - args.p0;
|
||||
// const int64_t iih = tgpig[1] * args.s1 + tpitg_1 * args.d1 - args.p1;
|
||||
//
|
||||
// const int64_t offset_dst =
|
||||
// (tpitg_0 * tgpg[1] * tgpg[2] + tgpig[1] * tgpg[2] + tgpig[2]) * args.CHW +
|
||||
// (tgpig_0 * KHW + tpitg_1 * args.KW + tpitg_2);
|
||||
//
|
||||
// device T * pdst = (device T *) (dst);
|
||||
//
|
||||
// if (iih < 0 || iih >= args.IH || iiw < 0 || iiw >= args.IW) {
|
||||
// pdst[offset_dst] = 0.0f;
|
||||
// } else {
|
||||
// const int64_t offset_src = tpitg_0 * args.ofs0 + tgpig_0 * args.ofs1;
|
||||
// pdst[offset_dst] = x[offset_src + iih * args.IW + iiw];
|
||||
// }
|
||||
//}
|
||||
//
|
||||
//template [[host_name("kernel_im2col_ext_f32")]] kernel im2col_ext_t kernel_im2col_ext<float>;
|
||||
//template [[host_name("kernel_im2col_ext_f16")]] kernel im2col_ext_t kernel_im2col_ext<half>;
|
||||
|
||||
typedef void (conv_transpose_1d_t)(
|
||||
constant ggml_metal_kargs_conv_transpose_1d & args,
|
||||
|
||||
@@ -31,6 +31,12 @@
|
||||
#include <filesystem>
|
||||
#include <algorithm>
|
||||
|
||||
static const char * RPC_DEBUG = std::getenv("GGML_RPC_DEBUG");
|
||||
|
||||
#define LOG_DBG(...) \
|
||||
do { if (RPC_DEBUG) GGML_LOG_DEBUG(__VA_ARGS__); } while (0)
|
||||
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
static constexpr size_t MAX_CHUNK_SIZE = 1024ull * 1024ull * 1024ull; // 1 GiB
|
||||
@@ -47,7 +53,7 @@ struct socket_t {
|
||||
sockfd_t fd;
|
||||
socket_t(sockfd_t fd) : fd(fd) {}
|
||||
~socket_t() {
|
||||
GGML_PRINT_DEBUG("[%s] closing socket %d\n", __func__, this->fd);
|
||||
LOG_DBG("[%s] closing socket %d\n", __func__, this->fd);
|
||||
#ifdef _WIN32
|
||||
closesocket(this->fd);
|
||||
#else
|
||||
@@ -265,14 +271,14 @@ static std::shared_ptr<socket_t> socket_connect(const char * host, int port) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!set_no_delay(sockfd)) {
|
||||
fprintf(stderr, "Failed to set TCP_NODELAY\n");
|
||||
GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
|
||||
return nullptr;
|
||||
}
|
||||
addr.sin_family = AF_INET;
|
||||
addr.sin_port = htons(port);
|
||||
struct hostent * server = gethostbyname(host);
|
||||
if (server == NULL) {
|
||||
fprintf(stderr, "Cannot resolve host '%s'\n", host);
|
||||
GGML_LOG_ERROR("Cannot resolve host '%s'\n", host);
|
||||
return nullptr;
|
||||
}
|
||||
memcpy(&addr.sin_addr.s_addr, server->h_addr, server->h_length);
|
||||
@@ -289,7 +295,7 @@ static std::shared_ptr<socket_t> socket_accept(sockfd_t srv_sockfd) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!set_no_delay(client_socket_fd)) {
|
||||
fprintf(stderr, "Failed to set TCP_NODELAY\n");
|
||||
GGML_LOG_ERROR("Failed to set TCP_NODELAY\n");
|
||||
return nullptr;
|
||||
}
|
||||
return client_socket;
|
||||
@@ -302,11 +308,11 @@ static std::shared_ptr<socket_t> create_server_socket(const char * host, int por
|
||||
return nullptr;
|
||||
}
|
||||
if (!set_reuse_addr(sockfd)) {
|
||||
fprintf(stderr, "Failed to set SO_REUSEADDR\n");
|
||||
GGML_LOG_ERROR("Failed to set SO_REUSEADDR\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (inet_addr(host) == INADDR_NONE) {
|
||||
fprintf(stderr, "Invalid host address: %s\n", host);
|
||||
GGML_LOG_ERROR("Invalid host address: %s\n", host);
|
||||
return nullptr;
|
||||
}
|
||||
struct sockaddr_in serv_addr;
|
||||
@@ -349,7 +355,7 @@ static bool recv_data(sockfd_t sockfd, void * data, size_t size) {
|
||||
return false;
|
||||
}
|
||||
if (n == 0) {
|
||||
GGML_LOG_ERROR("recv returned 0 (peer closed?)\n");
|
||||
LOG_DBG("recv returned 0 (peer closed?)\n");
|
||||
return false;
|
||||
}
|
||||
bytes_recv += (size_t)n;
|
||||
@@ -383,7 +389,7 @@ static bool recv_msg(sockfd_t sockfd, std::vector<uint8_t> & input) {
|
||||
try {
|
||||
input.resize(size);
|
||||
} catch (const std::bad_alloc & e) {
|
||||
fprintf(stderr, "Failed to allocate input buffer of size %" PRIu64 "\n", size);
|
||||
GGML_LOG_ERROR("Failed to allocate input buffer of size %" PRIu64 "\n", size);
|
||||
return false;
|
||||
}
|
||||
return recv_data(sockfd, input.data(), size);
|
||||
@@ -443,11 +449,11 @@ static bool check_server_version(const std::shared_ptr<socket_t> & sock) {
|
||||
bool status = send_rpc_cmd(sock, RPC_CMD_HELLO, nullptr, 0, &response, sizeof(response));
|
||||
RPC_STATUS_ASSERT(status);
|
||||
if (response.major != RPC_PROTO_MAJOR_VERSION || response.minor > RPC_PROTO_MINOR_VERSION) {
|
||||
fprintf(stderr, "RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
GGML_LOG_ERROR("RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
return false;
|
||||
}
|
||||
if (response.minor != RPC_PROTO_MINOR_VERSION || response.patch != RPC_PROTO_PATCH_VERSION) {
|
||||
fprintf(stderr, "WARNING: RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
GGML_LOG_INFO("WARNING: RPC server version mismatch: %d.%d.%d\n", response.major, response.minor, response.patch);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -488,7 +494,7 @@ static std::shared_ptr<socket_t> get_socket(const std::string & endpoint) {
|
||||
if (!check_server_version(sock)) {
|
||||
return nullptr;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
|
||||
LOG_DBG("[%s] connected to %s, sockfd=%d\n", __func__, endpoint.c_str(), sock->fd);
|
||||
sockets[endpoint] = sock;
|
||||
return sock;
|
||||
}
|
||||
@@ -809,7 +815,7 @@ ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint) {
|
||||
}
|
||||
auto sock = get_socket(endpoint);
|
||||
if (sock == nullptr) {
|
||||
fprintf(stderr, "Failed to connect to %s\n", endpoint);
|
||||
GGML_LOG_ERROR("Failed to connect to %s\n", endpoint);
|
||||
return nullptr;
|
||||
}
|
||||
size_t alignment = get_alignment(sock);
|
||||
@@ -909,7 +915,7 @@ void rpc_server::hello(rpc_msg_hello_rsp & response) {
|
||||
response.major = RPC_PROTO_MAJOR_VERSION;
|
||||
response.minor = RPC_PROTO_MINOR_VERSION;
|
||||
response.patch = RPC_PROTO_PATCH_VERSION;
|
||||
GGML_PRINT_DEBUG("[%s] version: %d.%d.%d\n", __func__, response.major, response.minor, response.patch);
|
||||
LOG_DBG("[%s] version: %d.%d.%d\n", __func__, response.major, response.minor, response.patch);
|
||||
}
|
||||
|
||||
bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_msg_get_alloc_size_rsp & response) {
|
||||
@@ -929,7 +935,7 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
|
||||
GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_DBG("[%s] buffer: %p, data: %p\n", __func__, (void*)tensor->buffer, tensor->data);
|
||||
if (tensor->buffer == nullptr) {
|
||||
//No buffer allocated.
|
||||
buft = ggml_backend_get_default_buffer_type(backend);
|
||||
@@ -937,7 +943,7 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
|
||||
buft = tensor->buffer->buft;
|
||||
}
|
||||
|
||||
response.alloc_size = ggml_backend_buft_get_alloc_size(buft,tensor);
|
||||
response.alloc_size = ggml_backend_buft_get_alloc_size(buft, tensor);
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -950,29 +956,29 @@ void rpc_server::alloc_buffer(const rpc_msg_alloc_buffer_req & request, rpc_msg_
|
||||
if (buffer != nullptr) {
|
||||
response.remote_ptr = reinterpret_cast<uint64_t>(buffer);
|
||||
response.remote_size = buffer->size;
|
||||
GGML_PRINT_DEBUG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
|
||||
LOG_DBG("[%s] size: %" PRIu64 " -> remote_ptr: %" PRIx64 ", remote_size: %" PRIu64 "\n", __func__, request.size, response.remote_ptr, response.remote_size);
|
||||
buffers.insert(buffer);
|
||||
} else {
|
||||
GGML_LOG_ERROR("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
|
||||
LOG_DBG("[%s] size: %" PRIu64 " -> failed\n", __func__, request.size);
|
||||
}
|
||||
}
|
||||
|
||||
void rpc_server::get_alignment(rpc_msg_get_alignment_rsp & response) {
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
||||
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
||||
GGML_PRINT_DEBUG("[%s] alignment: %lu\n", __func__, alignment);
|
||||
LOG_DBG("[%s] alignment: %lu\n", __func__, alignment);
|
||||
response.alignment = alignment;
|
||||
}
|
||||
|
||||
void rpc_server::get_max_size(rpc_msg_get_max_size_rsp & response) {
|
||||
ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
|
||||
size_t max_size = ggml_backend_buft_get_max_size(buft);
|
||||
GGML_PRINT_DEBUG("[%s] max_size: %lu\n", __func__, max_size);
|
||||
LOG_DBG("[%s] max_size: %lu\n", __func__, max_size);
|
||||
response.max_size = max_size;
|
||||
}
|
||||
|
||||
bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rpc_msg_buffer_get_base_rsp & response) {
|
||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||
LOG_DBG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||
if (buffers.find(buffer) == buffers.end()) {
|
||||
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||
@@ -984,7 +990,7 @@ bool rpc_server::buffer_get_base(const rpc_msg_buffer_get_base_req & request, rp
|
||||
}
|
||||
|
||||
bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
|
||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||
LOG_DBG("[%s] remote_ptr: %" PRIx64 "\n", __func__, request.remote_ptr);
|
||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||
if (buffers.find(buffer) == buffers.end()) {
|
||||
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||
@@ -996,7 +1002,7 @@ bool rpc_server::free_buffer(const rpc_msg_free_buffer_req & request) {
|
||||
}
|
||||
|
||||
bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
|
||||
GGML_PRINT_DEBUG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
|
||||
LOG_DBG("[%s] remote_ptr: %" PRIx64 ", value: %u\n", __func__, request.remote_ptr, request.value);
|
||||
ggml_backend_buffer_t buffer = reinterpret_cast<ggml_backend_buffer_t>(request.remote_ptr);
|
||||
if (buffers.find(buffer) == buffers.end()) {
|
||||
GGML_LOG_ERROR("[%s] buffer not found\n", __func__);
|
||||
@@ -1073,7 +1079,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
return false;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
||||
LOG_DBG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu\n", __func__, (void*)tensor->buffer, tensor->data, offset, size);
|
||||
|
||||
// sanitize tensor->data
|
||||
{
|
||||
@@ -1096,7 +1102,7 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
|
||||
fs::path cache_file = fs::path(cache_dir) / hash_str;
|
||||
std::ofstream ofs(cache_file, std::ios::binary);
|
||||
ofs.write((const char *)data, size);
|
||||
printf("[%s] saved to '%s'\n", __func__, cache_file.c_str());
|
||||
GGML_LOG_INFO("[%s] saved to '%s'\n", __func__, cache_file.c_str());
|
||||
}
|
||||
ggml_backend_tensor_set(tensor, data, offset, size);
|
||||
return true;
|
||||
@@ -1142,8 +1148,8 @@ bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rp
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
return false;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n",
|
||||
__func__, (void*)tensor->buffer, tensor->data, request.offset, size, request.hash);
|
||||
LOG_DBG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n",
|
||||
__func__, (void*)tensor->buffer, tensor->data, request.offset, size, request.hash);
|
||||
|
||||
// sanitize tensor->data
|
||||
{
|
||||
@@ -1177,7 +1183,7 @@ bool rpc_server::init_tensor(const rpc_msg_init_tensor_req & request) {
|
||||
GGML_LOG_ERROR("Null tensor pointer passed to server init_tensor function.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
LOG_DBG("[%s] buffer: %p, data: %p\n", __func__, (void*)tensor->buffer, tensor->data);
|
||||
// Call the backend's buffer_init_tensor function
|
||||
ggml_backend_buffer_t buffer = tensor->buffer;
|
||||
if (buffer && buffer->iface.init_tensor) {
|
||||
@@ -1210,7 +1216,7 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
|
||||
GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
|
||||
return false;
|
||||
}
|
||||
GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, request.offset, request.size);
|
||||
LOG_DBG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %" PRIu64 "\n", __func__, (void*)tensor->buffer, tensor->data, request.offset, request.size);
|
||||
|
||||
// sanitize tensor->data
|
||||
{
|
||||
@@ -1254,7 +1260,7 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
|
||||
uint64_t dst_buf_sz = (uint64_t) ggml_backend_buffer_get_size(dst->buffer);
|
||||
|
||||
if (dst_data + src_size > dst_base + dst_buf_sz) {
|
||||
GGML_PRINT_DEBUG("[%s] out-of-bounds write in rpc_server::copy_tensor:\n"
|
||||
GGML_LOG_ERROR("[%s] out-of-bounds write in rpc_server::copy_tensor:\n"
|
||||
" write range : [0x%" PRIx64 ", 0x%" PRIx64 "]\n"
|
||||
" buffer base: [0x%" PRIx64 ", 0x%" PRIx64 "]\n",
|
||||
__func__,
|
||||
@@ -1265,8 +1271,8 @@ bool rpc_server::copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_co
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_PRINT_DEBUG("[%s] src->buffer: %p, dst->buffer: %p\n",
|
||||
__func__, (void*) src->buffer, (void*) dst->buffer);
|
||||
LOG_DBG("[%s] src->buffer: %p, dst->buffer: %p\n",
|
||||
__func__, (void*) src->buffer, (void*) dst->buffer);
|
||||
|
||||
response.result = ggml_backend_buffer_copy_tensor(src, dst);
|
||||
return true;
|
||||
@@ -1342,7 +1348,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph
|
||||
return false;
|
||||
}
|
||||
const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
|
||||
GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
|
||||
LOG_DBG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
|
||||
|
||||
size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
|
||||
|
||||
@@ -1394,7 +1400,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
||||
}
|
||||
// the first command sent by the client must be HELLO
|
||||
if (cmd != RPC_CMD_HELLO) {
|
||||
fprintf(stderr, "Expected HELLO command, update client\n");
|
||||
GGML_LOG_ERROR("Expected HELLO command, update client\n");
|
||||
return;
|
||||
}
|
||||
if (!recv_msg(sockfd, nullptr, 0)) {
|
||||
@@ -1411,7 +1417,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
||||
}
|
||||
if (cmd >= RPC_CMD_COUNT) {
|
||||
// fail fast if the command is invalid
|
||||
fprintf(stderr, "Unknown command: %d\n", cmd);
|
||||
GGML_LOG_ERROR("Unknown command: %d\n", cmd);
|
||||
break;
|
||||
}
|
||||
switch (cmd) {
|
||||
@@ -1599,7 +1605,7 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
fprintf(stderr, "Unknown command: %d\n", cmd);
|
||||
GGML_LOG_ERROR("Unknown command: %d\n", cmd);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1329,24 +1329,25 @@ extern "C" {
|
||||
//
|
||||
// Performance utils
|
||||
//
|
||||
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
|
||||
// NOTE: Used by llama.cpp examples/tools, avoid using in third-party apps. Instead, do your own performance measurements.
|
||||
//
|
||||
|
||||
struct llama_perf_context_data {
|
||||
double t_start_ms;
|
||||
double t_load_ms;
|
||||
double t_p_eval_ms;
|
||||
double t_eval_ms;
|
||||
// ms == milliseconds
|
||||
double t_start_ms; // absolute start time
|
||||
double t_load_ms; // time needed for loading the model
|
||||
double t_p_eval_ms; // time needed for processing the prompt
|
||||
double t_eval_ms; // time needed for generating tokens
|
||||
|
||||
int32_t n_p_eval;
|
||||
int32_t n_eval;
|
||||
int32_t n_reused; // number of times a ggml compute graph had been reused
|
||||
int32_t n_p_eval; // number of prompt tokens
|
||||
int32_t n_eval; // number of generated tokens
|
||||
int32_t n_reused; // number of times a ggml compute graph had been reused
|
||||
};
|
||||
|
||||
struct llama_perf_sampler_data {
|
||||
double t_sample_ms;
|
||||
double t_sample_ms; // time needed for sampling in ms
|
||||
|
||||
int32_t n_sample;
|
||||
int32_t n_sample; // number of sampled tokens
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
|
||||
@@ -1358,6 +1359,9 @@ extern "C" {
|
||||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||
|
||||
// print a breakdown of per-device memory use via LLAMA_LOG:
|
||||
LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
|
||||
@@ -721,6 +721,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_CLS_OUT, "cls.output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
|
||||
@@ -2027,6 +2027,21 @@ void llama_context::perf_reset() {
|
||||
n_reused = 0;
|
||||
}
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> llama_context::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> ret;
|
||||
for (const auto & buft_size : model.memory_breakdown()) {
|
||||
ret[buft_size.first].model += buft_size.second;
|
||||
}
|
||||
for (const auto & buft_size : memory->memory_breakdown()) {
|
||||
ret[buft_size.first].context += buft_size.second;
|
||||
}
|
||||
for (const auto & backend_ptr : backends) {
|
||||
ggml_backend_t backend = backend_ptr.get();
|
||||
ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
@@ -2765,6 +2780,142 @@ void llama_perf_context_reset(llama_context * ctx) {
|
||||
ctx->perf_reset();
|
||||
}
|
||||
|
||||
void llama_memory_breakdown_print(const struct llama_context * ctx) {
|
||||
const std::vector<ggml_backend_dev_t> & devices = ctx->get_model().devices;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown = ctx->memory_breakdown();
|
||||
|
||||
std::vector<std::array<std::string, 9>> table_data;
|
||||
table_data.reserve(devices.size());
|
||||
const std::string template_header = "%s: | %s | %s %s %s %s %s %s %s |\n";
|
||||
const std::string template_gpu = "%s: | %s | %s = %s + (%s = %s + %s + %s) + %s |\n";
|
||||
const std::string template_other = "%s: | %s | %s %s %s = %s + %s + %s %s |\n";
|
||||
|
||||
table_data.push_back({template_header, "memory breakdown [MiB]", "total", "free", "self", "model", "context", "compute", "unaccounted"});
|
||||
|
||||
constexpr size_t MiB = 1024 * 1024;
|
||||
const std::vector<std::string> desc_prefixes_strip = {"NVIDIA ", "GeForce ", "Tesla ", "AMD ", "Radeon ", "Instinct "};
|
||||
|
||||
// track seen buffer types to avoid double counting:
|
||||
std::set<ggml_backend_buffer_type_t> seen_buffer_types;
|
||||
|
||||
// accumulative memory breakdown for each device and for host:
|
||||
std::vector<llama_memory_breakdown_data> mb_dev(devices.size());
|
||||
llama_memory_breakdown_data mb_host;
|
||||
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
mb_host.model += mb.model;
|
||||
mb_host.context += mb.context;
|
||||
mb_host.compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (dev) {
|
||||
int i_dev = -1;
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
if (devices[i] == dev) {
|
||||
i_dev = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i_dev != -1) {
|
||||
mb_dev[i_dev].model += mb.model;
|
||||
mb_dev[i_dev].context += mb.context;
|
||||
mb_dev[i_dev].compute += mb.compute;
|
||||
seen_buffer_types.insert(buft);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// print memory breakdown for each device:
|
||||
for (size_t i = 0; i < devices.size(); i++) {
|
||||
ggml_backend_dev_t dev = devices[i];
|
||||
llama_memory_breakdown_data mb = mb_dev[i];
|
||||
|
||||
const std::string name = ggml_backend_dev_name(dev);
|
||||
std::string desc = ggml_backend_dev_description(dev);
|
||||
for (const std::string & prefix : desc_prefixes_strip) {
|
||||
if (desc.length() >= prefix.length() && desc.substr(0, prefix.length()) == prefix) {
|
||||
desc = desc.substr(prefix.length());
|
||||
}
|
||||
}
|
||||
|
||||
size_t free, total;
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
const size_t unaccounted = total - self - free;
|
||||
|
||||
table_data.push_back({
|
||||
template_gpu,
|
||||
" - " + name + " (" + desc + ")",
|
||||
std::to_string(total / MiB),
|
||||
std::to_string(free / MiB),
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
std::to_string(unaccounted / MiB)});
|
||||
}
|
||||
|
||||
// print memory breakdown for host:
|
||||
{
|
||||
const size_t self = mb_host.model + mb_host.context + mb_host.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - Host",
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb_host.model / MiB),
|
||||
std::to_string(mb_host.context / MiB),
|
||||
std::to_string(mb_host.compute / MiB),
|
||||
""}); // unaccounted
|
||||
}
|
||||
|
||||
// print memory breakdown for all remaining buffer types:
|
||||
for (const auto & buft_mb : memory_breakdown) {
|
||||
ggml_backend_buffer_type_t buft = buft_mb.first;
|
||||
const llama_memory_breakdown_data & mb = buft_mb.second;
|
||||
if (seen_buffer_types.count(buft) == 1) {
|
||||
continue;
|
||||
}
|
||||
const std::string name = ggml_backend_buft_name(buft);
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
table_data.push_back({
|
||||
template_other,
|
||||
" - " + name,
|
||||
"", // total
|
||||
"", // free
|
||||
std::to_string(self / MiB),
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
""}); // unaccounted
|
||||
seen_buffer_types.insert(buft);
|
||||
}
|
||||
|
||||
for (size_t j = 1; j < table_data[0].size(); j++) {
|
||||
size_t max_len = 0;
|
||||
for (const auto & td : table_data) {
|
||||
max_len = std::max(max_len, td[j].length());
|
||||
}
|
||||
for (auto & td : table_data) {
|
||||
td[j].insert(j == 1 ? td[j].length() : 0, max_len - td[j].length(), ' ');
|
||||
}
|
||||
}
|
||||
for (const auto & td : table_data) {
|
||||
LLAMA_LOG_INFO(td[0].c_str(),
|
||||
__func__, td[1].c_str(), td[2].c_str(), td[3].c_str(), td[4].c_str(), td[5].c_str(),
|
||||
td[6].c_str(), td[7].c_str(), td[8].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
|
||||
@@ -17,9 +17,17 @@ class llama_batch_allocr;
|
||||
class llama_io_read_i;
|
||||
class llama_io_write_i;
|
||||
|
||||
// "memory" as in abstract memory for the context
|
||||
struct llama_memory_i;
|
||||
struct llama_memory_context_i;
|
||||
|
||||
// "memory" as in physical memory for a buffer type, in bytes
|
||||
struct llama_memory_breakdown_data {
|
||||
size_t model = 0; // memory allocated for the model
|
||||
size_t context = 0; // memory allocated for the context
|
||||
size_t compute = 0; // memory allocated for temporary compute buffers
|
||||
};
|
||||
|
||||
struct llama_context {
|
||||
// init scheduler and compute buffers, reserve worst-case graphs
|
||||
llama_context(
|
||||
@@ -144,6 +152,8 @@ struct llama_context {
|
||||
llama_perf_context_data perf_get_data() const;
|
||||
void perf_reset();
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, llama_memory_breakdown_data> memory_breakdown() const;
|
||||
|
||||
//
|
||||
// training
|
||||
//
|
||||
|
||||
@@ -204,7 +204,10 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
|
||||
std::vector<int> target_pos(n_seqs_unq, -1);
|
||||
std::vector<int> target_row(n_seqs_unq, -1);
|
||||
|
||||
bool last = cparams.pooling_type == LLAMA_POOLING_TYPE_LAST;
|
||||
const bool last = (
|
||||
cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
|
||||
(cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
|
||||
);
|
||||
|
||||
for (int i = 0; i < n_tokens; ++i) {
|
||||
const llama_pos pos = ubatch->pos[i];
|
||||
@@ -1177,7 +1180,7 @@ ggml_tensor * llm_graph_context::build_inp_mean() const {
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_inp_cls() const {
|
||||
auto inp = std::make_unique<llm_graph_input_cls>(cparams);
|
||||
auto inp = std::make_unique<llm_graph_input_cls>(cparams, arch);
|
||||
|
||||
auto & cur = inp->cls;
|
||||
|
||||
@@ -1877,34 +1880,32 @@ void llm_graph_context::build_pooling(
|
||||
case LLAMA_POOLING_TYPE_RANK:
|
||||
{
|
||||
ggml_tensor * inp_cls = build_inp_cls();
|
||||
inp = ggml_get_rows(ctx0, inp, inp_cls);
|
||||
cur = ggml_get_rows(ctx0, inp, inp_cls);
|
||||
|
||||
// classification head
|
||||
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
||||
if (cls) {
|
||||
// classification head
|
||||
// https://github.com/huggingface/transformers/blob/5af7d41e49bbfc8319f462eb45253dcb3863dfb7/src/transformers/models/roberta/modeling_roberta.py#L1566
|
||||
cur = ggml_mul_mat(ctx0, cls, inp);
|
||||
cur = ggml_mul_mat(ctx0, cls, cur);
|
||||
if (cls_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_b);
|
||||
}
|
||||
cur = ggml_tanh(ctx0, cur);
|
||||
}
|
||||
|
||||
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
||||
if (cls_out) {
|
||||
cur = ggml_mul_mat(ctx0, cls_out, cur);
|
||||
if (cls_out_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_out_b);
|
||||
}
|
||||
}
|
||||
} else if (cls_out) {
|
||||
// Single layer classification head (direct projection)
|
||||
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
||||
cur = ggml_mul_mat(ctx0, cls_out, inp);
|
||||
// some models don't have `cls_out`, for example: https://huggingface.co/jinaai/jina-reranker-v1-tiny-en
|
||||
// https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/blob/cb5347e43979c3084a890e3f99491952603ae1b7/modeling_bert.py#L884-L896
|
||||
// Single layer classification head (direct projection)
|
||||
// https://github.com/huggingface/transformers/blob/f4fc42216cd56ab6b68270bf80d811614d8d59e4/src/transformers/models/bert/modeling_bert.py#L1476
|
||||
if (cls_out) {
|
||||
cur = ggml_mul_mat(ctx0, cls_out, cur);
|
||||
if (cls_out_b) {
|
||||
cur = ggml_add(ctx0, cur, cls_out_b);
|
||||
}
|
||||
} else {
|
||||
GGML_ABORT("RANK pooling requires either cls+cls_b or cls_out+cls_out_b");
|
||||
}
|
||||
|
||||
// softmax for qwen3 reranker
|
||||
if (arch == LLM_ARCH_QWEN3) {
|
||||
cur = ggml_soft_max(ctx0, cur);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
|
||||
@@ -206,7 +206,7 @@ public:
|
||||
|
||||
class llm_graph_input_cls : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
|
||||
llm_graph_input_cls(const llama_cparams & cparams, const llm_arch arch) : cparams(cparams), arch(arch) {}
|
||||
virtual ~llm_graph_input_cls() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
@@ -214,6 +214,7 @@ public:
|
||||
ggml_tensor * cls; // I32 [n_batch]
|
||||
|
||||
const llama_cparams cparams;
|
||||
const llm_arch arch;
|
||||
};
|
||||
|
||||
class llm_graph_input_rs : public llm_graph_input_i {
|
||||
|
||||
@@ -113,6 +113,14 @@ llama_pos llama_kv_cache_iswa::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return kv_swa->seq_pos_max(seq_id);
|
||||
}
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache_iswa::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> mb = kv_base->memory_breakdown();
|
||||
for (const auto & buft_size : kv_swa->memory_breakdown()) {
|
||||
mb[buft_size.first] += buft_size.second;
|
||||
}
|
||||
return mb;
|
||||
}
|
||||
|
||||
llama_memory_context_ptr llama_kv_cache_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
||||
GGML_UNUSED(embd_all);
|
||||
|
||||
|
||||
@@ -56,6 +56,8 @@ public:
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||
|
||||
@@ -473,6 +473,14 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return cells.seq_pos_max(seq_id);
|
||||
}
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_kv_cache::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
|
||||
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
llama_memory_context_ptr llama_kv_cache::init_batch(
|
||||
llama_batch_allocr & balloc,
|
||||
uint32_t n_ubatch,
|
||||
|
||||
@@ -121,6 +121,8 @@ public:
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||
|
||||
@@ -166,6 +166,14 @@ llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
|
||||
}
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> mb = mem_attn->memory_breakdown();
|
||||
for (const auto & buft_size : mem_recr->memory_breakdown()) {
|
||||
mb[buft_size.first] += buft_size.second;
|
||||
}
|
||||
return mb;
|
||||
}
|
||||
|
||||
void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
|
||||
GGML_UNUSED(flags);
|
||||
|
||||
|
||||
@@ -68,6 +68,8 @@ public:
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
||||
|
||||
// state write/load
|
||||
|
||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1, llama_state_seq_flags flags = 0) const override;
|
||||
|
||||
@@ -359,6 +359,14 @@ llama_pos llama_memory_recurrent::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return result;
|
||||
}
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_memory_recurrent::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||
for (const ggml_backend_buffer_ptr & buf_ptr : bufs) {
|
||||
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
llama_memory_context_ptr llama_memory_recurrent::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
|
||||
do {
|
||||
balloc.split_reset();
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "llama-graph.h"
|
||||
#include "llama-memory.h"
|
||||
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
@@ -50,6 +51,8 @@ public:
|
||||
llama_pos seq_pos_min(llama_seq_id seq_id) const override;
|
||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const override;
|
||||
|
||||
bool prepare(const std::vector<llama_ubatch> & ubatches);
|
||||
|
||||
// find a contiguous slot of memory cells and emplace the ubatch there
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <functional>
|
||||
|
||||
@@ -108,6 +109,8 @@ struct llama_memory_i {
|
||||
virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
|
||||
virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
|
||||
|
||||
virtual std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const = 0;
|
||||
|
||||
//
|
||||
// state write/read
|
||||
//
|
||||
|
||||
@@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) {
|
||||
case LLM_TYPE_1_7B: return "1.7B";
|
||||
case LLM_TYPE_1_8B: return "1.8B";
|
||||
case LLM_TYPE_2B: return "2B";
|
||||
case LLM_TYPE_2_6B: return "2.6B";
|
||||
case LLM_TYPE_2_8B: return "2.8B";
|
||||
case LLM_TYPE_2_9B: return "2.9B";
|
||||
case LLM_TYPE_3B: return "3B";
|
||||
@@ -1977,10 +1978,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||
for (uint32_t il = 0; il < hparams.n_layer; ++il) {
|
||||
hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0;
|
||||
}
|
||||
switch (hparams.n_embd) {
|
||||
case 1024: type = LLM_TYPE_350M; break;
|
||||
case 1536: type = LLM_TYPE_700M; break;
|
||||
case 2048: type = LLM_TYPE_1_2B; break;
|
||||
switch (hparams.n_ff()) {
|
||||
case 4608: type = LLM_TYPE_350M; break;
|
||||
case 6912: type = LLM_TYPE_700M; break;
|
||||
case 8192: type = LLM_TYPE_1_2B; break;
|
||||
case 10752: type = LLM_TYPE_2_6B; break;
|
||||
default: type = LLM_TYPE_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
@@ -3165,6 +3167,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
||||
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||
}
|
||||
|
||||
// output rerank head
|
||||
cls_out = create_tensor(tn(LLM_TENSOR_CLS_OUT, "weight"), {n_embd, hparams.n_cls_out}, TENSOR_NOT_REQUIRED);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = layers[i];
|
||||
|
||||
@@ -6003,6 +6008,14 @@ size_t llama_model::n_devices() const {
|
||||
return devices.size();
|
||||
}
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
|
||||
std::map<ggml_backend_buffer_type_t, size_t> ret;
|
||||
for (const ggml_backend_buffer_ptr & buf_ptr : pimpl->bufs) {
|
||||
ret[ggml_backend_buffer_get_type(buf_ptr.get())] += ggml_backend_buffer_get_size(buf_ptr.get());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint64_t llama_model::n_elements() const {
|
||||
return pimpl->n_elements;
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "llama-memory.h"
|
||||
#include "llama-vocab.h"
|
||||
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
@@ -58,6 +59,7 @@ enum llm_type {
|
||||
LLM_TYPE_1_7B,
|
||||
LLM_TYPE_1_8B,
|
||||
LLM_TYPE_2B,
|
||||
LLM_TYPE_2_6B,
|
||||
LLM_TYPE_2_8B,
|
||||
LLM_TYPE_2_9B,
|
||||
LLM_TYPE_3B,
|
||||
@@ -452,10 +454,12 @@ struct llama_model {
|
||||
|
||||
std::string desc() const;
|
||||
|
||||
size_t size() const;
|
||||
size_t size() const; // file size
|
||||
size_t n_tensors() const;
|
||||
size_t n_devices() const;
|
||||
|
||||
std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;
|
||||
|
||||
// total number of parameters in the model
|
||||
uint64_t n_elements() const;
|
||||
|
||||
|
||||
@@ -219,3 +219,6 @@ target_link_libraries(${LLAMA_TEST_NAME} PRIVATE mtmd)
|
||||
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|
||||
add_executable(${TEST_TARGET} test-c.c)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE llama)
|
||||
|
||||
llama_build_and_test(test-alloc.cpp)
|
||||
target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src)
|
||||
|
||||
572
tests/test-alloc.cpp
Normal file
572
tests/test-alloc.cpp
Normal file
@@ -0,0 +1,572 @@
|
||||
#include <ggml-alloc.h>
|
||||
#include <ggml-backend-impl.h>
|
||||
#include <ggml-cpp.h>
|
||||
#include <ggml-impl.h>
|
||||
#include <ggml.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <exception>
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
//
|
||||
// dummy backend with configurable max_buffer_size, tracks allocations
|
||||
|
||||
uint8_t * const alloc_base = (uint8_t *) 16;
|
||||
|
||||
struct dummy_backend_context {
|
||||
size_t max_buffer_size = 64;
|
||||
size_t alignment = 8;
|
||||
|
||||
ggml_backend_buffer_i buffer_interface;
|
||||
std::vector<ggml_backend_buffer_t> buffers;
|
||||
|
||||
size_t allocated_total() const {
|
||||
size_t n = 0;
|
||||
for (ggml_backend_buffer_t buf : buffers) {
|
||||
n += ggml_backend_buffer_get_size(buf);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
};
|
||||
|
||||
// ggml_backend_buffer_type interface
|
||||
|
||||
static const char * dummy_backend_buffer_type_get_name(ggml_backend_buffer_type_t) {
|
||||
return "dummy_buffer_type";
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t dummy_backend_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
|
||||
ggml_backend_buffer_t & buffer = ctx->buffers.emplace_back();
|
||||
buffer = ggml_backend_buffer_init(buft, ctx->buffer_interface, ctx, size);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
static size_t dummy_backend_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||
dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
|
||||
return ctx->alignment;
|
||||
}
|
||||
|
||||
static size_t dummy_backend_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
|
||||
dummy_backend_context * ctx = (dummy_backend_context *) buft->context;
|
||||
return ctx->max_buffer_size;
|
||||
}
|
||||
|
||||
static bool dummy_backend_buffer_type_is_host(ggml_backend_buffer_type_t) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// ggml_backend_buffer interface
|
||||
|
||||
static void dummy_backend_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
dummy_backend_context * ctx = (dummy_backend_context *) buffer->context;
|
||||
|
||||
auto i = std::find(ctx->buffers.begin(), ctx->buffers.end(), buffer);
|
||||
GGML_ASSERT(i != ctx->buffers.end());
|
||||
ctx->buffers.erase(i);
|
||||
}
|
||||
|
||||
static void * dummy_backend_buffer_get_base(ggml_backend_buffer_t) {
|
||||
return alloc_base;
|
||||
}
|
||||
|
||||
static ggml_status dummy_backend_buffer_init_tensor(ggml_backend_buffer_t, ggml_tensor *) {
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
static void dummy_backend_buffer_memset_tensor(ggml_backend_buffer_t, ggml_tensor *, uint8_t, size_t, size_t) {}
|
||||
|
||||
static void dummy_backend_buffer_set_tensor(ggml_backend_buffer_t, ggml_tensor *, const void *, size_t, size_t) {}
|
||||
|
||||
static void dummy_backend_buffer_get_tensor(ggml_backend_buffer_t, const ggml_tensor *, void *, size_t, size_t) {}
|
||||
|
||||
static void dummy_backend_buffer_clear(ggml_backend_buffer_t, uint8_t) {}
|
||||
|
||||
// dummy_backend (not really a full backend, just provides what gallocr needs)
|
||||
|
||||
struct dummy_backend {
|
||||
std::unique_ptr<dummy_backend_context> context;
|
||||
ggml_backend_buffer_type buffer_type;
|
||||
};
|
||||
|
||||
static dummy_backend dummy_backend_init(size_t max_buffer_size, size_t alignment = 8) {
|
||||
dummy_backend b{};
|
||||
b.context = std::make_unique<dummy_backend_context>();
|
||||
b.context->alignment = alignment;
|
||||
b.context->max_buffer_size = max_buffer_size;
|
||||
|
||||
b.context->buffer_interface.free_buffer = dummy_backend_buffer_free_buffer;
|
||||
b.context->buffer_interface.get_base = dummy_backend_buffer_get_base;
|
||||
b.context->buffer_interface.init_tensor = dummy_backend_buffer_init_tensor;
|
||||
b.context->buffer_interface.memset_tensor = dummy_backend_buffer_memset_tensor;
|
||||
b.context->buffer_interface.set_tensor = dummy_backend_buffer_set_tensor;
|
||||
b.context->buffer_interface.get_tensor = dummy_backend_buffer_get_tensor;
|
||||
b.context->buffer_interface.clear = dummy_backend_buffer_clear;
|
||||
|
||||
b.buffer_type.context = b.context.get();
|
||||
b.buffer_type.iface.get_name = dummy_backend_buffer_type_get_name;
|
||||
b.buffer_type.iface.alloc_buffer = dummy_backend_buffer_type_alloc_buffer;
|
||||
b.buffer_type.iface.get_alignment = dummy_backend_buffer_type_get_alignment;
|
||||
b.buffer_type.iface.get_max_size = dummy_backend_buffer_type_get_max_size;
|
||||
b.buffer_type.iface.is_host = dummy_backend_buffer_type_is_host;
|
||||
return b;
|
||||
}
|
||||
|
||||
//
|
||||
// test utilities
|
||||
|
||||
struct test_context_with_graph {
|
||||
ggml_context * ctx;
|
||||
ggml_cgraph * graph;
|
||||
ggml_context_ptr ctx_ptr;
|
||||
};
|
||||
|
||||
static test_context_with_graph make_context() {
|
||||
ggml_init_params params{};
|
||||
params.mem_size = 48 * ggml_tensor_overhead() + ggml_graph_overhead();
|
||||
params.no_alloc = true;
|
||||
|
||||
ggml_context * ctx = ggml_init(params);
|
||||
ggml_context_ptr ctx_ptr = ggml_context_ptr(ctx);
|
||||
ggml_cgraph * graph = ggml_new_graph(ctx);
|
||||
return { ctx, graph, std::move(ctx_ptr) };
|
||||
}
|
||||
|
||||
static ggml_tensor * make_input_1d(ggml_context * ctx, int64_t n_elements) {
|
||||
ggml_tensor * t = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
||||
ggml_set_input(t);
|
||||
return t;
|
||||
}
|
||||
|
||||
static ggml_tensor * make_input_with_size(ggml_context * ctx, size_t size_bytes) {
|
||||
GGML_ASSERT(size_bytes % 4 == 0);
|
||||
return make_input_1d(ctx, size_bytes / 4);
|
||||
}
|
||||
|
||||
static void assign_names(ggml_context * ctx, const char * prefix = "x") {
|
||||
int i = 0;
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
|
||||
ggml_format_name(t, "%s%d", prefix, i++);
|
||||
}
|
||||
}
|
||||
|
||||
static int get_leaf_id(ggml_cgraph * graph, const char * tensor_name) {
|
||||
for (int i = 0; i < graph->n_leafs; ++i) {
|
||||
if (strncmp(graph->leafs[i]->name, tensor_name, GGML_MAX_NAME) == 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "leaf not found: %s\n", tensor_name);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int get_node_id(ggml_cgraph * graph, const char * tensor_name) {
|
||||
for (int i = 0; i < graph->n_nodes; ++i) {
|
||||
if (strncmp(graph->nodes[i]->name, tensor_name, GGML_MAX_NAME) == 0) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "node not found: %s", tensor_name);
|
||||
return -1;
|
||||
}
|
||||
|
||||
static ggml_gallocr_ptr allocate_graph(ggml_cgraph * graph, ggml_tensor * out, ggml_backend_buffer_type_t buft) {
|
||||
ggml_set_output(out);
|
||||
ggml_build_forward_expand(graph, out);
|
||||
|
||||
ggml_gallocr_ptr galloc = ggml_gallocr_ptr(ggml_gallocr_new(buft));
|
||||
bool result = ggml_gallocr_alloc_graph(galloc.get(), graph);
|
||||
GGML_ASSERT(result);
|
||||
return galloc;
|
||||
}
|
||||
|
||||
//
|
||||
// correctness checks for result allocations
|
||||
|
||||
static void check_all_allocated(ggml_cgraph * graph) {
|
||||
for (int i = 0; i < ggml_graph_n_nodes(graph); ++i) {
|
||||
ggml_tensor * t = ggml_graph_node(graph, i);
|
||||
GGML_ASSERT(t->buffer != nullptr);
|
||||
GGML_ASSERT(t->data != nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
static void check_max_size(ggml_context * ctx) {
|
||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t; t = ggml_get_next_tensor(ctx, t)) {
|
||||
auto buft = ggml_backend_buffer_get_type(t->buffer);
|
||||
size_t max_size = ggml_backend_buft_get_max_size(buft);
|
||||
size_t offset = (char *) t->data - (char *) ggml_backend_buffer_get_base(t->buffer);
|
||||
GGML_ASSERT(t->data >= ggml_backend_buffer_get_base(t->buffer));
|
||||
GGML_ASSERT((size_t) offset + ggml_nbytes(t) <= max_size);
|
||||
}
|
||||
}
|
||||
|
||||
static bool can_reuse_memory(ggml_cgraph * graph, int current_i, ggml_tensor * current, ggml_tensor * other) {
|
||||
if (other->flags & GGML_TENSOR_FLAG_OUTPUT) {
|
||||
return false;
|
||||
}
|
||||
// Check if `other` is still "alive", ie. an input to any node after the `current` op
|
||||
for (int i = current_i; i < ggml_graph_n_nodes(graph); ++i) {
|
||||
ggml_tensor * t = ggml_graph_node(graph, i);
|
||||
for (int s = 0; s < GGML_MAX_SRC; s++) {
|
||||
if (t == current && ggml_op_can_inplace(t->op)) {
|
||||
continue;
|
||||
}
|
||||
if (t->src[s] == other) {
|
||||
return false;
|
||||
}
|
||||
if (t->src[s] && t->src[s]->view_src == other) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool memory_overlap(ggml_tensor * a, ggml_tensor * b) {
|
||||
if (a->buffer != b->buffer) {
|
||||
return false;
|
||||
}
|
||||
int64_t a0 = (int64_t) a->data;
|
||||
int64_t a1 = a0 + ggml_nbytes(a);
|
||||
int64_t b0 = (int64_t) b->data;
|
||||
int64_t b1 = b0 + ggml_nbytes(b);
|
||||
return a1 > b0 && b1 > a0;
|
||||
}
|
||||
|
||||
static ggml_tensor * get_view_source(ggml_tensor * t) {
|
||||
while (t->view_src) {
|
||||
t = t->view_src;
|
||||
}
|
||||
return t;
|
||||
}
|
||||
|
||||
static void check_no_overlap(ggml_cgraph * graph) {
|
||||
for (int i = 0; i < ggml_graph_n_nodes(graph); ++i) {
|
||||
for (int j = 0; j < i; ++j) {
|
||||
ggml_tensor * t = ggml_graph_node(graph, i);
|
||||
ggml_tensor * o = ggml_graph_node(graph, j);
|
||||
GGML_ASSERT(t != o);
|
||||
|
||||
if (get_view_source(t) == get_view_source(o)) {
|
||||
continue;
|
||||
}
|
||||
if (memory_overlap(t, o)) {
|
||||
GGML_ASSERT(can_reuse_memory(graph, i, t, o));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// test cases
|
||||
|
||||
// Scenario where the first backend buffer is completely exhausted and there are further
|
||||
// tensors which require a second buffer
|
||||
static void test_max_size_too_many_tensors() {
|
||||
dummy_backend backend = dummy_backend_init(16);
|
||||
auto [ctx, graph, ctx_ptr] = make_context();
|
||||
|
||||
ggml_tensor * x[7];
|
||||
x[0] = make_input_with_size(ctx, 8);
|
||||
x[1] = make_input_with_size(ctx, 8);
|
||||
x[2] = make_input_with_size(ctx, 8);
|
||||
x[3] = ggml_mul(ctx, x[0], x[1]);
|
||||
x[4] = ggml_add(ctx, x[1], x[2]);
|
||||
x[5] = ggml_add(ctx, x[3], x[0]);
|
||||
x[6] = ggml_add(ctx, x[4], x[5]);
|
||||
assign_names(ctx);
|
||||
|
||||
ggml_gallocr_ptr galloc = allocate_graph(graph, x[6], &backend.buffer_type);
|
||||
check_all_allocated(graph);
|
||||
check_no_overlap(graph);
|
||||
check_max_size(ctx);
|
||||
GGML_ASSERT(backend.context->allocated_total() <= 16 + 16);
|
||||
}
|
||||
|
||||
// Scenario where there is some space left in the first buffer, but not enough to accomodate
|
||||
// a larger tensor, so a second buffer is required
|
||||
static void test_max_size_tensor_too_large() {
|
||||
dummy_backend backend = dummy_backend_init(32);
|
||||
auto [ctx, graph, ctx_ptr] = make_context();
|
||||
|
||||
ggml_tensor * x[3];
|
||||
x[0] = make_input_with_size(ctx, 16); // chunk 0, [0 , 16)
|
||||
x[1] = make_input_with_size(ctx, 8); // chunk 0, [16, 24)
|
||||
x[2] = ggml_concat(ctx, x[0], x[1], 0); // chunk 1, [0 , 24)
|
||||
assign_names(ctx);
|
||||
|
||||
ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
|
||||
check_all_allocated(graph);
|
||||
check_no_overlap(graph);
|
||||
check_max_size(ctx);
|
||||
GGML_ASSERT(backend.context->allocated_total() <= 32 + 24);
|
||||
}
|
||||
|
||||
// Scenario where a single tensor exceeds the max buffer size - in this case the allocator
|
||||
// should try to create a bigger buffer anyway, and wait for the backend to throw an error.
|
||||
// Backends may report an artificially lower max size in some cases for compatibility reasons.
|
||||
static void test_tensor_larger_than_max_size() {
|
||||
dummy_backend backend = dummy_backend_init(16);
|
||||
auto [ctx, graph, ctx_ptr] = make_context();
|
||||
|
||||
ggml_tensor * x[2];
|
||||
x[0] = make_input_with_size(ctx, 24);
|
||||
x[1] = ggml_scale(ctx, x[0], 2.0f);
|
||||
assign_names(ctx);
|
||||
|
||||
ggml_gallocr_ptr galloc = allocate_graph(graph, x[1], &backend.buffer_type);
|
||||
check_all_allocated(graph);
|
||||
check_no_overlap(graph);
|
||||
GGML_ASSERT(backend.context->allocated_total() == 24);
|
||||
}
|
||||
|
||||
// This test assumes a max of 16 buffer chunks, and tries to allocate tensors that would
|
||||
// require more. Expectation is that the last buffer should grow to fit everything,
|
||||
// leaving it to the backend to error out if it can't allocate that much.
|
||||
static void test_not_enough_chunks() {
|
||||
const int max_chunks = 16;
|
||||
const int max_size = 8;
|
||||
|
||||
dummy_backend backend = dummy_backend_init(max_size);
|
||||
auto [ctx, graph, ctx_ptr] = make_context();
|
||||
|
||||
ggml_tensor * x[max_chunks + 1];
|
||||
for (int i = 0; i < max_chunks + 1; ++i) {
|
||||
x[i] = make_input_with_size(ctx, max_size);
|
||||
}
|
||||
ggml_tensor * acc = x[0];
|
||||
for (int i = 0; i < max_chunks; ++i) {
|
||||
acc = ggml_add(ctx, acc, x[i + 1]);
|
||||
}
|
||||
assign_names(ctx);
|
||||
|
||||
ggml_gallocr_ptr galloc = allocate_graph(graph, acc, &backend.buffer_type);
|
||||
check_all_allocated(graph);
|
||||
check_no_overlap(graph);
|
||||
GGML_ASSERT(backend.context->allocated_total() > max_chunks * max_size);
|
||||
}
|
||||
|
||||
// Fill up leftover unallocated space of a chunk after allocating a large tensor that
|
||||
// requires a new chunk.
|
||||
static void test_fill_leftover_space() {
|
||||
dummy_backend backend = dummy_backend_init(16);
|
||||
auto [ctx, graph, ctx_ptr] = make_context();
|
||||
|
||||
ggml_tensor * x[4];
|
||||
x[0] = make_input_with_size(ctx, 8);
|
||||
x[1] = ggml_pad(ctx, x[0], 2, 0, 0, 0);
|
||||
x[3] = ggml_mean(ctx, x[1]);
|
||||
assign_names(ctx);
|
||||
|
||||
ggml_gallocr_ptr galloc = allocate_graph(graph, x[3], &backend.buffer_type);
|
||||
check_all_allocated(graph);
|
||||
check_no_overlap(graph);
|
||||
check_max_size(ctx);
|
||||
GGML_ASSERT(backend.context->allocated_total() <= 12 + 16);
|
||||
}
|
||||
|
||||
// Check that views don't require any extra memory
|
||||
static void test_view_inplace() {
|
||||
dummy_backend backend = dummy_backend_init(32);
|
||||
auto [ctx, graph, ctx_ptr] = make_context();
|
||||
|
||||
ggml_tensor * x[6];
|
||||
x[0] = make_input_1d(ctx, 4); // chunk 0, [0, 16)
|
||||
x[1] = ggml_reshape_2d(ctx, x[0], 2, 2); // view of x0
|
||||
x[2] = ggml_permute(ctx, x[1], 1, 0, 2, 3); // view of x0
|
||||
x[3] = ggml_view_1d(ctx, x[2], 2, 4); // view of x0
|
||||
x[4] = make_input_1d(ctx, 2); // chunk 0, [16, 24)
|
||||
x[5] = ggml_add(ctx, x[3], x[4]); // reuse (inplace add)
|
||||
assign_names(ctx);
|
||||
|
||||
ggml_gallocr_ptr galloc = allocate_graph(graph, x[5], &backend.buffer_type);
|
||||
check_all_allocated(graph);
|
||||
check_no_overlap(graph);
|
||||
check_max_size(ctx);
|
||||
GGML_ASSERT(backend.context->allocated_total() <= 24);
|
||||
}
|
||||
|
||||
static void test_reuse_and_free() {
|
||||
dummy_backend backend = dummy_backend_init(40);
|
||||
auto [ctx, graph, ctx_ptr] = make_context();
|
||||
|
||||
ggml_tensor * x[9];
|
||||
x[0] = make_input_with_size(ctx, 24);
|
||||
x[1] = make_input_with_size(ctx, 8);
|
||||
x[2] = make_input_with_size(ctx, 8);
|
||||
x[3] = ggml_add(ctx, x[1], x[2]); // reuse, free x2
|
||||
x[4] = ggml_pad(ctx, x[0], 2, 0, 0, 0); // alloc new buffer, free x0
|
||||
x[5] = ggml_scale(ctx, x[4], 2.0f); // alloc from free block
|
||||
x[6] = ggml_add(ctx, x[4], x[5]); // reuse, free x5
|
||||
x[7] = ggml_view_1d(ctx, x[6], 2, 8); // view
|
||||
x[8] = ggml_add(ctx, x[3], x[7]); // reuse
|
||||
assign_names(ctx);
|
||||
|
||||
ggml_gallocr_ptr galloc = allocate_graph(graph, x[8], &backend.buffer_type);
|
||||
check_all_allocated(graph);
|
||||
check_no_overlap(graph);
|
||||
check_max_size(ctx);
|
||||
GGML_ASSERT(backend.context->allocated_total() <= 40 + 32 + 32);
|
||||
}
|
||||
|
||||
static void test_merge_free_block(size_t max_buffer_size) {
|
||||
dummy_backend backend = dummy_backend_init(max_buffer_size);
|
||||
auto [ctx, graph, ctx_ptr] = make_context();
|
||||
|
||||
ggml_tensor * x[9];
|
||||
x[0] = make_input_with_size(ctx, 16);
|
||||
x[1] = make_input_with_size(ctx, 16);
|
||||
x[2] = make_input_with_size(ctx, 16);
|
||||
x[3] = ggml_mean(ctx, x[0]);
|
||||
x[4] = ggml_mean(ctx, x[1]);
|
||||
x[5] = ggml_pad(ctx, x[2], 2, 0, 0, 0);
|
||||
x[6] = ggml_add(ctx, x[3], x[4]);
|
||||
x[7] = ggml_pad(ctx, x[6], 5, 0, 0, 0);
|
||||
x[8] = ggml_add(ctx, x[5], x[7]);
|
||||
assign_names(ctx);
|
||||
|
||||
ggml_gallocr_ptr galloc = allocate_graph(graph, x[8], &backend.buffer_type);
|
||||
check_all_allocated(graph);
|
||||
check_no_overlap(graph);
|
||||
check_max_size(ctx);
|
||||
GGML_ASSERT(backend.context->allocated_total() <= 32 + 32 + 24);
|
||||
}
|
||||
|
||||
// Check that previously allocated but freed memory is preferred over allocating
|
||||
// additional memory, even if the remaining space in a chunk would match tensor size better
|
||||
static void test_prefer_already_allocated_memory() {
|
||||
dummy_backend backend = dummy_backend_init(32, /*align*/ 4);
|
||||
auto [ctx, graph, ctx_ptr] = make_context();
|
||||
|
||||
ggml_tensor * x[3];
|
||||
x[0] = make_input_with_size(ctx, 24); // [24b][8b unused]
|
||||
x[1] = ggml_mean(ctx, x[0]); // [24b free][4b][4b unused]
|
||||
x[2] = ggml_mean(ctx, x[1]); // should be allocated in the 24b block
|
||||
assign_names(ctx);
|
||||
|
||||
ggml_gallocr_ptr galloc = allocate_graph(graph, x[2], &backend.buffer_type);
|
||||
check_all_allocated(graph);
|
||||
check_no_overlap(graph);
|
||||
GGML_ASSERT(backend.context->allocated_total() <= 28);
|
||||
}
|
||||
|
||||
// test for allocating on multiple devices with some tensors in the graph
|
||||
// allocated externally (not by gallocr).
|
||||
static void test_multiple_buffer_types() {
|
||||
dummy_backend backend_a = dummy_backend_init(32);
|
||||
dummy_backend backend_b = dummy_backend_init(SIZE_MAX);
|
||||
|
||||
auto [ctx_a, _a, ctx_a_ptr] = make_context();
|
||||
auto [ctx_b, _b, ctx_b_ptr] = make_context();
|
||||
auto [ctx, graph, ctx_ptr] = make_context();
|
||||
|
||||
ggml_tensor * a[2];
|
||||
a[0] = make_input_with_size(ctx_a, 16);
|
||||
a[1] = make_input_with_size(ctx_a, 16);
|
||||
assign_names(ctx_a, "a");
|
||||
|
||||
ggml_tensor * b[2];
|
||||
b[0] = make_input_with_size(ctx_b, 24);
|
||||
b[1] = make_input_with_size(ctx_b, 4);
|
||||
assign_names(ctx_b, "b");
|
||||
|
||||
ggml_tensor * x[9];
|
||||
x[0] = make_input_with_size(ctx, 16);
|
||||
x[1] = ggml_mul(ctx, x[0], a[0]);
|
||||
x[2] = ggml_pad(ctx, x[1], 2, 0, 0, 0);
|
||||
x[3] = ggml_mul(ctx, x[2], b[0]);
|
||||
x[4] = ggml_mean(ctx, x[3]);
|
||||
x[5] = ggml_add(ctx, x[4], b[1]);
|
||||
x[6] = ggml_pad(ctx, x[5], 3, 0, 0, 0);
|
||||
x[7] = ggml_add(ctx, x[6], a[1]);
|
||||
x[8] = ggml_scale(ctx, x[7], 2.0f);
|
||||
assign_names(ctx, "x");
|
||||
|
||||
ggml_backend_buffer_ptr buf_a(ggml_backend_alloc_ctx_tensors_from_buft(ctx_a, &backend_a.buffer_type));
|
||||
ggml_backend_buffer_ptr buf_b(ggml_backend_alloc_ctx_tensors_from_buft(ctx_b, &backend_b.buffer_type));
|
||||
ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
|
||||
|
||||
// assign buffer types manually to avoid extra complexity from backend scheduler
|
||||
ggml_set_output(x[8]);
|
||||
ggml_build_forward_expand(graph, x[8]);
|
||||
|
||||
GGML_ASSERT(graph->n_leafs == 5);
|
||||
int leaf_buffer_ids[5];
|
||||
leaf_buffer_ids[get_leaf_id(graph, "a0")] = 0;
|
||||
leaf_buffer_ids[get_leaf_id(graph, "a1")] = 0;
|
||||
leaf_buffer_ids[get_leaf_id(graph, "b0")] = 1;
|
||||
leaf_buffer_ids[get_leaf_id(graph, "b1")] = 1;
|
||||
leaf_buffer_ids[get_leaf_id(graph, "x0")] = 0;
|
||||
|
||||
GGML_ASSERT(graph->n_nodes == 8);
|
||||
int node_buffer_ids[8];
|
||||
node_buffer_ids[get_node_id(graph, "x1")] = 0;
|
||||
node_buffer_ids[get_node_id(graph, "x2")] = 0;
|
||||
node_buffer_ids[get_node_id(graph, "x3")] = 1;
|
||||
node_buffer_ids[get_node_id(graph, "x4")] = 1;
|
||||
node_buffer_ids[get_node_id(graph, "x5")] = 1;
|
||||
node_buffer_ids[get_node_id(graph, "x6")] = 1;
|
||||
node_buffer_ids[get_node_id(graph, "x7")] = 0;
|
||||
node_buffer_ids[get_node_id(graph, "x8")] = 0;
|
||||
|
||||
ggml_gallocr_ptr galloc(ggml_gallocr_new_n(bufts, 2));
|
||||
ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
|
||||
ggml_gallocr_alloc_graph(galloc.get(), graph);
|
||||
|
||||
check_all_allocated(graph);
|
||||
check_no_overlap(graph);
|
||||
check_max_size(ctx);
|
||||
GGML_ASSERT(backend_a.context->allocated_total() <= 32 + 32 + 24);
|
||||
GGML_ASSERT(backend_b.context->allocated_total() <= 32 + 24);
|
||||
}
|
||||
|
||||
static void test_buffer_size_zero() {
|
||||
dummy_backend backend_a = dummy_backend_init(SIZE_MAX);
|
||||
dummy_backend backend_b = dummy_backend_init(SIZE_MAX);
|
||||
auto [ctx, graph, ctx_ptr] = make_context();
|
||||
|
||||
ggml_tensor * x[2];
|
||||
x[0] = make_input_with_size(ctx, 16);
|
||||
x[1] = ggml_scale(ctx, x[0], 2.0f);
|
||||
|
||||
ggml_set_output(x[1]);
|
||||
ggml_build_forward_expand(graph, x[1]);
|
||||
|
||||
int leaf_buffer_ids[1] = { 0 };
|
||||
int node_buffer_ids[1] = { 0 };
|
||||
|
||||
ggml_backend_buffer_type_t bufts[2] = { &backend_a.buffer_type, &backend_b.buffer_type };
|
||||
ggml_gallocr_ptr galloc = ggml_gallocr_ptr(ggml_gallocr_new_n(bufts, 2));
|
||||
bool res1 = ggml_gallocr_reserve_n(galloc.get(), graph, node_buffer_ids, leaf_buffer_ids);
|
||||
bool res2 = ggml_gallocr_alloc_graph(galloc.get(), graph);
|
||||
GGML_ASSERT(res1 && res2);
|
||||
|
||||
check_all_allocated(graph);
|
||||
GGML_ASSERT(backend_a.context->allocated_total() == 16);
|
||||
GGML_ASSERT(backend_b.context->allocated_total() == 0);
|
||||
}
|
||||
|
||||
static void run(const char * name, void (*f)()) {
|
||||
printf("%s ", name);
|
||||
fflush(stdout);
|
||||
f();
|
||||
printf("PASSED\n");
|
||||
}
|
||||
|
||||
int main() {
|
||||
run("test_max_size_too_many_tensors", test_max_size_too_many_tensors);
|
||||
run("test_max_size_tensor_too_large", test_max_size_tensor_too_large);
|
||||
run("test_tensor_larger_than_max_size", test_tensor_larger_than_max_size);
|
||||
run("test_not_enough_chunks", test_not_enough_chunks);
|
||||
run("test_fill_leftover_space", test_fill_leftover_space);
|
||||
run("test_view_inplace", test_view_inplace);
|
||||
run("test_reuse_and_free", test_reuse_and_free);
|
||||
run("test_merge_free_block(32)", []() { test_merge_free_block(32); });
|
||||
run("test_merge_free_block(SIZE_MAX)", []() { test_merge_free_block(SIZE_MAX); });
|
||||
run("test_prefer_already_allocated_memory", test_prefer_already_allocated_memory);
|
||||
run("test_multiple_buffer_types", test_multiple_buffer_types);
|
||||
run("test_buffer_size_zero", test_buffer_size_zero);
|
||||
return 0;
|
||||
}
|
||||
@@ -6117,7 +6117,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_l2_norm (GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
||||
}
|
||||
for (float eps : {0.0f, 1e-6f, 1e-4f, 1e-1f, 1.0f}) {
|
||||
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps));
|
||||
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
|
||||
test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
|
||||
test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, false));
|
||||
test_cases.emplace_back(new test_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true));
|
||||
|
||||
@@ -260,14 +260,7 @@ int main(int argc, char * argv[]) {
|
||||
|
||||
int64_t iterations = params.iterations;
|
||||
|
||||
|
||||
// Initialize GGML, ensures float conversion tables are initialized
|
||||
struct ggml_init_params ggml_params = {
|
||||
/* .mem_size = */ 1*1024,
|
||||
/* .mem_buffer = */ NULL,
|
||||
/* .no_alloc = */ true,
|
||||
};
|
||||
struct ggml_context * ctx = ggml_init(ggml_params);
|
||||
ggml_cpu_init();
|
||||
|
||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||
ggml_type type = (ggml_type) i;
|
||||
@@ -359,7 +352,5 @@ int main(int argc, char * argv[]) {
|
||||
}
|
||||
}
|
||||
|
||||
ggml_free(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -2060,6 +2060,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
LOG("\n");
|
||||
llama_perf_context_print(ctx);
|
||||
llama_memory_breakdown_print(ctx);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
|
||||
@@ -5093,21 +5093,15 @@ int main(int argc, char ** argv) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<server_tokens> tokenized_queries = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, query, /* add_special */ false, true);
|
||||
if (tokenized_queries.size() != 1) {
|
||||
res_error(res, format_error_response("\"query\" must contain only a single prompt", ERROR_TYPE_INVALID_REQUEST));
|
||||
}
|
||||
|
||||
// create and queue the task
|
||||
json responses = json::array();
|
||||
bool error = false;
|
||||
std::unordered_set<int> task_ids;
|
||||
{
|
||||
std::vector<server_task> tasks;
|
||||
auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true);
|
||||
tasks.reserve(tokenized_docs.size());
|
||||
for (size_t i = 0; i < tokenized_docs.size(); i++) {
|
||||
auto tmp = format_rerank(ctx_server.vocab, tokenized_queries[0], tokenized_docs[i]);
|
||||
tasks.reserve(documents.size());
|
||||
for (size_t i = 0; i < documents.size(); i++) {
|
||||
auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]);
|
||||
server_task task = server_task(SERVER_TASK_TYPE_RERANK);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.index = i;
|
||||
|
||||
@@ -1368,34 +1368,6 @@ static std::string fnv_hash(const uint8_t * data, size_t len) {
|
||||
return std::to_string(hash);
|
||||
}
|
||||
|
||||
|
||||
// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
|
||||
static server_tokens format_rerank(const struct llama_vocab * vocab, server_tokens & query, server_tokens & doc) {
|
||||
server_tokens result = {};
|
||||
|
||||
// Get EOS token - use SEP token as fallback if EOS is not available
|
||||
llama_token eos_token = llama_vocab_eos(vocab);
|
||||
if (eos_token == LLAMA_TOKEN_NULL) {
|
||||
eos_token = llama_vocab_sep(vocab);
|
||||
}
|
||||
if (llama_vocab_get_add_bos(vocab)) {
|
||||
result.push_back(llama_vocab_bos(vocab));
|
||||
}
|
||||
result.push_back(query);
|
||||
if (llama_vocab_get_add_eos(vocab)) {
|
||||
result.push_back(eos_token);
|
||||
}
|
||||
if (llama_vocab_get_add_sep(vocab)) {
|
||||
result.push_back(llama_vocab_sep(vocab));
|
||||
}
|
||||
result.push_back(doc);
|
||||
if (llama_vocab_get_add_eos(vocab)) {
|
||||
result.push_back(eos_token);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector<raw_buffer> files) {
|
||||
mtmd::bitmaps bitmaps;
|
||||
for (auto & file : files) {
|
||||
@@ -1501,3 +1473,43 @@ static std::vector<server_tokens> tokenize_input_prompts(const llama_vocab * voc
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// format rerank task: [BOS]query[EOS][SEP]doc[EOS].
|
||||
static server_tokens format_rerank(const struct llama_model * model, const struct llama_vocab * vocab, mtmd_context * mctx, const std::string & query, const std::string & doc) {
|
||||
server_tokens result = {};
|
||||
|
||||
const char * rerank_prompt = llama_model_chat_template(model, "rerank");
|
||||
|
||||
if (rerank_prompt != nullptr) {
|
||||
std::string prompt = rerank_prompt;
|
||||
string_replace_all(prompt, "{query}" , query);
|
||||
string_replace_all(prompt, "{document}", doc );
|
||||
server_tokens tokens = tokenize_input_subprompt(vocab, mctx, prompt, false, true);
|
||||
result.push_back(tokens);
|
||||
} else {
|
||||
// Get EOS token - use SEP token as fallback if EOS is not available
|
||||
server_tokens query_tokens = tokenize_input_subprompt(vocab, mctx, query, false, false);
|
||||
server_tokens doc_tokens = tokenize_input_subprompt(vocab, mctx, doc, false, false);
|
||||
llama_token eos_token = llama_vocab_eos(vocab);
|
||||
if (eos_token == LLAMA_TOKEN_NULL) {
|
||||
eos_token = llama_vocab_sep(vocab);
|
||||
}
|
||||
|
||||
if (llama_vocab_get_add_bos(vocab)) {
|
||||
result.push_back(llama_vocab_bos(vocab));
|
||||
}
|
||||
result.push_back(query_tokens);
|
||||
if (llama_vocab_get_add_eos(vocab)) {
|
||||
result.push_back(eos_token);
|
||||
}
|
||||
if (llama_vocab_get_add_sep(vocab)) {
|
||||
result.push_back(llama_vocab_sep(vocab));
|
||||
}
|
||||
result.push_back(doc_tokens);
|
||||
if (llama_vocab_get_add_eos(vocab)) {
|
||||
result.push_back(eos_token);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user