mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-05-07 16:57:34 +03:00
Compare commits
138 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8e52631d55 | ||
|
|
f4b5a2ee91 | ||
|
|
97f06e9eed | ||
|
|
e358d75adb | ||
|
|
cfff1fc300 | ||
|
|
3980e04d5a | ||
|
|
2496f9c149 | ||
|
|
5207d120ea | ||
|
|
a0101225bc | ||
|
|
a290ce6266 | ||
|
|
a00e47e422 | ||
|
|
750141969c | ||
|
|
a736e6c0ac | ||
|
|
e3e3f8e46a | ||
|
|
f08f20a0e3 | ||
|
|
07eaf919ed | ||
|
|
74d6248f71 | ||
|
|
2ca1161bd7 | ||
|
|
bbeb89d76c | ||
|
|
ff806a110d | ||
|
|
d5003b6e4d | ||
|
|
2635ac76e8 | ||
|
|
70a8309114 | ||
|
|
c91faf997f | ||
|
|
bf76ac77be | ||
|
|
a09a00e502 | ||
|
|
2bacb1eb77 | ||
|
|
d6e7b033a4 | ||
|
|
fa595462ca | ||
|
|
a817a22bc6 | ||
|
|
eff06702b2 | ||
|
|
e77056f9b2 | ||
|
|
935a340292 | ||
|
|
d8794eecd5 | ||
|
|
36a694c965 | ||
|
|
a4701c98f7 | ||
|
|
994118a183 | ||
|
|
c84e6d6db5 | ||
|
|
fa8feaed34 | ||
|
|
846262d787 | ||
|
|
6dcd824fce | ||
|
|
d4b0c22f9e | ||
|
|
e48034dfc9 | ||
|
|
048a490f76 | ||
|
|
db44417b02 | ||
|
|
d05fe1d7da | ||
|
|
0754b7b6fe | ||
|
|
09294365a9 | ||
|
|
63d93d1733 | ||
|
|
c5a3bc39b1 | ||
|
|
9dbb372610 | ||
|
|
228e836344 | ||
|
|
ed23489f42 | ||
|
|
457e2288c9 | ||
|
|
e8ec7ab058 | ||
|
|
1a03cf47f6 | ||
|
|
b97ebdc98f | ||
|
|
2098fd6169 | ||
|
|
ab6120cde5 | ||
|
|
c3c1505392 | ||
|
|
05e141a6b3 | ||
|
|
aab68217b7 | ||
|
|
a95a11e5b8 | ||
|
|
5cbfb18075 | ||
|
|
beb42fffa4 | ||
|
|
660b1b4bdc | ||
|
|
c20c44514a | ||
|
|
6118c043b1 | ||
|
|
5f0ab726f7 | ||
|
|
e82aaf2587 | ||
|
|
27aef3dd91 | ||
|
|
45155597aa | ||
|
|
80afa33aad | ||
|
|
b42c7fa5b8 | ||
|
|
d77599234e | ||
|
|
41a63be28e | ||
|
|
098705a29e | ||
|
|
683c5acb90 | ||
|
|
b1d5f5b449 | ||
|
|
4b221b7f1e | ||
|
|
59237bfbbc | ||
|
|
1cbc846eba | ||
|
|
3142f1dbb9 | ||
|
|
b5c4227dc6 | ||
|
|
d6a5094004 | ||
|
|
7b95ea5d11 | ||
|
|
bdc9c743a5 | ||
|
|
739393beeb | ||
|
|
fc2b0053ff | ||
|
|
7b8443ac78 | ||
|
|
5d56effdee | ||
|
|
52e5f0a5c1 | ||
|
|
f9f33654a6 | ||
|
|
98bb57916a | ||
|
|
f42e29fdf1 | ||
|
|
19821178be | ||
|
|
698d19b93c | ||
|
|
50494a2800 | ||
|
|
d530d6e7a2 | ||
|
|
c3e08f4700 | ||
|
|
14e733e36f | ||
|
|
516e8d7a8a | ||
|
|
434b2a1ff6 | ||
|
|
983ca8992e | ||
|
|
665abc6097 | ||
|
|
4414c04b9a | ||
|
|
ceaf47c4b1 | ||
|
|
42401c72b8 | ||
|
|
e940b3d468 | ||
|
|
0f1bb602dd | ||
|
|
d13540becd | ||
|
|
f84270ea10 | ||
|
|
5594d13224 | ||
|
|
f535774325 | ||
|
|
06a811d085 | ||
|
|
78433f606f | ||
|
|
7ec36aa861 | ||
|
|
b1a5bd4e0c | ||
|
|
0c6ee1cade | ||
|
|
2dd84169d1 | ||
|
|
f454bd7eb8 | ||
|
|
b760272f1a | ||
|
|
dcad77cc3b | ||
|
|
98dc1418ea | ||
|
|
9725a313be | ||
|
|
d1649047a3 | ||
|
|
9d34231bb8 | ||
|
|
8ea8fee966 | ||
|
|
eddd7a13a5 | ||
|
|
dd2914dc81 | ||
|
|
0adede866d | ||
|
|
361fe72acb | ||
|
|
a702f39597 | ||
|
|
13d36cf891 | ||
|
|
f65bc34c68 | ||
|
|
15fa3c493b | ||
|
|
dc80c5252a | ||
|
|
e583f3b4f5 |
@@ -12,6 +12,8 @@ body:
|
||||
after recreating the CMake build directory and with `-DGGML_CCACHE=OFF`.
|
||||
If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
|
||||
by clearing `~/.cache/ccache` (on Linux).
|
||||
|
||||
Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
|
||||
- type: textarea
|
||||
id: commit
|
||||
attributes:
|
||||
|
||||
4
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
4
.github/ISSUE_TEMPLATE/011-bug-results.yml
vendored
@@ -1,5 +1,5 @@
|
||||
name: Bug (model use)
|
||||
description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
|
||||
description: Something goes wrong when running a model (crashes, garbled outputs, etc.).
|
||||
title: "Eval bug: "
|
||||
labels: ["bug-unconfirmed", "model evaluation"]
|
||||
body:
|
||||
@@ -12,6 +12,8 @@ body:
|
||||
If you encountered the issue while using an external UI (e.g. ollama),
|
||||
please reproduce your issue using one of the examples/binaries in this repository.
|
||||
The `llama-completion` binary can be used for simple and reproducible model inference.
|
||||
|
||||
Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
2
.github/ISSUE_TEMPLATE/019-bug-misc.yml
vendored
@@ -10,6 +10,8 @@ body:
|
||||
This issue template is intended for miscellaneous bugs that don't fit into any other category.
|
||||
If you encountered the issue while using an external UI (e.g. ollama),
|
||||
please reproduce your issue using one of the examples/binaries in this repository.
|
||||
|
||||
Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
|
||||
- type: textarea
|
||||
id: version
|
||||
attributes:
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/020-enhancement.yml
vendored
2
.github/ISSUE_TEMPLATE/020-enhancement.yml
vendored
@@ -8,6 +8,8 @@ body:
|
||||
value: |
|
||||
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)
|
||||
|
||||
Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
|
||||
|
||||
- type: checkboxes
|
||||
id: prerequisites
|
||||
attributes:
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/030-research.yml
vendored
2
.github/ISSUE_TEMPLATE/030-research.yml
vendored
@@ -8,6 +8,8 @@ body:
|
||||
value: |
|
||||
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
|
||||
|
||||
Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
|
||||
|
||||
- type: checkboxes
|
||||
id: research-stage
|
||||
attributes:
|
||||
|
||||
2
.github/ISSUE_TEMPLATE/040-refactor.yml
vendored
2
.github/ISSUE_TEMPLATE/040-refactor.yml
vendored
@@ -9,6 +9,8 @@ body:
|
||||
Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
|
||||
Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
|
||||
|
||||
Please fill out this template yourself, copypasting language model outputs is [strictly prohibited](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md#ai-usage-policy).
|
||||
|
||||
- type: textarea
|
||||
id: background-description
|
||||
attributes:
|
||||
|
||||
2
.github/pull_request_template.md
vendored
2
.github/pull_request_template.md
vendored
@@ -6,7 +6,7 @@
|
||||
|
||||
<!-- You can provide more details and link related discussions here. Delete this section if not applicable -->
|
||||
|
||||
# Requirements
|
||||
## Requirements
|
||||
|
||||
<!-- IMPORTANT: Please do NOT delete this section, otherwise your PR may be rejected -->
|
||||
|
||||
|
||||
33
.github/workflows/build-and-test-snapdragon.yml
vendored
33
.github/workflows/build-and-test-snapdragon.yml
vendored
@@ -49,28 +49,19 @@ jobs:
|
||||
cp docs/backend/snapdragon/CMakeUserPresets.json .
|
||||
cmake --preset arm64-android-snapdragon-release -B build
|
||||
cmake --build build
|
||||
cmake --install build --prefix pkg-adb/llama.cpp
|
||||
cmake --install build --prefix pkg-snapdragon/llama.cpp
|
||||
|
||||
- name: Upload Llama.CPP Snapdragon Android Build Artifact
|
||||
if: ${{ always() && steps.build_llama_cpp_snapdragon_android.outcome == 'success' }}
|
||||
uses: actions/upload-artifact@v6
|
||||
with:
|
||||
name: llama-cpp-android-arm64-snapdragon
|
||||
path: pkg-adb/llama.cpp
|
||||
|
||||
check-secret:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
has-key: ${{ steps.check.outputs.has-key }}
|
||||
steps:
|
||||
- id: check
|
||||
run: echo "has-key=${{ secrets.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
|
||||
path: pkg-snapdragon/llama.cpp
|
||||
|
||||
test-snapdragon-qdc:
|
||||
name: Test on QDC Android Device (${{ matrix.device }})
|
||||
needs: [android-ndk-snapdragon, check-secret]
|
||||
if: needs.check-secret.outputs.has-key == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
needs: [android-ndk-snapdragon]
|
||||
runs-on: ubuntu-slim
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
@@ -81,10 +72,10 @@ jobs:
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Download build artifact
|
||||
uses: actions/download-artifact@v4
|
||||
uses: actions/download-artifact@v7
|
||||
with:
|
||||
name: llama-cpp-android-arm64-snapdragon
|
||||
path: pkg-snapdragon/
|
||||
path: pkg-snapdragon/llama.cpp
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
@@ -92,13 +83,25 @@ jobs:
|
||||
python-version: '3.x'
|
||||
cache: pip
|
||||
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y curl unzip
|
||||
|
||||
- name: Install QDC SDK wheel
|
||||
run: |
|
||||
curl -fSL -o qdc_sdk.zip https://softwarecenter.qualcomm.com/api/download/software/tools/Qualcomm_Device_Cloud_SDK/All/0.2.3/qualcomm_device_cloud_sdk-0.2.3.zip
|
||||
unzip qdc_sdk.zip -d qdc_sdk
|
||||
pip install qdc_sdk/qualcomm_device_cloud_sdk-0.2.3-py3-none-any.whl
|
||||
|
||||
- name: Check QDC API key
|
||||
id: check_secret
|
||||
env:
|
||||
QDC_API_KEY: ${{ secrets.QDC_API_KEY }}
|
||||
run: echo "has-qdc-key=${{ env.QDC_API_KEY != '' }}" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: Run QDC tests (${{ matrix.device }})
|
||||
if: steps.check_secret.outputs.has-qdc-key == 'true'
|
||||
run: |
|
||||
python scripts/snapdragon/qdc/run_qdc_jobs.py \
|
||||
--test all \
|
||||
|
||||
2
.github/workflows/gguf-publish.yml
vendored
2
.github/workflows/gguf-publish.yml
vendored
@@ -29,10 +29,10 @@ jobs:
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: '3.11'
|
||||
pip-install: poetry==2.4.0
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
cd gguf-py
|
||||
python -m pip install poetry==2.3.2
|
||||
poetry install
|
||||
|
||||
- name: Build package
|
||||
|
||||
2
.github/workflows/python-type-check.yml
vendored
2
.github/workflows/python-type-check.yml
vendored
@@ -31,7 +31,7 @@ jobs:
|
||||
uses: actions/setup-python@v6
|
||||
with:
|
||||
python-version: "3.11"
|
||||
pip-install: -r requirements/requirements-all.txt ty==0.0.26
|
||||
pip-install: -r requirements/requirements-all.txt ty==0.0.33
|
||||
# - name: Type-check with Pyright
|
||||
# uses: jakebailey/pyright-action@v2
|
||||
# with:
|
||||
|
||||
14
.gitignore
vendored
14
.gitignore
vendored
@@ -34,7 +34,6 @@
|
||||
/.vscode/
|
||||
/nppBackup
|
||||
|
||||
|
||||
# Coverage
|
||||
|
||||
/gcovr-report/
|
||||
@@ -74,6 +73,7 @@
|
||||
!/models/templates
|
||||
|
||||
# Zig
|
||||
|
||||
/zig-out/
|
||||
/zig-cache/
|
||||
|
||||
@@ -93,6 +93,7 @@
|
||||
!/examples/sycl/*.sh
|
||||
|
||||
# Server Web UI temporary files
|
||||
|
||||
/tools/server/webui/node_modules
|
||||
/tools/server/webui/dist
|
||||
# we no longer use gz for index.html
|
||||
@@ -104,11 +105,15 @@
|
||||
__pycache__/
|
||||
*/poetry.lock
|
||||
poetry.toml
|
||||
poetry.lock
|
||||
uv.lock
|
||||
|
||||
# Nix
|
||||
|
||||
/result
|
||||
|
||||
# Test binaries
|
||||
|
||||
/tests/test-backend-ops
|
||||
/tests/test-double-float
|
||||
/tests/test-grad0
|
||||
@@ -124,6 +129,7 @@ poetry.toml
|
||||
/tests/test-tokenizer-1-spm
|
||||
|
||||
# Scripts
|
||||
|
||||
!/scripts/install-oneapi.bat
|
||||
|
||||
# Generated by scripts
|
||||
@@ -132,18 +138,24 @@ poetry.toml
|
||||
/wikitext-2-raw/
|
||||
|
||||
# Test models for lora adapters
|
||||
|
||||
/lora-tests
|
||||
|
||||
# Local scripts
|
||||
|
||||
/run-vim.sh
|
||||
/run-chat.sh
|
||||
/run-spec.sh
|
||||
/.ccache/
|
||||
|
||||
# IDE
|
||||
|
||||
/*.code-workspace
|
||||
/.windsurf/
|
||||
# emscripten
|
||||
a.out.*
|
||||
|
||||
# AGENTS
|
||||
|
||||
AGENTS.local.md
|
||||
.pi/SYSTEM.md
|
||||
|
||||
34
.pi/gg/SYSTEM.md
Normal file
34
.pi/gg/SYSTEM.md
Normal file
@@ -0,0 +1,34 @@
|
||||
You are a coding agent. Here are some very important rules that you must follow:
|
||||
|
||||
General:
|
||||
- By very precise and concise when writing code, comments, explanations, etc.
|
||||
- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
|
||||
- Don't try to build or run the code unless you are explicitly asked to do so
|
||||
- Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
|
||||
|
||||
Coding:
|
||||
- When in doubt, always refer to the CONTRIBUTING.md file of the project
|
||||
- When referencing issues or PRs in comments, use the format:
|
||||
- C/C++ code: `// ref: <url>`
|
||||
- Other (CMake, etc.): `# ref: <url>`
|
||||
|
||||
Pull requests (PRs):
|
||||
- New branch names are prefixed with "gg/"
|
||||
- Before opening a pull request, ask the user to confirm the description
|
||||
- When creating a pull request, look for the repository's PR template and follow it
|
||||
- For the AI usage disclosure section, write "YES. llama.cpp + pi"
|
||||
- Always create the pull requests in draft mode
|
||||
|
||||
Commits:
|
||||
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
|
||||
- Do not explicitly set the git author in commits - rely on the default git config
|
||||
|
||||
Resources (read on demand):
|
||||
- [CONTRIBUTING.md](CONTRIBUTING.md)
|
||||
- [Build documentation](docs/build.md)
|
||||
- [Server usage documentation](tools/server/README.md)
|
||||
- [Server development documentation](tools/server/README-dev.md)
|
||||
- [PEG parser](docs/development/parsing.md)
|
||||
- [Auto parser](docs/autoparser.md)
|
||||
- [Jinja engine](common/jinja/README.md)
|
||||
- [PR template](.github/pull_request_template.md)
|
||||
12
CODEOWNERS
12
CODEOWNERS
@@ -53,28 +53,30 @@
|
||||
/examples/speculative/ @ggerganov
|
||||
/ggml/cmake/ @ggerganov
|
||||
/ggml/include/ @ggerganov
|
||||
/ggml/src/ggml-backend-meta.cpp @JohannesGaessler
|
||||
/ggml/src/ggml-cann/ @ggml-org/ggml-cann
|
||||
/ggml/src/ggml-common.h @ggerganov
|
||||
/ggml/src/ggml-cpu/ @ggerganov
|
||||
/ggml/src/ggml-cpu/spacemit/ @alex-spacemit
|
||||
/ggml/src/ggml-cuda/ @ggml-org/ggml-cuda
|
||||
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
|
||||
/ggml/src/ggml-hip/ @IMbackK
|
||||
/ggml/src/ggml-cuda/vendors/hip.h @IMbackK
|
||||
/ggml/src/ggml-cuda/fattn-wmma* @IMbackK
|
||||
/ggml/src/ggml-hexagon/ @ggml-org/ggml-hexagon
|
||||
/ggml/src/ggml-hip/ @IMbackK
|
||||
/ggml/src/ggml-impl.h @ggerganov
|
||||
/ggml/src/ggml-metal/ @ggml-org/ggml-metal
|
||||
/ggml/src/ggml-opencl/ @ggml-org/ggml-opencl
|
||||
/ggml/src/ggml-hexagon/ @ggml-org/ggml-hexagon
|
||||
/ggml/src/ggml-openvino/ @cavusmustafa @wine99
|
||||
/ggml/src/ggml-opt.cpp @JohannesGaessler
|
||||
/ggml/src/ggml-quants.* @ggerganov
|
||||
/ggml/src/ggml-rpc/ @ggml-org/ggml-rpc
|
||||
/ggml/src/ggml-sycl/ @ggml-org/ggml-sycl
|
||||
/ggml/src/ggml-threading.* @ggerganov
|
||||
/ggml/src/ggml-vulkan/ @ggml-org/ggml-vulkan
|
||||
/ggml/src/ggml-virtgpu/ @kpouget
|
||||
/ggml/src/ggml-vulkan/ @ggml-org/ggml-vulkan
|
||||
/ggml/src/ggml-webgpu/ @ggml-org/ggml-webgpu
|
||||
/ggml/src/ggml-zdnn/ @ggml-org/ggml-zdnn @Andreas-Krebbel @AlekseiNikiforovIBM
|
||||
/ggml/src/ggml-openvino/ @cavusmustafa @wine99
|
||||
/ggml/src/ggml-zendnn/ @avinashcpandey @Jiten1parmar @z-vishal
|
||||
/ggml/src/ggml.c @ggerganov
|
||||
/ggml/src/ggml.cpp @ggerganov
|
||||
/ggml/src/gguf.cpp @JohannesGaessler @Green-Sky
|
||||
|
||||
615
common/arg.cpp
615
common/arg.cpp
File diff suppressed because it is too large
Load Diff
@@ -25,7 +25,8 @@ struct common_arg {
|
||||
const char * value_hint_2 = nullptr; // for second arg value
|
||||
const char * env = nullptr;
|
||||
std::string help;
|
||||
bool is_sparam = false; // is current arg a sampling param?
|
||||
bool is_sampling = false; // is current arg a sampling param?
|
||||
bool is_spec = false; // is current arg a speculative decoding param?
|
||||
bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
|
||||
void (*handler_void) (common_params & params) = nullptr;
|
||||
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
||||
@@ -74,7 +75,8 @@ struct common_arg {
|
||||
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
||||
common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
|
||||
common_arg & set_env(const char * env);
|
||||
common_arg & set_sparam();
|
||||
common_arg & set_sampling();
|
||||
common_arg & set_spec();
|
||||
common_arg & set_preset_only();
|
||||
bool in_example(enum llama_example ex);
|
||||
bool is_exclude(enum llama_example ex);
|
||||
|
||||
@@ -136,10 +136,10 @@ common_peg_parser analyze_reasoning::build_parser(parser_build_context & ctx) co
|
||||
if (!end.empty()) {
|
||||
if (!start.empty()) {
|
||||
// Standard tag-based: optional(<think>reasoning</think>)
|
||||
return p.optional(start + p.reasoning(p.until(end)) + end + p.space());
|
||||
return p.optional(p.optspace(start) + p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
|
||||
}
|
||||
// Delimiter-style (empty start)
|
||||
return p.optional(p.reasoning(p.until(end)) + end + p.space());
|
||||
return p.optional(p.reasoning(p.until(trim_whitespace(end))) + p.optspace(end));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -186,7 +186,6 @@ common_peg_parser analyze_tools::build_parser(parser_build_context & ctx) const
|
||||
common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
const auto & inputs = ctx.inputs;
|
||||
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
// Build effective field names with dot notation if function_field is set
|
||||
std::string name_field = format.name_field;
|
||||
@@ -225,8 +224,7 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont
|
||||
tool_start = format.per_call_start;
|
||||
}
|
||||
|
||||
return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(p.until(tool_start)))) + tools_parser +
|
||||
p.end();
|
||||
return ctx.reasoning_parser + p.optional(p.content(p.until(tool_start))) + tools_parser + p.end();
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p, const std::string & name,
|
||||
@@ -270,7 +268,6 @@ common_peg_parser analyze_tools::build_func_parser(common_chat_peg_builder & p,
|
||||
common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
const auto & inputs = ctx.inputs;
|
||||
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
common_peg_parser tool_choice = p.choice();
|
||||
|
||||
@@ -336,14 +333,12 @@ common_peg_parser analyze_tools::build_tool_parser_tag_json(parser_build_context
|
||||
|
||||
std::string trigger_marker = !format.section_start.empty() ? format.section_start : format.per_call_start;
|
||||
auto content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
|
||||
return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
|
||||
p.end();
|
||||
return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
|
||||
}
|
||||
|
||||
common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_context & ctx) const {
|
||||
auto & p = ctx.p;
|
||||
const auto & inputs = ctx.inputs;
|
||||
bool force_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||
|
||||
auto until_suffix = p.rule("until-suffix", p.until(arguments.value_suffix));
|
||||
|
||||
@@ -471,8 +466,7 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte
|
||||
|
||||
std::string trigger_marker = !format.section_start.empty() ? format.section_start : format.per_call_start;
|
||||
auto content_before_tools = trigger_marker.empty() ? p.eps() : p.until(trigger_marker);
|
||||
return ctx.reasoning_parser + (force_tools ? p.eps() : p.optional(p.content(content_before_tools))) + tool_calls +
|
||||
p.end();
|
||||
return ctx.reasoning_parser + p.optional(p.content(content_before_tools)) + tool_calls + p.end();
|
||||
}
|
||||
|
||||
} // namespace autoparser
|
||||
|
||||
@@ -296,7 +296,7 @@ void analyze_reasoning::compare_reasoning_presence() {
|
||||
return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
|
||||
});
|
||||
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
|
||||
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
|
||||
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.tag("post", (p.space() + p.marker() + p.space())) + p.rest();
|
||||
});
|
||||
// try the more aggressive parse first, if it fails, fall back to the delimiter one
|
||||
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
|
||||
@@ -306,11 +306,11 @@ void analyze_reasoning::compare_reasoning_presence() {
|
||||
if (result.result.success()) {
|
||||
if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
start = trim_leading_whitespace(result.tags["pre"]);
|
||||
end = trim_trailing_whitespace(result.tags["post"]);
|
||||
start = result.tags["pre"];
|
||||
end = result.tags["post"];
|
||||
} else if (!result.tags["post"].empty()) {
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
end = trim_trailing_whitespace(result.tags["post"]);
|
||||
end = result.tags["post"];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -342,7 +342,7 @@ void analyze_reasoning::compare_thinking_enabled() {
|
||||
if (left_trimmed.empty() && !diff.right.empty()) {
|
||||
if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
|
||||
if (start.empty()) {
|
||||
start = trim_leading_whitespace(diff.right);
|
||||
start = diff.right;
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
}
|
||||
}
|
||||
@@ -353,7 +353,7 @@ void analyze_reasoning::compare_thinking_enabled() {
|
||||
if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
|
||||
start = seg[seg.size() - 2].value;
|
||||
}
|
||||
end = trim_trailing_whitespace(diff.left);
|
||||
end = diff.left;
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
}
|
||||
}
|
||||
@@ -445,14 +445,14 @@ void analyze_reasoning::compare_reasoning_scope() {
|
||||
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
|
||||
if (result.result.success()) {
|
||||
start = result.tags["pre"];
|
||||
end = trim_trailing_whitespace(result.tags["post"]);
|
||||
end = result.tags["post"];
|
||||
} else {
|
||||
auto parser_delimiter = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
|
||||
return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())));
|
||||
});
|
||||
result = parser_delimiter.parse_anywhere_and_extract(comparison->output_B);
|
||||
if (result.result.success()) {
|
||||
end = trim_trailing_whitespace(result.tags["post"]);
|
||||
end = result.tags["post"];
|
||||
} else {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Unable to extract reasoning markers, falling back to reasoning = NONE\n" ANSI_RESET, __func__);
|
||||
mode = reasoning_mode::NONE;
|
||||
|
||||
@@ -816,6 +816,32 @@ common_peg_parser common_chat_peg_builder::prefix(const std::string & s, const s
|
||||
return literal(s.substr(0, s.rfind(delimiter)));
|
||||
}
|
||||
|
||||
common_peg_parser common_chat_peg_builder::optspace(const std::string & tag) {
|
||||
auto parser = eps();
|
||||
size_t end_of_prefix_space = tag.size();
|
||||
size_t start_of_suffix_space = tag.size();
|
||||
for (size_t i = 0; i < tag.size(); i++) {
|
||||
if (!std::isspace(tag[i])) {
|
||||
end_of_prefix_space = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (size_t i = tag.size(); i > 0; i--) {
|
||||
if (!std::isspace(tag[i - 1])) {
|
||||
start_of_suffix_space = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < end_of_prefix_space; i++) {
|
||||
parser += optional(literal(std::string(1, tag[i])));
|
||||
}
|
||||
parser += literal(tag.substr(end_of_prefix_space, start_of_suffix_space - end_of_prefix_space));
|
||||
for (size_t i = start_of_suffix_space; i < tag.size(); i++) {
|
||||
parser += optional(literal(std::string(1, tag[i])));
|
||||
}
|
||||
return parser;
|
||||
}
|
||||
|
||||
common_peg_parser common_chat_peg_builder::standard_json_tools(
|
||||
const std::string & section_start,
|
||||
const std::string & section_end,
|
||||
|
||||
@@ -96,6 +96,9 @@ class common_chat_peg_builder : public common_peg_parser_builder {
|
||||
// Return a parser that parses the prefix of a string, up to a given delimiter.
|
||||
common_peg_parser prefix(const std::string & s, const std::string & delimiter = {});
|
||||
|
||||
// Return a parser that parses all elements of tag, but leading and trailing spaces are optional
|
||||
common_peg_parser optspace(const std::string & tag);
|
||||
|
||||
// Legacy-compatible helper for building standard JSON tool calls
|
||||
// Used by tests and manual parsers
|
||||
// name_key/args_key: JSON key names for function name and arguments
|
||||
|
||||
@@ -2116,22 +2116,38 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
static std::string common_chat_templates_generation_prompt(const common_chat_template & tmpl, const autoparser::generation_params & inputs) {
|
||||
autoparser::generation_params params = inputs;
|
||||
params.add_generation_prompt = false;
|
||||
std::string no_gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
|
||||
params.add_generation_prompt = true;
|
||||
std::string gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
|
||||
|
||||
size_t prefix_len = 0;
|
||||
size_t min_size = std::min(no_gen_prompt.size(), gen_prompt.size());
|
||||
while (prefix_len < min_size && no_gen_prompt[prefix_len] == gen_prompt[prefix_len]) {
|
||||
prefix_len++;
|
||||
}
|
||||
return gen_prompt.substr(prefix_len);
|
||||
}
|
||||
|
||||
static common_chat_params common_chat_templates_apply_jinja(const struct common_chat_templates * tmpls,
|
||||
const struct common_chat_templates_inputs & inputs) {
|
||||
autoparser::generation_params params;
|
||||
params.tools = common_chat_tools_to_json_oaicompat(inputs.tools);
|
||||
const auto & tmpl =
|
||||
params.tools.is_array() && tmpls->template_tool_use ? *tmpls->template_tool_use : *tmpls->template_default;
|
||||
const auto & src = tmpl.source();
|
||||
const auto & caps = tmpl.original_caps();
|
||||
params.messages = render_message_to_json(inputs.messages, tmpl.original_caps());
|
||||
params.tool_choice = inputs.tool_choice;
|
||||
params.reasoning_format = inputs.reasoning_format;
|
||||
params.enable_thinking = inputs.enable_thinking;
|
||||
params.grammar = inputs.grammar;
|
||||
params.now = inputs.now;
|
||||
params.add_bos = tmpls->add_bos;
|
||||
params.add_eos = tmpls->add_eos;
|
||||
const auto & src = tmpl.source();
|
||||
const auto & caps = tmpl.original_caps();
|
||||
params.messages = render_message_to_json(inputs.messages, tmpl.original_caps());
|
||||
params.tool_choice = inputs.tool_choice;
|
||||
params.reasoning_format = inputs.reasoning_format;
|
||||
params.enable_thinking = inputs.enable_thinking;
|
||||
params.grammar = inputs.grammar;
|
||||
params.now = inputs.now;
|
||||
params.add_generation_prompt = inputs.add_generation_prompt;
|
||||
params.add_bos = tmpls->add_bos;
|
||||
params.add_eos = tmpls->add_eos;
|
||||
|
||||
if (src.find("<|channel|>") == std::string::npos) {
|
||||
// map developer to system for all models except for GPT-OSS
|
||||
@@ -2153,14 +2169,7 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
workaround::func_args_not_string(params.messages);
|
||||
}
|
||||
|
||||
params.add_generation_prompt = false;
|
||||
std::string no_gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
|
||||
params.add_generation_prompt = true;
|
||||
std::string gen_prompt = common_chat_template_direct_apply_impl(tmpl, params);
|
||||
auto diff = calculate_diff_split(no_gen_prompt, gen_prompt);
|
||||
params.generation_prompt = diff.right + diff.suffix;
|
||||
|
||||
params.add_generation_prompt = inputs.add_generation_prompt;
|
||||
params.generation_prompt = common_chat_templates_generation_prompt(tmpl, params);
|
||||
|
||||
params.extra_context = common_chat_extra_context();
|
||||
for (auto el : inputs.chat_template_kwargs) {
|
||||
@@ -2212,8 +2221,8 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_
|
||||
auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser);
|
||||
auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE;
|
||||
if (auto_params.supports_thinking) {
|
||||
auto_params.thinking_start_tag = autoparser.reasoning.start;
|
||||
auto_params.thinking_end_tag = autoparser.reasoning.end;
|
||||
auto_params.thinking_start_tag = trim_whitespace(autoparser.reasoning.start);
|
||||
auto_params.thinking_end_tag = trim_whitespace(autoparser.reasoning.end);
|
||||
}
|
||||
auto_params.generation_prompt = params.generation_prompt;
|
||||
common_peg_arena arena;
|
||||
|
||||
@@ -70,7 +70,7 @@ common_time_meas::~common_time_meas() {
|
||||
// CPU utils
|
||||
//
|
||||
|
||||
int32_t cpu_get_num_physical_cores() {
|
||||
int32_t common_cpu_get_num_physical_cores() {
|
||||
#ifdef __linux__
|
||||
// enumerate the set of thread siblings, num entries is num cores
|
||||
std::unordered_set<std::string> siblings;
|
||||
@@ -185,11 +185,11 @@ static int cpu_count_math_cpus(int n_cpu) {
|
||||
/**
|
||||
* Returns number of CPUs on system that are useful for math.
|
||||
*/
|
||||
int32_t cpu_get_num_math() {
|
||||
int32_t common_cpu_get_num_math() {
|
||||
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
|
||||
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
|
||||
if (n_cpu < 1) {
|
||||
return cpu_get_num_physical_cores();
|
||||
return common_cpu_get_num_physical_cores();
|
||||
}
|
||||
if (is_hybrid_cpu()) {
|
||||
cpu_set_t affinity;
|
||||
@@ -202,7 +202,7 @@ int32_t cpu_get_num_math() {
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return cpu_get_num_physical_cores();
|
||||
return common_cpu_get_num_physical_cores();
|
||||
}
|
||||
|
||||
// Helper for setting process priority
|
||||
@@ -263,7 +263,7 @@ bool set_process_priority(enum ggml_sched_priority prio) {
|
||||
//
|
||||
|
||||
|
||||
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
|
||||
void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model) {
|
||||
int32_t n_set = 0;
|
||||
|
||||
if (cpuparams.n_threads < 0) {
|
||||
@@ -271,7 +271,7 @@ void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model)
|
||||
if (role_model != nullptr) {
|
||||
cpuparams = *role_model;
|
||||
} else {
|
||||
cpuparams.n_threads = cpu_get_num_math();
|
||||
cpuparams.n_threads = common_cpu_get_num_math();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1521,7 +1521,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
||||
return cparams;
|
||||
}
|
||||
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params) {
|
||||
struct ggml_threadpool_params tpp;
|
||||
|
||||
ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
|
||||
|
||||
@@ -54,7 +54,7 @@ struct common_control_vector_load_info;
|
||||
// CPU utils
|
||||
//
|
||||
|
||||
struct cpu_params {
|
||||
struct common_cpu_params {
|
||||
int n_threads = -1;
|
||||
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||
bool mask_valid = false; // Default: any CPU
|
||||
@@ -63,8 +63,8 @@ struct cpu_params {
|
||||
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||
};
|
||||
|
||||
int32_t cpu_get_num_physical_cores();
|
||||
int32_t cpu_get_num_math();
|
||||
int32_t common_cpu_get_num_physical_cores();
|
||||
int32_t common_cpu_get_num_math();
|
||||
|
||||
//
|
||||
// Common params
|
||||
@@ -297,34 +297,19 @@ struct common_params_model {
|
||||
|
||||
struct common_ngram_mod;
|
||||
|
||||
struct common_params_speculative {
|
||||
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
|
||||
// draft-model-based speculative decoding parameters
|
||||
struct common_params_speculative_draft {
|
||||
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
||||
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||
|
||||
// general-purpose speculative decoding parameters
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
||||
|
||||
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
||||
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||
float p_split = 0.1f; // speculative decoding split probability
|
||||
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
||||
common_params_model mparams;
|
||||
|
||||
// ngram-based speculative decoding
|
||||
llama_model * model = nullptr; // a llama_model that can be shared by multiple speculative contexts
|
||||
|
||||
uint16_t ngram_size_n = 12; // ngram size for lookup
|
||||
uint16_t ngram_size_m = 48; // mgram size for speculative tokens
|
||||
uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||
|
||||
std::shared_ptr<common_ngram_mod> ngram_mod;
|
||||
|
||||
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT
|
||||
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||
|
||||
// draft-model speculative decoding
|
||||
|
||||
struct common_params_model mparams_dft;
|
||||
|
||||
llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
|
||||
|
||||
llama_context_params cparams_dft; // these are the parameters for the draft llama_context
|
||||
llama_context_params cparams; // these are the parameters for the draft llama_context
|
||||
|
||||
int32_t n_ctx = 0; // draft context size
|
||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||
@@ -332,25 +317,60 @@ struct common_params_speculative {
|
||||
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
|
||||
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
|
||||
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
common_cpu_params cpuparams;
|
||||
common_cpu_params cpuparams_batch;
|
||||
|
||||
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
|
||||
|
||||
std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
|
||||
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
|
||||
};
|
||||
|
||||
struct common_params_speculative_ngram_mod {
|
||||
int32_t n_match = 24;
|
||||
|
||||
int32_t n_max = 64;
|
||||
int32_t n_min = 48;
|
||||
|
||||
// shared instance of the ngram container for all speculative decoding contexts
|
||||
std::shared_ptr<common_ngram_mod> obj;
|
||||
};
|
||||
|
||||
struct common_params_speculative_ngram_map {
|
||||
uint16_t size_n = 12; // ngram size for lookup
|
||||
uint16_t size_m = 48; // mgram size for speculative tokens
|
||||
uint16_t min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed
|
||||
};
|
||||
|
||||
struct common_params_speculative_ngram_cache {
|
||||
std::string lookup_cache_static; // path of static ngram cache file for lookup decoding
|
||||
std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding
|
||||
};
|
||||
|
||||
struct common_params_speculative {
|
||||
// TODO: become a vector in order to support "chains of speculators"
|
||||
common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE;
|
||||
|
||||
common_params_speculative_draft draft;
|
||||
|
||||
common_params_speculative_ngram_mod ngram_mod;
|
||||
common_params_speculative_ngram_map ngram_simple;
|
||||
common_params_speculative_ngram_map ngram_map_k;
|
||||
common_params_speculative_ngram_map ngram_map_k4v;
|
||||
|
||||
common_params_speculative_ngram_cache ngram_cache;
|
||||
|
||||
bool has_dft() const {
|
||||
return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
|
||||
return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
|
||||
}
|
||||
};
|
||||
|
||||
struct common_params_vocoder {
|
||||
struct common_params_model model;
|
||||
|
||||
std::string speaker_file = ""; // speaker file path // NOLINT
|
||||
std::string speaker_file; // speaker file path
|
||||
|
||||
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT
|
||||
bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy
|
||||
};
|
||||
|
||||
struct common_params_diffusion {
|
||||
@@ -433,8 +453,8 @@ struct common_params {
|
||||
|
||||
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
|
||||
|
||||
struct cpu_params cpuparams;
|
||||
struct cpu_params cpuparams_batch;
|
||||
common_cpu_params cpuparams;
|
||||
common_cpu_params cpuparams_batch;
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||
void * cb_eval_user_data = nullptr;
|
||||
@@ -678,7 +698,7 @@ std::string common_params_get_system_info(const common_params & params);
|
||||
|
||||
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
|
||||
void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model = nullptr);
|
||||
bool set_process_priority(enum ggml_sched_priority prio);
|
||||
|
||||
//
|
||||
@@ -846,7 +866,7 @@ common_init_result_ptr common_init_from_params(common_params & params);
|
||||
|
||||
struct llama_model_params common_model_params_to_llama ( common_params & params);
|
||||
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params);
|
||||
|
||||
// clear LoRA adapters from context, then apply new list of adapters
|
||||
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
|
||||
|
||||
@@ -1,9 +1,38 @@
|
||||
#include "debug.h"
|
||||
|
||||
#include "common.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct common_debug_cb_user_data::impl {
|
||||
std::vector<uint8_t> data;
|
||||
std::vector<std::regex> tensor_filters;
|
||||
bool abort_on_nan{false};
|
||||
};
|
||||
|
||||
common_debug_cb_user_data::common_debug_cb_user_data() : pimpl(std::make_unique<impl>()) {}
|
||||
common_debug_cb_user_data::~common_debug_cb_user_data() = default;
|
||||
|
||||
common_debug_cb_user_data::common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan)
|
||||
: pimpl(std::make_unique<impl>())
|
||||
{
|
||||
for (const auto & pattern : filter_patterns) {
|
||||
try {
|
||||
std::string anchored_pattern = "^" + pattern;
|
||||
pimpl->tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
|
||||
} catch (const std::regex_error & e) {
|
||||
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
|
||||
}
|
||||
}
|
||||
pimpl->abort_on_nan = abort_on_nan;
|
||||
|
||||
params.cb_eval = common_debug_cb_eval;
|
||||
params.cb_eval_user_data = this;
|
||||
}
|
||||
|
||||
static std::string common_ggml_ne_string(const ggml_tensor * t) {
|
||||
std::string str;
|
||||
@@ -47,8 +76,7 @@ static float common_ggml_get_float_value(const uint8_t * data,
|
||||
|
||||
#define INDENT " "
|
||||
|
||||
template <bool abort>
|
||||
void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) {
|
||||
static void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n, bool abort_on_nan) {
|
||||
GGML_ASSERT(n > 0);
|
||||
float sum = 0;
|
||||
for (int64_t i3 = 0; i3 < ne[3]; i3++) {
|
||||
@@ -94,7 +122,7 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
|
||||
LOG(INDENT "sum = %f\n", sum);
|
||||
}
|
||||
|
||||
if constexpr (abort) {
|
||||
if (abort_on_nan) {
|
||||
if (std::isnan(sum)) {
|
||||
LOG("encountered NaN - aborting\n");
|
||||
exit(0);
|
||||
@@ -112,8 +140,9 @@ void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * n
|
||||
* @param user_data user data to pass at each call back
|
||||
* @return true to receive data or continue the graph, false otherwise
|
||||
*/
|
||||
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
auto * cb_data = (base_callback_data *) user_data;
|
||||
bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
auto * cb_data = (common_debug_cb_user_data *) user_data;
|
||||
auto * pimpl = cb_data->pimpl.get();
|
||||
|
||||
const struct ggml_tensor * src0 = t->src[0];
|
||||
const struct ggml_tensor * src1 = t->src[1];
|
||||
@@ -122,10 +151,10 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b
|
||||
return true; // Always retrieve data
|
||||
}
|
||||
|
||||
bool matches_filter = cb_data->tensor_filters.empty();
|
||||
bool matches_filter = pimpl->tensor_filters.empty();
|
||||
|
||||
if (!matches_filter) {
|
||||
for (const auto & filter : cb_data->tensor_filters) {
|
||||
for (const auto & filter : pimpl->tensor_filters) {
|
||||
if (std::regex_search(t->name, filter)) {
|
||||
matches_filter = true;
|
||||
break;
|
||||
@@ -148,20 +177,14 @@ template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, b
|
||||
|
||||
if (!is_host) {
|
||||
auto n_bytes = ggml_nbytes(t);
|
||||
cb_data->data.resize(n_bytes);
|
||||
ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes);
|
||||
pimpl->data.resize(n_bytes);
|
||||
ggml_backend_tensor_get(t, pimpl->data.data(), 0, n_bytes);
|
||||
}
|
||||
|
||||
if (!ggml_is_quantized(t->type) && matches_filter) {
|
||||
uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data();
|
||||
common_debug_print_tensor<abort_on_nan>(data, t->type, t->ne, t->nb, 3);
|
||||
uint8_t * data = is_host ? (uint8_t *) t->data : pimpl->data.data();
|
||||
common_debug_print_tensor(data, t->type, t->ne, t->nb, 3, pimpl->abort_on_nan);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Explicit template instantiations
|
||||
template bool common_debug_cb_eval<false>(ggml_tensor *, bool, void *);
|
||||
template bool common_debug_cb_eval<true>(ggml_tensor *, bool, void *);
|
||||
template void common_debug_print_tensor<false>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
|
||||
template void common_debug_print_tensor<true>(uint8_t *, ggml_type, const int64_t *, const size_t *, int64_t);
|
||||
|
||||
@@ -1,43 +1,31 @@
|
||||
#pragma once
|
||||
#include "common.h"
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <regex>
|
||||
|
||||
// common debug functions and structs
|
||||
|
||||
// Print a tensor's detailed data
|
||||
// data - the tensor's data in byte format
|
||||
// type - the tensor's quantization type
|
||||
// ne - the tensor dimensions array
|
||||
// nb - the tensor strides array
|
||||
// n - the number of rows/columns to fully print
|
||||
template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n);
|
||||
struct common_params;
|
||||
|
||||
// Intended to use as callback for ggml_backend_sched_eval_callback
|
||||
// prints tensors that are processed in the computation graph
|
||||
// by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
|
||||
// non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
|
||||
// The template parameter determines whether an error should be thrown whenever a NaN is encountered
|
||||
// by default prints all tensors, but can be configured by creating a `common_debug_cb_user_data` instance with
|
||||
// non-empty filter_patterns. See examples/debug.cpp for possible usage patterns
|
||||
// `common_debug_cb_user_data` contains `abort_on_nan` flag that determines whether an error should be thrown whenever a NaN is encountered
|
||||
// in a tensor (useful for stopping debug sessions on first erroneous tensor)
|
||||
// The callback data will be passed as the third parameter (user_data)
|
||||
template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
struct base_callback_data {
|
||||
std::vector<uint8_t> data;
|
||||
std::vector<std::regex> tensor_filters;
|
||||
bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
|
||||
base_callback_data() = default;
|
||||
struct common_debug_cb_user_data {
|
||||
struct impl;
|
||||
std::unique_ptr<impl> pimpl;
|
||||
|
||||
base_callback_data(common_params & params, const std::vector<std::string> & filter_patterns) {
|
||||
for (const auto & pattern : filter_patterns) {
|
||||
try {
|
||||
std::string anchored_pattern = "^" + pattern;
|
||||
tensor_filters.emplace_back(anchored_pattern, std::regex::optimize);
|
||||
} catch (const std::regex_error & e) {
|
||||
throw std::runtime_error("Invalid regex pattern '" + pattern + "': " + e.what());
|
||||
}
|
||||
}
|
||||
params.cb_eval = common_debug_cb_eval<false>;
|
||||
params.cb_eval_user_data = this;
|
||||
}
|
||||
common_debug_cb_user_data();
|
||||
~common_debug_cb_user_data();
|
||||
|
||||
common_debug_cb_user_data(const common_debug_cb_user_data &) = delete;
|
||||
common_debug_cb_user_data & operator=(const common_debug_cb_user_data &) = delete;
|
||||
|
||||
common_debug_cb_user_data(common_params & params, const std::vector<std::string> & filter_patterns, bool abort_on_nan = false);
|
||||
};
|
||||
|
||||
@@ -627,7 +627,7 @@ static hf_cache::hf_file find_best_model(const hf_cache::hf_files & files,
|
||||
if (!tag.empty()) {
|
||||
tags.push_back(tag);
|
||||
} else {
|
||||
tags = {"Q4_K_M", "Q4_0"};
|
||||
tags = {"Q4_K_M", "Q8_0"};
|
||||
}
|
||||
|
||||
for (const auto & t : tags) {
|
||||
|
||||
@@ -109,16 +109,24 @@ static std::vector<llama_device_memory_data> common_get_device_memory_data(
|
||||
ret.back().total = total;
|
||||
}
|
||||
for (size_t i = 0; i < nd; i++) {
|
||||
ggml_backend_dev_t dev = llama_model_get_device(model, i);
|
||||
|
||||
size_t free;
|
||||
size_t total;
|
||||
ggml_backend_dev_memory(llama_model_get_device(model, i), &free, &total);
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
|
||||
// devices can return 0 bytes for free and total memory if they do not
|
||||
// have any to report. in this case, we will use the host memory as a fallback
|
||||
// fixes: https://github.com/ggml-org/llama.cpp/issues/18577
|
||||
// Some non-GPU accelerator backends, such as BLAS, report 0/0 and rely on
|
||||
// the host-memory fallback. For GPU-like backends, keep 0/0 so --fit does
|
||||
// not assign anything to a device with an unknown memory budget.
|
||||
if (free == 0 && total == 0) {
|
||||
free = ret.back().free;
|
||||
total = ret.back().total;
|
||||
const enum ggml_backend_dev_type type = ggml_backend_dev_type(dev);
|
||||
if (type == GGML_BACKEND_DEVICE_TYPE_GPU || type == GGML_BACKEND_DEVICE_TYPE_IGPU) {
|
||||
LOG_WRN("%s: device %s did not report memory; --fit will not use it\n",
|
||||
__func__, ggml_backend_dev_name(dev));
|
||||
} else {
|
||||
free = ret.back().free;
|
||||
total = ret.back().total;
|
||||
}
|
||||
}
|
||||
ret[i].free = free;
|
||||
ret[i].total = total;
|
||||
@@ -856,7 +864,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) {
|
||||
ggml_backend_dev_memory(dev, &free, &total);
|
||||
|
||||
const size_t self = mb.model + mb.context + mb.compute;
|
||||
const size_t unaccounted = total - self - free;
|
||||
const int64_t unaccounted = static_cast<int64_t>(total) - static_cast<int64_t>(free) - static_cast<int64_t>(self);
|
||||
|
||||
table_data.push_back({
|
||||
template_gpu,
|
||||
@@ -867,7 +875,7 @@ void common_memory_breakdown_print(const struct llama_context * ctx) {
|
||||
std::to_string(mb.model / MiB),
|
||||
std::to_string(mb.context / MiB),
|
||||
std::to_string(mb.compute / MiB),
|
||||
std::to_string(unaccounted / MiB)});
|
||||
std::to_string(unaccounted / static_cast<int64_t>(MiB))});
|
||||
}
|
||||
|
||||
// print memory breakdown for host:
|
||||
|
||||
@@ -57,7 +57,7 @@ static fs::path get_cache_directory() {
|
||||
#ifndef _WIN32
|
||||
const struct passwd * pw = getpwuid(getuid());
|
||||
|
||||
if (pw->pw_dir && *pw->pw_dir) {
|
||||
if (pw && pw->pw_dir && *pw->pw_dir) {
|
||||
return fs::path(pw->pw_dir) / ".cache" / "huggingface" / "hub";
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -106,10 +106,16 @@ struct statement {
|
||||
size_t pos; // position in source, for debugging
|
||||
virtual ~statement() = default;
|
||||
virtual std::string type() const { return "Statement"; }
|
||||
|
||||
// execute_impl must be overridden by derived classes
|
||||
virtual value execute_impl(context &) { throw std::runtime_error("cannot exec " + type()); }
|
||||
virtual value execute_impl(context &) { throw_exec_error(); }
|
||||
// execute is the public method to execute a statement with error handling
|
||||
value execute(context &);
|
||||
|
||||
private:
|
||||
[[noreturn]] void throw_exec_error() const {
|
||||
throw std::runtime_error("cannot exec " + type());
|
||||
}
|
||||
};
|
||||
|
||||
// Type Checking Utilities
|
||||
@@ -143,7 +149,7 @@ struct program : public statement {
|
||||
program() = default;
|
||||
explicit program(statements && body) : body(std::move(body)) {}
|
||||
std::string type() const override { return "Program"; }
|
||||
value execute_impl(context &) override {
|
||||
[[noreturn]] value execute_impl(context &) override {
|
||||
throw std::runtime_error("Cannot execute program directly, use jinja::runtime instead");
|
||||
}
|
||||
};
|
||||
@@ -195,7 +201,7 @@ struct break_statement : public statement {
|
||||
}
|
||||
};
|
||||
|
||||
value execute_impl(context &) override {
|
||||
[[noreturn]] value execute_impl(context &) override {
|
||||
throw break_statement::signal();
|
||||
}
|
||||
};
|
||||
@@ -209,7 +215,7 @@ struct continue_statement : public statement {
|
||||
}
|
||||
};
|
||||
|
||||
value execute_impl(context &) override {
|
||||
[[noreturn]] value execute_impl(context &) override {
|
||||
throw continue_statement::signal();
|
||||
}
|
||||
};
|
||||
@@ -509,7 +515,7 @@ struct slice_expression : public expression {
|
||||
chk_type<expression>(this->step_expr);
|
||||
}
|
||||
std::string type() const override { return "SliceExpression"; }
|
||||
value execute_impl(context &) override {
|
||||
[[noreturn]] value execute_impl(context &) override {
|
||||
throw std::runtime_error("must be handled by MemberExpression");
|
||||
}
|
||||
};
|
||||
|
||||
@@ -590,6 +590,10 @@ static bool string_endswith(const std::string & str, const std::string & suffix)
|
||||
return str.compare(str.length() - suffix.length(), suffix.length(), suffix) == 0;
|
||||
}
|
||||
|
||||
[[noreturn]] static value string_join_not_implemented(const func_args &) {
|
||||
throw not_implemented_exception("String join builtin not implemented");
|
||||
}
|
||||
|
||||
const func_builtins & value_string_t::get_builtins() const {
|
||||
static const func_builtins builtins = {
|
||||
{"default", default_value},
|
||||
@@ -851,9 +855,7 @@ const func_builtins & value_string_t::get_builtins() const {
|
||||
res->val_str.mark_input_based_on(val_input->as_string());
|
||||
return res;
|
||||
}},
|
||||
{"join", [](const func_args &) -> value {
|
||||
throw not_implemented_exception("String join builtin not implemented");
|
||||
}},
|
||||
{"join", string_join_not_implemented},
|
||||
};
|
||||
return builtins;
|
||||
}
|
||||
@@ -884,6 +886,9 @@ const func_builtins & value_bool_t::get_builtins() const {
|
||||
return builtins;
|
||||
}
|
||||
|
||||
[[noreturn]] static value array_unique_not_implemented(const func_args &) {
|
||||
throw not_implemented_exception("Array unique builtin not implemented");
|
||||
}
|
||||
|
||||
const func_builtins & value_array_t::get_builtins() const {
|
||||
static const func_builtins builtins = {
|
||||
@@ -1084,13 +1089,14 @@ const func_builtins & value_array_t::get_builtins() const {
|
||||
std::reverse(arr.begin(), arr.end());
|
||||
return is_val<value_tuple>(val) ? mk_val<value_tuple>(std::move(arr)) : mk_val<value_array>(std::move(arr));
|
||||
}},
|
||||
{"unique", [](const func_args &) -> value {
|
||||
throw not_implemented_exception("Array unique builtin not implemented");
|
||||
}},
|
||||
{"unique", array_unique_not_implemented},
|
||||
};
|
||||
return builtins;
|
||||
}
|
||||
|
||||
[[noreturn]] static value object_join_not_implemented(const func_args &) {
|
||||
throw not_implemented_exception("object join not implemented");
|
||||
}
|
||||
|
||||
const func_builtins & value_object_t::get_builtins() const {
|
||||
if (!has_builtins) {
|
||||
@@ -1183,9 +1189,7 @@ const func_builtins & value_object_t::get_builtins() const {
|
||||
});
|
||||
return result;
|
||||
}},
|
||||
{"join", [](const func_args &) -> value {
|
||||
throw not_implemented_exception("object join not implemented");
|
||||
}},
|
||||
{"join", object_join_not_implemented},
|
||||
};
|
||||
return builtins;
|
||||
}
|
||||
|
||||
@@ -129,27 +129,25 @@ struct value_t {
|
||||
// Note: only for debugging and error reporting purposes
|
||||
virtual std::string type() const { return ""; }
|
||||
|
||||
virtual int64_t as_int() const { throw std::runtime_error(type() + " is not an int value"); }
|
||||
virtual double as_float() const { throw std::runtime_error(type() + " is not a float value"); }
|
||||
virtual string as_string() const { throw std::runtime_error(type() + " is not a string value"); }
|
||||
virtual bool as_bool() const { throw std::runtime_error(type() + " is not a bool value"); }
|
||||
virtual const std::vector<value> & as_array() const { throw std::runtime_error(type() + " is not an array value"); }
|
||||
virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw std::runtime_error(type() + " is not an object value"); }
|
||||
virtual value invoke(const func_args &) const { throw std::runtime_error(type() + " is not a function value"); }
|
||||
virtual int64_t as_int() const { throw_type_error("is not an int value"); }
|
||||
virtual double as_float() const { throw_type_error("is not a float value"); }
|
||||
virtual string as_string() const { throw_type_error("is not a string value"); }
|
||||
virtual bool as_bool() const { throw_type_error("is not a bool value"); }
|
||||
virtual const std::vector<value> & as_array() const { throw_type_error("is not an array value"); }
|
||||
virtual const std::vector<std::pair<value, value>> & as_ordered_object() const { throw_type_error("is not an object value"); }
|
||||
virtual value invoke(const func_args &) const { throw_type_error("is not a function value"); }
|
||||
virtual bool is_none() const { return false; }
|
||||
virtual bool is_undefined() const { return false; }
|
||||
virtual const func_builtins & get_builtins() const {
|
||||
throw std::runtime_error("No builtins available for type " + type());
|
||||
}
|
||||
virtual const func_builtins & get_builtins() const { throw_type_error("has no builtins"); }
|
||||
|
||||
virtual bool has_key(const value &) { throw std::runtime_error(type() + " is not an object value"); }
|
||||
virtual void insert(const value & /* key */, const value & /* val */) { throw std::runtime_error(type() + " is not an object value"); }
|
||||
virtual value & at(const value & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
|
||||
virtual value & at(const value & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
|
||||
virtual value & at(const std::string & /* key */, value & /* default_val */) { throw std::runtime_error(type() + " is not an object value"); }
|
||||
virtual value & at(const std::string & /* key */) { throw std::runtime_error(type() + " is not an object value"); }
|
||||
virtual value & at(int64_t /* idx */, value & /* default_val */) { throw std::runtime_error(type() + " is not an array value"); }
|
||||
virtual value & at(int64_t /* idx */) { throw std::runtime_error(type() + " is not an array value"); }
|
||||
virtual bool has_key(const value &) { throw_type_error("is not an object value"); }
|
||||
virtual void insert(const value & /* key */, const value & /* val */) { throw_type_error("is not an object value"); }
|
||||
virtual value & at(const value & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
|
||||
virtual value & at(const value & /* key */) { throw_type_error("is not an object value"); }
|
||||
virtual value & at(const std::string & /* key */, value & /* default_val */) { throw_type_error("is not an object value"); }
|
||||
virtual value & at(const std::string & /* key */) { throw_type_error("is not an object value"); }
|
||||
virtual value & at(int64_t /* idx */, value & /* default_val */) { throw_type_error("is not an array value"); }
|
||||
virtual value & at(int64_t /* idx */) { throw_type_error("is not an array value"); }
|
||||
|
||||
virtual bool is_numeric() const { return false; }
|
||||
virtual bool is_hashable() const { return false; }
|
||||
@@ -163,6 +161,11 @@ struct value_t {
|
||||
// Note: only for debugging purposes
|
||||
virtual std::string as_repr() const { return as_string().str(); }
|
||||
|
||||
private:
|
||||
[[noreturn]] void throw_type_error(const char* expected) const {
|
||||
throw std::runtime_error(type() + " " + expected);
|
||||
}
|
||||
|
||||
protected:
|
||||
virtual bool equivalent(const value_t &) const = 0;
|
||||
virtual bool nonequal(const value_t & other) const { return !equivalent(other); }
|
||||
|
||||
@@ -49,7 +49,7 @@ enum common_log_col : int {
|
||||
};
|
||||
|
||||
// disable colors by default
|
||||
static std::vector<const char *> g_col = {
|
||||
static const char* g_col[] = {
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
@@ -247,7 +247,6 @@ public:
|
||||
|
||||
entries = std::move(new_entries);
|
||||
}
|
||||
|
||||
cv.notify_one();
|
||||
}
|
||||
|
||||
@@ -265,7 +264,6 @@ public:
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mtx);
|
||||
cv.wait(lock, [this]() { return head != tail; });
|
||||
|
||||
cur = entries[head];
|
||||
|
||||
head = (head + 1) % entries.size();
|
||||
@@ -301,7 +299,6 @@ public:
|
||||
|
||||
tail = (tail + 1) % entries.size();
|
||||
}
|
||||
|
||||
cv.notify_one();
|
||||
}
|
||||
|
||||
@@ -338,7 +335,7 @@ public:
|
||||
g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
|
||||
g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
|
||||
} else {
|
||||
for (size_t i = 0; i < g_col.size(); i++) {
|
||||
for (size_t i = 0; i < std::size(g_col); i++) {
|
||||
g_col[i] = "";
|
||||
}
|
||||
}
|
||||
@@ -368,14 +365,20 @@ struct common_log * common_log_init() {
|
||||
}
|
||||
|
||||
struct common_log * common_log_main() {
|
||||
static struct common_log log;
|
||||
// We intentionally leak (i.e. do not delete) the logger singleton because
|
||||
// common_log destructor called at DLL teardown phase will cause hanging on Windows.
|
||||
// OS will release resources anyway so it should not be a significant issue,
|
||||
// though this design may cause logs to be lost if not flushed before the program exits.
|
||||
// Refer to https://github.com/ggml-org/llama.cpp/issues/22142 for details.
|
||||
static struct common_log * log;
|
||||
static std::once_flag init_flag;
|
||||
std::call_once(init_flag, [&]() {
|
||||
log = new common_log;
|
||||
// Set default to auto-detect colors
|
||||
log.set_colors(tty_can_use_colors());
|
||||
log->set_colors(tty_can_use_colors());
|
||||
});
|
||||
|
||||
return &log;
|
||||
return log;
|
||||
}
|
||||
|
||||
void common_log_pause(struct common_log * log) {
|
||||
|
||||
@@ -49,7 +49,11 @@ void common_log_default_callback(enum ggml_log_level level, const char * text, v
|
||||
struct common_log;
|
||||
|
||||
struct common_log * common_log_init();
|
||||
struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
|
||||
|
||||
// Singleton, intentionally leaked to avoid Windows teardown hangs.
|
||||
// Call common_log_flush() before exit if you want to ensure all logs are flushed.
|
||||
struct common_log * common_log_main();
|
||||
|
||||
void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe
|
||||
void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
|
||||
void common_log_free (struct common_log * log);
|
||||
|
||||
@@ -43,7 +43,7 @@ static std::set<std::string> get_remote_preset_whitelist(const std::map<std::str
|
||||
for (const auto & it : key_to_opt) {
|
||||
const std::string & key = it.first;
|
||||
const common_arg & opt = it.second;
|
||||
if (allowed_options.find(key) != allowed_options.end() || opt.is_sparam) {
|
||||
if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) {
|
||||
allowed_keys.insert(key);
|
||||
// also add variant keys (args without leading dashes and env vars)
|
||||
for (const auto & arg : opt.get_args()) {
|
||||
|
||||
@@ -122,6 +122,20 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
|
||||
}
|
||||
break;
|
||||
case REASONING_BUDGET_DONE:
|
||||
// Re-arm on a new start tag: some models emit multiple <think> blocks
|
||||
// per response, and each should get a fresh budget window.
|
||||
if (ctx->start_matcher.advance(token)) {
|
||||
ctx->state = REASONING_BUDGET_COUNTING;
|
||||
ctx->remaining = ctx->budget;
|
||||
ctx->end_matcher.reset();
|
||||
LOG_INF("reasoning-budget: re-activated on new start tag, budget=%d tokens\n", ctx->budget);
|
||||
|
||||
if (ctx->remaining <= 0) {
|
||||
ctx->state = REASONING_BUDGET_FORCING;
|
||||
ctx->force_pos = 0;
|
||||
LOG_INF("reasoning-budget: budget=0, forcing immediately\n");
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -144,6 +158,8 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
|
||||
for (size_t i = 0; i < cur_p->size; i++) {
|
||||
if (cur_p->data[i].id != forced) {
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
} else {
|
||||
cur_p->data[i].logit = +INFINITY; // force the token
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -218,34 +234,6 @@ static struct llama_sampler * common_reasoning_budget_init_state(
|
||||
);
|
||||
}
|
||||
|
||||
struct llama_sampler * common_reasoning_budget_init(
|
||||
const struct llama_vocab * vocab,
|
||||
const std::vector<llama_token> & start_tokens,
|
||||
const std::vector<llama_token> & end_tokens,
|
||||
const std::vector<llama_token> & forced_tokens,
|
||||
int32_t budget,
|
||||
const std::vector<llama_token> & prefill_tokens) {
|
||||
// Determine initial state from prefill: COUNTING if the prefill begins with
|
||||
// the start sequence but does not also contain the end sequence after it.
|
||||
common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE;
|
||||
if (!prefill_tokens.empty() && !start_tokens.empty() &&
|
||||
prefill_tokens.size() >= start_tokens.size() &&
|
||||
std::equal(start_tokens.begin(), start_tokens.end(), prefill_tokens.begin())) {
|
||||
initial_state = REASONING_BUDGET_COUNTING;
|
||||
// If the end sequence also follows the start in the prefill, reasoning
|
||||
// was opened and immediately closed — stay IDLE.
|
||||
if (!end_tokens.empty() &&
|
||||
prefill_tokens.size() >= start_tokens.size() + end_tokens.size()) {
|
||||
auto end_start = prefill_tokens.end() - (ptrdiff_t) end_tokens.size();
|
||||
if (end_start >= prefill_tokens.begin() + (ptrdiff_t) start_tokens.size() &&
|
||||
std::equal(end_tokens.begin(), end_tokens.end(), end_start)) {
|
||||
initial_state = REASONING_BUDGET_IDLE;
|
||||
}
|
||||
}
|
||||
}
|
||||
return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
|
||||
}
|
||||
|
||||
struct llama_sampler * common_reasoning_budget_init(
|
||||
const struct llama_vocab * vocab,
|
||||
const std::vector<llama_token> & start_tokens,
|
||||
|
||||
@@ -29,10 +29,7 @@ enum common_reasoning_budget_state {
|
||||
// end_tokens - token sequence for natural deactivation
|
||||
// forced_tokens - token sequence forced when budget expires
|
||||
// budget - max tokens allowed in the reasoning block
|
||||
// prefill_tokens - tokens already present in the prompt (generation prompt);
|
||||
// used to determine the initial state: COUNTING if they begin
|
||||
// with start_tokens (but don't also end with end_tokens),
|
||||
// IDLE otherwise. COUNTING with budget <= 0 is promoted to FORCING.
|
||||
// initial_state - initial state
|
||||
//
|
||||
struct llama_sampler * common_reasoning_budget_init(
|
||||
const struct llama_vocab * vocab,
|
||||
@@ -40,16 +37,6 @@ struct llama_sampler * common_reasoning_budget_init(
|
||||
const std::vector<llama_token> & end_tokens,
|
||||
const std::vector<llama_token> & forced_tokens,
|
||||
int32_t budget,
|
||||
const std::vector<llama_token> & prefill_tokens = {});
|
||||
|
||||
// Variant that takes an explicit initial state (used by tests and clone).
|
||||
// COUNTING with budget <= 0 is promoted to FORCING.
|
||||
struct llama_sampler * common_reasoning_budget_init(
|
||||
const struct llama_vocab * vocab,
|
||||
const std::vector<llama_token> & start_tokens,
|
||||
const std::vector<llama_token> & end_tokens,
|
||||
const std::vector<llama_token> & forced_tokens,
|
||||
int32_t budget,
|
||||
common_reasoning_budget_state initial_state);
|
||||
common_reasoning_budget_state initial_state = REASONING_BUDGET_IDLE);
|
||||
|
||||
common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);
|
||||
|
||||
@@ -260,32 +260,35 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||
}
|
||||
}
|
||||
|
||||
// Compute prefill tokens from the generation prompt
|
||||
std::vector<llama_token> prefill_tokens;
|
||||
if (!params.generation_prompt.empty()) {
|
||||
GGML_ASSERT(vocab != nullptr);
|
||||
auto tokens = common_tokenize(vocab, params.generation_prompt, false, true);
|
||||
for (size_t i = 0; i < tokens.size(); i++) {
|
||||
std::string piece = common_token_to_piece(vocab, tokens[i], true);
|
||||
if (i == 0 && std::isspace(piece[0]) && !std::isspace(params.generation_prompt[0])) {
|
||||
// Some tokenizers will add a space before the first special token, need to exclude
|
||||
continue;
|
||||
}
|
||||
LOG_DBG("%s: prefill token: %d = %s\n", __func__, tokens[i], piece.c_str());
|
||||
prefill_tokens.push_back(tokens[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Feed generation prompt tokens to the grammar sampler so it advances past
|
||||
// tokens the template already placed in the prompt.
|
||||
// Only applies to output-format and tool-call grammars; user-supplied grammars must not be prefilled.
|
||||
std::vector<llama_token> prefill_tokens;
|
||||
if (!params.generation_prompt.empty() && common_grammar_needs_prefill(params.grammar)) {
|
||||
GGML_ASSERT(vocab != nullptr);
|
||||
prefill_tokens = common_tokenize(vocab, params.generation_prompt, false, true);
|
||||
if (!prefill_tokens.empty()) {
|
||||
std::string first_token = common_token_to_piece(vocab, prefill_tokens[0], true);
|
||||
if (std::isspace(first_token[0]) && !std::isspace(params.generation_prompt[0])) {
|
||||
// Some tokenizers will add a space before the first special token, need to remove
|
||||
prefill_tokens = std::vector<llama_token>(prefill_tokens.begin() + 1, prefill_tokens.end());
|
||||
}
|
||||
}
|
||||
|
||||
if (grmr && !params.grammar_lazy) {
|
||||
try {
|
||||
for (const auto & token : prefill_tokens) {
|
||||
llama_sampler_accept(grmr, token);
|
||||
LOG_DBG("%s: accepted prefill token (%d)\n", __func__, token);
|
||||
}
|
||||
} catch (std::exception &e) {
|
||||
LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
|
||||
common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
|
||||
throw e;
|
||||
if (grmr && !params.grammar_lazy && common_grammar_needs_prefill(params.grammar)) {
|
||||
try {
|
||||
for (const auto & token : prefill_tokens) {
|
||||
llama_sampler_accept(grmr, token);
|
||||
LOG_DBG("%s: grammar accepted prefill token (%d)\n", __func__, token);
|
||||
}
|
||||
} catch (std::exception &e) {
|
||||
LOG_ERR("%s: error initializing grammar sampler for grammar:\n%s\n\nGeneration prompt:\n'%s'\n", __func__,
|
||||
common_grammar_value(params.grammar).c_str(), params.generation_prompt.c_str());
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -296,8 +299,12 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||
params.reasoning_budget_start,
|
||||
params.reasoning_budget_end,
|
||||
params.reasoning_budget_forced,
|
||||
params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens,
|
||||
prefill_tokens);
|
||||
params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens);
|
||||
|
||||
for (const auto & token : prefill_tokens) {
|
||||
llama_sampler_accept(rbudget, token);
|
||||
LOG_DBG("%s: reasoning-budget accepted prefill token (%d)\n", __func__, token);
|
||||
}
|
||||
}
|
||||
|
||||
if (params.has_logit_bias()) {
|
||||
@@ -431,7 +438,7 @@ static bool grammar_should_apply(struct common_sampler * gsmpl) {
|
||||
return true;
|
||||
}
|
||||
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated) {
|
||||
if (!gsmpl) {
|
||||
return;
|
||||
}
|
||||
@@ -439,9 +446,11 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
// grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
|
||||
accept_grammar = accept_grammar && grammar_should_apply(gsmpl);
|
||||
const auto accept_grammar = is_generated && grammar_should_apply(gsmpl);
|
||||
|
||||
llama_sampler_accept(gsmpl->rbudget, token);
|
||||
if (gsmpl->rbudget && is_generated) {
|
||||
llama_sampler_accept(gsmpl->rbudget, token);
|
||||
}
|
||||
|
||||
if (gsmpl->grmr && accept_grammar) {
|
||||
llama_sampler_accept(gsmpl->grmr, token);
|
||||
|
||||
@@ -41,8 +41,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||
|
||||
void common_sampler_free(struct common_sampler * gsmpl);
|
||||
|
||||
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
|
||||
// if is_generated is true, the token is accepted by the sampling chain, the reasoning budget sampler, and the grammar sampler
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated);
|
||||
void common_sampler_reset (struct common_sampler * gsmpl);
|
||||
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
||||
|
||||
|
||||
@@ -61,18 +61,26 @@ static bool common_speculative_are_compatible(
|
||||
LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
|
||||
|
||||
if (vocab_type_tgt != vocab_type_dft) {
|
||||
LOG_DBG("%s: draft model vocab type must match target model to use speculation but ", __func__);
|
||||
LOG_DBG("vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt);
|
||||
LOG_WRN("%s: draft model vocab type must match target model to use speculation but "
|
||||
"vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (
|
||||
llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
||||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
|
||||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
|
||||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
|
||||
) {
|
||||
LOG_DBG("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
||||
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
||||
(llama_vocab_get_add_bos(vocab_tgt) && llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft))) {
|
||||
LOG_WRN("%s: draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
|
||||
__func__,
|
||||
llama_vocab_get_add_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_dft),
|
||||
llama_vocab_bos(vocab_tgt), llama_vocab_bos(vocab_dft));
|
||||
return false;
|
||||
}
|
||||
|
||||
if (llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
|
||||
(llama_vocab_get_add_eos(vocab_tgt) && llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft))) {
|
||||
LOG_WRN("%s: draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
|
||||
__func__,
|
||||
llama_vocab_get_add_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_dft),
|
||||
llama_vocab_eos(vocab_tgt), llama_vocab_eos(vocab_dft));
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -143,6 +151,9 @@ struct common_speculative_state {
|
||||
llama_tokens & result) = 0;
|
||||
|
||||
virtual void accept(uint16_t n_accepted) = 0;
|
||||
|
||||
virtual int32_t n_max(const common_params_speculative & params) const = 0;
|
||||
virtual int32_t n_min(const common_params_speculative & params) const = 0;
|
||||
};
|
||||
|
||||
struct common_speculative_checkpoint {
|
||||
@@ -156,8 +167,6 @@ struct common_speculative_checkpoint {
|
||||
size_t size() const {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
size_t ckpt_size = 0;
|
||||
};
|
||||
|
||||
struct common_speculative_state_draft : public common_speculative_state {
|
||||
@@ -165,7 +174,7 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
llama_context * ctx_dft;
|
||||
|
||||
bool use_ckpt = false;
|
||||
struct common_speculative_checkpoint ckpt;
|
||||
common_speculative_checkpoint ckpt;
|
||||
|
||||
common_sampler * smpl;
|
||||
|
||||
@@ -238,29 +247,19 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
llama_batch_free(batch);
|
||||
}
|
||||
|
||||
void begin(const llama_tokens & prompt) override {
|
||||
if (use_ckpt && ckpt.size() > 0) {
|
||||
// delete checkpoint
|
||||
LOG_DBG("%s: delete checkpoint, prompt.size=%zu, pos_min=%d, pos_max=%d, n_tokens=%" PRId64 ", size=%.3f MiB\n",
|
||||
__func__, prompt.size(), ckpt.pos_min, ckpt.pos_max, ckpt.n_tokens, (float) ckpt.data.size() / 1024 / 1024);
|
||||
ckpt.pos_min = 0;
|
||||
ckpt.pos_max = 0;
|
||||
ckpt.n_tokens = 0;
|
||||
ckpt.ckpt_size = 0;
|
||||
ckpt.data.clear();
|
||||
}
|
||||
void begin(const llama_tokens & /*prompt*/) override {
|
||||
}
|
||||
|
||||
size_t draft_create_checkpoint(int n_tokens_prompt, int n_tokens_batch) {
|
||||
size_t create_checkpoint(int n_tokens_prompt) {
|
||||
int slot_id = 0;
|
||||
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
const size_t checkpoint_size = llama_state_seq_get_size_ext(ctx_dft, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
|
||||
ckpt.pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_dft), slot_id);
|
||||
ckpt.pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), slot_id);
|
||||
ckpt.n_tokens = n_tokens_prompt - n_tokens_batch;
|
||||
ckpt.n_tokens = n_tokens_prompt;
|
||||
ckpt.data.resize(checkpoint_size);
|
||||
|
||||
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
const size_t n = llama_state_seq_get_data_ext(ctx_dft, ckpt.data.data(), checkpoint_size, slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
if (n != checkpoint_size) {
|
||||
GGML_ABORT("checkpoint size mismatch: expected %zu, got %zu\n", checkpoint_size, n);
|
||||
}
|
||||
@@ -270,13 +269,13 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
return n;
|
||||
}
|
||||
|
||||
size_t draft_restore_checkpoint(size_t ckpt_size_part_expected) {
|
||||
size_t restore_checkpoint() {
|
||||
int slot_id = 0;
|
||||
LOG_DBG("%s: pos_min = %d, pos_max = %d\n", __func__, ckpt.pos_min, ckpt.pos_max);
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
|
||||
if (n != ckpt_size_part_expected) {
|
||||
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu, get_data_ext->%zu, set_data_ext->%zu",
|
||||
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size(), ckpt_size_part_expected, n);
|
||||
const size_t n = llama_state_seq_set_data_ext(ctx_dft, ckpt.data.data(), ckpt.size(), slot_id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY | LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
if (n != ckpt.size()) {
|
||||
GGML_ABORT("%s: failed to restore context checkpoint (pos_min=%d, pos_max=%d, size=%zu",
|
||||
__func__, ckpt.pos_min, ckpt.pos_max, ckpt.size());
|
||||
}
|
||||
llama_memory_seq_rm(llama_get_memory(ctx_dft), slot_id, ckpt.pos_max + 1, -1);
|
||||
|
||||
@@ -288,6 +287,8 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
const llama_tokens & prompt_tgt,
|
||||
llama_token id_last,
|
||||
llama_tokens & result) override {
|
||||
const auto & sparams = params.draft;
|
||||
|
||||
auto * spec = this;
|
||||
|
||||
auto & batch = spec->batch;
|
||||
@@ -301,7 +302,7 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
int reuse_i = 0; // index of part to be reused in prompt_dft
|
||||
int reuse_n = 0; // length of part to be reused in prompt_dft
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx_dft) - params.n_max;
|
||||
const int n_ctx = llama_n_ctx(ctx_dft) - sparams.n_max;
|
||||
|
||||
llama_tokens prompt_cnv;
|
||||
if (!spec->vocab_cmpt) {
|
||||
@@ -333,13 +334,18 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
|
||||
const int i_start = std::max<int>(0, (int) prompt_cur.size() - n_ctx);
|
||||
|
||||
if (use_ckpt && i_start > 0) {
|
||||
LOG_WRN("%s: context shift is not supported with checkpoint-based contexts - skipping\n", __func__);
|
||||
return;
|
||||
}
|
||||
|
||||
// reuse as much as possible from the old draft context
|
||||
// ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
|
||||
for (int i = 0; i < (int) prompt_dft.size(); ++i) {
|
||||
int cur = 0;
|
||||
while (i_start + cur < (int) prompt_cur.size() &&
|
||||
i + cur < (int) prompt_dft.size() &&
|
||||
prompt_cur[i_start + cur] == prompt_dft[i + cur]) {
|
||||
i + cur < (int) prompt_dft.size() &&
|
||||
prompt_cur[i_start + cur] == prompt_dft[i + cur]) {
|
||||
cur++;
|
||||
}
|
||||
|
||||
@@ -347,21 +353,26 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
reuse_i = i;
|
||||
reuse_n = cur;
|
||||
}
|
||||
|
||||
if (use_ckpt) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
LOG_DBG("%s: reuse_i = %d, reuse_n = %d, #prompt_dft = %zu, #prompt_cur = %zu\n",
|
||||
__func__, reuse_i, reuse_n, prompt_dft.size(), prompt_cur.size());
|
||||
if (use_ckpt && ckpt.ckpt_size == 0 && reuse_n > 0) {
|
||||
LOG_DBG("%s: no checkpoint available, no reuse, (reuse_i=%d, reuse_n=%d) -> (0, 0)\n",
|
||||
__func__, reuse_i, reuse_n);
|
||||
if (use_ckpt && ckpt.n_tokens > reuse_n) {
|
||||
LOG_DBG("%s: checkpoint (n_tokens = %d) is outdated -> delete it\n", __func__, (int) ckpt.n_tokens);
|
||||
|
||||
reuse_i = 0;
|
||||
reuse_n = 0;
|
||||
|
||||
ckpt = {};
|
||||
}
|
||||
|
||||
result.clear();
|
||||
result.reserve(params.n_max);
|
||||
result.reserve(sparams.n_max);
|
||||
|
||||
bool needs_ckpt = use_ckpt && prompt_dft.size() > 0;
|
||||
if (reuse_n == 0 || (use_ckpt && reuse_i > 0)) {
|
||||
llama_memory_clear(mem_dft, false);
|
||||
prompt_dft.clear();
|
||||
@@ -372,7 +383,7 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
for (int i = reuse_i + reuse_n + 1; i < (int) prompt_dft.size(); ++i) {
|
||||
result.push_back(prompt_dft[i]);
|
||||
|
||||
if (params.n_max <= (int) result.size()) {
|
||||
if (sparams.n_max <= (int) result.size()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -380,50 +391,38 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
return;
|
||||
}
|
||||
|
||||
bool do_restore = false;
|
||||
if (prompt_dft.size() > prompt_cur.size() && reuse_i + reuse_n < (int64_t) prompt_dft.size()) {
|
||||
// This can happen after a partial acceptance (speculative decoding with checkpoints)
|
||||
LOG_DBG("%s: #prompt_dft=%zu, #prompt_cur=%zu, shorten draft\n",
|
||||
__func__, prompt_dft.size(), prompt_cur.size());
|
||||
prompt_dft.resize(prompt_cur.size());
|
||||
do_restore = true;
|
||||
}
|
||||
|
||||
if (reuse_i > 0) {
|
||||
GGML_ASSERT(!use_ckpt);
|
||||
|
||||
bool is_removed = llama_memory_seq_rm (mem_dft, 0, 0, reuse_i);
|
||||
if (!is_removed) {
|
||||
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_i=%d\n", __func__, reuse_i);
|
||||
return;
|
||||
}
|
||||
llama_memory_seq_add(mem_dft, 0, reuse_i, -1, -reuse_i);
|
||||
|
||||
prompt_dft.erase(prompt_dft.begin(), prompt_dft.begin() + reuse_i);
|
||||
}
|
||||
|
||||
if (reuse_n < (int) prompt_dft.size() || do_restore) {
|
||||
if (reuse_n < (int) prompt_dft.size()) {
|
||||
if (use_ckpt) {
|
||||
if (ckpt.n_tokens > (int64_t) prompt_dft.size()) {
|
||||
LOG_INF("%s: checkpoint is too large, prompt_tgt.size=%zu, ckpt.n_tokens=%" PRId64 ", reuse_n=%d, prompt_dft.size=%zu\n",
|
||||
__func__, prompt_tgt.size(), ckpt.n_tokens, reuse_n, prompt_dft.size());
|
||||
if (ckpt.n_tokens > 0) {
|
||||
LOG_DBG("%s: restoring checkpoint, reuse_n=%d, prompt_dft.size=%zu\n", __func__, reuse_n, prompt_dft.size());
|
||||
restore_checkpoint();
|
||||
reuse_n = ckpt.n_tokens;
|
||||
prompt_dft.resize(reuse_n);
|
||||
}
|
||||
draft_restore_checkpoint(ckpt.ckpt_size);
|
||||
reuse_n = ckpt.n_tokens;
|
||||
prompt_dft.resize(reuse_n);
|
||||
needs_ckpt = false;
|
||||
} else {
|
||||
bool is_removed = llama_memory_seq_rm (mem_dft, 0, reuse_n, -1);
|
||||
const bool is_removed = llama_memory_seq_rm(mem_dft, 0, reuse_n, -1);
|
||||
if (!is_removed) {
|
||||
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n",
|
||||
__func__, reuse_n, prompt_dft.size());
|
||||
LOG_ERR("%s: llama_memory_seq_rm failed, reuse_n=%d, prompt_dft.size=%zu\n", __func__, reuse_n, prompt_dft.size());
|
||||
return;
|
||||
}
|
||||
prompt_dft.erase(prompt_dft.begin() + reuse_n, prompt_dft.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (needs_ckpt) {
|
||||
ckpt.ckpt_size = draft_create_checkpoint(prompt_dft.size(), batch.n_tokens);
|
||||
}
|
||||
|
||||
// prepare a batch to evaluate any new tokens in the prompt
|
||||
common_batch_clear(batch);
|
||||
|
||||
@@ -437,12 +436,17 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
// we should rarely end-up here during normal decoding
|
||||
if (batch.n_tokens > 0) {
|
||||
//LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
|
||||
LOG_DBG("%s: draft prompt batch: %d tokens\n", __func__, batch.n_tokens);
|
||||
|
||||
int ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0 && ret != 1) {
|
||||
LOG_WRN("%s: llama_decode returned %d, prompt_cur.size=%zu\n",
|
||||
__func__, ret, prompt_cur.size());
|
||||
}
|
||||
|
||||
if (use_ckpt) {
|
||||
create_checkpoint(prompt_dft.size());
|
||||
}
|
||||
}
|
||||
|
||||
const llama_pos n_past = prompt_dft.size();
|
||||
@@ -454,7 +458,7 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
|
||||
prompt_dft.push_back(id_last);
|
||||
|
||||
LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
|
||||
//LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx_dft, prompt_dft).c_str());
|
||||
|
||||
int ret = llama_decode(ctx_dft, batch);
|
||||
if (ret != 0 && ret != 1) {
|
||||
@@ -465,7 +469,7 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
common_sampler_reset(smpl);
|
||||
|
||||
// sample n_draft tokens from the draft model
|
||||
for (int i = 0; i < params.n_max; ++i) {
|
||||
for (int i = 0; i < sparams.n_max; ++i) {
|
||||
common_batch_clear(batch);
|
||||
|
||||
common_sampler_sample(smpl, ctx_dft, 0, true);
|
||||
@@ -482,14 +486,14 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
|
||||
common_sampler_accept(smpl, id, true);
|
||||
|
||||
result.push_back(id);
|
||||
|
||||
if (params.n_max <= (int) result.size()) {
|
||||
// only collect very high-confidence draft tokens
|
||||
if (cur_p->data[0].p < sparams.p_min) {
|
||||
break;
|
||||
}
|
||||
|
||||
// only collect very high-confidence draft tokens
|
||||
if (cur_p->data[0].p < params.p_min) {
|
||||
result.push_back(id);
|
||||
|
||||
if (sparams.n_max <= (int) result.size()) {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -510,10 +514,14 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
detokenized = replace_to_tgt(detokenized);
|
||||
LOG_DBG("draft->main detokenized string: '%s'\n", detokenized.c_str());
|
||||
result = common_tokenize(ctx_tgt, detokenized, false, true);
|
||||
if (result.size() > (size_t)params.n_max) {
|
||||
result.resize(params.n_max);
|
||||
if (result.size() > (size_t) sparams.n_max) {
|
||||
result.resize(sparams.n_max);
|
||||
}
|
||||
}
|
||||
|
||||
if (result.size() < (size_t) sparams.n_min) {
|
||||
result.clear();
|
||||
}
|
||||
}
|
||||
|
||||
void accept(uint16_t n_accepted) override {
|
||||
@@ -521,6 +529,14 @@ struct common_speculative_state_draft : public common_speculative_state {
|
||||
GGML_UNUSED(n_accepted);
|
||||
}
|
||||
|
||||
int32_t n_max(const common_params_speculative & params) const override {
|
||||
return params.draft.n_max;
|
||||
}
|
||||
|
||||
int32_t n_min(const common_params_speculative & params) const override {
|
||||
return params.draft.n_min;
|
||||
}
|
||||
|
||||
std::string replace_to_dft(const std::string & input) const {
|
||||
std::string result = input;
|
||||
|
||||
@@ -573,6 +589,14 @@ struct common_speculative_state_eagle3 : public common_speculative_state {
|
||||
// noop
|
||||
GGML_UNUSED(n_accepted);
|
||||
}
|
||||
|
||||
int32_t n_max(const common_params_speculative & params) const override {
|
||||
return params.draft.n_max;
|
||||
}
|
||||
|
||||
int32_t n_min(const common_params_speculative & params) const override {
|
||||
return params.draft.n_min;
|
||||
}
|
||||
};
|
||||
|
||||
// state of self-speculation (simple implementation, not ngram-map)
|
||||
@@ -602,19 +626,27 @@ struct common_speculative_state_ngram_simple : public common_speculative_state {
|
||||
// noop
|
||||
GGML_UNUSED(n_accepted);
|
||||
}
|
||||
|
||||
int32_t n_max(const common_params_speculative & /*params*/) const override {
|
||||
return config.size_mgram;
|
||||
}
|
||||
|
||||
int32_t n_min(const common_params_speculative & /*params*/) const override {
|
||||
return config.size_mgram;
|
||||
}
|
||||
};
|
||||
|
||||
struct common_speculative_state_ngram_map_k : public common_speculative_state {
|
||||
// draft ngram map for speculative decoding without draft model
|
||||
common_ngram_map map;
|
||||
common_ngram_map config;
|
||||
|
||||
common_speculative_state_ngram_map_k(
|
||||
enum common_speculative_type type,
|
||||
common_ngram_map map)
|
||||
: common_speculative_state(type), map(std::move(map)) {}
|
||||
common_ngram_map config)
|
||||
: common_speculative_state(type), config(std::move(config)) {}
|
||||
|
||||
void begin(const llama_tokens & prompt) override {
|
||||
common_ngram_map_begin(map, prompt);
|
||||
common_ngram_map_begin(config, prompt);
|
||||
}
|
||||
|
||||
void draft(
|
||||
@@ -622,12 +654,20 @@ struct common_speculative_state_ngram_map_k : public common_speculative_state {
|
||||
const llama_tokens & prompt_tgt,
|
||||
llama_token id_last,
|
||||
llama_tokens & result) override {
|
||||
common_ngram_map_draft(map, prompt_tgt, id_last, result);
|
||||
common_ngram_map_draft(config, prompt_tgt, id_last, result);
|
||||
GGML_UNUSED(params);
|
||||
}
|
||||
|
||||
void accept(uint16_t n_accepted) override {
|
||||
common_ngram_map_accept(map, n_accepted);
|
||||
common_ngram_map_accept(config, n_accepted);
|
||||
}
|
||||
|
||||
int32_t n_max(const common_params_speculative & /*params*/) const override {
|
||||
return config.size_value;
|
||||
}
|
||||
|
||||
int32_t n_min(const common_params_speculative & /*params*/) const override {
|
||||
return config.size_value;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -684,7 +724,7 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
|
||||
const llama_tokens & prompt_tgt,
|
||||
llama_token id_last,
|
||||
llama_tokens & result) override {
|
||||
GGML_UNUSED(params);
|
||||
const auto & sparams = params.ngram_mod;
|
||||
|
||||
n_draft_last = 0;
|
||||
|
||||
@@ -704,16 +744,16 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
|
||||
i_last = cur_len - n;
|
||||
}
|
||||
|
||||
result.resize(n + params.n_max);
|
||||
result.resize(n + sparams.n_max);
|
||||
for (size_t i = 0; i < n - 1; ++i) {
|
||||
result[i] = prompt_tgt[cur_len - n + 1 + i];
|
||||
}
|
||||
result[n - 1] = id_last;
|
||||
|
||||
for (int i = 0; i < params.n_max; ++i) {
|
||||
for (int i = 0; i < sparams.n_max; ++i) {
|
||||
const llama_token token = mod.get(result.data() + i);
|
||||
if (token == common_ngram_mod::EMPTY) {
|
||||
if (i < params.n_min) {
|
||||
if (i < sparams.n_min) {
|
||||
result.clear();
|
||||
return;
|
||||
}
|
||||
@@ -735,17 +775,15 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
|
||||
}
|
||||
|
||||
void accept(uint16_t n_accepted) override {
|
||||
if (verbose) {
|
||||
LOG_INF("%s: accepted %d tokens from %zu drafted tokens\n", __func__, n_accepted, n_draft_last);
|
||||
}
|
||||
|
||||
// compute acceptance fraction if we have a recorded draft length
|
||||
if (n_draft_last > 0) {
|
||||
const double f_acc = (double)n_accepted / (double)n_draft_last;
|
||||
if (f_acc < 0.5) {
|
||||
n_low++;
|
||||
if (n_low >= 3) {
|
||||
LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
|
||||
if (verbose) {
|
||||
LOG_WRN("%s: low acceptance streak (%d) – resetting ngram_mod\n", __func__, n_low);
|
||||
}
|
||||
|
||||
mod.reset();
|
||||
n_low = 0;
|
||||
@@ -756,6 +794,14 @@ struct common_speculative_state_ngram_mod : public common_speculative_state {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int32_t n_max(const common_params_speculative & params) const override {
|
||||
return params.ngram_mod.n_max;
|
||||
}
|
||||
|
||||
int32_t n_min(const common_params_speculative & params) const override {
|
||||
return params.ngram_mod.n_min;
|
||||
}
|
||||
};
|
||||
|
||||
struct common_speculative_state_ngram_cache : public common_speculative_state {
|
||||
@@ -849,6 +895,14 @@ struct common_speculative_state_ngram_cache : public common_speculative_state {
|
||||
// TODO: noop
|
||||
GGML_UNUSED(n_accepted);
|
||||
}
|
||||
|
||||
int32_t n_max(const common_params_speculative & /*params*/) const override {
|
||||
return n_draft;
|
||||
}
|
||||
|
||||
int32_t n_min(const common_params_speculative & /*params*/) const override {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
|
||||
struct common_speculative {
|
||||
@@ -857,11 +911,13 @@ struct common_speculative {
|
||||
common_speculative_state * curr_impl = nullptr; // current implementation in use (for stats)
|
||||
};
|
||||
|
||||
static common_ngram_map get_common_ngram_map(const common_speculative_config & config) {
|
||||
uint16_t size_key = config.params.ngram_size_n;
|
||||
uint16_t size_value = config.params.ngram_size_m;
|
||||
bool key_only = (config.type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K);
|
||||
uint16_t min_hits = config.params.ngram_min_hits;
|
||||
static common_ngram_map get_common_ngram_map(
|
||||
common_speculative_type type,
|
||||
const common_params_speculative_ngram_map & config) {
|
||||
uint16_t size_key = config.size_n;
|
||||
uint16_t size_value = config.size_m;
|
||||
bool key_only = type == COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K;
|
||||
uint16_t min_hits = config.min_hits;
|
||||
|
||||
return common_ngram_map(size_key, size_value, key_only, min_hits);
|
||||
}
|
||||
@@ -919,8 +975,8 @@ common_speculative * common_speculative_init(
|
||||
common_params_speculative & params,
|
||||
llama_context * ctx_tgt) {
|
||||
llama_context * ctx_dft = nullptr;
|
||||
if (params.model_dft) {
|
||||
ctx_dft = llama_init_from_model(params.model_dft, params.cparams_dft);
|
||||
if (params.draft.model) {
|
||||
ctx_dft = llama_init_from_model(params.draft.model, params.draft.cparams);
|
||||
if (ctx_dft == nullptr) {
|
||||
LOG_ERR("%s", "failed to create draft context\n");
|
||||
return nullptr;
|
||||
@@ -930,7 +986,7 @@ common_speculative * common_speculative_init(
|
||||
// Compute the implementations to use based on the config and their order of preference
|
||||
std::vector<common_speculative_config> configs = {}; // list of speculative configs to try
|
||||
{
|
||||
bool has_draft = !params.mparams_dft.path.empty();
|
||||
bool has_draft = !params.draft.mparams.path.empty();
|
||||
bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3
|
||||
|
||||
bool has_ngram_cache = (params.type == COMMON_SPECULATIVE_TYPE_NGRAM_CACHE);
|
||||
@@ -953,16 +1009,17 @@ common_speculative * common_speculative_init(
|
||||
configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, params));
|
||||
}
|
||||
if (has_ngram_mod) {
|
||||
// shared instance for all speculative decoding contexts
|
||||
if (!params.ngram_mod) {
|
||||
params.ngram_mod = std::make_shared<common_ngram_mod>(params.ngram_size_n, 4*1024*1024);
|
||||
auto & sparams = params.ngram_mod;
|
||||
|
||||
LOG_INF("%s: initialized ngram_mod with n=%d, size=%zu (%.3f MB)\n", __func__,
|
||||
params.ngram_size_n, params.ngram_mod->size(),
|
||||
(float)(params.ngram_mod->size_bytes())/1024/1024);
|
||||
if (!sparams.obj) {
|
||||
sparams.obj = std::make_shared<common_ngram_mod>(sparams.n_match, 4*1024*1024);
|
||||
|
||||
if (params.ngram_size_n < 16) {
|
||||
LOG_WRN("%s: ngram_mod n=%d is too small - poor quality is possible, see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, params.ngram_size_n);
|
||||
LOG_INF("%s: initialized ngram_mod with n_match=%d, size=%zu (%.3f MB)\n", __func__,
|
||||
sparams.n_match, sparams.obj->size(), (float)(sparams.obj->size_bytes())/1024/1024);
|
||||
|
||||
if (sparams.n_match < 16) {
|
||||
LOG_WRN("%s: ngram_mod n_match=%d is too small - poor quality is possible, "
|
||||
"see: https://github.com/ggml-org/llama.cpp/pull/19164\n", __func__, sparams.n_match);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -992,7 +1049,7 @@ common_speculative * common_speculative_init(
|
||||
impls.push_back(std::make_unique<common_speculative_state_draft>(config.type,
|
||||
/* .ctx_tgt = */ ctx_tgt,
|
||||
/* .ctx_dft = */ ctx_dft,
|
||||
/* .replacements = */ params.replacements,
|
||||
/* .replacements = */ params.draft.replacements,
|
||||
/* .use_ckpt = */ use_ckpt
|
||||
));
|
||||
break;
|
||||
@@ -1002,18 +1059,18 @@ common_speculative * common_speculative_init(
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE: {
|
||||
common_ngram_map ngram_map = get_common_ngram_map(config);
|
||||
common_ngram_map ngram_map = get_common_ngram_map(config.type, config.params.ngram_simple);
|
||||
|
||||
uint16_t ngram_size_key = ngram_map.size_key;
|
||||
uint16_t mgram_size_value = ngram_map.size_value;
|
||||
|
||||
auto config_simple = common_ngram_simple_config {
|
||||
/* .size_ngram = */ ngram_size_key,
|
||||
/* .size_mgram = */ mgram_size_value
|
||||
/* .size_ngram = */ ngram_size_key,
|
||||
/* .size_mgram = */ mgram_size_value
|
||||
};
|
||||
auto state = std::make_unique<common_speculative_state_ngram_simple>(
|
||||
/* .type = */ config.type,
|
||||
/* .state = */ config_simple
|
||||
/* .type = */ config.type,
|
||||
/* .state = */ config_simple
|
||||
);
|
||||
impls.push_back(std::move(state));
|
||||
break;
|
||||
@@ -1022,18 +1079,17 @@ common_speculative * common_speculative_init(
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V: {
|
||||
impls.push_back(std::make_unique<common_speculative_state_ngram_map_k>(
|
||||
(config.type),
|
||||
get_common_ngram_map(config)
|
||||
get_common_ngram_map(config.type, config.params.ngram_map_k)
|
||||
));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_MOD: {
|
||||
GGML_ASSERT(config.params.ngram_mod);
|
||||
impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod));
|
||||
GGML_ASSERT(config.params.ngram_mod.obj);
|
||||
impls.push_back(std::make_unique<common_speculative_state_ngram_mod>(config.type, *config.params.ngram_mod.obj));
|
||||
break;
|
||||
}
|
||||
case COMMON_SPECULATIVE_TYPE_NGRAM_CACHE: {
|
||||
auto state = create_state_ngram_cache(
|
||||
params.lookup_cache_static, params.lookup_cache_dynamic, config);
|
||||
auto state = create_state_ngram_cache(params.ngram_cache.lookup_cache_static, params.ngram_cache.lookup_cache_dynamic, config);
|
||||
impls.push_back(std::make_unique<common_speculative_state_ngram_cache>(state));
|
||||
break;
|
||||
}
|
||||
@@ -1091,6 +1147,15 @@ llama_tokens common_speculative_draft(
|
||||
impl->n_call_draft++;
|
||||
}
|
||||
|
||||
{
|
||||
const int n_min = impl->n_min(params);
|
||||
|
||||
if (!result.empty() && (int) result.size() < n_min) {
|
||||
LOG_DBG("%s: ignoring small draft: %d < %d\n", __func__, (int) result.size(), n_min);
|
||||
result.clear();
|
||||
}
|
||||
}
|
||||
|
||||
if (!result.empty()) {
|
||||
LOG_DBG("%s: called impl %s, hist size = %zu, call_count = %zu, gen = %zu\n", __func__,
|
||||
common_speculative_type_to_str(impl.get()->type).c_str(), prompt_tgt.size(),
|
||||
@@ -1100,7 +1165,7 @@ llama_tokens common_speculative_draft(
|
||||
impl->n_gen_drafts++;
|
||||
impl->n_gen_tokens += result.size();
|
||||
|
||||
break; // We have a draft, so break out of the loop and return it.
|
||||
break; // we have a draft, so break out of the loop and return it.
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1128,6 +1193,32 @@ void common_speculative_accept(common_speculative * spec, uint16_t n_accepted) {
|
||||
}
|
||||
}
|
||||
|
||||
int32_t common_speculative_n_max(const common_speculative * spec, const common_params_speculative & params) {
|
||||
if (spec == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t n_max = 0;
|
||||
for (const auto & impl : spec->impls) {
|
||||
n_max = std::max(n_max, impl->n_max(params));
|
||||
}
|
||||
|
||||
return n_max;
|
||||
}
|
||||
|
||||
int32_t common_speculative_n_min(const common_speculative * spec, const common_params_speculative & params) {
|
||||
if (spec == nullptr) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t n_min = 0;
|
||||
for (const auto & impl : spec->impls) {
|
||||
n_min = std::max(n_min, impl->n_min(params));
|
||||
}
|
||||
|
||||
return n_min;
|
||||
}
|
||||
|
||||
void common_speculative_print_stats(const common_speculative * spec) {
|
||||
if (spec == nullptr) {
|
||||
return;
|
||||
|
||||
@@ -33,6 +33,9 @@ llama_tokens common_speculative_draft(
|
||||
// informs the speculative decoder that n_accepted tokens were accepted by the target model
|
||||
void common_speculative_accept(common_speculative * spec, uint16_t n_accepted);
|
||||
|
||||
int32_t common_speculative_n_max(const common_speculative * spec, const common_params_speculative & params);
|
||||
int32_t common_speculative_n_min(const common_speculative * spec, const common_params_speculative & params);
|
||||
|
||||
// print statistics about the speculative decoding
|
||||
void common_speculative_print_stats(const common_speculative * spec);
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -175,6 +175,7 @@ pre_computed_hashes = [
|
||||
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
|
||||
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
|
||||
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
|
||||
{"name": "qwen35", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openbmb/MiniCPM-V-4_6", "chkhsh": "1444df51289cfa8063b96f0e62b1125440111bc79a52003ea14b6eac7016fd5f"},
|
||||
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
|
||||
# jina-v2-de variants
|
||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/aari1995/German_Semantic_V3", "chkhsh": "b3d1dd861f1d4c5c0d2569ce36baf3f90fe8a102db3de50dd71ff860d91be3df"},
|
||||
|
||||
@@ -51,6 +51,12 @@ The packages for FP32 and FP16 would have different accuracy and performance on
|
||||
|
||||
## News
|
||||
|
||||
- 2026.04
|
||||
|
||||
- Optimize mul_mat by reorder feature for data type: Q4_K, Q5_K, Q_K, Q8_0.
|
||||
- Fused MoE.
|
||||
- Upgrate CI and built package for oneAPI 2025.3.3, support Ubuntu 24.04 built package.
|
||||
|
||||
- 2026.03
|
||||
- Support Flash-Attention: less memory usage, performance impact depends on LLM.
|
||||
|
||||
@@ -349,6 +355,12 @@ Choose one of following methods to run.
|
||||
./examples/sycl/test.sh
|
||||
```
|
||||
|
||||
- Run llama-server:
|
||||
|
||||
```sh
|
||||
./examples/sycl/start-svr.sh -m PATH/MODEL_FILE
|
||||
```
|
||||
|
||||
2. Command line
|
||||
Launch inference
|
||||
|
||||
@@ -637,10 +649,18 @@ Choose one of following methods to run.
|
||||
|
||||
1. Script
|
||||
|
||||
- Run test:
|
||||
|
||||
```
|
||||
examples\sycl\win-test.bat
|
||||
```
|
||||
|
||||
- Run llama-server:
|
||||
|
||||
```
|
||||
examples\sycl\win-start-svr.bat -m PATH\MODEL_FILE
|
||||
```
|
||||
|
||||
2. Command line
|
||||
|
||||
Launch inference
|
||||
|
||||
49
docs/multimodal/minicpmv4.6.md
Normal file
49
docs/multimodal/minicpmv4.6.md
Normal file
@@ -0,0 +1,49 @@
|
||||
## MiniCPM-V 4.6
|
||||
|
||||
### Prepare models and code
|
||||
|
||||
Download [MiniCPM-V-4_6](https://huggingface.co/openbmb/MiniCPM-V-4_6) PyTorch model from huggingface to "MiniCPM-V-4_6" folder.
|
||||
|
||||
The model must be the standard `transformers` v5.7.0+ checkpoint (no `trust_remote_code`); the architecture in `config.json` is `MiniCPMV4_6ForConditionalGeneration` with a `qwen3_5_text` text model and a SigLIP-based vision tower plus a window-attention `vit_merger`.
|
||||
|
||||
### Build llama.cpp
|
||||
|
||||
If there are differences in usage, please refer to the official build [documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
|
||||
|
||||
Clone llama.cpp:
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/llama.cpp
|
||||
cd llama.cpp
|
||||
```
|
||||
|
||||
Build llama.cpp using `CMake`:
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
```
|
||||
|
||||
|
||||
### Usage of MiniCPM-V 4.6
|
||||
|
||||
Unlike older MiniCPM-V variants, MiniCPM-V 4.6 is converted directly through `convert_hf_to_gguf.py`. The same script is invoked twice on the original Hugging Face directory: once to produce the language-model GGUF and once with `--mmproj` to produce the multimodal projector GGUF.
|
||||
|
||||
```bash
|
||||
# language model
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --outfile ../MiniCPM-V-4_6/ggml-model-f16.gguf
|
||||
|
||||
# multimodal projector (vision tower + window-attention vit_merger + DownsampleMLP merger)
|
||||
python ./convert_hf_to_gguf.py ../MiniCPM-V-4_6 --mmproj --outfile ../MiniCPM-V-4_6/mmproj-model-f16.gguf
|
||||
|
||||
# optional: quantize to Q4_K_M
|
||||
./build/bin/llama-quantize ../MiniCPM-V-4_6/ggml-model-f16.gguf ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||
```
|
||||
|
||||
|
||||
Inference on Linux or Mac
|
||||
```bash
|
||||
# run in single-turn mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
|
||||
|
||||
# run in conversation mode
|
||||
./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4_6/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4_6/mmproj-model-f16.gguf
|
||||
```
|
||||
@@ -26,7 +26,7 @@ Legend:
|
||||
| CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
@@ -60,7 +60,7 @@ Legend:
|
||||
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||
@@ -105,7 +105,7 @@ Legend:
|
||||
| SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| SUM | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
|
||||
|
||||
5501
docs/ops/WebGPU.csv
5501
docs/ops/WebGPU.csv
File diff suppressed because it is too large
Load Diff
@@ -33,18 +33,18 @@ An example to use this approach can be the rewriting of source code by a LLM.
|
||||
This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.
|
||||
|
||||
```
|
||||
llama-server [...] --spec-type ngram-simple --draft-max 64
|
||||
llama-server [...] --spec-type ngram-simple --spec-draft-n-max 64
|
||||
```
|
||||
|
||||
#### n-gram Map Key (`ngram-map-k`)
|
||||
|
||||
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.
|
||||
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-map-k-min-hits`, default is 1) before generating drafts.
|
||||
|
||||
The number of accepted tokens is stored for each used n-gram.
|
||||
|
||||
**Example:**
|
||||
```
|
||||
llama-server [...] --spec-type ngram-map-k --draft-max 64
|
||||
llama-server [...] --spec-type ngram-map-k --spec-draft-n-max 64
|
||||
```
|
||||
|
||||
#### n-gram Map Key-4-Values (`ngram-map-k4v`)
|
||||
@@ -55,7 +55,7 @@ The number of accepted tokens is stored for each used n-gram.
|
||||
|
||||
**Example:** Server options to be used if there are a lot of longer repetitions.
|
||||
```
|
||||
llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
|
||||
llama-server [...] --spec-type ngram-map-k4v --spec-ngram-map-k4v-size-n 8 --spec-ngram-map-k4v-size-m 8 --spec-ngram-map-k4v-min-hits 2 --spec-draft-n-max 64
|
||||
```
|
||||
|
||||
### n-gram Mod (`ngram-mod`)
|
||||
@@ -80,9 +80,9 @@ Currently, a single hash pool is shared across all server slots, so different re
|
||||
# notes:
|
||||
# - small `n` are not recommended
|
||||
# - MoEs require long drafts
|
||||
# - dense models: can reduce `--draft-min` and `--draft-max`
|
||||
# - dense models: can reduce `--spec-ngram-mod-n-min` and `--spec-ngram-mod-n-max`
|
||||
|
||||
llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
|
||||
llama-server ... --spec-type ngram-mod --spec-ngram-mod-n-match 24 --spec-ngram-mod-n-min 48 --spec-ngram-mod-n-max 64
|
||||
```
|
||||
|
||||
Applications:
|
||||
@@ -105,21 +105,90 @@ Example Video:
|
||||
|
||||
If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.
|
||||
|
||||
### General Speculative Parameters
|
||||
|
||||
```
|
||||
--draft, --draft-n, --draft-max N number of tokens to draft for speculative decoding (default: 16)
|
||||
(env: LLAMA_ARG_DRAFT_MAX)
|
||||
--draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding
|
||||
(default: 0)
|
||||
(env: LLAMA_ARG_DRAFT_MIN)
|
||||
[...]
|
||||
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
|
||||
type of speculative decoding to use when no draft model is provided
|
||||
(default: none)
|
||||
--spec-ngram-size-n N ngram size N for ngram-simple/ngram-map speculative decoding, length
|
||||
of lookup n-gram (default: 12)
|
||||
--spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length
|
||||
of draft m-gram (default: 48)
|
||||
--spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1)
|
||||
(env: LLAMA_ARG_SPEC_TYPE)
|
||||
--spec-default use default speculative decoding
|
||||
```
|
||||
|
||||
### Draft Model Parameters
|
||||
|
||||
```
|
||||
--spec-draft-model, -md, --model-draft FNAME
|
||||
draft model for speculative decoding (default: unused)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_MODEL)
|
||||
--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]
|
||||
HuggingFace repository for the draft model
|
||||
--spec-draft-n-max N
|
||||
number of tokens to draft for speculative decoding (default: 16)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX)
|
||||
--spec-draft-n-min N
|
||||
minimum number of draft tokens to use for speculative decoding (default: 0)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN)
|
||||
--spec-draft-p-split, --draft-p-split P
|
||||
speculative decoding split probability (default: 0.10)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT)
|
||||
--spec-draft-p-min, --draft-p-min P
|
||||
minimum speculative decoding probability (greedy) (default: 0.75)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN)
|
||||
--spec-draft-ctx-size, -cd, --ctx-size-draft N
|
||||
size of the prompt context for the draft model (default: 0, 0 = loaded from model)
|
||||
(env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE)
|
||||
--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N
|
||||
max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
|
||||
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT)
|
||||
--spec-draft-device, -devd, --device-draft <dev1,dev2,..>
|
||||
comma-separated list of devices to use for offloading the draft model
|
||||
--spec-draft-replace, --spec-replace TARGET DRAFT
|
||||
translate the string in TARGET into DRAFT if the draft model and main model are not compatible
|
||||
```
|
||||
|
||||
### n-gram Mod Parameters
|
||||
|
||||
```
|
||||
--spec-ngram-mod-n-match N
|
||||
ngram-mod lookup length (default: 24)
|
||||
--spec-ngram-mod-n-min N
|
||||
minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48)
|
||||
--spec-ngram-mod-n-max N
|
||||
maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64)
|
||||
```
|
||||
|
||||
### n-gram Simple Parameters
|
||||
|
||||
```
|
||||
--spec-ngram-simple-size-n N
|
||||
ngram size N for ngram-simple speculative decoding, length of lookup n-gram (default: 12)
|
||||
--spec-ngram-simple-size-m N
|
||||
ngram size M for ngram-simple speculative decoding, length of draft m-gram (default: 48)
|
||||
--spec-ngram-simple-min-hits N
|
||||
minimum hits for ngram-simple speculative decoding (default: 1)
|
||||
```
|
||||
|
||||
### n-gram Map Key Parameters
|
||||
|
||||
```
|
||||
--spec-ngram-map-k-size-n N
|
||||
ngram size N for ngram-map-k speculative decoding, length of lookup n-gram (default: 12)
|
||||
--spec-ngram-map-k-size-m N
|
||||
ngram size M for ngram-map-k speculative decoding, length of draft m-gram (default: 48)
|
||||
--spec-ngram-map-k-min-hits N
|
||||
minimum hits for ngram-map-k speculative decoding (default: 1)
|
||||
```
|
||||
|
||||
### n-gram Map Key-4-Values Parameters
|
||||
|
||||
```
|
||||
--spec-ngram-map-k4v-size-n N
|
||||
ngram size N for ngram-map-k4v speculative decoding, length of lookup n-gram (default: 12)
|
||||
--spec-ngram-map-k4v-size-m N
|
||||
ngram size M for ngram-map-k4v speculative decoding, length of draft m-gram (default: 48)
|
||||
--spec-ngram-map-k4v-min-hits N
|
||||
minimum hits for ngram-map-k4v speculative decoding (default: 1)
|
||||
```
|
||||
|
||||
### `--spec-type TYPE`
|
||||
@@ -140,21 +209,40 @@ Specifies a type of speculative decoding without draft model.
|
||||
./llama-server [...] --spec-type ngram-simple
|
||||
```
|
||||
|
||||
### `--spec-ngram-size-n N`
|
||||
### `--spec-ngram-*-size-n N`
|
||||
|
||||
Sets the size N of the lookup n-gram for n-gram map based speculative decoding.
|
||||
The n-gram size N determines how many tokens in a row to look back when searching for matching patterns.
|
||||
|
||||
### `--spec-ngram-size-m M`
|
||||
Each n-gram implementation has its own parameter:
|
||||
|
||||
- `--spec-ngram-simple-size-n` for `ngram-simple`
|
||||
- `--spec-ngram-map-k-size-n` for `ngram-map-k`
|
||||
- `--spec-ngram-map-k4v-size-n` for `ngram-map-k4v`
|
||||
- `--spec-ngram-mod-n-match` for `ngram-mod`
|
||||
|
||||
### `--spec-ngram-*-size-m M`
|
||||
|
||||
Sets the size M of the draft m-gram for n-gram map based speculative decoding.
|
||||
The m-gram size determines how many tokens to draft when a match is found.
|
||||
Larger values can provide more speedup but may reduce acceptance rate.
|
||||
|
||||
### `--spec-ngram-min-hits H`
|
||||
Each n-gram implementation has its own parameter:
|
||||
|
||||
- `--spec-ngram-simple-size-m` for `ngram-simple`
|
||||
- `--spec-ngram-map-k-size-m` for `ngram-map-k`
|
||||
- `--spec-ngram-map-k4v-size-m` for `ngram-map-k4v`
|
||||
|
||||
### `--spec-ngram-*-min-hits H`
|
||||
|
||||
This option defines how often a key has to appear in the token history to be used as a draft (default is 1).
|
||||
|
||||
Each n-gram implementation has its own parameter:
|
||||
|
||||
- `--spec-ngram-simple-min-hits` for `ngram-simple`
|
||||
- `--spec-ngram-map-k-min-hits` for `ngram-map-k`
|
||||
- `--spec-ngram-map-k4v-min-hits` for `ngram-map-k4v`
|
||||
|
||||
## Statistics
|
||||
Each speculative decoding implementation prints statistics.
|
||||
|
||||
@@ -180,4 +268,3 @@ statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts
|
||||
- `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
|
||||
- `#acc tokens`: number of tokens accepted by the main model
|
||||
- `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).
|
||||
|
||||
|
||||
@@ -202,10 +202,14 @@ static bool run(llama_context * ctx, const common_params & params) {
|
||||
print_tokenized_prompt(ctx, tokens, params.prompt);
|
||||
|
||||
if (params.save_logits) {
|
||||
output_data output {ctx, model, params};
|
||||
std::filesystem::path model_path{params.model.path};
|
||||
std::string model_name{model_path.stem().string()};
|
||||
save_output_data(output, model_name, params.logits_output_dir);
|
||||
try {
|
||||
output_data output {ctx, model, params};
|
||||
std::filesystem::path model_path{params.model.path};
|
||||
std::string model_name{model_path.stem().string()};
|
||||
save_output_data(output, model_name, params.logits_output_dir);
|
||||
} catch (const std::exception & e) {
|
||||
LOG_ERR("%s : error saving logits: %s\n", __func__, e.what());
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -223,7 +227,7 @@ int main(int argc, char ** argv) {
|
||||
llama_backend_init();
|
||||
llama_numa_init(params.numa);
|
||||
|
||||
std::optional<base_callback_data> cb_data;
|
||||
std::optional<common_debug_cb_user_data> cb_data;
|
||||
if (!params.save_logits) {
|
||||
cb_data.emplace(params, params.tensor_filter);
|
||||
}
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
set(TARGET llama-diffusion)
|
||||
add_library(${TARGET} STATIC diffusion.cpp diffusion.h)
|
||||
target_link_libraries(${TARGET} PUBLIC llama llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PUBLIC cxx_std_17)
|
||||
|
||||
set(TARGET llama-diffusion-cli)
|
||||
add_executable(${TARGET} diffusion-cli.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE llama llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE llama-diffusion llama llama-common ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
||||
@@ -12,11 +12,11 @@ The diffusion CLI supports various parameters to control the generation process:
|
||||
### Core Diffusion Parameters
|
||||
- `--diffusion-steps`: Number of diffusion steps (default: 256)
|
||||
- `--diffusion-algorithm`: Algorithm for token selection
|
||||
- `0`: ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
|
||||
- `1`: ENTROPY_BASED - Entropy-based selection
|
||||
- `2`: MARGIN_BASED - Margin-based selection
|
||||
- `3`: RANDOM - Random selection
|
||||
- `4`: CONFIDENCE_BASED - Confidence-based selection (default)
|
||||
- `0`: DIFFUSION_ALGORITHM_ORIGIN - Token will be generated in a purely random order from https://arxiv.org/abs/2107.03006.
|
||||
- `1`: DIFFUSION_ALGORITHM_ENTROPY_BASED - Entropy-based selection
|
||||
- `2`: DIFFUSION_ALGORITHM_MARGIN_BASED - Margin-based selection
|
||||
- `3`: DIFFUSION_ALGORITHM_RANDOM - Random selection
|
||||
- `4`: DIFFUSION_ALGORITHM_CONFIDENCE_BASED - Confidence-based selection (default)
|
||||
- More documentation here https://github.com/DreamLM/Dream
|
||||
- `--diffusion-visual`: Enable live visualization during generation
|
||||
|
||||
|
||||
@@ -1,127 +1,23 @@
|
||||
#include "arg.h"
|
||||
#include "chat.h"
|
||||
#include "common.h"
|
||||
#include "diffusion.h"
|
||||
#include "llama.h"
|
||||
#include "log.h"
|
||||
|
||||
#include <limits.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <clocale>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
enum diffusion_algorithm { ORIGIN = 0, ENTROPY_BASED = 1, MARGIN_BASED = 2, RANDOM = 3, CONFIDENCE_BASED = 4 };
|
||||
|
||||
// Unified transfer scheduling methods
|
||||
enum transfer_schedule {
|
||||
TIMESTEP_BASED = 0, // Dream-style: (1.0 - s/t) * remaining
|
||||
BLOCK_BASED = 1, // LLaDA-style: process in blocks with get_num_transfer_tokens
|
||||
};
|
||||
|
||||
typedef bool (*diffusion_step_callback_t)(int32_t step,
|
||||
int32_t total_steps,
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
void * user_data);
|
||||
|
||||
struct diffusion_params {
|
||||
int32_t steps = 0;
|
||||
float temperature = 0;
|
||||
llama_token mask_token_id = LLAMA_TOKEN_NULL;
|
||||
diffusion_step_callback_t step_callback = nullptr;
|
||||
void * step_callback_user_data = nullptr;
|
||||
int32_t seed = 0;
|
||||
bool visual_mode = false;
|
||||
bool shift_logits = false; // Shift logits by -1 after decode
|
||||
|
||||
float top_p = 0.;
|
||||
int32_t top_k = 0.;
|
||||
|
||||
diffusion_algorithm algorithm = CONFIDENCE_BASED;
|
||||
transfer_schedule schedule = TIMESTEP_BASED;
|
||||
|
||||
float cfg_scale = 0.; // Config scale for classifier-free guidance
|
||||
float eps = 0.; // Timestep scheduling
|
||||
int32_t block_length = 0; // Block size (for block scheduling)
|
||||
float alg_temp = 0; // algorithm temperature (0.0 = deterministic)
|
||||
bool add_gumbel_noise = false; // Add gumbel noise to the logits if temp > 0.0
|
||||
|
||||
int32_t max_length = 0; // Maximum sequence length
|
||||
};
|
||||
|
||||
struct callback_data {
|
||||
diffusion_params * diff_params;
|
||||
const llama_vocab * vocab;
|
||||
int32_t n_input;
|
||||
};
|
||||
|
||||
static float calculate_confidence(const llama_token_data_array & cur_p,
|
||||
diffusion_algorithm algorithm,
|
||||
std::mt19937 & rng) {
|
||||
switch (algorithm) {
|
||||
case CONFIDENCE_BASED:
|
||||
return cur_p.data[cur_p.selected].p; // Selected token probability
|
||||
|
||||
case ENTROPY_BASED:
|
||||
{
|
||||
float entropy = 0.0f;
|
||||
const float epsilon = 1e-10f;
|
||||
for (size_t i = 0; i < cur_p.size; i++) {
|
||||
float prob = cur_p.data[i].p;
|
||||
entropy += prob * logf(prob + epsilon);
|
||||
}
|
||||
return -entropy; // Higher entropy = lower confidence
|
||||
}
|
||||
|
||||
case MARGIN_BASED:
|
||||
return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
|
||||
|
||||
case RANDOM:
|
||||
{
|
||||
std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
|
||||
return uniform(rng); // Random confidence
|
||||
}
|
||||
|
||||
case ORIGIN:
|
||||
return cur_p.data[cur_p.selected].p;
|
||||
|
||||
default:
|
||||
return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// Unified transfer count calculation function
|
||||
static int32_t calculate_transfer_count(int32_t step,
|
||||
int32_t total_steps,
|
||||
int32_t remaining_masked,
|
||||
transfer_schedule schedule,
|
||||
float eps,
|
||||
const std::vector<int32_t> & num_transfer_tokens = {}) {
|
||||
switch (schedule) {
|
||||
case TIMESTEP_BASED:
|
||||
{
|
||||
float t = 1.0f - (float) step / total_steps * (1.0f - eps);
|
||||
float s = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
|
||||
float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
|
||||
return (int32_t) (remaining_masked * p_transfer);
|
||||
}
|
||||
|
||||
case BLOCK_BASED:
|
||||
if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
|
||||
return num_transfer_tokens[step];
|
||||
}
|
||||
return remaining_masked / (total_steps - step); // Fallback
|
||||
|
||||
default:
|
||||
return remaining_masked / (total_steps - step);
|
||||
}
|
||||
}
|
||||
|
||||
static bool diffusion_step_callback(int32_t step,
|
||||
int32_t total_steps,
|
||||
const llama_token * tokens,
|
||||
@@ -176,341 +72,6 @@ static bool diffusion_step_callback(int32_t step,
|
||||
return true;
|
||||
}
|
||||
|
||||
static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
|
||||
if (temperature == 0.0f) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::uniform_real_distribution<double> uniform(0.0, 1.0);
|
||||
for (int32_t i = 0; i < n_vocab; i++) {
|
||||
double noise = uniform(rng);
|
||||
// Prevent log(0)
|
||||
noise = std::max(noise, 1e-20);
|
||||
double gumbel_noise = std::pow(-std::log(noise), temperature);
|
||||
logits[i] = std::exp(logits[i]) / gumbel_noise;
|
||||
}
|
||||
}
|
||||
|
||||
static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
|
||||
std::vector<int32_t> num_transfer_tokens(steps);
|
||||
|
||||
int32_t base = mask_count / steps;
|
||||
int32_t remainder = mask_count % steps;
|
||||
|
||||
for (int32_t i = 0; i < steps; i++) {
|
||||
num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
|
||||
}
|
||||
|
||||
return num_transfer_tokens;
|
||||
}
|
||||
|
||||
static void diffusion_generate(llama_context * ctx,
|
||||
const llama_token * input_tokens,
|
||||
llama_token * output_tokens,
|
||||
int32_t n_input,
|
||||
const diffusion_params & params,
|
||||
int32_t & n_generated) {
|
||||
n_generated = 0;
|
||||
if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
|
||||
// Initialize with input and pad with mask tokens
|
||||
std::copy(input_tokens, input_tokens + n_input, output_tokens);
|
||||
std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
|
||||
llama_set_causal_attn(ctx, false);
|
||||
|
||||
int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
|
||||
|
||||
std::vector<llama_token_data> candidates(n_vocab);
|
||||
std::vector<llama_token_data> conf_candidates;
|
||||
conf_candidates.reserve(params.max_length);
|
||||
std::vector<int32_t> mask_positions;
|
||||
mask_positions.reserve(params.max_length);
|
||||
|
||||
// Setup sampler chain
|
||||
struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||
if (params.top_k > 0) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
|
||||
}
|
||||
if (params.top_p < 1.0f) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
|
||||
}
|
||||
if (params.temperature > 0.0f) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
|
||||
}
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
|
||||
|
||||
struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
|
||||
|
||||
llama_batch batch = llama_batch_init(params.max_length, 0, 1);
|
||||
batch.n_tokens = params.max_length;
|
||||
|
||||
// Pre-allocate buffers for CFG if needed
|
||||
int32_t logits_size = n_vocab * params.max_length;
|
||||
std::vector<float> cond_logits_buffer;
|
||||
std::vector<llama_token> un_x_buffer;
|
||||
if (params.cfg_scale > 0.0f) {
|
||||
cond_logits_buffer.resize(logits_size);
|
||||
un_x_buffer.resize(params.max_length);
|
||||
}
|
||||
|
||||
// For block-based processing
|
||||
std::vector<int32_t> num_transfer_tokens;
|
||||
int32_t num_blocks = 1;
|
||||
int32_t steps_per_block = params.steps;
|
||||
|
||||
if (params.schedule == BLOCK_BASED) {
|
||||
GGML_ASSERT(params.max_length % params.block_length == 0);
|
||||
num_blocks = params.max_length / params.block_length;
|
||||
GGML_ASSERT(params.steps % num_blocks == 0);
|
||||
steps_per_block = params.steps / num_blocks;
|
||||
}
|
||||
|
||||
std::vector<float> confidence(params.max_length);
|
||||
|
||||
int64_t total_sampling_time = 0;
|
||||
int64_t total_time = 0;
|
||||
int64_t time_start = ggml_time_us();
|
||||
|
||||
for (int block_num = 0; block_num < num_blocks; block_num++) {
|
||||
int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
|
||||
int32_t block_end = (params.schedule == BLOCK_BASED) ?
|
||||
std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
|
||||
params.max_length;
|
||||
|
||||
// Count masked tokens in current block for block-based processing
|
||||
if (params.schedule == BLOCK_BASED) {
|
||||
int32_t block_mask_count = 0;
|
||||
for (int i = block_start; i < block_end; i++) {
|
||||
if (output_tokens[i] == params.mask_token_id) {
|
||||
block_mask_count++;
|
||||
}
|
||||
}
|
||||
num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
|
||||
}
|
||||
|
||||
for (int32_t step = 0; step < steps_per_block; step++) {
|
||||
int32_t global_step = block_num * steps_per_block + step;
|
||||
|
||||
if (params.step_callback) {
|
||||
if (!params.step_callback(
|
||||
global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Setup batch
|
||||
for (int32_t i = 0; i < params.max_length; i++) {
|
||||
batch.token[i] = output_tokens[i];
|
||||
batch.pos[i] = i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id[i][0] = 0;
|
||||
batch.logits[i] = 1;
|
||||
}
|
||||
|
||||
float * logits = nullptr;
|
||||
|
||||
if (params.cfg_scale > 0.0f) {
|
||||
int ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("Failed to generate conditional");
|
||||
break;
|
||||
}
|
||||
float * cond_logits_ptr = llama_get_logits(ctx);
|
||||
std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
|
||||
|
||||
// Unconditional generation (mask input)
|
||||
std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
|
||||
for (int32_t i = 0; i < n_input; i++) {
|
||||
un_x_buffer[i] = params.mask_token_id;
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < params.max_length; i++) {
|
||||
batch.token[i] = un_x_buffer[i];
|
||||
}
|
||||
ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("Failed to generate unconditional");
|
||||
break;
|
||||
}
|
||||
float * uncond_logits = llama_get_logits(ctx);
|
||||
|
||||
// Apply CFG
|
||||
for (int32_t i = 0; i < logits_size; i++) {
|
||||
cond_logits_buffer[i] =
|
||||
uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
|
||||
}
|
||||
logits = cond_logits_buffer.data();
|
||||
} else {
|
||||
int ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
|
||||
break;
|
||||
}
|
||||
logits = llama_get_logits(ctx);
|
||||
}
|
||||
|
||||
if (!logits) {
|
||||
LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
|
||||
break;
|
||||
}
|
||||
|
||||
auto get_logits_for_pos = [&](int32_t pos) -> const float * {
|
||||
if (params.shift_logits) {
|
||||
return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
|
||||
}
|
||||
return logits + (pos) *n_vocab;
|
||||
};
|
||||
|
||||
int64_t time_start_sampling = ggml_time_us();
|
||||
|
||||
mask_positions.clear();
|
||||
for (int32_t i = 0; i < params.max_length; i++) {
|
||||
if (output_tokens[i] == params.mask_token_id) {
|
||||
// For block-based, only consider current block
|
||||
if (params.schedule != BLOCK_BASED || (i >= block_start && i < block_end)) {
|
||||
mask_positions.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (mask_positions.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (params.add_gumbel_noise && params.temperature > 0.0f) {
|
||||
add_gumbel_noise(logits, n_vocab, params.temperature, rng);
|
||||
}
|
||||
|
||||
if (params.algorithm == ORIGIN) {
|
||||
int32_t transfer_count = calculate_transfer_count(
|
||||
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
|
||||
float p_transfer = (float) transfer_count / mask_positions.size();
|
||||
|
||||
for (int32_t pos : mask_positions) {
|
||||
if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
|
||||
const float * pos_logits = get_logits_for_pos(pos);
|
||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates[token_id].id = token_id;
|
||||
candidates[token_id].logit = pos_logits[token_id];
|
||||
candidates[token_id].p = 0.0f;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
candidates.data(),
|
||||
(size_t) n_vocab,
|
||||
-1,
|
||||
false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
output_tokens[pos] = cur_p.data[cur_p.selected].id;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::vector<std::pair<float, int32_t>> confidences;
|
||||
std::vector<llama_token> sampled_tokens(mask_positions.size());
|
||||
|
||||
for (size_t i = 0; i < mask_positions.size(); i++) {
|
||||
int32_t pos = mask_positions[i];
|
||||
const float * pos_logits = get_logits_for_pos(pos);
|
||||
|
||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates[token_id].logit = pos_logits[token_id];
|
||||
candidates[token_id].p = 0.0f;
|
||||
candidates[token_id].id = token_id;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
candidates.data(),
|
||||
candidates.size(),
|
||||
-1,
|
||||
false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
llama_token sampled_token = cur_p.data[cur_p.selected].id;
|
||||
|
||||
float conf = calculate_confidence(cur_p, params.algorithm, rng);
|
||||
|
||||
sampled_tokens[i] = sampled_token;
|
||||
confidences.emplace_back(conf, i);
|
||||
}
|
||||
|
||||
int32_t transfer_count = calculate_transfer_count(
|
||||
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
|
||||
|
||||
if (transfer_count > 0) {
|
||||
if (params.alg_temp == 0.0f) {
|
||||
std::partial_sort(confidences.begin(),
|
||||
confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
|
||||
confidences.end(),
|
||||
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
|
||||
if (a.first != b.first) {
|
||||
return a.first > b.first;
|
||||
}
|
||||
return a.second < b.second;
|
||||
});
|
||||
|
||||
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
|
||||
int32_t mask_idx = confidences[i].second;
|
||||
int32_t pos = mask_positions[mask_idx];
|
||||
output_tokens[pos] = sampled_tokens[mask_idx];
|
||||
}
|
||||
} else {
|
||||
conf_candidates.clear();
|
||||
for (size_t i = 0; i < confidences.size(); i++) {
|
||||
float conf_logit = confidences[i].first / params.alg_temp;
|
||||
conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
|
||||
}
|
||||
|
||||
llama_token_data_array conf_array = {
|
||||
conf_candidates.data(),
|
||||
conf_candidates.size(),
|
||||
-1,
|
||||
false,
|
||||
};
|
||||
|
||||
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
|
||||
llama_sampler_apply(dist_sampler, &conf_array);
|
||||
int32_t selected_idx = conf_array.selected;
|
||||
int32_t mask_idx = selected_idx;
|
||||
int32_t pos = mask_positions[mask_idx];
|
||||
output_tokens[pos] = sampled_tokens[mask_idx];
|
||||
|
||||
conf_candidates[selected_idx].p = 0.0f;
|
||||
conf_array.selected = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int64_t time_end_sampling = ggml_time_us();
|
||||
total_sampling_time += time_end_sampling - time_start_sampling;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t time_end = ggml_time_us();
|
||||
total_time += time_end - time_start;
|
||||
|
||||
LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
|
||||
total_time / 1000.0,
|
||||
total_time / 1000.0 / params.steps,
|
||||
total_sampling_time / 1000.0 / params.steps);
|
||||
|
||||
llama_batch_free(batch);
|
||||
llama_sampler_free(sampler);
|
||||
llama_sampler_free(dist_sampler);
|
||||
|
||||
n_generated = params.max_length;
|
||||
}
|
||||
|
||||
static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) {
|
||||
if (!use_chat_template) {
|
||||
return prompt;
|
||||
@@ -631,10 +192,10 @@ int main(int argc, char ** argv) {
|
||||
GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0));
|
||||
|
||||
if (params.diffusion.eps) {
|
||||
diff_params.schedule = TIMESTEP_BASED;
|
||||
diff_params.schedule = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED;
|
||||
diff_params.eps = params.diffusion.eps;
|
||||
} else if (params.diffusion.block_length) {
|
||||
diff_params.schedule = BLOCK_BASED;
|
||||
diff_params.schedule = DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED;
|
||||
diff_params.block_length = params.diffusion.block_length;
|
||||
}
|
||||
|
||||
@@ -653,8 +214,17 @@ int main(int argc, char ** argv) {
|
||||
callback_data cb_data = { &diff_params, vocab, n_input };
|
||||
diff_params.step_callback_user_data = &cb_data;
|
||||
|
||||
const char * alg_names[] = { "ORIGIN", "ENTROPY_BASED", "MARGIN_BASED", "RANDOM", "CONFIDENCE_BASED" };
|
||||
const char * sched_names[] = { "TIMESTEP_BASED", "BLOCK_BASED" };
|
||||
const char * alg_names[] = {
|
||||
"DIFFUSION_ALGORITHM_ORIGIN",
|
||||
"DIFFUSION_ALGORITHM_ENTROPY_BASED",
|
||||
"DIFFUSION_ALGORITHM_MARGIN_BASED",
|
||||
"DIFFUSION_ALGORITHM_RANDOM",
|
||||
"DIFFUSION_ALGORITHM_CONFIDENCE_BASED",
|
||||
};
|
||||
const char * sched_names[] = {
|
||||
"DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED",
|
||||
"DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED",
|
||||
};
|
||||
const char * alg_name =
|
||||
(diff_params.algorithm >= 0 && diff_params.algorithm <= 4) ? alg_names[diff_params.algorithm] : "UNKNOWN";
|
||||
const char * sched_name =
|
||||
@@ -666,11 +236,11 @@ int main(int argc, char ** argv) {
|
||||
LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "algorithm", diff_params.algorithm, alg_name);
|
||||
LOG_INF("diffusion_params: - %-25s enum = %d (%s)\n", "schedule", diff_params.schedule, sched_name);
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "temperature", diff_params.temperature);
|
||||
if (diff_params.schedule == TIMESTEP_BASED) {
|
||||
if (diff_params.schedule == DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED) {
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.6f\n", "eps", diff_params.eps);
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "alg_temp", diff_params.alg_temp);
|
||||
}
|
||||
if (diff_params.schedule == BLOCK_BASED) {
|
||||
if (diff_params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
|
||||
LOG_INF("diffusion_params: - %-25s u32 = %d\n", "block_length", diff_params.block_length);
|
||||
LOG_INF("diffusion_params: - %-25s f32 = %.3f\n", "cfg_scale", diff_params.cfg_scale);
|
||||
}
|
||||
|
||||
408
examples/diffusion/diffusion.cpp
Normal file
408
examples/diffusion/diffusion.cpp
Normal file
@@ -0,0 +1,408 @@
|
||||
#include "diffusion.h"
|
||||
|
||||
#include "log.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <random>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
static float calculate_confidence(const llama_token_data_array & cur_p,
|
||||
diffusion_algorithm algorithm,
|
||||
std::mt19937 & rng) {
|
||||
switch (algorithm) {
|
||||
case DIFFUSION_ALGORITHM_CONFIDENCE_BASED:
|
||||
return cur_p.data[cur_p.selected].p; // Selected token probability
|
||||
|
||||
case DIFFUSION_ALGORITHM_ENTROPY_BASED:
|
||||
{
|
||||
float entropy = 0.0f;
|
||||
const float epsilon = 1e-10f;
|
||||
for (size_t i = 0; i < cur_p.size; i++) {
|
||||
float prob = cur_p.data[i].p;
|
||||
entropy += prob * logf(prob + epsilon);
|
||||
}
|
||||
return -entropy; // Higher entropy = lower confidence
|
||||
}
|
||||
|
||||
case DIFFUSION_ALGORITHM_MARGIN_BASED:
|
||||
return (cur_p.size > 1) ? cur_p.data[0].p - cur_p.data[1].p : cur_p.data[0].p;
|
||||
|
||||
case DIFFUSION_ALGORITHM_RANDOM:
|
||||
{
|
||||
std::uniform_real_distribution<float> uniform(0.0f, 1.0f);
|
||||
return uniform(rng); // Random confidence
|
||||
}
|
||||
|
||||
case DIFFUSION_ALGORITHM_ORIGIN:
|
||||
return cur_p.data[cur_p.selected].p;
|
||||
|
||||
default:
|
||||
return 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
// Unified transfer count calculation function
|
||||
static int32_t calculate_transfer_count(int32_t step,
|
||||
int32_t total_steps,
|
||||
int32_t remaining_masked,
|
||||
diffusion_transfer_schedule schedule,
|
||||
float eps,
|
||||
const std::vector<int32_t> & num_transfer_tokens = {}) {
|
||||
switch (schedule) {
|
||||
case DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED:
|
||||
{
|
||||
float t = 1.0f - (float) step / total_steps * (1.0f - eps);
|
||||
float s = 1.0f - (float) (step + 1) / total_steps * (1.0f - eps);
|
||||
float p_transfer = (step < total_steps - 1) ? (1.0f - s / t) : 1.0f;
|
||||
return (int32_t) (remaining_masked * p_transfer);
|
||||
}
|
||||
|
||||
case DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED:
|
||||
if (!num_transfer_tokens.empty() && step < (int32_t) num_transfer_tokens.size()) {
|
||||
return num_transfer_tokens[step];
|
||||
}
|
||||
return remaining_masked / (total_steps - step); // Fallback
|
||||
|
||||
default:
|
||||
return remaining_masked / (total_steps - step);
|
||||
}
|
||||
}
|
||||
|
||||
static void add_gumbel_noise(float * logits, int32_t n_vocab, float temperature, std::mt19937 & rng) {
|
||||
if (temperature == 0.0f) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::uniform_real_distribution<double> uniform(0.0, 1.0);
|
||||
for (int32_t i = 0; i < n_vocab; i++) {
|
||||
double noise = uniform(rng);
|
||||
// Prevent log(0)
|
||||
noise = std::max(noise, 1e-20);
|
||||
double gumbel_noise = std::pow(-std::log(noise), temperature);
|
||||
logits[i] = std::exp(logits[i]) / gumbel_noise;
|
||||
}
|
||||
}
|
||||
|
||||
static std::vector<int32_t> get_num_transfer_tokens(int32_t mask_count, int32_t steps) {
|
||||
std::vector<int32_t> num_transfer_tokens(steps);
|
||||
|
||||
int32_t base = mask_count / steps;
|
||||
int32_t remainder = mask_count % steps;
|
||||
|
||||
for (int32_t i = 0; i < steps; i++) {
|
||||
num_transfer_tokens[i] = base + (i < remainder ? 1 : 0);
|
||||
}
|
||||
|
||||
return num_transfer_tokens;
|
||||
}
|
||||
|
||||
void diffusion_generate(llama_context * ctx,
|
||||
const llama_token * input_tokens,
|
||||
llama_token * output_tokens,
|
||||
int32_t n_input,
|
||||
const diffusion_params & params,
|
||||
int32_t & n_generated) {
|
||||
n_generated = 0;
|
||||
if (!ctx || !input_tokens || !output_tokens || n_input <= 0 || params.max_length <= n_input) {
|
||||
return;
|
||||
}
|
||||
|
||||
const llama_model * model = llama_get_model(ctx);
|
||||
|
||||
// Initialize with input and pad with mask tokens
|
||||
std::copy(input_tokens, input_tokens + n_input, output_tokens);
|
||||
std::fill(output_tokens + n_input, output_tokens + params.max_length, params.mask_token_id);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
|
||||
llama_set_causal_attn(ctx, false);
|
||||
|
||||
int32_t n_vocab = llama_vocab_n_tokens(llama_model_get_vocab(model));
|
||||
|
||||
std::vector<llama_token_data> candidates(n_vocab);
|
||||
std::vector<llama_token_data> conf_candidates;
|
||||
conf_candidates.reserve(params.max_length);
|
||||
std::vector<int32_t> mask_positions;
|
||||
mask_positions.reserve(params.max_length);
|
||||
|
||||
// Setup sampler chain
|
||||
struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params());
|
||||
if (params.top_k > 0) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_k(params.top_k));
|
||||
}
|
||||
if (params.top_p < 1.0f) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_top_p(params.top_p, 1));
|
||||
}
|
||||
if (params.temperature > 0.0f) {
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_temp(params.temperature));
|
||||
}
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_dist(params.seed));
|
||||
|
||||
struct llama_sampler * dist_sampler = llama_sampler_init_dist(params.seed);
|
||||
|
||||
llama_batch batch = llama_batch_init(params.max_length, 0, 1);
|
||||
batch.n_tokens = params.max_length;
|
||||
|
||||
// Pre-allocate buffers for CFG if needed
|
||||
int32_t logits_size = n_vocab * params.max_length;
|
||||
std::vector<float> cond_logits_buffer;
|
||||
std::vector<llama_token> un_x_buffer;
|
||||
if (params.cfg_scale > 0.0f) {
|
||||
cond_logits_buffer.resize(logits_size);
|
||||
un_x_buffer.resize(params.max_length);
|
||||
}
|
||||
|
||||
// For block-based processing
|
||||
std::vector<int32_t> num_transfer_tokens;
|
||||
int32_t num_blocks = 1;
|
||||
int32_t steps_per_block = params.steps;
|
||||
|
||||
if (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
|
||||
GGML_ASSERT(params.max_length % params.block_length == 0);
|
||||
num_blocks = params.max_length / params.block_length;
|
||||
GGML_ASSERT(params.steps % num_blocks == 0);
|
||||
steps_per_block = params.steps / num_blocks;
|
||||
}
|
||||
|
||||
std::vector<float> confidence(params.max_length);
|
||||
|
||||
int64_t total_sampling_time = 0;
|
||||
int64_t total_time = 0;
|
||||
int64_t time_start = ggml_time_us();
|
||||
|
||||
for (int block_num = 0; block_num < num_blocks; block_num++) {
|
||||
int32_t block_start = (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) ? n_input + block_num * params.block_length : 0;
|
||||
int32_t block_end = (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) ?
|
||||
std::min(n_input + (block_num + 1) * params.block_length, params.max_length) :
|
||||
params.max_length;
|
||||
|
||||
// Count masked tokens in current block for block-based processing
|
||||
if (params.schedule == DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED) {
|
||||
int32_t block_mask_count = 0;
|
||||
for (int i = block_start; i < block_end; i++) {
|
||||
if (output_tokens[i] == params.mask_token_id) {
|
||||
block_mask_count++;
|
||||
}
|
||||
}
|
||||
num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block);
|
||||
}
|
||||
|
||||
for (int32_t step = 0; step < steps_per_block; step++) {
|
||||
int32_t global_step = block_num * steps_per_block + step;
|
||||
|
||||
if (params.step_callback) {
|
||||
if (!params.step_callback(
|
||||
global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Setup batch
|
||||
for (int32_t i = 0; i < params.max_length; i++) {
|
||||
batch.token[i] = output_tokens[i];
|
||||
batch.pos[i] = i;
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id[i][0] = 0;
|
||||
batch.logits[i] = 1;
|
||||
}
|
||||
|
||||
float * logits = nullptr;
|
||||
|
||||
if (params.cfg_scale > 0.0f) {
|
||||
int ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("Failed to generate conditional");
|
||||
break;
|
||||
}
|
||||
float * cond_logits_ptr = llama_get_logits(ctx);
|
||||
std::memcpy(cond_logits_buffer.data(), cond_logits_ptr, logits_size * sizeof(float));
|
||||
|
||||
// Unconditional generation (mask input)
|
||||
std::copy(output_tokens, output_tokens + params.max_length, un_x_buffer.begin());
|
||||
for (int32_t i = 0; i < n_input; i++) {
|
||||
un_x_buffer[i] = params.mask_token_id;
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < params.max_length; i++) {
|
||||
batch.token[i] = un_x_buffer[i];
|
||||
}
|
||||
ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("Failed to generate unconditional");
|
||||
break;
|
||||
}
|
||||
float * uncond_logits = llama_get_logits(ctx);
|
||||
|
||||
// Apply CFG
|
||||
for (int32_t i = 0; i < logits_size; i++) {
|
||||
cond_logits_buffer[i] =
|
||||
uncond_logits[i] + (params.cfg_scale + 1.0f) * (cond_logits_buffer[i] - uncond_logits[i]);
|
||||
}
|
||||
logits = cond_logits_buffer.data();
|
||||
} else {
|
||||
int ret = llama_decode(ctx, batch);
|
||||
if (ret != 0) {
|
||||
LOG_ERR("%s: failed to decode at step %d, ret = %d\n", __func__, global_step, ret);
|
||||
break;
|
||||
}
|
||||
logits = llama_get_logits(ctx);
|
||||
}
|
||||
|
||||
if (!logits) {
|
||||
LOG_ERR("%s: failed to get logits at step %d\n", __func__, global_step);
|
||||
break;
|
||||
}
|
||||
|
||||
auto get_logits_for_pos = [&](int32_t pos) -> const float * {
|
||||
if (params.shift_logits) {
|
||||
return pos == 0 ? logits : logits + (pos - 1) * n_vocab;
|
||||
}
|
||||
return logits + pos * n_vocab;
|
||||
};
|
||||
|
||||
int64_t time_start_sampling = ggml_time_us();
|
||||
|
||||
mask_positions.clear();
|
||||
for (int32_t i = 0; i < params.max_length; i++) {
|
||||
if (output_tokens[i] == params.mask_token_id) {
|
||||
// For block-based, only consider current block
|
||||
if (params.schedule != DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED || (i >= block_start && i < block_end)) {
|
||||
mask_positions.push_back(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (mask_positions.empty()) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (params.add_gumbel_noise && params.temperature > 0.0f) {
|
||||
add_gumbel_noise(logits, n_vocab, params.temperature, rng);
|
||||
}
|
||||
|
||||
if (params.algorithm == DIFFUSION_ALGORITHM_ORIGIN) {
|
||||
int32_t transfer_count = calculate_transfer_count(
|
||||
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
|
||||
float p_transfer = (float) transfer_count / mask_positions.size();
|
||||
|
||||
for (int32_t pos : mask_positions) {
|
||||
if (std::uniform_real_distribution<float>(0.0f, 1.0f)(rng) < p_transfer) {
|
||||
const float * pos_logits = get_logits_for_pos(pos);
|
||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates[token_id].id = token_id;
|
||||
candidates[token_id].logit = pos_logits[token_id];
|
||||
candidates[token_id].p = 0.0f;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
candidates.data(),
|
||||
(size_t) n_vocab,
|
||||
-1,
|
||||
false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
output_tokens[pos] = cur_p.data[cur_p.selected].id;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
std::vector<std::pair<float, int32_t>> confidences;
|
||||
std::vector<llama_token> sampled_tokens(mask_positions.size());
|
||||
|
||||
for (size_t i = 0; i < mask_positions.size(); i++) {
|
||||
int32_t pos = mask_positions[i];
|
||||
const float * pos_logits = get_logits_for_pos(pos);
|
||||
|
||||
for (int32_t token_id = 0; token_id < n_vocab; token_id++) {
|
||||
candidates[token_id].logit = pos_logits[token_id];
|
||||
candidates[token_id].p = 0.0f;
|
||||
candidates[token_id].id = token_id;
|
||||
}
|
||||
|
||||
llama_token_data_array cur_p = {
|
||||
candidates.data(),
|
||||
candidates.size(),
|
||||
-1,
|
||||
false,
|
||||
};
|
||||
|
||||
llama_sampler_apply(sampler, &cur_p);
|
||||
llama_token sampled_token = cur_p.data[cur_p.selected].id;
|
||||
|
||||
float conf = calculate_confidence(cur_p, params.algorithm, rng);
|
||||
|
||||
sampled_tokens[i] = sampled_token;
|
||||
confidences.emplace_back(conf, i);
|
||||
}
|
||||
|
||||
int32_t transfer_count = calculate_transfer_count(
|
||||
step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens);
|
||||
|
||||
if (transfer_count > 0) {
|
||||
if (params.alg_temp == 0.0f) {
|
||||
std::partial_sort(confidences.begin(),
|
||||
confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()),
|
||||
confidences.end(),
|
||||
[](const std::pair<float, int32_t> & a, const std::pair<float, int32_t> & b) {
|
||||
if (a.first != b.first) {
|
||||
return a.first > b.first;
|
||||
}
|
||||
return a.second < b.second;
|
||||
});
|
||||
|
||||
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
|
||||
int32_t mask_idx = confidences[i].second;
|
||||
int32_t pos = mask_positions[mask_idx];
|
||||
output_tokens[pos] = sampled_tokens[mask_idx];
|
||||
}
|
||||
} else {
|
||||
conf_candidates.clear();
|
||||
for (size_t i = 0; i < confidences.size(); i++) {
|
||||
float conf_logit = confidences[i].first / params.alg_temp;
|
||||
conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f });
|
||||
}
|
||||
|
||||
llama_token_data_array conf_array = {
|
||||
conf_candidates.data(),
|
||||
conf_candidates.size(),
|
||||
-1,
|
||||
false,
|
||||
};
|
||||
|
||||
for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) {
|
||||
llama_sampler_apply(dist_sampler, &conf_array);
|
||||
int32_t selected_idx = conf_array.selected;
|
||||
int32_t mask_idx = selected_idx;
|
||||
int32_t pos = mask_positions[mask_idx];
|
||||
output_tokens[pos] = sampled_tokens[mask_idx];
|
||||
|
||||
conf_candidates[selected_idx].p = 0.0f;
|
||||
conf_array.selected = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int64_t time_end_sampling = ggml_time_us();
|
||||
total_sampling_time += time_end_sampling - time_start_sampling;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t time_end = ggml_time_us();
|
||||
total_time += time_end - time_start;
|
||||
|
||||
LOG_INF("\ntotal time: %0.2fms, time per step: %0.2fms, sampling time per step: %0.2fms\n",
|
||||
total_time / 1000.0,
|
||||
total_time / 1000.0 / params.steps,
|
||||
total_sampling_time / 1000.0 / params.steps);
|
||||
|
||||
llama_batch_free(batch);
|
||||
llama_sampler_free(sampler);
|
||||
llama_sampler_free(dist_sampler);
|
||||
|
||||
n_generated = params.max_length;
|
||||
}
|
||||
57
examples/diffusion/diffusion.h
Normal file
57
examples/diffusion/diffusion.h
Normal file
@@ -0,0 +1,57 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
enum diffusion_algorithm {
|
||||
DIFFUSION_ALGORITHM_ORIGIN = 0,
|
||||
DIFFUSION_ALGORITHM_ENTROPY_BASED = 1,
|
||||
DIFFUSION_ALGORITHM_MARGIN_BASED = 2,
|
||||
DIFFUSION_ALGORITHM_RANDOM = 3,
|
||||
DIFFUSION_ALGORITHM_CONFIDENCE_BASED = 4,
|
||||
};
|
||||
|
||||
// Unified transfer scheduling methods
|
||||
enum diffusion_transfer_schedule {
|
||||
DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED = 0, // Dream-style: (1.0 - s/t) * remaining
|
||||
DIFFUSION_TRANSFER_SCHEDULE_BLOCK_BASED = 1, // LLaDA-style: process in blocks with get_num_transfer_tokens
|
||||
};
|
||||
|
||||
typedef bool (*diffusion_step_callback_t)(int32_t step,
|
||||
int32_t total_steps,
|
||||
const llama_token * tokens,
|
||||
int32_t n_tokens,
|
||||
void * user_data);
|
||||
|
||||
struct diffusion_params {
|
||||
int32_t steps = 0;
|
||||
float temperature = 0;
|
||||
llama_token mask_token_id = LLAMA_TOKEN_NULL;
|
||||
diffusion_step_callback_t step_callback = nullptr;
|
||||
void * step_callback_user_data = nullptr;
|
||||
int32_t seed = 0;
|
||||
bool visual_mode = false;
|
||||
bool shift_logits = false; // Shift logits by -1 after decode
|
||||
|
||||
float top_p = 0.;
|
||||
int32_t top_k = 0.;
|
||||
|
||||
diffusion_algorithm algorithm = DIFFUSION_ALGORITHM_CONFIDENCE_BASED;
|
||||
diffusion_transfer_schedule schedule = DIFFUSION_TRANSFER_SCHEDULE_TIMESTEP_BASED;
|
||||
|
||||
float cfg_scale = 0.; // Config scale for classifier-free guidance
|
||||
float eps = 0.; // Timestep scheduling
|
||||
int32_t block_length = 0; // Block size (for block scheduling)
|
||||
float alg_temp = 0; // algorithm temperature (0.0 = deterministic)
|
||||
bool add_gumbel_noise = false; // Add gumbel noise to the logits if temp > 0.0
|
||||
|
||||
int32_t max_length = 0; // Maximum sequence length
|
||||
};
|
||||
|
||||
void diffusion_generate(llama_context * ctx,
|
||||
const llama_token * input_tokens,
|
||||
llama_token * output_tokens,
|
||||
int32_t n_input,
|
||||
const diffusion_params & params,
|
||||
int32_t & n_generated);
|
||||
@@ -3,7 +3,6 @@
|
||||
#include "debug.h"
|
||||
#include "log.h"
|
||||
#include "llama.h"
|
||||
#include "llama-cpp.h"
|
||||
|
||||
#include <clocale>
|
||||
#include <string>
|
||||
@@ -38,7 +37,7 @@ static bool run(llama_context * ctx, const common_params & params) {
|
||||
int main(int argc, char ** argv) {
|
||||
std::setlocale(LC_NUMERIC, "C");
|
||||
|
||||
base_callback_data cb_data;
|
||||
common_debug_cb_user_data cb_data;
|
||||
|
||||
common_params params;
|
||||
|
||||
@@ -53,7 +52,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// pass the callback to the backend scheduler
|
||||
// it will be executed for each node during the graph computation
|
||||
params.cb_eval = common_debug_cb_eval<false>;
|
||||
params.cb_eval = common_debug_cb_eval;
|
||||
params.cb_eval_user_data = &cb_data;
|
||||
params.warmup = false;
|
||||
|
||||
|
||||
@@ -73,12 +73,12 @@ static void write_help(std::ostringstream & ss, const md_file & md) {
|
||||
auto ctx_arg = common_params_parser_init(params, md.ex);
|
||||
|
||||
std::vector<common_arg *> common_options;
|
||||
std::vector<common_arg *> sparam_options;
|
||||
std::vector<common_arg *> sampling_options;
|
||||
std::vector<common_arg *> specific_options;
|
||||
for (auto & opt : ctx_arg.options) {
|
||||
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
|
||||
if (opt.is_sparam) {
|
||||
sparam_options.push_back(&opt);
|
||||
if (opt.is_sampling) {
|
||||
sampling_options.push_back(&opt);
|
||||
} else if (opt.in_example(ctx_arg.ex)) {
|
||||
specific_options.push_back(&opt);
|
||||
} else {
|
||||
@@ -93,7 +93,7 @@ static void write_help(std::ostringstream & ss, const md_file & md) {
|
||||
ss << "### Common params\n\n";
|
||||
write_table(ss, common_options);
|
||||
ss << "\n\n### Sampling params\n\n";
|
||||
write_table(ss, sparam_options);
|
||||
write_table(ss, sampling_options);
|
||||
ss << "\n\n### " << md.specific_section_header << "\n\n";
|
||||
write_table(ss, specific_options);
|
||||
|
||||
|
||||
@@ -37,9 +37,9 @@ int main(int argc, char ** argv){
|
||||
|
||||
common_ngram_cache ngram_cache;
|
||||
common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
|
||||
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.speculative.lookup_cache_static.c_str());
|
||||
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.speculative.ngram_cache.lookup_cache_static.c_str());
|
||||
|
||||
common_ngram_cache_save(ngram_cache, params.speculative.lookup_cache_static);
|
||||
common_ngram_cache_save(ngram_cache, params.speculative.ngram_cache.lookup_cache_static);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -24,7 +24,7 @@ int main(int argc, char ** argv){
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int n_draft = params.speculative.n_max;
|
||||
const int n_draft = params.speculative.draft.n_max;
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
@@ -49,18 +49,18 @@ int main(int argc, char ** argv){
|
||||
{
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
|
||||
if (!params.speculative.lookup_cache_static.empty()) {
|
||||
if (!params.speculative.ngram_cache.lookup_cache_static.empty()) {
|
||||
try {
|
||||
ngram_cache_static = common_ngram_cache_load(params.speculative.lookup_cache_static);
|
||||
ngram_cache_static = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_static);
|
||||
} catch (std::ifstream::failure const &) {
|
||||
LOG_ERR("failed to open static lookup cache: %s", params.speculative.lookup_cache_static.c_str());
|
||||
LOG_ERR("failed to open static lookup cache: %s", params.speculative.ngram_cache.lookup_cache_static.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.speculative.lookup_cache_dynamic.empty()) {
|
||||
if (!params.speculative.ngram_cache.lookup_cache_dynamic.empty()) {
|
||||
try {
|
||||
ngram_cache_dynamic = common_ngram_cache_load(params.speculative.lookup_cache_dynamic);
|
||||
ngram_cache_dynamic = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_dynamic);
|
||||
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
||||
}
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ int main(int argc, char ** argv){
|
||||
}
|
||||
|
||||
// max. number of additional tokens to draft if match is found
|
||||
const int n_draft = params.speculative.n_max;
|
||||
const int n_draft = params.speculative.draft.n_max;
|
||||
|
||||
// init llama.cpp
|
||||
llama_backend_init();
|
||||
@@ -54,18 +54,18 @@ int main(int argc, char ** argv){
|
||||
const int64_t t_start_draft_us = ggml_time_us();
|
||||
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
|
||||
|
||||
if (!params.speculative.lookup_cache_static.empty()) {
|
||||
if (!params.speculative.ngram_cache.lookup_cache_static.empty()) {
|
||||
try {
|
||||
ngram_cache_static = common_ngram_cache_load(params.speculative.lookup_cache_static);
|
||||
ngram_cache_static = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_static);
|
||||
} catch (std::ifstream::failure const &) {
|
||||
LOG_ERR("failed to open static lookup cache: %s", params.speculative.lookup_cache_static.c_str());
|
||||
LOG_ERR("failed to open static lookup cache: %s", params.speculative.ngram_cache.lookup_cache_static.c_str());
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.speculative.lookup_cache_dynamic.empty()) {
|
||||
if (!params.speculative.ngram_cache.lookup_cache_dynamic.empty()) {
|
||||
try {
|
||||
ngram_cache_dynamic = common_ngram_cache_load(params.speculative.lookup_cache_dynamic);
|
||||
ngram_cache_dynamic = common_ngram_cache_load(params.speculative.ngram_cache.lookup_cache_dynamic);
|
||||
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
||||
}
|
||||
|
||||
@@ -213,7 +213,7 @@ int main(int argc, char ** argv){
|
||||
|
||||
// Update dynamic ngram cache with context ngram cache and save it to disk:
|
||||
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||
common_ngram_cache_save(ngram_cache_dynamic, params.speculative.lookup_cache_dynamic);
|
||||
common_ngram_cache_save(ngram_cache_dynamic, params.speculative.ngram_cache.lookup_cache_dynamic);
|
||||
|
||||
LOG("\n\n");
|
||||
|
||||
|
||||
@@ -38,8 +38,12 @@ int main(int argc, char ** argv) {
|
||||
std::string result0;
|
||||
std::string result1;
|
||||
std::string result2;
|
||||
std::string result3;
|
||||
|
||||
// init
|
||||
|
||||
ggml_backend_load_all();
|
||||
|
||||
auto llama_init = common_init_from_params(params);
|
||||
|
||||
auto * model = llama_init->model();
|
||||
@@ -213,11 +217,83 @@ int main(int argc, char ** argv) {
|
||||
n_past += 1;
|
||||
}
|
||||
|
||||
// test on-device state save/load
|
||||
auto params_ctx4 = common_context_params_to_llama(params);
|
||||
params_ctx4.n_seq_max = 2;
|
||||
llama_context * ctx4 = llama_init_from_model(model, params_ctx4);
|
||||
|
||||
llama_sampler * smpl4 = llama_sampler_chain_init(sparams);
|
||||
|
||||
llama_sampler_chain_add(smpl4, llama_sampler_init_dist(params.sampling.seed));
|
||||
|
||||
printf("\nsingle seq run: %s", params.prompt.c_str());
|
||||
|
||||
// load state (rng, logits, embedding and kv_cache) from file
|
||||
n_token_count_out = 0;
|
||||
|
||||
if (!llama_state_load_file(ctx4, state_file.data(), unused_sts.data(), unused_sts.size(), &n_token_count_out)) {
|
||||
fprintf(stderr, "\n%s : failed to load state\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : loaded state with %zu tokens\n", __func__, n_token_count_out);
|
||||
|
||||
// restore state (last tokens)
|
||||
n_past = n_token_count_out;
|
||||
if (!common_replay_last_token(ctx4, tokens.back(), n_past)) {
|
||||
return 1;
|
||||
}
|
||||
++n_past;
|
||||
|
||||
// save seq 0 and load into seq 1
|
||||
{
|
||||
// save kv of seq 0
|
||||
std::vector<uint8_t> seq_store(llama_state_seq_get_size_ext(ctx4, 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE));
|
||||
const size_t ncopy = llama_state_seq_get_data_ext(ctx4, seq_store.data(), seq_store.size(), 0, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
if (ncopy != seq_store.size()) {
|
||||
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
|
||||
|
||||
// erase whole kv
|
||||
llama_memory_clear(llama_get_memory(ctx4), true);
|
||||
fprintf(stderr, "%s : kv cache cleared\n", __func__);
|
||||
|
||||
// restore kv into seq 0
|
||||
const size_t nset = llama_state_seq_set_data_ext(ctx4, seq_store.data(), seq_store.size(), 1, LLAMA_STATE_SEQ_FLAGS_ON_DEVICE);
|
||||
if (nset != seq_store.size()) {
|
||||
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
|
||||
return 1;
|
||||
}
|
||||
fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset);
|
||||
}
|
||||
|
||||
// forth run
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sampler_sample(smpl4, ctx4, -1);
|
||||
auto next_token_str = common_token_to_piece(ctx4, next_token);
|
||||
|
||||
printf("%s", next_token_str.c_str());
|
||||
result3 += next_token_str;
|
||||
|
||||
common_batch_clear(batch);
|
||||
common_batch_add(batch, next_token, n_past, {1}, true);
|
||||
|
||||
if (llama_decode(ctx4, batch)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
llama_batch_free(batch);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
llama_sampler_free(smpl);
|
||||
llama_sampler_free(smpl2);
|
||||
llama_sampler_free(smpl3);
|
||||
llama_sampler_free(smpl4);
|
||||
|
||||
llama_batch_free(batch);
|
||||
|
||||
@@ -226,12 +302,18 @@ int main(int argc, char ** argv) {
|
||||
|
||||
llama_free(ctx2);
|
||||
llama_free(ctx3);
|
||||
llama_free(ctx4);
|
||||
|
||||
if (result0 != result2) {
|
||||
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (result0 != result3) {
|
||||
fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n%s : success\n", __func__);
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -43,7 +43,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.speculative.mparams_dft.path.empty()) {
|
||||
if (params.speculative.draft.mparams.path.empty()) {
|
||||
LOG_ERR("%s: --model-draft is required\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -77,7 +77,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// TODO: simplify this logic
|
||||
{
|
||||
const auto & params_spec = params.speculative;
|
||||
const auto & params_spec = params.speculative.draft;
|
||||
|
||||
auto params_dft = params;
|
||||
|
||||
@@ -85,15 +85,15 @@ int main(int argc, char ** argv) {
|
||||
params_dft.n_ctx = params_spec.n_ctx;
|
||||
params_dft.n_batch = llama_n_ctx_seq(ctx_tgt);
|
||||
params_dft.devices = params_spec.devices;
|
||||
params_dft.model = params_spec.mparams_dft;
|
||||
params_dft.model = params_spec.mparams;
|
||||
params_dft.n_gpu_layers = params_spec.n_gpu_layers;
|
||||
|
||||
if (params_spec.cpuparams.n_threads > 0) {
|
||||
params_dft.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
|
||||
params_dft.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||
params_dft.cpuparams.n_threads = params.speculative.draft.cpuparams.n_threads;
|
||||
params_dft.cpuparams_batch.n_threads = params.speculative.draft.cpuparams_batch.n_threads;
|
||||
}
|
||||
|
||||
params_dft.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
||||
params_dft.tensor_buft_overrides = params.speculative.draft.tensor_buft_overrides;
|
||||
|
||||
auto mparams_dft = common_model_params_to_llama(params_dft);
|
||||
|
||||
@@ -103,8 +103,8 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
params.speculative.model_dft = model_dft.get();
|
||||
params.speculative.cparams_dft = common_context_params_to_llama(params_dft);
|
||||
params.speculative.draft.model = model_dft.get();
|
||||
params.speculative.draft.cparams = common_context_params_to_llama(params_dft);
|
||||
}
|
||||
|
||||
// Tokenize the prompt
|
||||
@@ -187,16 +187,6 @@ int main(int argc, char ** argv) {
|
||||
// generate a new draft
|
||||
draft = common_speculative_draft(spec, params_spec, prompt_tgt, id_last);
|
||||
|
||||
if ((int) draft.size() > params_spec.n_max) {
|
||||
LOG_WRN("draft size %zu exceeds max %d, truncating\n", draft.size(), params_spec.n_max);
|
||||
draft.resize(params_spec.n_max);
|
||||
}
|
||||
|
||||
if ((int) draft.size() < params_spec.n_min) {
|
||||
LOG_DBG("ignoring small draft: %zu < %d\n", draft.size(), params_spec.n_min);
|
||||
draft.clear();
|
||||
}
|
||||
|
||||
// save the original draft size
|
||||
n_draft = draft.size();
|
||||
|
||||
@@ -220,19 +210,12 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
GGML_ASSERT(n_draft > 0);
|
||||
|
||||
// always have a token to evaluate from before - id_last
|
||||
common_batch_clear(batch_tgt);
|
||||
common_batch_add (batch_tgt, id_last, n_past++, { 0 }, true);
|
||||
|
||||
// evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
|
||||
{
|
||||
// do not waste time on small drafts
|
||||
if (draft.size() < (size_t) params_spec.n_min) {
|
||||
draft.clear();
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < draft.size(); ++i) {
|
||||
common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
||||
}
|
||||
@@ -340,7 +323,7 @@ int main(int argc, char ** argv) {
|
||||
LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||
|
||||
LOG_INF("\n");
|
||||
LOG_INF("n_draft = %d\n", params_spec.n_max);
|
||||
LOG_INF("n_draft = %d\n", params_spec.draft.n_max);
|
||||
LOG_INF("n_predict = %d\n", n_predict);
|
||||
LOG_INF("n_drafted = %d\n", n_drafted);
|
||||
LOG_INF("n_accept = %d\n", n_accept);
|
||||
|
||||
@@ -49,7 +49,7 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (params.speculative.mparams_dft.path.empty()) {
|
||||
if (params.speculative.draft.mparams.path.empty()) {
|
||||
LOG_ERR("%s: --model-draft is required\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
@@ -58,7 +58,7 @@ int main(int argc, char ** argv) {
|
||||
const int n_seq_dft = params.n_parallel;
|
||||
|
||||
// probability threshold for splitting a draft branch (only for n_seq_dft > 1)
|
||||
const float p_draft_split = params.speculative.p_split;
|
||||
const float p_draft_split = params.speculative.draft.p_split;
|
||||
|
||||
std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed);
|
||||
std::uniform_real_distribution<> u_dist;
|
||||
@@ -80,15 +80,15 @@ int main(int argc, char ** argv) {
|
||||
ctx_tgt = llama_init_tgt->context();
|
||||
|
||||
// load the draft model
|
||||
params.devices = params.speculative.devices;
|
||||
params.model = params.speculative.mparams_dft;
|
||||
params.n_gpu_layers = params.speculative.n_gpu_layers;
|
||||
if (params.speculative.cpuparams.n_threads > 0) {
|
||||
params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
|
||||
params.devices = params.speculative.draft.devices;
|
||||
params.model = params.speculative.draft.mparams;
|
||||
params.n_gpu_layers = params.speculative.draft.n_gpu_layers;
|
||||
if (params.speculative.draft.cpuparams.n_threads > 0) {
|
||||
params.cpuparams.n_threads = params.speculative.draft.cpuparams.n_threads;
|
||||
}
|
||||
|
||||
params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
|
||||
params.tensor_buft_overrides = params.speculative.tensor_buft_overrides;
|
||||
params.cpuparams_batch.n_threads = params.speculative.draft.cpuparams_batch.n_threads;
|
||||
params.tensor_buft_overrides = params.speculative.draft.tensor_buft_overrides;
|
||||
|
||||
auto llama_init_dft = common_init_from_params(params);
|
||||
|
||||
@@ -110,13 +110,21 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (
|
||||
llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
||||
llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
|
||||
llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft) ||
|
||||
llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft)
|
||||
) {
|
||||
LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
|
||||
if (llama_vocab_get_add_bos(vocab_tgt) != llama_vocab_get_add_bos(vocab_dft) ||
|
||||
(llama_vocab_get_add_bos(vocab_tgt) && llama_vocab_bos(vocab_tgt) != llama_vocab_bos(vocab_dft))) {
|
||||
LOG_ERR("%s: draft model bos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
|
||||
__func__,
|
||||
llama_vocab_get_add_bos(vocab_tgt), llama_vocab_get_add_bos(vocab_dft),
|
||||
llama_vocab_bos(vocab_tgt), llama_vocab_bos(vocab_dft));
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (llama_vocab_get_add_eos(vocab_tgt) != llama_vocab_get_add_eos(vocab_dft) ||
|
||||
(llama_vocab_get_add_eos(vocab_tgt) && llama_vocab_eos(vocab_tgt) != llama_vocab_eos(vocab_dft))) {
|
||||
LOG_ERR("%s: draft model eos tokens must match target model to use speculation. add: %d - %d, id: %d - %d)\n",
|
||||
__func__,
|
||||
llama_vocab_get_add_eos(vocab_tgt), llama_vocab_get_add_eos(vocab_dft),
|
||||
llama_vocab_eos(vocab_tgt), llama_vocab_eos(vocab_dft));
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -137,11 +145,12 @@ int main(int argc, char ** argv) {
|
||||
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
|
||||
const char * token_text_tgt = llama_vocab_get_text(vocab_tgt, i);
|
||||
const char * token_text_dft = llama_vocab_get_text(vocab_dft, i);
|
||||
|
||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||
LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
|
||||
LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
|
||||
common_token_to_piece(ctx_tgt, i).c_str(),
|
||||
common_token_to_piece(ctx_dft, i).c_str());
|
||||
common_token_to_piece(vocab_tgt, i).c_str(),
|
||||
common_token_to_piece(vocab_dft, i).c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
@@ -183,7 +192,7 @@ int main(int argc, char ** argv) {
|
||||
//GGML_ASSERT(n_vocab == llama_vocab_n_tokens(model_dft));
|
||||
|
||||
// how many tokens to draft each time
|
||||
int n_draft = params.speculative.n_max;
|
||||
int n_draft = params.speculative.draft.n_max;
|
||||
|
||||
int n_predict = 0;
|
||||
int n_drafted = 0;
|
||||
|
||||
124
examples/sycl/start-svr.sh
Executable file
124
examples/sycl/start-svr.sh
Executable file
@@ -0,0 +1,124 @@
|
||||
#!/bin/bash
|
||||
|
||||
# MIT license
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
Help() {
|
||||
cat << EOF
|
||||
Usage: $(basename "$0") [OPTIONS]
|
||||
|
||||
This script processes files with specified options.
|
||||
|
||||
Options:
|
||||
-h, --help Display this help message and exit.
|
||||
-c, --context <value> Set context length. Bigger need more memory.
|
||||
-p, --promote <value> Prompt to start generation with.
|
||||
-m, --model <value> Full model file path.
|
||||
-mg,--main-gpu <value> Set main GPU ID (0 - n) for single GPU mode.
|
||||
-sm,--split-mode <value> How to split the model across multiple GPUs, one of:
|
||||
- none: use one GPU only
|
||||
- layer (default): split layers and KV across GPUs
|
||||
- row: split rows across GPUs
|
||||
-ngl,--n-gpu-layers <value> Max. number of layers to store in VRAM (default: -1)
|
||||
-lv,--log-verbosity <value> Set the verbosity threshold. Messages with a higher verbosity will be
|
||||
ignored. Values:
|
||||
- 0: generic output
|
||||
- 1: error
|
||||
- 2: warning
|
||||
- 3: info
|
||||
- 4: debug
|
||||
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
BIN_FILE=./build/bin/llama-server
|
||||
SEED=0
|
||||
GPUS_SETTING=""
|
||||
|
||||
MODEL_FILE=../models/Qwen3.5-4B-Q4_0.gguf
|
||||
NGL=99
|
||||
CONTEXT=4096
|
||||
GGML_SYCL_DEVICE=-1
|
||||
SPLIT_MODE=layer
|
||||
LOG_VERBOSE=3
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
-c|--context)
|
||||
CONTEXT=$2
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-m|--model)
|
||||
MODEL_FILE="$2"
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-mg|--main-gpu)
|
||||
GGML_SYCL_DEVICE=$2
|
||||
SPLIT_MODE=none
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-sm|--split-mode)
|
||||
SPLIT_MODE=$2
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-ngl|--n-gpu-layers)
|
||||
NGL=$2
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-lv|--log-verbosity)
|
||||
LOG_VERBOSE=$2
|
||||
# Shift twice to consume both the option flag and its value
|
||||
shift
|
||||
shift
|
||||
;;
|
||||
-h|--help)
|
||||
Help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
# Handle unknown options or stop processing options
|
||||
echo "Invalid option: $1"
|
||||
# Optional: exit script or shift to treat remaining as positional args
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
|
||||
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#export GGML_SYCL_DEBUG=1
|
||||
|
||||
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
||||
|
||||
#support malloc device memory more than 4GB.
|
||||
export UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
echo "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=${UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS}"
|
||||
|
||||
if [ $GGML_SYCL_DEVICE -ne -1 ]; then
|
||||
echo "Use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
|
||||
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
|
||||
else
|
||||
echo "Use all Intel GPUs, including iGPU & dGPU"
|
||||
GPUS_SETTING="-sm ${SPLIT_MODE}"
|
||||
fi
|
||||
|
||||
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
|
||||
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap --host 0.0.0.0 --port 8000
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ SEED=0
|
||||
GPUS_SETTING=""
|
||||
|
||||
INPUT_PROMPT="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
MODEL_FILE=models/llama-2-7b.Q4_0.gguf
|
||||
MODEL_FILE=../models/llama-2-7b.Q4_0.gguf
|
||||
NGL=99
|
||||
CONTEXT=4096
|
||||
GGML_SYCL_DEVICE=-1
|
||||
@@ -119,12 +119,13 @@ if [ $GGML_SYCL_DEVICE -ne -1 ]; then
|
||||
echo "Use $GGML_SYCL_DEVICE as main GPU"
|
||||
#use signle GPU only
|
||||
GPUS_SETTING="-mg $GGML_SYCL_DEVICE -sm ${SPLIT_MODE}"
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:${$GGML_SYCL_DEVICE}"
|
||||
export ONEAPI_DEVICE_SELECTOR="level_zero:${GGML_SYCL_DEVICE}"
|
||||
echo "ONEAPI_DEVICE_SELECTOR=${ONEAPI_DEVICE_SELECTOR}"
|
||||
else
|
||||
echo "Use all Intel GPUs, including iGPU & dGPU"
|
||||
echo "Use all Intel GPUs, including iGPU & dGPU"
|
||||
GPUS_SETTING="-sm ${SPLIT_MODE}"
|
||||
fi
|
||||
|
||||
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
|
||||
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 400 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
|
||||
echo "run cmd: ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap "
|
||||
ZES_ENABLE_SYSMAN=1 ${BIN_FILE} -m ${MODEL_FILE} -no-cnv -p "${INPUT_PROMPT}" -n 200 -e -ngl ${NGL} -s ${SEED} -c ${CONTEXT} ${GPUS_SETTING} -lv ${LOG_VERBOSE} --mmap
|
||||
|
||||
|
||||
179
examples/sycl/win-start-svr.bat
Normal file
179
examples/sycl/win-start-svr.bat
Normal file
@@ -0,0 +1,179 @@
|
||||
:: MIT license
|
||||
:: Copyright (C) 2024 Intel Corporation
|
||||
:: SPDX-License-Identifier: MIT
|
||||
|
||||
@echo off
|
||||
setlocal EnableExtensions EnableDelayedExpansion
|
||||
|
||||
set "BIN_FILE=.\build\bin\llama-server.exe"
|
||||
set "SEED=0"
|
||||
set "GPUS_SETTING="
|
||||
|
||||
set "MODEL_FILE=..\models\Qwen3.5-4B-Q4_0.gguf"
|
||||
set "NGL=99"
|
||||
set "CONTEXT=4096"
|
||||
set "GGML_SYCL_DEVICE=-1"
|
||||
set "SPLIT_MODE=layer"
|
||||
set "LOG_VERBOSE=3"
|
||||
|
||||
if "%~1"=="" goto after_args
|
||||
|
||||
:parse_args
|
||||
if "%~1"=="" goto after_args
|
||||
|
||||
if /I "%~1"=="-c" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "CONTEXT=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--context" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "CONTEXT=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-m" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "MODEL_FILE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--model" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "MODEL_FILE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-mg" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "GGML_SYCL_DEVICE=%~2"
|
||||
set "SPLIT_MODE=none"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--main-gpu" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "GGML_SYCL_DEVICE=%~2"
|
||||
set "SPLIT_MODE=none"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-sm" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "SPLIT_MODE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--split-mode" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "SPLIT_MODE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-ngl" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "NGL=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--n-gpu-layers" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "NGL=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-lv" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "LOG_VERBOSE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--log-verbosity" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "LOG_VERBOSE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-h" goto help
|
||||
if /I "%~1"=="--help" goto help
|
||||
|
||||
echo Invalid option: %~1
|
||||
exit /b 1
|
||||
|
||||
:missing_value
|
||||
echo Missing value for option: %~1
|
||||
exit /b 1
|
||||
|
||||
:help
|
||||
echo Usage: %~n0 [OPTIONS]
|
||||
echo.
|
||||
echo This script processes files with specified options.
|
||||
echo.
|
||||
echo Options:
|
||||
echo -h, --help Display this help message and exit.
|
||||
echo -c, --context ^<value^> Set context length. Bigger need more memory.
|
||||
echo -m, --model ^<value^> Full model file path.
|
||||
echo -mg,--main-gpu ^<value^> Set main GPU ID (0 - n) for single GPU mode.
|
||||
echo -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
|
||||
echo - none: use one GPU only
|
||||
echo - layer (default): split layers and KV across GPUs
|
||||
echo - row: split rows across GPUs
|
||||
echo -ngl,--n-gpu-layers ^<value^> Max. number of layers to store in VRAM (default: -1)
|
||||
echo -lv,--log-verbosity ^<value^> Set the verbosity threshold. Messages with a higher verbosity will be
|
||||
echo ignored. Values:
|
||||
echo - 0: generic output
|
||||
echo - 1: error
|
||||
echo - 2: warning
|
||||
echo - 3: info
|
||||
echo - 4: debug
|
||||
exit /b 0
|
||||
|
||||
:after_args
|
||||
|
||||
REM In Windows CMD, source is not available; call oneAPI setvars if present.
|
||||
if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
|
||||
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
|
||||
) else (
|
||||
echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
|
||||
)
|
||||
|
||||
REM Support malloc device memory more than 4GB.
|
||||
set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
|
||||
echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
|
||||
|
||||
if not "%GGML_SYCL_DEVICE%"=="-1" (
|
||||
echo Use %GGML_SYCL_DEVICE% as main GPU
|
||||
REM Use single GPU only.
|
||||
set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
|
||||
set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
|
||||
echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
|
||||
) else (
|
||||
echo Use all Intel GPUs, including iGPU ^& dGPU
|
||||
set "GPUS_SETTING=-sm %SPLIT_MODE%"
|
||||
)
|
||||
|
||||
echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
|
||||
set "ZES_ENABLE_SYSMAN=1"
|
||||
%BIN_FILE% -m "%MODEL_FILE%" -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap --host 0.0.0.0 --port 8000
|
||||
|
||||
endlocal
|
||||
|
||||
@@ -2,10 +2,200 @@
|
||||
:: Copyright (C) 2024 Intel Corporation
|
||||
:: SPDX-License-Identifier: MIT
|
||||
|
||||
set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||
|
||||
:: support malloc device memory more than 4GB.
|
||||
set UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1
|
||||
set LOAD_MODE="--mmap"
|
||||
.\build\bin\llama-completion.exe -m models\llama-2-7b.Q4_0.gguf -no-cnv -p %INPUT2% -n 400 -e -ngl 99 -s 0 %LOAD_MODE%
|
||||
@echo off
|
||||
setlocal EnableExtensions EnableDelayedExpansion
|
||||
|
||||
REM MIT license
|
||||
REM Copyright (C) 2024 Intel Corporation
|
||||
REM SPDX-License-Identifier: MIT
|
||||
|
||||
set "BIN_FILE=.\build\bin\llama-completion.exe"
|
||||
set "SEED=0"
|
||||
set "GPUS_SETTING="
|
||||
|
||||
set "INPUT_PROMPT=Building a website can be done in 10 simple steps:^nStep 1:"
|
||||
set "MODEL_FILE=..\models\llama-2-7b.Q4_0.gguf"
|
||||
set "NGL=99"
|
||||
set "CONTEXT=4096"
|
||||
set "GGML_SYCL_DEVICE=-1"
|
||||
set "SPLIT_MODE=layer"
|
||||
set "LOG_VERBOSE=3"
|
||||
|
||||
if "%~1"=="" goto after_args
|
||||
|
||||
:parse_args
|
||||
if "%~1"=="" goto after_args
|
||||
|
||||
if /I "%~1"=="-c" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "CONTEXT=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--context" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "CONTEXT=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-p" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "INPUT_PROMPT=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--promote" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "INPUT_PROMPT=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-m" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "MODEL_FILE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--model" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "MODEL_FILE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-mg" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "GGML_SYCL_DEVICE=%~2"
|
||||
set "SPLIT_MODE=none"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--main-gpu" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "GGML_SYCL_DEVICE=%~2"
|
||||
set "SPLIT_MODE=none"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-sm" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "SPLIT_MODE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--split-mode" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "SPLIT_MODE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-ngl" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "NGL=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--n-gpu-layers" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "NGL=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-lv" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "LOG_VERBOSE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
if /I "%~1"=="--log-verbosity" (
|
||||
if "%~2"=="" goto missing_value
|
||||
set "LOG_VERBOSE=%~2"
|
||||
shift
|
||||
shift
|
||||
goto parse_args
|
||||
)
|
||||
|
||||
if /I "%~1"=="-h" goto help
|
||||
if /I "%~1"=="--help" goto help
|
||||
|
||||
echo Invalid option: %~1
|
||||
exit /b 1
|
||||
|
||||
:missing_value
|
||||
echo Missing value for option: %~1
|
||||
exit /b 1
|
||||
|
||||
:help
|
||||
echo Usage: %~n0 [OPTIONS]
|
||||
echo.
|
||||
echo This script processes files with specified options.
|
||||
echo.
|
||||
echo Options:
|
||||
echo -h, --help Display this help message and exit.
|
||||
echo -c, --context ^<value^> Set context length. Bigger need more memory.
|
||||
echo -p, --promote ^<value^> Prompt to start generation with.
|
||||
echo -m, --model ^<value^> Full model file path.
|
||||
echo -mg,--main-gpu ^<value^> Set main GPU ID (0 - n) for single GPU mode.
|
||||
echo -sm,--split-mode ^<value^> How to split the model across multiple GPUs, one of:
|
||||
echo - none: use one GPU only
|
||||
echo - layer (default): split layers and KV across GPUs
|
||||
echo - row: split rows across GPUs
|
||||
echo -ngl,--n-gpu-layers ^<value^> Max. number of layers to store in VRAM (default: -1)
|
||||
echo -lv,--log-verbosity ^<value^> Set the verbosity threshold. Messages with a higher verbosity will be
|
||||
echo ignored. Values:
|
||||
echo - 0: generic output
|
||||
echo - 1: error
|
||||
echo - 2: warning
|
||||
echo - 3: info
|
||||
echo - 4: debug
|
||||
exit /b 0
|
||||
|
||||
:after_args
|
||||
|
||||
REM In Windows CMD, source is not available; call oneAPI setvars if present.
|
||||
if exist "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" (
|
||||
call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" >nul
|
||||
) else (
|
||||
echo Warning: oneAPI setvars.bat not found. Continuing without environment setup.
|
||||
)
|
||||
|
||||
REM Support malloc device memory more than 4GB.
|
||||
set "UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1"
|
||||
echo UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=%UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS%
|
||||
|
||||
if not "%GGML_SYCL_DEVICE%"=="-1" (
|
||||
echo Use %GGML_SYCL_DEVICE% as main GPU
|
||||
REM Use single GPU only.
|
||||
set "GPUS_SETTING=-mg %GGML_SYCL_DEVICE% -sm %SPLIT_MODE%"
|
||||
set "ONEAPI_DEVICE_SELECTOR=level_zero:%GGML_SYCL_DEVICE%"
|
||||
echo ONEAPI_DEVICE_SELECTOR=%ONEAPI_DEVICE_SELECTOR%
|
||||
) else (
|
||||
echo Use all Intel GPUs, including iGPU ^& dGPU
|
||||
set "GPUS_SETTING=-sm %SPLIT_MODE%"
|
||||
)
|
||||
|
||||
echo run cmd: ZES_ENABLE_SYSMAN=1 %BIN_FILE% -m %MODEL_FILE% -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
|
||||
set "ZES_ENABLE_SYSMAN=1"
|
||||
%BIN_FILE% -m "%MODEL_FILE%" -no-cnv -p "%INPUT_PROMPT%" -n 200 -e -ngl %NGL% -s %SEED% -c %CONTEXT% %GPUS_SETTING% -lv %LOG_VERBOSE% --mmap
|
||||
|
||||
endlocal
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
|
||||
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 10)
|
||||
set(GGML_VERSION_MINOR 11)
|
||||
set(GGML_VERSION_PATCH 0)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
|
||||
@@ -438,6 +438,12 @@ extern "C" {
|
||||
GGML_PREC_F32 = 10,
|
||||
};
|
||||
|
||||
// op hint
|
||||
enum ggml_op_hint {
|
||||
GGML_HINT_NONE = 0,
|
||||
GGML_HINT_SRC0_IS_HADAMARD = 1,
|
||||
};
|
||||
|
||||
// model file types
|
||||
enum ggml_ftype {
|
||||
GGML_FTYPE_UNKNOWN = -1,
|
||||
@@ -1419,6 +1425,11 @@ extern "C" {
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_prec prec);
|
||||
|
||||
// change the hint of a matrix multiplication
|
||||
GGML_API void ggml_mul_mat_set_hint(
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_op_hint hint);
|
||||
|
||||
// indirect matrix multiplication
|
||||
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
||||
struct ggml_context * ctx,
|
||||
|
||||
@@ -470,11 +470,10 @@ endforeach()
|
||||
|
||||
target_link_libraries(ggml-base PRIVATE Threads::Threads)
|
||||
|
||||
find_library(MATH_LIBRARY m)
|
||||
if (MATH_LIBRARY)
|
||||
if (NOT WIN32 OR NOT DEFINED ENV{ONEAPI_ROOT})
|
||||
target_link_libraries(ggml-base PRIVATE ${MATH_LIBRARY})
|
||||
endif()
|
||||
if (DEFINED MATH_LIBRARY)
|
||||
target_link_libraries(ggml-base PRIVATE ${MATH_LIBRARY})
|
||||
elseif (NOT WIN32 AND NOT DEFINED ENV{ONEAPI_ROOT})
|
||||
target_link_libraries(ggml-base PRIVATE m)
|
||||
endif()
|
||||
|
||||
if (CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||
|
||||
@@ -1205,40 +1205,57 @@ static void ggml_backend_meta_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
|
||||
|
||||
if (split_state.n_segments != 1) {
|
||||
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
|
||||
size_t offset_data = 0;
|
||||
std::vector<size_t> simple_offsets(n_bufs, 0);
|
||||
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
|
||||
const size_t row_stride = tensor->nb[1];
|
||||
GGML_ASSERT(offset % row_stride == 0);
|
||||
GGML_ASSERT(size % row_stride == 0);
|
||||
const int64_t r_start = offset / row_stride;
|
||||
const int64_t r_count = size / row_stride;
|
||||
GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
|
||||
|
||||
const int64_t blck_size = ggml_blck_size(tensor->type);
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
|
||||
tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
|
||||
simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
|
||||
r_count, simple_tensor->nb[1], tensor->nb[1]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*tensor->ne[1] == size);
|
||||
GGML_ASSERT(offset_data*r_count == size);
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
|
||||
const size_t row_stride = tensor->nb[2];
|
||||
GGML_ASSERT(offset % row_stride == 0);
|
||||
GGML_ASSERT(size % row_stride == 0);
|
||||
const int64_t r_start = offset / row_stride;
|
||||
const int64_t r_count = size / row_stride;
|
||||
GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
|
||||
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data, simple_offsets[j], nbytes,
|
||||
tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
|
||||
ggml_backend_tensor_set_2d(simple_tensor, (const char *) data + offset_data,
|
||||
simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
|
||||
r_count, simple_tensor->nb[2], tensor->nb[2]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*tensor->ne[2] == size);
|
||||
GGML_ASSERT(offset_data*r_count == size);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1295,40 +1312,57 @@ static void ggml_backend_meta_buffer_get_tensor(ggml_backend_buffer_t buffer, co
|
||||
|
||||
if (split_state.n_segments != 1) {
|
||||
GGML_ASSERT(split_state.axis >= 0 && split_state.axis < GGML_MAX_DIMS);
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
GGML_ASSERT(tensor->ne[3] == 1);
|
||||
|
||||
size_t offset_data = 0;
|
||||
std::vector<size_t> simple_offsets(n_bufs, 0);
|
||||
if (split_state.axis == GGML_BACKEND_SPLIT_AXIS_0) {
|
||||
GGML_ASSERT(tensor->ne[2] == 1);
|
||||
|
||||
const size_t row_stride = tensor->nb[1];
|
||||
GGML_ASSERT(offset % row_stride == 0);
|
||||
GGML_ASSERT(size % row_stride == 0);
|
||||
const int64_t r_start = offset / row_stride;
|
||||
const int64_t r_count = size / row_stride;
|
||||
GGML_ASSERT(r_start + r_count <= tensor->ne[1]);
|
||||
|
||||
const int64_t blck_size = ggml_blck_size(tensor->type);
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
GGML_ASSERT(split_state.ne[s*n_bufs + j] % blck_size == 0);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j]/blck_size * tensor->nb[0];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
|
||||
tensor->ne[1], simple_tensor->nb[1], tensor->nb[1]);
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
|
||||
simple_offsets[j] + r_start * simple_tensor->nb[1], nbytes,
|
||||
r_count, simple_tensor->nb[1], tensor->nb[1]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*tensor->ne[1] == size);
|
||||
GGML_ASSERT(offset_data*r_count == size);
|
||||
return;
|
||||
}
|
||||
GGML_ASSERT(split_state.axis == GGML_BACKEND_SPLIT_AXIS_1);
|
||||
|
||||
const size_t row_stride = tensor->nb[2];
|
||||
GGML_ASSERT(offset % row_stride == 0);
|
||||
GGML_ASSERT(size % row_stride == 0);
|
||||
const int64_t r_start = offset / row_stride;
|
||||
const int64_t r_count = size / row_stride;
|
||||
GGML_ASSERT(r_start + r_count <= tensor->ne[2]);
|
||||
|
||||
for (size_t s = 0; s < split_state.n_segments; s++) {
|
||||
for (size_t j = 0; j < n_bufs; j++) {
|
||||
const ggml_tensor * simple_tensor = ggml_backend_meta_buffer_simple_tensor(tensor, j);
|
||||
const size_t nbytes = split_state.ne[s*n_bufs + j] * tensor->nb[1];
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data, simple_offsets[j], nbytes,
|
||||
tensor->ne[2], simple_tensor->nb[2], tensor->nb[2]);
|
||||
ggml_backend_tensor_get_2d(simple_tensor, (char *) data + offset_data,
|
||||
simple_offsets[j] + r_start * simple_tensor->nb[2], nbytes,
|
||||
r_count, simple_tensor->nb[2], tensor->nb[2]);
|
||||
offset_data += nbytes;
|
||||
simple_offsets[j] += nbytes;
|
||||
}
|
||||
}
|
||||
GGML_ASSERT(offset_data*tensor->ne[2] == size);
|
||||
GGML_ASSERT(offset_data*r_count == size);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -1792,7 +1826,24 @@ static enum ggml_status ggml_backend_meta_graph_compute(ggml_backend_t backend,
|
||||
continue;
|
||||
}
|
||||
|
||||
i = get_i_delayed(i);
|
||||
const int i_delayed = get_i_delayed(i);
|
||||
|
||||
// If we can delay the AllReduce we need to consider the interaction with zero-sized tensor slices.
|
||||
// A backend with such a slice would normally have valid data after participating in the AllReduce with a node that has
|
||||
// its compute flag disabled and thus gets its data zeroed out.
|
||||
// If the AllReduce is delayed then the nodes until that point also need to have their compute flag disabled.
|
||||
if (i_delayed > i) {
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
if ((bcj.nodes[i]->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
|
||||
for (int ii = i + 1; ii <= i_delayed; ii++) {
|
||||
bcj.nodes[ii]->flags &= ~GGML_TENSOR_FLAG_COMPUTE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
i = i_delayed;
|
||||
|
||||
for (size_t j = 0; j < n_backends; j++) {
|
||||
auto & bcj = backend_ctx->backend_configs[j];
|
||||
@@ -2049,8 +2100,8 @@ static const ggml_backend_i ggml_backend_meta_i = {
|
||||
/* .free = */ ggml_backend_meta_free,
|
||||
/* .set_tensor_async = */ ggml_backend_meta_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_meta_get_tensor_async,
|
||||
/* .get_tensor_2d_async = */ nullptr,
|
||||
/* .set_tensor_2d_async = */ nullptr,
|
||||
/* .get_tensor_2d_async = */ nullptr,
|
||||
/* .cpy_tensor_async = */ nullptr,
|
||||
/* .synchronize = */ ggml_backend_meta_synchronize,
|
||||
/* .graph_plan_create = */ nullptr,
|
||||
|
||||
@@ -181,6 +181,12 @@ struct ggml_backend_registry {
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto & entry : backends) {
|
||||
if (entry.reg == reg) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
||||
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||
@@ -192,6 +198,12 @@ struct ggml_backend_registry {
|
||||
}
|
||||
|
||||
void register_device(ggml_backend_dev_t device) {
|
||||
for (auto & dev : devices) {
|
||||
if (dev == device) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
||||
#endif
|
||||
|
||||
@@ -262,9 +262,9 @@ static struct ggml_backend_i blas_backend_i = {
|
||||
/* .get_name = */ ggml_backend_blas_get_name,
|
||||
/* .free = */ ggml_backend_blas_free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
|
||||
@@ -25,6 +25,7 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml.h"
|
||||
|
||||
|
||||
#include <aclnnop/aclnn_add.h>
|
||||
#include <aclnnop/aclnn_add_rms_norm.h>
|
||||
#include <aclnnop/aclnn_addcdiv.h>
|
||||
@@ -45,7 +46,9 @@
|
||||
#include <aclnnop/aclnn_fused_infer_attention_score_v2.h>
|
||||
#include <aclnnop/aclnn_ger.h>
|
||||
#include <aclnnop/aclnn_group_norm.h>
|
||||
#include <aclnnop/aclnn_gather_v2.h>
|
||||
#include <aclnnop/aclnn_grouped_matmul_v3.h>
|
||||
#include <aclnnop/aclnn_scatter.h>
|
||||
#include <aclnnop/aclnn_gt_scalar.h>
|
||||
#include <aclnnop/aclnn_im2col.h>
|
||||
#include <aclnnop/aclnn_index_copy.h>
|
||||
@@ -62,6 +65,7 @@
|
||||
#include <aclnnop/aclnn_permute.h>
|
||||
#include <aclnnop/aclnn_pow.h>
|
||||
#include <aclnnop/aclnn_pow_tensor_tensor.h>
|
||||
#include <aclnnop/aclnn_recurrent_gated_delta_rule.h>
|
||||
#include <aclnnop/aclnn_reduce_sum.h>
|
||||
#include <aclnnop/aclnn_reflection_pad1d.h>
|
||||
#include <aclnnop/aclnn_repeat.h>
|
||||
@@ -69,11 +73,15 @@
|
||||
#include <aclnnop/aclnn_rms_norm.h>
|
||||
#include <aclnnop/aclnn_roll.h>
|
||||
#include <aclnnop/aclnn_softmax.h>
|
||||
#include <aclnnop/aclnn_softmax_cross_entropy_with_logits.h>
|
||||
#include <aclnnop/aclnn_sub.h>
|
||||
#include <aclnnop/aclnn_sum.h>
|
||||
#include <aclnnop/aclnn_threshold.h>
|
||||
#include <aclnnop/aclnn_tril.h>
|
||||
#include <aclnnop/aclnn_triangular_solve.h>
|
||||
#include <aclnnop/aclnn_triu.h>
|
||||
#include <aclnnop/aclnn_logical_not.h>
|
||||
#include <aclnnop/aclnn_masked_fill_scalar.h>
|
||||
#include <aclnnop/aclnn_upsample_nearest_2d.h>
|
||||
#include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
|
||||
#include <aclnnop/aclnn_zero.h>
|
||||
@@ -151,6 +159,107 @@ void ggml_cann_op_unary_gated(std::function<void(ggml_backend_cann_context &, ac
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, acl_dst.get(), acl_src1.get());
|
||||
}
|
||||
|
||||
// Fused SwiGLU using aclnnSwiGlu: splits input along innermost dim, applies
|
||||
// SiLU to left half, multiplies by right half.
|
||||
//
|
||||
// Falls back to the generic two-kernel path when src[1] != nullptr (two
|
||||
// independent halves) or swapped != 0 (reversed activation order), as
|
||||
// aclnnSwiGlu only handles the single interleaved tensor in standard order.
|
||||
//
|
||||
// CANN tiling for SwiGlu requires (storageShapeDim + viewDims) to be even.
|
||||
// aclCreateTensor always uses storageShapeDim=1, so viewDims must be odd.
|
||||
// We use a 3D view (1+3=4, even) to satisfy this constraint while preserving
|
||||
// correct split semantics along the innermost (ne[0]) dimension.
|
||||
void ggml_cann_swiglu(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
auto silu_fn = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Silu, acl_src, acl_dst);
|
||||
};
|
||||
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
if (dst->src[1] != nullptr || swapped != 0) {
|
||||
ggml_cann_op_unary_gated(silu_fn, ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// aclnnSwiGlu requires the split dim (src->ne[0]) to be even; fall back otherwise.
|
||||
if (dst->src[0]->ne[0] % 2 != 0) {
|
||||
ggml_cann_op_unary_gated(silu_fn, ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor * src0 = dst->src[0];
|
||||
size_t elem_size = ggml_element_size(src0);
|
||||
|
||||
// src0 GGML: [2*ne0, ne1, ne2, ne3] → 3D view [2*ne0, ne1, ne2*ne3]
|
||||
// CANN reversed: [ne2*ne3, ne1, 2*ne0], split along CANN dim 2 (last).
|
||||
int64_t ne0_x2 = src0->ne[0];
|
||||
int64_t ne1 = src0->ne[1];
|
||||
int64_t ne23 = src0->ne[2] * src0->ne[3];
|
||||
int64_t src3d_ne[] = { ne0_x2, ne1, ne23 };
|
||||
size_t src3d_nb[] = { (size_t)src0->nb[0], (size_t)src0->nb[1], (size_t)src0->nb[2] };
|
||||
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0->data, ggml_cann_type_mapping(src0->type),
|
||||
elem_size, src3d_ne, src3d_nb, 3);
|
||||
|
||||
// dst GGML: [ne0, ne1, ne2, ne3] → 3D view [ne0, ne1, ne2*ne3]
|
||||
int64_t ne0 = dst->ne[0];
|
||||
int64_t dst3d_ne[] = { ne0, ne1, ne23 };
|
||||
size_t dst3d_nb[] = { (size_t)dst->nb[0], (size_t)dst->nb[1], (size_t)dst->nb[2] };
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst->data, ggml_cann_type_mapping(dst->type),
|
||||
elem_size, dst3d_ne, dst3d_nb, 3);
|
||||
|
||||
// CANN tensor [ne23, ne1, 2*ne0]: split along CANN dim 2 (last) = 2*ne0.
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, SwiGlu, acl_src.get(), (int64_t)2, acl_dst.get());
|
||||
}
|
||||
|
||||
// Fused GeGLU using aclnnGeGluV3: splits input along ne[0] (CANN last dim),
|
||||
// activates the LEFT half with GELU, multiplies by right half.
|
||||
// approximate: 0=tanh, 1=none(erf). activateLeft=true matches GGML convention.
|
||||
// outGelu is a required-but-discard output buffer.
|
||||
//
|
||||
// Falls back to the generic two-kernel path when src[1] != nullptr (two
|
||||
// independent halves) or swapped != 0 (reversed activation order), as
|
||||
// aclnnGeGluV3 only handles the single interleaved tensor in standard order.
|
||||
void ggml_cann_geglu(ggml_backend_cann_context & ctx, ggml_tensor * dst, int64_t approximate) {
|
||||
auto gelu_fn = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Gelu, acl_src, acl_dst);
|
||||
};
|
||||
|
||||
const int32_t swapped = ggml_get_op_params_i32(dst, 1);
|
||||
if (dst->src[1] != nullptr || swapped != 0) {
|
||||
ggml_cann_op_unary_gated(gelu_fn, ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
// aclnnGeGluV3 requires the split dim (src->ne[0]) to be even; fall back otherwise.
|
||||
if (dst->src[0]->ne[0] % 2 != 0) {
|
||||
ggml_cann_op_unary_gated(gelu_fn, ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
ggml_tensor * src0 = dst->src[0];
|
||||
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
// Allocate a temporary buffer for the required outGelu output (same shape as dst).
|
||||
// Build contiguous strides since the pool allocation is a fresh buffer.
|
||||
size_t elem_size = ggml_element_size(dst);
|
||||
int64_t ne[GGML_MAX_DIMS] = { dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3] };
|
||||
size_t nb[GGML_MAX_DIMS];
|
||||
nb[0] = elem_size;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
nb[i] = nb[i - 1] * ne[i - 1];
|
||||
}
|
||||
size_t gelu_out_size = nb[GGML_MAX_DIMS - 1] * ne[GGML_MAX_DIMS - 1];
|
||||
ggml_cann_pool_alloc gelu_out_alloc(ctx.pool(), gelu_out_size);
|
||||
|
||||
acl_tensor_ptr acl_gelu_out = ggml_cann_create_tensor(
|
||||
gelu_out_alloc.get(), ggml_cann_type_mapping(dst->type), elem_size, ne, nb, GGML_MAX_DIMS);
|
||||
// V3 adds activateLeft param; true → Gelu(left)*right, matching GGML convention.
|
||||
// GGML dim 0 → CANN last dim (index GGML_MAX_DIMS-1 = 3 for 4D tensor).
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GeGluV3, acl_src.get(), (int64_t)(GGML_MAX_DIMS - 1), approximate, true,
|
||||
acl_dst.get(), acl_gelu_out.get());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Repeats elements of a tensor along each dimension according to the
|
||||
* specified repeat array.
|
||||
@@ -445,28 +554,33 @@ void ggml_cann_l2_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes);
|
||||
void * buffer = temp_buffer_allocator.get();
|
||||
|
||||
int64_t div_ne[] = { 1, src->ne[1], src->ne[2], src->ne[3] };
|
||||
size_t div_nb[GGML_MAX_DIMS];
|
||||
div_nb[0] = sizeof(float);
|
||||
int64_t norm_ne[] = { 1, src->ne[1], src->ne[2], src->ne[3] };
|
||||
size_t norm_nb[GGML_MAX_DIMS];
|
||||
norm_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
div_nb[i] = div_nb[i - 1] * div_ne[i - 1];
|
||||
norm_nb[i] = norm_nb[i - 1] * norm_ne[i - 1];
|
||||
}
|
||||
acl_tensor_ptr acl_div = ggml_cann_create_tensor(buffer, ACL_FLOAT, type_size, div_ne, div_nb, GGML_MAX_DIMS);
|
||||
acl_tensor_ptr acl_norm = ggml_cann_create_tensor(buffer, ACL_FLOAT, sizeof(float), norm_ne, norm_nb, GGML_MAX_DIMS);
|
||||
|
||||
std::vector<int64_t> norm_dims = { 3 };
|
||||
acl_int_array_ptr dims_array = ggml_cann_create_int_array(norm_dims.data(), norm_dims.size());
|
||||
|
||||
float p_value = 2.0f;
|
||||
acl_scalar_ptr p_scalar = ggml_cann_create_scalar(&p_value, aclDataType::ACL_FLOAT);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Norm, acl_src.get(), p_scalar.get(), dims_array.get(), true, acl_div.get());
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Norm, acl_src.get(), p_scalar.get(), dims_array.get(), true, acl_norm.get());
|
||||
|
||||
// Clamp norm to at least eps: scale = 1/fmaxf(norm, eps)
|
||||
acl_scalar_ptr acl_min = ggml_cann_create_scalar(&eps, aclDataType::ACL_FLOAT);
|
||||
float flt_max = FLT_MAX;
|
||||
acl_scalar_ptr acl_max = ggml_cann_create_scalar(&flt_max, aclDataType::ACL_FLOAT);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Clamp, acl_div.get(), acl_min.get(), acl_max.get(), acl_div.get());
|
||||
ggml_cann_pool_alloc clamp_buffer_allocator(ctx.pool());
|
||||
acl_tensor_ptr acl_clamped;
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src.get(), acl_div.get(), acl_dst.get());
|
||||
if (eps > 0.0f) {
|
||||
void * clamp_buf = clamp_buffer_allocator.alloc(n_bytes);
|
||||
acl_clamped = ggml_cann_create_tensor(clamp_buf, ACL_FLOAT, sizeof(float), norm_ne, norm_nb, GGML_MAX_DIMS);
|
||||
acl_scalar_ptr eps_scalar = ggml_cann_create_scalar(&eps, aclDataType::ACL_FLOAT);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, ClampMin, acl_norm.get(), eps_scalar.get(), acl_clamped.get());
|
||||
}
|
||||
|
||||
aclTensor * acl_div_input = acl_clamped ? acl_clamped.get() : acl_norm.get();
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Div, acl_src.get(), acl_div_input, acl_dst.get());
|
||||
}
|
||||
|
||||
void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
@@ -482,56 +596,30 @@ void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor *
|
||||
logits_nb[1] = logits_nb[0] * logits_ne[0];
|
||||
acl_tensor_ptr acl_logits = ggml_cann_create_tensor(src0->data, ACL_FLOAT, sizeof(float), logits_ne, logits_nb, 2);
|
||||
|
||||
size_t log_softmax_type_size = sizeof(float);
|
||||
int64_t log_softmax_n_bytes = nr * nc * log_softmax_type_size;
|
||||
ggml_cann_pool_alloc log_softmax_allocator(ctx.pool(), log_softmax_n_bytes);
|
||||
void * log_softmax_buffer = log_softmax_allocator.get();
|
||||
|
||||
int64_t log_softmax_ne[] = { nc, nr };
|
||||
size_t log_softmax_nb[2];
|
||||
log_softmax_nb[0] = log_softmax_type_size;
|
||||
log_softmax_nb[1] = log_softmax_nb[0] * log_softmax_ne[0];
|
||||
acl_tensor_ptr acl_log_softmax = ggml_cann_create_tensor(log_softmax_buffer, ACL_FLOAT, log_softmax_type_size,
|
||||
log_softmax_ne, log_softmax_nb, 2);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, LogSoftmax, acl_logits.get(), 1, acl_log_softmax.get());
|
||||
|
||||
int64_t labels_ne[] = { nc, nr };
|
||||
size_t labels_nb[2];
|
||||
labels_nb[0] = ggml_type_size(src1->type);
|
||||
labels_nb[1] = labels_nb[0] * labels_ne[0];
|
||||
acl_tensor_ptr acl_labels = ggml_cann_create_tensor(src1->data, ACL_FLOAT, sizeof(float), labels_ne, labels_nb, 2);
|
||||
|
||||
size_t mul_type_size = sizeof(float);
|
||||
int64_t mul_n_bytes = nr * nc * mul_type_size;
|
||||
ggml_cann_pool_alloc mul_allocator(ctx.pool(), mul_n_bytes);
|
||||
void * mul_buffer = mul_allocator.get();
|
||||
size_t loss_per_sample_type_size = sizeof(float);
|
||||
int64_t loss_per_sample_n_bytes = nr * loss_per_sample_type_size;
|
||||
ggml_cann_pool_alloc loss_per_sample_allocator(ctx.pool(), loss_per_sample_n_bytes);
|
||||
void * loss_per_sample_buffer = loss_per_sample_allocator.get();
|
||||
|
||||
int64_t mul_ne[] = { nc, nr };
|
||||
size_t mul_nb[2];
|
||||
mul_nb[0] = mul_type_size;
|
||||
mul_nb[1] = mul_nb[0] * mul_ne[0];
|
||||
acl_tensor_ptr acl_mul_result = ggml_cann_create_tensor(mul_buffer, ACL_FLOAT, mul_type_size, mul_ne, mul_nb, 2);
|
||||
int64_t loss_per_sample_ne[] = { nr };
|
||||
size_t loss_per_sample_nb[1];
|
||||
loss_per_sample_nb[0] = loss_per_sample_type_size;
|
||||
acl_tensor_ptr acl_loss_per_sample = ggml_cann_create_tensor(
|
||||
loss_per_sample_buffer, ACL_FLOAT, loss_per_sample_type_size, loss_per_sample_ne, loss_per_sample_nb, 1);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Mul, acl_log_softmax.get(), acl_labels.get(), acl_mul_result.get());
|
||||
size_t backprop_n_bytes = nr * nc * sizeof(float);
|
||||
ggml_cann_pool_alloc backprop_allocator(ctx.pool(), backprop_n_bytes);
|
||||
void * backprop_buffer = backprop_allocator.get();
|
||||
acl_tensor_ptr acl_backprop = ggml_cann_create_tensor(backprop_buffer, ACL_FLOAT, sizeof(float), logits_ne, logits_nb, 2);
|
||||
|
||||
size_t sum_per_sample_type_size = sizeof(float);
|
||||
int64_t sum_per_sample_n_bytes = nr * sum_per_sample_type_size;
|
||||
ggml_cann_pool_alloc sum_per_sample_allocator(ctx.pool(), sum_per_sample_n_bytes);
|
||||
void * sum_per_sample_buffer = sum_per_sample_allocator.get();
|
||||
|
||||
int64_t sum_per_sample_ne[] = { nr };
|
||||
size_t sum_per_sample_nb[1];
|
||||
sum_per_sample_nb[0] = sum_per_sample_type_size;
|
||||
acl_tensor_ptr acl_sum_per_sample = ggml_cann_create_tensor(
|
||||
sum_per_sample_buffer, ACL_FLOAT, sum_per_sample_type_size, sum_per_sample_ne, sum_per_sample_nb, 1);
|
||||
|
||||
std::vector<int64_t> sum_dims = { 1 };
|
||||
acl_int_array_ptr dims_array = ggml_cann_create_int_array(sum_dims.data(), sum_dims.size());
|
||||
bool keep_dims = false;
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_mul_result.get(), dims_array.get(), keep_dims, ACL_FLOAT,
|
||||
acl_sum_per_sample.get());
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, SoftmaxCrossEntropyWithLogits, acl_logits.get(), acl_labels.get(),
|
||||
acl_loss_per_sample.get(), acl_backprop.get());
|
||||
|
||||
size_t total_sum_type_size = sizeof(float);
|
||||
int64_t total_sum_n_bytes = 1 * total_sum_type_size;
|
||||
@@ -547,11 +635,12 @@ void ggml_cann_cross_entropy_loss(ggml_backend_cann_context & ctx, ggml_tensor *
|
||||
|
||||
std::vector<int64_t> total_sum_dims = { 0 };
|
||||
acl_int_array_ptr total_sum_dims_array = ggml_cann_create_int_array(total_sum_dims.data(), total_sum_dims.size());
|
||||
bool keep_dims = false;
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_sum_per_sample.get(), total_sum_dims_array.get(), keep_dims, ACL_FLOAT,
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_loss_per_sample.get(), total_sum_dims_array.get(), keep_dims, ACL_FLOAT,
|
||||
acl_total_sum.get());
|
||||
|
||||
float value = -1.0f / static_cast<float>(nr);
|
||||
float value = 1.0f / static_cast<float>(nr);
|
||||
acl_scalar_ptr scale_factor = ggml_cann_create_scalar(&value, aclDataType::ACL_FLOAT);
|
||||
acl_tensor_ptr acl_dst =
|
||||
ggml_cann_create_tensor(dst->data, ACL_FLOAT, sizeof(float), total_sum_ne, total_sum_nb, 1);
|
||||
@@ -589,6 +678,33 @@ void ggml_cann_group_norm(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
acl_mean_out.get(), acl_rstd_out.get());
|
||||
}
|
||||
|
||||
void ggml_cann_set(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src0 = dst->src[0];
|
||||
ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
||||
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
||||
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
||||
size_t offset = ((int32_t *) dst->op_params)[3];
|
||||
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
||||
|
||||
size_t param_nb[] = { ggml_element_size(src0), nb1, nb2, nb3 };
|
||||
|
||||
// Create a view of dst at the target offset with src1's dimensions
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset);
|
||||
acl_tensor_ptr acl_src1 = ggml_cann_create_tensor(src1);
|
||||
|
||||
if (!inplace) {
|
||||
// First copy src0 to dst entirely
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ACL_CHECK(
|
||||
aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
||||
}
|
||||
|
||||
// Copy src1 into the target region of dst
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst.get(), acl_src1.get());
|
||||
}
|
||||
|
||||
void ggml_cann_acc(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src0 = dst->src[0];
|
||||
ggml_tensor * src1 = dst->src[1];
|
||||
@@ -652,6 +768,113 @@ void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
aclnn_reduce_sum(ctx, dst, reduce_dims, 4);
|
||||
}
|
||||
|
||||
void ggml_cann_cumsum(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src = dst->src[0];
|
||||
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src);
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
||||
// GGML cumsum operates along dim 0 (innermost / ne[0]).
|
||||
// ggml_cann_create_tensor reverses dimensions to [ne3,ne2,ne1,ne0],
|
||||
// so GGML dim 0 maps to CANN dim 3 (the last dim of the 4-D tensor).
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Cumsum, acl_src.get(), (int64_t)3,
|
||||
ggml_cann_type_mapping(dst->type), acl_dst.get());
|
||||
}
|
||||
|
||||
void ggml_cann_solve_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src0 = dst->src[0]; // A: [N, N, B2, B3] lower triangular
|
||||
ggml_tensor * src1 = dst->src[1]; // B: [K, N, B2, B3]
|
||||
|
||||
acl_tensor_ptr acl_a = ggml_cann_create_tensor(src0);
|
||||
acl_tensor_ptr acl_b = ggml_cann_create_tensor(src1);
|
||||
acl_tensor_ptr acl_x = ggml_cann_create_tensor(dst);
|
||||
|
||||
// mOut: triangular copy of A (required output), same shape as A.
|
||||
const size_t a_bytes = ggml_nbytes(src0);
|
||||
ggml_cann_pool_alloc m_alloc(ctx.pool(), a_bytes);
|
||||
acl_tensor_ptr acl_m = ggml_cann_create_tensor(
|
||||
m_alloc.get(), ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS);
|
||||
|
||||
// Solve AX = B: upper=false (lower tri), transpose=false, unitriangular=false.
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, TriangularSolve,
|
||||
acl_b.get(), acl_a.get(), false, false, false,
|
||||
acl_x.get(), acl_m.get());
|
||||
}
|
||||
|
||||
void ggml_cann_diag(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src = dst->src[0];
|
||||
|
||||
GGML_ASSERT(src->ne[1] == 1);
|
||||
|
||||
const int64_t N = src->ne[0];
|
||||
const int64_t n_batch = src->ne[2] * src->ne[3];
|
||||
const size_t nb_f32 = sizeof(float);
|
||||
|
||||
// Fill dst with zeros.
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
||||
{
|
||||
float zero = 0.0f;
|
||||
acl_scalar_ptr acl_zero = ggml_cann_create_scalar(&zero, ACL_FLOAT);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst.get(), acl_zero.get());
|
||||
}
|
||||
|
||||
// Copy src vector onto the diagonal of dst via strided views.
|
||||
// src viewed as [N, n_batch], contiguous strides.
|
||||
int64_t ne_vec[2] = { N, n_batch };
|
||||
size_t nb_src_vec[2] = { nb_f32, N * nb_f32 };
|
||||
// dst diagonal view: stride (N+1)*4 steps along the diagonal.
|
||||
size_t nb_dst_diag[2] = { (N + 1) * nb_f32, N * N * nb_f32 };
|
||||
|
||||
acl_tensor_ptr acl_src_vec = ggml_cann_create_tensor(src->data, ACL_FLOAT, nb_f32, ne_vec, nb_src_vec, 2);
|
||||
acl_tensor_ptr acl_dst_diag = ggml_cann_create_tensor(dst->data, ACL_FLOAT, nb_f32, ne_vec, nb_dst_diag, 2);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceCopy, acl_dst_diag.get(), acl_src_vec.get());
|
||||
}
|
||||
|
||||
void ggml_cann_fill(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
float c = ggml_get_op_params_f32(dst, 0);
|
||||
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
||||
acl_scalar_ptr acl_c = ggml_cann_create_scalar(&c, ACL_FLOAT);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceFillScalar, acl_dst.get(), acl_c.get());
|
||||
}
|
||||
|
||||
void ggml_cann_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src = dst->src[0];
|
||||
|
||||
const int64_t S = src->ne[0];
|
||||
const int64_t n_batch = src->ne[2] * src->ne[3];
|
||||
const size_t nb_f32 = sizeof(float);
|
||||
|
||||
int64_t ne3d[3] = { S, S, n_batch };
|
||||
size_t nb3d[3] = { nb_f32, S * nb_f32, S * S * nb_f32 };
|
||||
|
||||
const ggml_tri_type ttype = (ggml_tri_type) ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src->data, ACL_FLOAT, nb_f32, ne3d, nb3d, 3);
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst->data, ACL_FLOAT, nb_f32, ne3d, nb3d, 3);
|
||||
|
||||
switch (ttype) {
|
||||
case GGML_TRI_TYPE_LOWER:
|
||||
// Tril(-1): preserve row > col (strict lower), zero upper + diagonal.
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src.get(), (int64_t)-1, acl_dst.get());
|
||||
break;
|
||||
case GGML_TRI_TYPE_UPPER_DIAG:
|
||||
// Triu(0): preserve row <= col (upper + diagonal), zero strict lower.
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Triu, acl_src.get(), (int64_t)0, acl_dst.get());
|
||||
break;
|
||||
case GGML_TRI_TYPE_UPPER:
|
||||
// Triu(1): preserve row < col (strict upper), zero lower + diagonal.
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Triu, acl_src.get(), (int64_t)1, acl_dst.get());
|
||||
break;
|
||||
case GGML_TRI_TYPE_LOWER_DIAG:
|
||||
// Tril(0): preserve row >= col (lower + diagonal), zero strict upper.
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Tril, acl_src.get(), (int64_t)0, acl_dst.get());
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("unsupported tri type");
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_upsample_nearest2d(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src = dst->src[0];
|
||||
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW);
|
||||
@@ -1695,152 +1918,90 @@ void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
aclnn_softmax(ctx, softmax_tensor.get(), 3, acl_dst.get());
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs index select operation on a 4D tensor using the CANN backend.
|
||||
*
|
||||
* This function applies the `IndexSelect` operation along a specific dimension
|
||||
* of the source tensor (`src_buffer`) using the indices from the index tensor (`index`).
|
||||
* It iterates over the last two dimensions of the source tensor, creates the corresponding
|
||||
* CANN tensors for the source, index, and output slices, and executes the `IndexSelect`
|
||||
* operation for each slice.
|
||||
*
|
||||
* @param ctx The context for CANN backend operations.
|
||||
* @param src_buffer The source buffer containing the 4D input tensor data.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb The strides (byte offsets) of the source tensor.
|
||||
* @param dst_buffer The destination buffer where the output tensor data will be written.
|
||||
* @param dst_ne The dimensions of the destination tensor.
|
||||
* @param dst_nb The strides (byte offsets) of the destination tensor.
|
||||
* @param index The index tensor specifying the indices to select from the source tensor.
|
||||
* @param type The data type of the source and destination tensors.
|
||||
*/
|
||||
static void aclnn_index_select_4d(ggml_backend_cann_context & ctx,
|
||||
void * src_buffer,
|
||||
int64_t * src_ne,
|
||||
size_t * src_nb,
|
||||
void * dst_buffer,
|
||||
int64_t * dst_ne,
|
||||
size_t * dst_nb,
|
||||
ggml_tensor * index,
|
||||
ggml_type type) {
|
||||
for (int64_t i = 0; i < src_ne[3]; i++) {
|
||||
for (int64_t j = 0; j < src_ne[2]; j++) {
|
||||
// src
|
||||
acl_tensor_ptr acl_src_tensor =
|
||||
ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
|
||||
|
||||
// index
|
||||
acl_tensor_ptr acl_index = ggml_cann_create_tensor(
|
||||
(char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
|
||||
ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
|
||||
|
||||
// out
|
||||
acl_tensor_ptr acl_out =
|
||||
ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, IndexSelect, acl_src_tensor.get(), 0, acl_index.get(), acl_out.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs inplace index copy operation on a 4D tensor using the CANN backend.
|
||||
*
|
||||
* This function applies the `IndexCopy` operation along a specific dimension of the
|
||||
* destination tensor (`dst_buffer`) by copying elements from the source tensor (`src_buffer`)
|
||||
* to positions specified by the index tensor (`index`).
|
||||
* It iterates over the last two dimensions of the tensors, creates the corresponding
|
||||
* CANN tensors for source, index, and destination slices, and performs the index copy
|
||||
* operation for each slice.
|
||||
*
|
||||
* @param ctx The context for CANN backend operations.
|
||||
* @param src_buffer The source buffer containing the 4D input tensor data to be copied.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb The strides (byte offsets) of the source tensor.
|
||||
* @param dst_buffer The destination buffer where values will be copied to.
|
||||
* @param dst_ne The dimensions of the destination tensor.
|
||||
* @param dst_nb The strides (byte offsets) of the destination tensor.
|
||||
* @param index The index tensor specifying target positions in the destination tensor.
|
||||
* @param type The data type of the source and destination tensors.
|
||||
*/
|
||||
static void aclnn_index_copy_4d(ggml_backend_cann_context & ctx,
|
||||
void * src_buffer,
|
||||
int64_t * src_ne,
|
||||
size_t * src_nb,
|
||||
void * dst_buffer,
|
||||
int64_t * dst_ne,
|
||||
size_t * dst_nb,
|
||||
ggml_tensor * index,
|
||||
ggml_type type) {
|
||||
for (int64_t i = 0; i < src_ne[3]; i++) {
|
||||
for (int64_t j = 0; j < src_ne[2]; j++) {
|
||||
// src
|
||||
acl_tensor_ptr acl_src_tensor =
|
||||
ggml_cann_create_tensor((char *) src_buffer + i * src_nb[3] + j * src_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type), src_ne, src_nb, 2);
|
||||
|
||||
// index
|
||||
acl_tensor_ptr acl_index = ggml_cann_create_tensor(
|
||||
(char *) index->data + (i % index->ne[2]) * index->nb[2] + (j % index->ne[1]) * index->nb[1],
|
||||
ggml_cann_type_mapping(index->type), ggml_element_size(index), index->ne, index->nb, 1);
|
||||
|
||||
// out
|
||||
acl_tensor_ptr acl_out =
|
||||
ggml_cann_create_tensor((char *) dst_buffer + i * dst_nb[3] + j * dst_nb[2],
|
||||
ggml_cann_type_mapping(type), ggml_type_size(type), dst_ne, dst_nb, 2);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_out.get(), 0, acl_index.get(), acl_src_tensor.get());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src0 = dst->src[0]; // src
|
||||
ggml_tensor * src0 = dst->src[0]; // weight
|
||||
ggml_tensor * src1 = dst->src[1]; // index
|
||||
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16
|
||||
|| dst->type == GGML_TYPE_BF16);
|
||||
|
||||
// n_idx: number of row indices per (i2, i3) batch slice.
|
||||
// ggml guarantees: src0->ne[2] == src1->ne[1], src0->ne[3] == src1->ne[2], src1->ne[3] == 1.
|
||||
const int64_t n_idx = src1->ne[0];
|
||||
|
||||
// Gather all (i2, i3) batch slices from src into dst.
|
||||
// ggml_cann_create_tensor reverses dims, so ACL sees [ne1, ne0].
|
||||
// GatherV2 with dim=0 gathers along ACL dim-0 == ggml ne[1] (the vocabulary / row axis).
|
||||
// nb: the 4 strides of the source buffer (nb[0..1] for the 2D slice shape,
|
||||
// nb[2..3] for computing per-batch-slice base pointer offsets).
|
||||
auto gather_batched = [&](void * src_base, aclDataType acl_type, size_t type_size,
|
||||
const size_t * nb) {
|
||||
int64_t src_ne[2] = { src0->ne[0], src0->ne[1] };
|
||||
size_t src_nb_2d[2] = { nb[0], nb[1] };
|
||||
int64_t dst_ne[2] = { src0->ne[0], n_idx };
|
||||
size_t dst_nb_2d[2] = { dst->nb[0], dst->nb[1] };
|
||||
int64_t idx_ne[1] = { n_idx };
|
||||
size_t idx_nb[1] = { (size_t)ggml_element_size(src1) };
|
||||
|
||||
for (int64_t i3 = 0; i3 < src0->ne[3]; i3++) {
|
||||
for (int64_t i2 = 0; i2 < src0->ne[2]; i2++) {
|
||||
acl_tensor_ptr acl_src = ggml_cann_create_tensor(
|
||||
(char *)src_base + i3 * nb[3] + i2 * nb[2],
|
||||
acl_type, type_size, src_ne, src_nb_2d, 2);
|
||||
acl_tensor_ptr acl_idx = ggml_cann_create_tensor(
|
||||
(char *)src1->data + i3 * src1->nb[2] + i2 * src1->nb[1],
|
||||
ggml_cann_type_mapping(src1->type), (size_t)ggml_element_size(src1),
|
||||
idx_ne, idx_nb, 1);
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(
|
||||
(char *)dst->data + i3 * dst->nb[3] + i2 * dst->nb[2],
|
||||
acl_type, type_size, dst_ne, dst_nb_2d, 2);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GatherV2, acl_src.get(), 0, acl_idx.get(), acl_dst.get());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_BF16:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
if (src0->type == dst->type) {
|
||||
aclnn_index_select_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1,
|
||||
dst->type);
|
||||
gather_batched(src0->data,
|
||||
ggml_cann_type_mapping(src0->type), ggml_type_size(src0->type),
|
||||
src0->nb);
|
||||
} else {
|
||||
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
|
||||
ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * ggml_element_size(dst));
|
||||
void * src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = dst->nb[0];
|
||||
// Cast src0 to dst type, then gather.
|
||||
ggml_cann_pool_alloc src_cast_allocator(ctx.pool(),
|
||||
ggml_nelements(src0) * ggml_element_size(dst));
|
||||
size_t src_cast_nb[GGML_MAX_DIMS];
|
||||
src_cast_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
src_cast_nb[i] = src_cast_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
acl_tensor_ptr src_trans_tensor =
|
||||
ggml_cann_create_tensor(src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
|
||||
aclnn_index_select_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
|
||||
dst->type);
|
||||
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
|
||||
acl_tensor_ptr acl_src_cast = ggml_cann_create_tensor(
|
||||
src_cast_allocator.get(), ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
||||
src0->ne, src_cast_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0.get(), acl_src_cast.get(), ggml_cann_type_mapping(dst->type));
|
||||
|
||||
gather_batched(src_cast_allocator.get(),
|
||||
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
||||
src_cast_nb);
|
||||
}
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
{
|
||||
// add 1 dim for bcast mul.
|
||||
// Dequantize Q8_0 to dst type, then gather.
|
||||
size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1], dequant_nb[GGML_MAX_DIMS + 1];
|
||||
int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1], *dequant_ne;
|
||||
int64_t scale_offset = 0;
|
||||
// [3,4,5,64] -> [3,4,5,2,32]
|
||||
weight_ne[0] = QK8_0;
|
||||
weight_ne[1] = src0->ne[0] / QK8_0;
|
||||
weight_nb[0] = sizeof(int8_t);
|
||||
weight_nb[1] = weight_nb[0] * weight_ne[0];
|
||||
weight_ne[0] = QK8_0;
|
||||
weight_ne[1] = src0->ne[0] / QK8_0;
|
||||
weight_nb[0] = sizeof(int8_t);
|
||||
weight_nb[1] = weight_nb[0] * weight_ne[0];
|
||||
for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
|
||||
weight_ne[i] = src0->ne[i - 1];
|
||||
weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
|
||||
}
|
||||
// [3,4,5,64] -> [3,4,5,2,1]
|
||||
scale_ne[0] = 1;
|
||||
scale_ne[1] = src0->ne[0] / QK8_0;
|
||||
scale_nb[0] = sizeof(uint16_t);
|
||||
@@ -1849,31 +2010,33 @@ void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
scale_ne[i] = src0->ne[i - 1];
|
||||
scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
// [3,4,5,64] -> [3,4,5,2,32]
|
||||
dequant_ne = weight_ne;
|
||||
dequant_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
|
||||
dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
|
||||
}
|
||||
scale_offset = ggml_nelements(src0) * sizeof(int8_t);
|
||||
ggml_cann_pool_alloc dequant_buffer_allocator(ctx.pool(),
|
||||
ggml_nelements(src0) * ggml_type_size(dst->type));
|
||||
acl_tensor_ptr acl_weight_tensor = ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t),
|
||||
weight_ne, weight_nb, GGML_MAX_DIMS + 1);
|
||||
acl_tensor_ptr acl_scale_tensor =
|
||||
ggml_cann_create_tensor(src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
|
||||
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
|
||||
acl_tensor_ptr dequant_tensor =
|
||||
ggml_cann_create_tensor(dequant_buffer_allocator.get(), ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
|
||||
aclnn_mul(ctx, acl_weight_tensor.get(), acl_scale_tensor.get(), dequant_tensor.get());
|
||||
dequant_nb[0] = ggml_type_size(dst->type);
|
||||
const int64_t scale_offset = ggml_nelements(src0) * sizeof(int8_t);
|
||||
ggml_cann_pool_alloc dequant_allocator(ctx.pool(),
|
||||
ggml_nelements(src0) * ggml_type_size(dst->type));
|
||||
acl_tensor_ptr acl_weight = ggml_cann_create_tensor(src0->data, ACL_INT8, sizeof(int8_t),
|
||||
weight_ne, weight_nb, GGML_MAX_DIMS + 1);
|
||||
acl_tensor_ptr acl_scale = ggml_cann_create_tensor(
|
||||
src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
|
||||
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
|
||||
acl_tensor_ptr acl_dequant = ggml_cann_create_tensor(
|
||||
dequant_allocator.get(), ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
|
||||
aclnn_mul(ctx, acl_weight.get(), acl_scale.get(), acl_dequant.get());
|
||||
|
||||
// Reinterpret dequant buffer as 4D [src0->ne] with contiguous strides.
|
||||
dequant_ne = src0->ne;
|
||||
dequant_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclnn_index_select_4d(ctx, dequant_buffer_allocator.get(), dequant_ne, dequant_nb, dst->data, dst->ne,
|
||||
dst->nb, src1, dst->type);
|
||||
gather_batched(dequant_allocator.get(),
|
||||
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
||||
dequant_nb);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@@ -1883,31 +2046,70 @@ void ggml_cann_get_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
}
|
||||
|
||||
void ggml_cann_set_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src0 = dst->src[0]; // src
|
||||
ggml_tensor * src1 = dst->src[1]; // index
|
||||
ggml_tensor * src0 = dst->src[0]; // source values
|
||||
ggml_tensor * src1 = dst->src[1]; // row indices
|
||||
|
||||
// n_idx: number of source rows to scatter per batch slice.
|
||||
// ggml guarantees: src0->ne[1] == src1->ne[0].
|
||||
const int64_t n_idx = src1->ne[0];
|
||||
|
||||
// Copy n_idx rows of src [ne0, n_idx] into dst [ne0, ne1] at positions given by a 1D index.
|
||||
// ggml_cann_create_tensor reverses dims, so ACL sees [ne1, ne0] for dst.
|
||||
// InplaceIndexCopy with dim=0 copies along ACL dim-0 == ggml ne[1] (the row axis).
|
||||
// src_nb: the 4 strides of the source buffer (nb[0..1] for the 2D slice shape,
|
||||
// nb[2..3] for computing per-batch-slice base pointer offsets).
|
||||
auto scatter_batched = [&](void * src_base, aclDataType acl_type, size_t type_size,
|
||||
const size_t * src_nb) {
|
||||
int64_t d_ne[2] = { dst->ne[0], dst->ne[1] };
|
||||
size_t d_nb[2] = { dst->nb[0], dst->nb[1] };
|
||||
int64_t s_ne[2] = { dst->ne[0], n_idx };
|
||||
size_t s_nb_2d[2] = { src_nb[0], src_nb[1] };
|
||||
int64_t i_ne[1] = { n_idx };
|
||||
size_t i_nb[1] = { (size_t)ggml_element_size(src1) };
|
||||
|
||||
for (int64_t i3 = 0; i3 < dst->ne[3]; i3++) {
|
||||
for (int64_t i2 = 0; i2 < dst->ne[2]; i2++) {
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(
|
||||
(char *)dst->data + i3 * dst->nb[3] + i2 * dst->nb[2],
|
||||
acl_type, type_size, d_ne, d_nb, 2);
|
||||
acl_tensor_ptr acl_idx = ggml_cann_create_tensor(
|
||||
(char *)src1->data + (i3 % src1->ne[2]) * src1->nb[2] + (i2 % src1->ne[1]) * src1->nb[1],
|
||||
ggml_cann_type_mapping(src1->type), (size_t)ggml_element_size(src1),
|
||||
i_ne, i_nb, 1);
|
||||
acl_tensor_ptr acl_src = ggml_cann_create_tensor(
|
||||
(char *)src_base + i3 * src_nb[3] + i2 * src_nb[2],
|
||||
acl_type, type_size, s_ne, s_nb_2d, 2);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceIndexCopy, acl_dst.get(), 0, acl_idx.get(), acl_src.get());
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
aclnn_index_copy_4d(ctx, src0->data, src0->ne, src0->nb, dst->data, dst->ne, dst->nb, src1, dst->type);
|
||||
break;
|
||||
}
|
||||
scatter_batched(src0->data,
|
||||
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
||||
src0->nb);
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_BF16:
|
||||
{
|
||||
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
|
||||
ggml_cann_pool_alloc src_buffer_allocator(ctx.pool(), ggml_nelements(src0) * sizeof(uint16_t));
|
||||
void * src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = sizeof(uint16_t);
|
||||
// Cast src0 (F32) to dst type first.
|
||||
ggml_cann_pool_alloc src_cast_allocator(ctx.pool(),
|
||||
ggml_nelements(src0) * ggml_type_size(dst->type));
|
||||
size_t src_cast_nb[GGML_MAX_DIMS];
|
||||
src_cast_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
src_cast_nb[i] = src_cast_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
acl_tensor_ptr src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0.get(), src_trans_tensor.get(), ggml_cann_type_mapping(dst->type));
|
||||
aclnn_index_copy_4d(ctx, src_trans_buffer, src0->ne, src_trans_nb, dst->data, dst->ne, dst->nb, src1,
|
||||
dst->type);
|
||||
acl_tensor_ptr acl_src0 = ggml_cann_create_tensor(src0);
|
||||
acl_tensor_ptr acl_src_cast = ggml_cann_create_tensor(
|
||||
src_cast_allocator.get(), ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
||||
src0->ne, src_cast_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0.get(), acl_src_cast.get(), ggml_cann_type_mapping(dst->type));
|
||||
|
||||
scatter_batched(src_cast_allocator.get(),
|
||||
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
|
||||
src_cast_nb);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@@ -3268,29 +3470,50 @@ void ggml_cann_pad_reflect_1d(ggml_backend_cann_context & ctx, ggml_tensor * dst
|
||||
int64_t paddingsArray[2] = { opts[0], opts[1] };
|
||||
acl_int_array_ptr paddings = ggml_cann_create_int_array(paddingsArray, 2);
|
||||
|
||||
for (int64_t i = 0; i < src0->ne[3]; i++) {
|
||||
acl_tensor_ptr acl_src =
|
||||
ggml_cann_create_tensor((char *) src0->data + i * src0->ne[3], ggml_cann_type_mapping(src0->type),
|
||||
ggml_element_size(src0), src0->ne, src0->nb, 3);
|
||||
// Collapsing ne[2]*ne[3] into a single batch dimension requires that dim3
|
||||
// is contiguous with respect to dim2 in both src and dst.
|
||||
GGML_ASSERT(src0->nb[3] == src0->nb[2] * src0->ne[2]);
|
||||
GGML_ASSERT(dst->nb[3] == dst->nb[2] * dst->ne[2]);
|
||||
|
||||
acl_tensor_ptr acl_dst =
|
||||
ggml_cann_create_tensor((char *) dst->data + i * src0->ne[3], ggml_cann_type_mapping(dst->type),
|
||||
ggml_element_size(dst), dst->ne, dst->nb, 3);
|
||||
int64_t src_ne_3d[3] = { src0->ne[0], src0->ne[1], src0->ne[2] * src0->ne[3] };
|
||||
int64_t dst_ne_3d[3] = { dst->ne[0], dst->ne[1], dst->ne[2] * dst->ne[3] };
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src.get(), paddings.get(), acl_dst.get());
|
||||
}
|
||||
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0->data, ggml_cann_type_mapping(src0->type),
|
||||
ggml_element_size(src0), src_ne_3d, src0->nb, 3);
|
||||
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst->data, ggml_cann_type_mapping(dst->type),
|
||||
ggml_element_size(dst), dst_ne_3d, dst->nb, 3);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, ReflectionPad1d, acl_src.get(), paddings.get(), acl_dst.get());
|
||||
}
|
||||
|
||||
void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src0 = dst->src[0];
|
||||
ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
// Write element-wise equality (0 or 1) into a temporary buffer to avoid
|
||||
// modifying src0 in-place. Use the same type as src0 so ReduceSum can
|
||||
// consume it directly without a type cast.
|
||||
ggml_cann_pool_alloc eq_alloc(ctx.pool(), ggml_nelements(src0) * ggml_element_size(src0));
|
||||
size_t eq_nb[GGML_MAX_DIMS];
|
||||
eq_nb[0] = ggml_element_size(src0);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
eq_nb[i] = eq_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
acl_tensor_ptr acl_eq = ggml_cann_create_tensor(
|
||||
eq_alloc.get(), ggml_cann_type_mapping(src0->type), ggml_element_size(src0),
|
||||
src0->ne, eq_nb, GGML_MAX_DIMS);
|
||||
|
||||
acl_tensor_ptr acl_self = ggml_cann_create_tensor(src0);
|
||||
acl_tensor_ptr acl_other = ggml_cann_create_tensor(src1);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, EqTensor, acl_self.get(), acl_other.get(), acl_eq.get());
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceEqTensor, acl_self.get(), acl_other.get());
|
||||
|
||||
ggml_cann_sum(ctx, dst);
|
||||
// Sum the 0/1 values into dst.
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
||||
int64_t dims[4] = { 0, 1, 2, 3 };
|
||||
acl_int_array_ptr dims_arr = ggml_cann_create_int_array(dims, 4);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, ReduceSum, acl_eq.get(), dims_arr.get(), true,
|
||||
ggml_cann_type_mapping(dst->type), acl_dst.get());
|
||||
}
|
||||
|
||||
void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
@@ -3306,6 +3529,27 @@ void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GtScalar, acl_src.get(), alpha.get(), acl_dst.get());
|
||||
}
|
||||
|
||||
void ggml_cann_softplus(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
acl_tensor_ptr acl_src = ggml_cann_create_tensor(src0);
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
||||
|
||||
float beta_val = 1.0f;
|
||||
float threshold_val = 20.0f;
|
||||
acl_scalar_ptr beta = ggml_cann_create_scalar(&beta_val, ACL_FLOAT);
|
||||
acl_scalar_ptr threshold = ggml_cann_create_scalar(&threshold_val, ACL_FLOAT);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Softplus, acl_src.get(), beta.get(), threshold.get(), acl_dst.get());
|
||||
}
|
||||
|
||||
void ggml_cann_geglu_quick(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
auto gelu_quick_fn = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
||||
};
|
||||
ggml_cann_op_unary_gated(gelu_quick_fn, ctx, dst);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs expert-specific matrix multiplication (MoE) with
|
||||
* floating-point precision using the CANN backend.
|
||||
@@ -3892,46 +4136,65 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
|
||||
}
|
||||
|
||||
static void ggml_cann_out_prod_fp(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
ggml_tensor * src0 = dst->src[0]; // weight
|
||||
ggml_tensor * src1 = dst->src[1]; // input
|
||||
ggml_tensor * src0 = dst->src[0]; // weight [ne00=m, ne01=K, ne02, ne03]
|
||||
ggml_tensor * src1 = dst->src[1]; // input [ne10=n, ne11=K, ne12, ne13]
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
acl_tensor_ptr acl_dst = ggml_cann_create_tensor(dst);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceZero, acl_dst.get());
|
||||
// dst[i,j] = sum_k src0[i,k] * src1[j,k] i.e. dst = src0 @ src1^T.
|
||||
//
|
||||
// ggml_cann_create_tensor reverses dimension order, so ACL sees:
|
||||
// acl_src0 slice: ggml[m,K] -> ACL[K,m]
|
||||
// acl_src1 slice: ggml[n,K] -> ACL[K,n]
|
||||
// acl_dst slice: ggml[m,n] -> ACL[n,m]
|
||||
//
|
||||
// Build a transposed view of src1 by swapping ne[0]/ne[1]:
|
||||
// src1_t: ggml[K,n] (swapped strides) -> ACL[n,K]
|
||||
//
|
||||
// Matmul(src1_t [n,K], src0 [K,m]) = [n,m] = acl_dst ✓
|
||||
//
|
||||
// The outer batch loop is kept because src0 may have fewer batch slices than
|
||||
// dst (ne02 <= ne2, ne03 <= ne3): this is a strided-broadcast not supported
|
||||
// by standard CANN Matmul broadcasting.
|
||||
|
||||
const aclDataType src0_acl_type = ggml_cann_type_mapping(src0->type);
|
||||
const aclDataType src1_acl_type = ggml_cann_type_mapping(src1->type);
|
||||
const aclDataType dst_acl_type = ggml_cann_type_mapping(dst->type);
|
||||
const size_t src0_type_sz = ggml_type_size(src0->type);
|
||||
const size_t src1_type_sz = ggml_type_size(src1->type);
|
||||
const size_t dst_type_sz = ggml_type_size(dst->type);
|
||||
|
||||
const int64_t dps2 = ne2 / ne02;
|
||||
const int64_t dps3 = ne3 / ne03;
|
||||
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||
const int64_t i02 = i2 / dps2;
|
||||
const int64_t i03 = i3 / dps3;
|
||||
|
||||
const int64_t i12 = i2;
|
||||
const int64_t i13 = i3;
|
||||
acl_tensor_ptr accumulator =
|
||||
ggml_cann_create_tensor((char *) dst->data + i2 * nb2 + i3 * nb3, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), dst->ne, dst->nb, 2);
|
||||
// src0 2D slice at [i02, i03]: ggml [m, K] -> ACL [K, m]
|
||||
int64_t src0_ne[2] = { ne00, ne01 };
|
||||
size_t src0_nb[2] = { nb00, nb01 };
|
||||
acl_tensor_ptr acl_src0_s = ggml_cann_create_tensor(
|
||||
(char *) src0->data + i02 * nb02 + i03 * nb03,
|
||||
src0_acl_type, src0_type_sz, src0_ne, src0_nb, 2);
|
||||
|
||||
// The outer product needs to be accumulated in this dimension.
|
||||
for (int64_t i1 = 0; i1 < ne11; i1++) {
|
||||
acl_tensor_ptr acl_input = ggml_cann_create_tensor(
|
||||
(char *) src1->data + i1 * nb11 + i12 * nb12 + i13 * nb13, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src1->ne, src1->nb, 1);
|
||||
// src1 transposed 2D slice at [i2, i3]: swap ne/nb -> ggml[K,n] -> ACL[n,K]
|
||||
int64_t src1_t_ne[2] = { ne11, ne10 };
|
||||
size_t src1_t_nb[2] = { nb11, nb10 };
|
||||
acl_tensor_ptr acl_src1_t = ggml_cann_create_tensor(
|
||||
(char *) src1->data + i2 * nb12 + i3 * nb13,
|
||||
src1_acl_type, src1_type_sz, src1_t_ne, src1_t_nb, 2);
|
||||
|
||||
acl_tensor_ptr acl_weight = ggml_cann_create_tensor(
|
||||
(char *) src0->data + i1 * nb01 + i02 * nb02 + i03 * nb03, ggml_cann_type_mapping(src0->type),
|
||||
ggml_type_size(src0->type), src0->ne, src0->nb, 1);
|
||||
// dst 2D slice at [i2, i3]: ggml [m, n] -> ACL [n, m]
|
||||
int64_t dst_ne[2] = { ne0, ne1 };
|
||||
size_t dst_nb[2] = { nb0, nb1 };
|
||||
acl_tensor_ptr acl_dst_s = ggml_cann_create_tensor(
|
||||
(char *) dst->data + i2 * nb2 + i3 * nb3,
|
||||
dst_acl_type, dst_type_sz, dst_ne, dst_nb, 2);
|
||||
|
||||
ggml_cann_pool_alloc output_allocator(ctx.pool());
|
||||
void * output_buffer = output_allocator.alloc(ggml_nbytes(dst));
|
||||
acl_tensor_ptr acl_out = ggml_cann_create_tensor(output_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), dst->ne, dst->nb, 2);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Ger, acl_input.get(), acl_weight.get(), acl_out.get());
|
||||
float alpha_value = 1.0f;
|
||||
aclScalar * alpha = aclCreateScalar(&alpha_value, ACL_FLOAT);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdd, accumulator.get(), acl_out.get(), alpha);
|
||||
}
|
||||
// Matmul(src1_t [n,K], src0 [K,m]) = [n,m] = acl_dst_s ✓
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Matmul,
|
||||
acl_src1_t.get(), acl_src0_s.get(), acl_dst_s.get(), (int8_t) 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4170,3 +4433,4 @@ void ggml_cann_gated_linear_attn(ggml_backend_cann_context & ctx, ggml_tensor *
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -32,6 +32,9 @@
|
||||
#include <aclnnop/aclnn_cat.h>
|
||||
#include <aclnnop/aclnn_clamp.h>
|
||||
#include <aclnnop/aclnn_cos.h>
|
||||
#include <aclnnop/aclnn_cumsum.h>
|
||||
#include <aclnnop/aclnn_tril.h>
|
||||
#include <aclnnop/aclnn_triu.h>
|
||||
#include <aclnnop/aclnn_exp.h>
|
||||
#include <aclnnop/aclnn_gelu.h>
|
||||
#include <aclnnop/aclnn_gelu_v2.h>
|
||||
@@ -47,6 +50,9 @@
|
||||
#include <aclnnop/aclnn_sign.h>
|
||||
#include <aclnnop/aclnn_silu.h>
|
||||
#include <aclnnop/aclnn_sin.h>
|
||||
#include <aclnnop/aclnn_softplus.h>
|
||||
#include <aclnnop/aclnn_swi_glu.h>
|
||||
#include <aclnnop/aclnn_geglu.h>
|
||||
#include <aclnnop/aclnn_slice.h>
|
||||
#include <aclnnop/aclnn_sqrt.h>
|
||||
#include <aclnnop/aclnn_tanh.h>
|
||||
@@ -69,6 +75,9 @@
|
||||
*/
|
||||
void ggml_cann_repeat(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cann_swiglu(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cann_geglu(ggml_backend_cann_context & ctx, ggml_tensor * dst, int64_t approximate);
|
||||
|
||||
/**
|
||||
* @brief Applies the Leaky ReLU activation function to a tensor using the CANN
|
||||
* backend.
|
||||
@@ -325,6 +334,48 @@ void ggml_cann_sum_rows(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cann_sum(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
/**
|
||||
* @brief Computes the cumulative sum of a ggml tensor along dim 0 using the
|
||||
* CANN backend.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor. dst->op is `GGML_OP_CUMSUM`.
|
||||
*/
|
||||
void ggml_cann_cumsum(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
/**
|
||||
* @brief Computes a triangular mask (tril/triu) of a square ggml tensor
|
||||
* using the CANN backend.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor. dst->op is `GGML_OP_TRI`.
|
||||
*/
|
||||
void ggml_cann_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
/**
|
||||
* @brief Solves a triangular linear system AX=B using the CANN backend.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor. dst->op is `GGML_OP_SOLVE_TRI`.
|
||||
*/
|
||||
void ggml_cann_solve_tri(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
/**
|
||||
* @brief Creates a diagonal matrix from a vector using the CANN backend.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor. dst->op is `GGML_OP_DIAG`.
|
||||
*/
|
||||
void ggml_cann_diag(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
/**
|
||||
* @brief Fills a tensor with a constant scalar value using the CANN backend.
|
||||
*
|
||||
* @param ctx The CANN context used for operations.
|
||||
* @param dst The destination tensor. dst->op is `GGML_OP_FILL`.
|
||||
*/
|
||||
void ggml_cann_fill(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
/**
|
||||
* @brief Upsamples a ggml tensor using nearest neighbor interpolation using
|
||||
* the CANN backend.
|
||||
@@ -461,6 +512,9 @@ void ggml_cann_timestep_embedding(ggml_backend_cann_context & ctx, ggml_tensor *
|
||||
// @see ggml_cann_dup.
|
||||
void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
// @see ggml_cann_acc, but copies src1 into dst instead of adding.
|
||||
void ggml_cann_set(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
/**
|
||||
* @brief Computes the softmax activation with optional masking.
|
||||
*
|
||||
@@ -813,6 +867,8 @@ void ggml_cann_count_equal(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
* dst->op is expected to be `GGML_OP_STEP`.
|
||||
*/
|
||||
void ggml_cann_step(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cann_softplus(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
void ggml_cann_geglu_quick(ggml_backend_cann_context & ctx, ggml_tensor * dst);
|
||||
|
||||
/**
|
||||
* @brief Performs the Flash Attention extended operator using the CANN backend.
|
||||
|
||||
@@ -1428,6 +1428,22 @@ static bool ggml_backend_cann_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Set a region of a tensor's device memory to a specified value.
|
||||
*
|
||||
* @param buffer The CANN buffer containing the tensor.
|
||||
* @param tensor Pointer to the tensor whose memory will be set.
|
||||
* @param value The value to which each byte in the region will be set.
|
||||
* @param offset Byte offset within the tensor's data to start setting.
|
||||
* @param size Number of bytes to set.
|
||||
*/
|
||||
static void ggml_backend_cann_buffer_memset_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
ggml_backend_cann_buffer_context * ctx = (ggml_backend_cann_buffer_context *) buffer->context;
|
||||
|
||||
ggml_cann_set_device(ctx->device);
|
||||
ACL_CHECK(aclrtMemset((char *) tensor->data + offset, size, value, size));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Clear a CANN buffer by setting all its memory to a specified value.
|
||||
*
|
||||
@@ -1454,7 +1470,7 @@ static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
|
||||
/* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cann_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
|
||||
/* .memset_tensor = */ NULL,
|
||||
/* .memset_tensor = */ ggml_backend_cann_buffer_memset_tensor,
|
||||
/* .set_tensor = */ ggml_backend_cann_buffer_set_tensor,
|
||||
/* .get_tensor = */ ggml_backend_cann_buffer_get_tensor,
|
||||
/* .set_tensor_2d = */ NULL,
|
||||
@@ -1835,6 +1851,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
|
||||
case GGML_UNARY_OP_STEP:
|
||||
ggml_cann_step(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
ggml_cann_softplus(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -1845,20 +1864,16 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
|
||||
GGML_CANN_CALL_OP_UNARY_GATED(Relu);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU:
|
||||
ggml_cann_geglu(ctx, dst, 0); // approximate=0 → tanh
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_ERF:
|
||||
// aclnnGelu internally uses the erf-based approximation.
|
||||
GGML_CANN_CALL_OP_UNARY_GATED(Gelu);
|
||||
ggml_cann_geglu(ctx, dst, 1); // approximate=1 → erf
|
||||
break;
|
||||
case GGML_GLU_OP_SWIGLU:
|
||||
GGML_CANN_CALL_OP_UNARY_GATED(Silu);
|
||||
ggml_cann_swiglu(ctx, dst);
|
||||
break;
|
||||
case GGML_GLU_OP_GEGLU_QUICK:
|
||||
{
|
||||
auto lambda = [](ggml_backend_cann_context & ctx, aclTensor * acl_src, aclTensor * acl_dst) {
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, GeluV2, acl_src, 0, acl_dst);
|
||||
};
|
||||
ggml_cann_op_unary_gated(lambda, ctx, dst);
|
||||
}
|
||||
ggml_cann_geglu_quick(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
@@ -1920,6 +1935,9 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
|
||||
case GGML_OP_CPY:
|
||||
ggml_cann_cpy(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SET:
|
||||
ggml_cann_set(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CONT:
|
||||
ggml_cann_dup(ctx, dst);
|
||||
break;
|
||||
@@ -1989,6 +2007,21 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context & ctx, struct gg
|
||||
case GGML_OP_SSM_CONV:
|
||||
ggml_cann_ssm_conv(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_CUMSUM:
|
||||
ggml_cann_cumsum(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_TRI:
|
||||
ggml_cann_tri(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_FILL:
|
||||
ggml_cann_fill(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_DIAG:
|
||||
ggml_cann_diag(ctx, dst);
|
||||
break;
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
ggml_cann_solve_tri(ctx, dst);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -2324,6 +2357,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(ggml_backend_t backend,
|
||||
if (use_cann_graph) {
|
||||
// If no matching graph is found, the graph needs to be recaptured.
|
||||
graph_capture_required = !cann_ctx->graph_lru_cache.find_and_move_to_front(cgraph);
|
||||
|
||||
if (graph_capture_required) {
|
||||
// If no matching graph is found, add a new ACL graph.
|
||||
ggml_cann_graph * new_graph = ggml_cann_graph::create_from_cgraph(cgraph);
|
||||
@@ -2382,6 +2416,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||
case GGML_UNARY_OP_SGN:
|
||||
case GGML_UNARY_OP_STEP:
|
||||
case GGML_UNARY_OP_GELU_ERF:
|
||||
case GGML_UNARY_OP_SOFTPLUS:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -2572,6 +2607,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_ARGSORT:
|
||||
case GGML_OP_ACC:
|
||||
case GGML_OP_SET:
|
||||
case GGML_OP_GROUP_NORM:
|
||||
return true;
|
||||
case GGML_OP_PAD:
|
||||
@@ -2649,6 +2685,16 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||
}
|
||||
case GGML_OP_SSM_CONV:
|
||||
return true;
|
||||
case GGML_OP_CUMSUM:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_TRI:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_FILL:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_DIAG:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_SOLVE_TRI:
|
||||
return op->src[0]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@@ -2700,8 +2746,8 @@ static const ggml_backend_i ggml_backend_cann_interface = {
|
||||
/* .free = */ ggml_backend_cann_free,
|
||||
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
|
||||
/* .synchronize = */ ggml_backend_cann_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
|
||||
@@ -485,6 +485,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
if (GGML_RV_ZIHINTPAUSE)
|
||||
string(APPEND MARCH_STR "_zihintpause")
|
||||
endif()
|
||||
if (GGML_CPU_RISCV64_SPACEMIT)
|
||||
# `xsmtvdotii' is only required for GCC >= 15.
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND
|
||||
CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 15)
|
||||
string(APPEND MARCH_STR "_xsmtvdotii")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
|
||||
else()
|
||||
@@ -571,13 +578,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
|
||||
# Fetch KleidiAI sources:
|
||||
include(FetchContent)
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.22.0")
|
||||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "54049037570ab0ee0a0d126b2ba5ece1")
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.24.0")
|
||||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/releases/download/${KLEIDIAI_COMMIT_TAG}/kleidiai-${KLEIDIAI_COMMIT_TAG}-src.tar.gz")
|
||||
set(KLEIDIAI_RELEASE_ARCHIVE_MD5 "2f02ebe29573d45813e671eb304f2a00")
|
||||
|
||||
set(KLEIDIAI_FETCH_ARGS
|
||||
URL ${KLEIDIAI_DOWNLOAD_URL}
|
||||
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5}
|
||||
URL_HASH MD5=${KLEIDIAI_RELEASE_ARCHIVE_MD5}
|
||||
)
|
||||
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
|
||||
list(APPEND KLEIDIAI_FETCH_ARGS DOWNLOAD_EXTRACT_TIMESTAMP NEW)
|
||||
|
||||
@@ -2005,12 +2005,12 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
|
||||
const int lda = KB * sizeof(TA);
|
||||
//const int ldb = KB * sizeof(TB);
|
||||
|
||||
static thread_local packed_B_t Tile0[TILE_N * TILE_K];
|
||||
static thread_local packed_B_t Tile1[TILE_N * TILE_K];
|
||||
static thread_local int8_t Tile23[TILE_M * TILE_K];
|
||||
alignas(64) static thread_local packed_B_t Tile0[TILE_N * TILE_K];
|
||||
alignas(64) static thread_local packed_B_t Tile1[TILE_N * TILE_K];
|
||||
alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];
|
||||
|
||||
static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
|
||||
static thread_local int32_t TileC1[TILE_M * TILE_N * 4];
|
||||
alignas(64) static thread_local int32_t TileC0[TILE_M * TILE_N * 4];
|
||||
alignas(64) static thread_local int32_t TileC1[TILE_M * TILE_N * 4];
|
||||
|
||||
// double buffering C to interleave avx512 and amx
|
||||
int32_t * C_cur = TileC0;
|
||||
@@ -2187,21 +2187,21 @@ void tinygemm_kernel_amx(int M, int N, int KB, const void * RESTRICT _A, const v
|
||||
const int m1 = std::max(M - TILE_M, 0);
|
||||
//const int lda = KB * sizeof(TA);
|
||||
|
||||
static thread_local int8_t Tile0[TILE_N * TILE_K];
|
||||
static thread_local int8_t Tile1[TILE_N * TILE_K];
|
||||
static thread_local int8_t Tile23[TILE_M * TILE_K];
|
||||
alignas(64) static thread_local int8_t Tile0[TILE_N * TILE_K];
|
||||
alignas(64) static thread_local int8_t Tile1[TILE_N * TILE_K];
|
||||
alignas(64) static thread_local int8_t Tile23[TILE_M * TILE_K];
|
||||
|
||||
// mat mul result for each group
|
||||
static thread_local int32_t Tile4[TILE_M * TILE_N];
|
||||
static thread_local int32_t Tile5[TILE_M * TILE_N];
|
||||
static thread_local int32_t Tile6[TILE_M * TILE_N];
|
||||
static thread_local int32_t Tile7[TILE_M * TILE_N];
|
||||
alignas(64) static thread_local int32_t Tile4[TILE_M * TILE_N];
|
||||
alignas(64) static thread_local int32_t Tile5[TILE_M * TILE_N];
|
||||
alignas(64) static thread_local int32_t Tile6[TILE_M * TILE_N];
|
||||
alignas(64) static thread_local int32_t Tile7[TILE_M * TILE_N];
|
||||
|
||||
// sum of each QK_K block, contains 8 groups, int32
|
||||
static thread_local int32_t Sumi4[TILE_M * TILE_N];
|
||||
static thread_local int32_t Sumi5[TILE_M * TILE_N];
|
||||
static thread_local int32_t Sumi6[TILE_M * TILE_N];
|
||||
static thread_local int32_t Sumi7[TILE_M * TILE_N];
|
||||
alignas(64) static thread_local int32_t Sumi4[TILE_M * TILE_N];
|
||||
alignas(64) static thread_local int32_t Sumi5[TILE_M * TILE_N];
|
||||
alignas(64) static thread_local int32_t Sumi6[TILE_M * TILE_N];
|
||||
alignas(64) static thread_local int32_t Sumi7[TILE_M * TILE_N];
|
||||
|
||||
const int k_group_size = std::is_same<TB, block_q6_K>::value ? 16 : 32;
|
||||
for (int i = 0; i < KB; ++i) {
|
||||
|
||||
@@ -5023,6 +5023,71 @@ void ggml_gemm_q8_0_4x8_q8_0(int n,
|
||||
UNUSED(ncols_interleaved);
|
||||
UNUSED(blocklen);
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
if (svcntb() * 8 == 256) {
|
||||
const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;
|
||||
|
||||
static const uint32_t idx_arr[8] = {0, 1, 4, 5, 2, 3, 6, 7};
|
||||
svuint32_t idx = svld1(svptrue_b32(), idx_arr);
|
||||
static const uint32_t idx_arr1[8] = {0, 1, 2, 3, 1, 2, 3, 0};
|
||||
svuint32_t idx_sc1 = svld1(svptrue_b32(), idx_arr1);
|
||||
static const uint32_t idx_arr2[8] = {0, 1, 2, 3, 0, 1, 2, 3};
|
||||
svuint32_t idx_sc2 = svld1(svptrue_b32(), idx_arr2);
|
||||
|
||||
for (int y = 0; y < nr; y += 4) {
|
||||
const block_q8_0x4 * a_ptr_base = (const block_q8_0x4 *) vy + (y / 4) * nb;
|
||||
|
||||
for (int x = 0; x < nc; x += ncols_interleaved) {
|
||||
const block_q8_0x4 * b_ptr = b_ptr_base + (x / 4) * nb;
|
||||
const block_q8_0x4 * a_ptr = a_ptr_base;
|
||||
|
||||
svfloat32_t acc_f32_01 = svdup_f32(0);
|
||||
svfloat32_t acc_f32_23 = svdup_f32(0);
|
||||
|
||||
for (int b = 0; b < nb; b++) {
|
||||
|
||||
svint32_t acc_01 = svdup_s32(0);
|
||||
svint32_t acc_23 = svdup_s32(0);
|
||||
|
||||
// Process 4 chunks of 8 positions each
|
||||
for (int chunk = 0; chunk < 4; chunk++) {
|
||||
svint8_t s_a01 = svld1rq_s8(svptrue_b8(), a_ptr->qs + chunk * 32);
|
||||
svint8_t s_a23 = svld1rq_s8(svptrue_b8(), a_ptr->qs + chunk * 32 + 16);
|
||||
svint8_t s_b0123 = svld1_s8(svptrue_b8(), b_ptr->qs + chunk * 32);
|
||||
|
||||
acc_01 = svmmla_s32(acc_01, s_a01, s_b0123);
|
||||
acc_23 = svmmla_s32(acc_23, s_a23, s_b0123);
|
||||
}
|
||||
|
||||
// Reorder outputs from 2×2 tiles to row-major
|
||||
// acc[01] = [r0c0, r0c1, r1c0, r1c1, r0c2, r0c3, r1c2, r1c3]
|
||||
// acc[23] = [r2c0, r2c1, r3c0, r3c1, r2c2, r2c3, r3c2, r3c3]
|
||||
|
||||
svint32_t row01 = svtbl_s32(acc_01, idx);
|
||||
svint32_t row23 = svtbl_s32(acc_23, idx);
|
||||
|
||||
svfloat16_t temp1 = svld1_f16(svptrue_pat_b16(SV_VL4), (const __fp16 *) a_ptr->d);
|
||||
svfloat16_t temp2 = svld1_f16(svptrue_pat_b16(SV_VL4), (const __fp16 *) b_ptr->d);
|
||||
svfloat32_t sv_a_d = svtbl_f32(svcvt_f32_f16_x(svptrue_b32(), svzip1_f16(temp1, temp1)), idx_sc1);
|
||||
svfloat32_t sv_b_d = svtbl_f32(svcvt_f32_f16_x(svptrue_b32(), svzip1_f16(temp2, temp2)), idx_sc2);
|
||||
|
||||
acc_f32_01 = svmla_f32_x(svptrue_b32(), acc_f32_01, svcvt_f32_s32_x(svptrue_b32(), row01), svmul_lane_f32(sv_b_d, sv_a_d, 0));
|
||||
acc_f32_23 = svmla_f32_x(svptrue_b32(), acc_f32_23, svcvt_f32_s32_x(svptrue_b32(), row23), svmul_lane_f32(sv_b_d, sv_a_d, 2));
|
||||
a_ptr++;
|
||||
b_ptr++;
|
||||
}
|
||||
|
||||
svbool_t pg4 = svptrue_pat_b32(SV_VL4);
|
||||
svst1_f32(pg4, s + (y+0) * bs + x, acc_f32_01);
|
||||
svst1_f32(pg4, s + (y+1) * bs + x, svext_f32(acc_f32_01, acc_f32_01, 4));
|
||||
svst1_f32(pg4, s + (y+2) * bs + x, acc_f32_23);
|
||||
svst1_f32(pg4, s + (y+3) * bs + x, svext_f32(acc_f32_23, acc_f32_23, 4));
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif // SVE compile-time end
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8)
|
||||
const block_q8_0x4 * b_ptr_base = (const block_q8_0x4 *) vx;
|
||||
|
||||
|
||||
@@ -2300,9 +2300,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
#if defined __AVX2__
|
||||
|
||||
const __m256i m4 = _mm256_set1_epi8(0xF);
|
||||
const __m256i m2 = _mm256_set1_epi8(3);
|
||||
const __m256i m32s = _mm256_set1_epi8(32);
|
||||
const __m256i m3 = _mm256_set1_epi8(3);
|
||||
const __m256i m15 = _mm256_set1_epi8(15);
|
||||
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
@@ -2314,53 +2313,45 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
const uint8_t * GGML_RESTRICT qh = x[i].qh;
|
||||
const int8_t * GGML_RESTRICT q8 = y[i].qs;
|
||||
|
||||
const __m256i q8sums = _mm256_loadu_si256((const __m256i*)y[i].bsums);
|
||||
const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
|
||||
const __m256i scales_16 = _mm256_cvtepi8_epi16(scales);
|
||||
const __m256i q8sclsub = _mm256_slli_epi32(_mm256_madd_epi16(q8sums, scales_16), 5);
|
||||
|
||||
__m256i sumi = _mm256_setzero_si256();
|
||||
|
||||
int is = 0;
|
||||
|
||||
for (int j = 0; j < QK_K/128; ++j) {
|
||||
|
||||
const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
|
||||
const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
|
||||
const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
|
||||
const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
|
||||
is += 4;
|
||||
|
||||
const __m256i q4bits1 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
|
||||
const __m256i q4bits2 = _mm256_loadu_si256((const __m256i*)q4); q4 += 32;
|
||||
const __m256i q4bitsH = _mm256_loadu_si256((const __m256i*)qh); qh += 32;
|
||||
|
||||
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m2), 4);
|
||||
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 2), m2), 4);
|
||||
const __m256i q4h_2 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 4), m2), 4);
|
||||
const __m256i q4h_3 = _mm256_slli_epi16(_mm256_and_si256(_mm256_srli_epi16(q4bitsH, 6), m2), 4);
|
||||
const __m256i q4h_0 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, m3), 4);
|
||||
const __m256i q4h_1 = _mm256_slli_epi16(_mm256_and_si256(q4bitsH, _mm256_set1_epi8(12)), 2);
|
||||
const __m256i q4h_2 = _mm256_and_si256(q4bitsH, _mm256_set1_epi8(48));
|
||||
const __m256i q4h_3 = _mm256_srli_epi16(_mm256_and_si256(q4bitsH, _mm256_set1_epi8(-64)), 2);
|
||||
|
||||
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m4), q4h_0);
|
||||
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m4), q4h_1);
|
||||
const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m4), q4h_2);
|
||||
const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m4), q4h_3);
|
||||
const __m256i q4_0 = _mm256_or_si256(_mm256_and_si256(q4bits1, m15), q4h_0);
|
||||
const __m256i q4_1 = _mm256_or_si256(_mm256_and_si256(q4bits2, m15), q4h_1);
|
||||
const __m256i q4_2 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits1, 4), m15), q4h_2);
|
||||
const __m256i q4_3 = _mm256_or_si256(_mm256_and_si256(_mm256_srli_epi16(q4bits2, 4), m15), q4h_3);
|
||||
|
||||
const __m256i q8_0 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
||||
const __m256i q8_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
||||
const __m256i q8_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
||||
const __m256i q8_3 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
||||
|
||||
__m256i q8s_0 = _mm256_maddubs_epi16(m32s, q8_0);
|
||||
__m256i q8s_1 = _mm256_maddubs_epi16(m32s, q8_1);
|
||||
__m256i q8s_2 = _mm256_maddubs_epi16(m32s, q8_2);
|
||||
__m256i q8s_3 = _mm256_maddubs_epi16(m32s, q8_3);
|
||||
|
||||
__m256i p16_0 = _mm256_maddubs_epi16(q4_0, q8_0);
|
||||
__m256i p16_1 = _mm256_maddubs_epi16(q4_1, q8_1);
|
||||
__m256i p16_2 = _mm256_maddubs_epi16(q4_2, q8_2);
|
||||
__m256i p16_3 = _mm256_maddubs_epi16(q4_3, q8_3);
|
||||
|
||||
p16_0 = _mm256_sub_epi16(p16_0, q8s_0);
|
||||
p16_1 = _mm256_sub_epi16(p16_1, q8s_1);
|
||||
p16_2 = _mm256_sub_epi16(p16_2, q8s_2);
|
||||
p16_3 = _mm256_sub_epi16(p16_3, q8s_3);
|
||||
const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
|
||||
const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
|
||||
const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
|
||||
const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
|
||||
is += 4;
|
||||
|
||||
p16_0 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_0), p16_0);
|
||||
p16_1 = _mm256_madd_epi16(_mm256_cvtepi8_epi16(scale_1), p16_1);
|
||||
@@ -2372,6 +2363,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
||||
|
||||
}
|
||||
|
||||
sumi = _mm256_sub_epi32(sumi, q8sclsub);
|
||||
acc = _mm256_fmadd_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi), acc);
|
||||
}
|
||||
|
||||
|
||||
@@ -1245,6 +1245,12 @@ void ggml_compute_forward_mul_mat(
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
const int32_t hint = ggml_get_op_params_i32(dst, 1);
|
||||
if (hint == GGML_HINT_SRC0_IS_HADAMARD && !params->use_ref) {
|
||||
ggml_compute_forward_fwht(params, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int ith = params->ith;
|
||||
@@ -2959,6 +2965,45 @@ struct ggml_cplan ggml_graph_plan(
|
||||
return cplan;
|
||||
}
|
||||
|
||||
|
||||
// Try to fuse the current node with subsequent nodes for better performance.
|
||||
// Returns the number of nodes skipped by fusion (>=1), or 0 if no fusion was applied.
|
||||
static bool ggml_cpu_disable_fusion = false; // initialized once in ggml_cpu_init(), read-only afterwards
|
||||
|
||||
static int ggml_cpu_try_fuse_ops(
|
||||
const struct ggml_cgraph * cgraph,
|
||||
const int node_n,
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_cplan * cplan) {
|
||||
|
||||
if (ggml_cpu_disable_fusion || cplan->use_ref) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
|
||||
if (node->op == GGML_OP_RMS_NORM) {
|
||||
// RMS_NORM + MUL fusion
|
||||
const enum ggml_op fuse_ops[] = { GGML_OP_RMS_NORM, GGML_OP_MUL };
|
||||
if (ggml_can_fuse(cgraph, node_n, fuse_ops, 2)) {
|
||||
struct ggml_tensor * mul_node = cgraph->nodes[node_n + 1];
|
||||
const struct ggml_tensor * mul_w = (mul_node->src[0] == node)
|
||||
? mul_node->src[1] : mul_node->src[0];
|
||||
if (node->src[0]->type == GGML_TYPE_F32 &&
|
||||
mul_node->type == GGML_TYPE_F32 &&
|
||||
mul_w->type == GGML_TYPE_F32 &&
|
||||
mul_w->ne[0] == node->ne[0] &&
|
||||
mul_w->nb[0] == sizeof(float)) {
|
||||
|
||||
ggml_compute_forward_rms_norm_mul_fused(params, node, mul_node);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
||||
struct ggml_threadpool * tp = state->threadpool;
|
||||
@@ -2995,7 +3040,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_compute_forward(¶ms, node);
|
||||
// TODO: move fused-op detection into ggml_graph_plan so fusion decisions are made once at planning time
|
||||
// Try fused ops, fall back to normal compute
|
||||
const int n_fused = ggml_cpu_try_fuse_ops(cgraph, node_n, ¶ms, cplan);
|
||||
if (n_fused > 0) {
|
||||
node_n += n_fused;
|
||||
} else {
|
||||
ggml_compute_forward(¶ms, node);
|
||||
}
|
||||
|
||||
if (state->ith == 0 && cplan->abort_callback &&
|
||||
cplan->abort_callback(cplan->abort_callback_data)) {
|
||||
@@ -3757,6 +3809,11 @@ void ggml_cpu_init(void) {
|
||||
ggml_init_riscv_arch_features();
|
||||
#endif
|
||||
|
||||
{
|
||||
const char * env = getenv("GGML_CPU_DISABLE_FUSION");
|
||||
ggml_cpu_disable_fusion = (env != NULL && atoi(env) == 1);
|
||||
}
|
||||
|
||||
is_first_call = false;
|
||||
}
|
||||
|
||||
|
||||
@@ -195,8 +195,8 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||
/* .free = */ ggml_backend_cpu_free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .set_tensor_2d_async = */ NULL,
|
||||
/* .get_tensor_2d_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
||||
|
||||
@@ -2321,6 +2321,9 @@ class tinyBLAS_Q0_PPC {
|
||||
}
|
||||
|
||||
void matmul(int64_t m, int64_t n) {
|
||||
#if defined(_AIX) || defined(__BIG_ENDIAN__)
|
||||
mnpack(0, m, 0, n);
|
||||
#else
|
||||
const int64_t mc = 64;
|
||||
const int64_t kc = 64;
|
||||
int64_t nc = 64;
|
||||
@@ -2334,7 +2337,6 @@ class tinyBLAS_Q0_PPC {
|
||||
} else {
|
||||
n_aligned = (n / 64) * 64;
|
||||
}
|
||||
|
||||
if (n_aligned > 0) {
|
||||
if (n_aligned % 64 == 0) nc = 64;
|
||||
else if (n_aligned == n) nc = n;
|
||||
@@ -2352,6 +2354,7 @@ class tinyBLAS_Q0_PPC {
|
||||
} else {
|
||||
mnpack(0, m, 0, n);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
@@ -3191,12 +3194,16 @@ class tinyBLAS_PPC {
|
||||
}
|
||||
|
||||
void matmul(int64_t m, int64_t n) {
|
||||
#if defined(_AIX) || defined(__BIG_ENDIAN__)
|
||||
mnpack(0, m, 0, n);
|
||||
#else
|
||||
int64_t mc = 256; int64_t nc = 256; int64_t kc = 256;
|
||||
if (m % mc == 0 && n % nc == 0 && k % kc == 0) {
|
||||
matmul_tiled(m, n, mc, nc, kc);
|
||||
} else {
|
||||
mnpack(0, m, 0, n);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
@@ -3713,11 +3713,27 @@ void ggml_compute_forward_norm(
|
||||
|
||||
// ggml_compute_forward_group_rms_norm
|
||||
|
||||
// fusion kinds that can be combined with the rms_norm computation in a single pass.
|
||||
// extend this enum when adding new fused variants (e.g. FUSE_ADD, FUSE_MUL_ADD, ...).
|
||||
enum ggml_rms_norm_fuse_op {
|
||||
GGML_RMS_NORM_FUSE_OP_NONE,
|
||||
GGML_RMS_NORM_FUSE_OP_MUL,
|
||||
};
|
||||
|
||||
template <ggml_rms_norm_fuse_op FUSE_OP>
|
||||
static void ggml_compute_forward_rms_norm_f32(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst) {
|
||||
ggml_tensor * dst_rms_norm,
|
||||
ggml_tensor * dst_fused = nullptr) {
|
||||
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src0 = dst_rms_norm->src[0];
|
||||
const ggml_tensor * src1 = nullptr;
|
||||
ggml_tensor * dst = dst_rms_norm;
|
||||
|
||||
if constexpr (FUSE_OP == GGML_RMS_NORM_FUSE_OP_MUL) {
|
||||
src1 = (dst_fused->src[0] == dst_rms_norm) ? dst_fused->src[1] : dst_fused->src[0];
|
||||
dst = dst_fused;
|
||||
}
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||
|
||||
@@ -3726,11 +3742,10 @@ static void ggml_compute_forward_rms_norm_f32(
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
memcpy(&eps, dst_rms_norm->op_params, sizeof(float));
|
||||
GGML_ASSERT(eps >= 0.0f);
|
||||
|
||||
// TODO: optimize
|
||||
@@ -3740,25 +3755,32 @@ static void ggml_compute_forward_rms_norm_f32(
|
||||
const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
ggml_float sum = 0.0;
|
||||
// worth switching to explicit SIMD?
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
sum += (ggml_float)(x[i00] * x[i00]);
|
||||
}
|
||||
|
||||
const float mean = sum/ne00;
|
||||
|
||||
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
memcpy(y, x, ne00 * sizeof(float));
|
||||
// for (int i00 = 0; i00 < ne00; i00++) {
|
||||
// y[i00] = x[i00];
|
||||
// }
|
||||
|
||||
const float mean = sum/ne00;
|
||||
const float scale = 1.0f/sqrtf(mean + eps);
|
||||
|
||||
// if you hit this, likely you got an inf somewhere earlier
|
||||
assert(scale > 0.0f);
|
||||
|
||||
ggml_vec_scale_f32(ne00, y, scale);
|
||||
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
||||
|
||||
if constexpr (FUSE_OP == GGML_RMS_NORM_FUSE_OP_MUL) {
|
||||
const int64_t i11 = i01 % ne11;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i13 = i03 % ne13;
|
||||
const float * w = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
|
||||
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
y[i00] = x[i00] * scale * w[i00];
|
||||
}
|
||||
} else {
|
||||
memcpy(y, x, ne00 * sizeof(float));
|
||||
ggml_vec_scale_f32(ne00, y, scale);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3773,7 +3795,31 @@ void ggml_compute_forward_rms_norm(
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_rms_norm_f32(params, dst);
|
||||
ggml_compute_forward_rms_norm_f32<GGML_RMS_NORM_FUSE_OP_NONE>(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fused RMS_NORM + MUL: computes dst = rms_norm(src0) * src1 in a single pass.
|
||||
// This avoids materializing the intermediate rms_norm result in memory.
|
||||
void ggml_compute_forward_rms_norm_mul_fused(
|
||||
const ggml_compute_params * params,
|
||||
ggml_tensor * dst_rms_norm,
|
||||
ggml_tensor * dst_mul) {
|
||||
|
||||
GGML_ASSERT(dst_mul != nullptr);
|
||||
GGML_ASSERT(dst_mul->src[0] == dst_rms_norm || dst_mul->src[1] == dst_rms_norm);
|
||||
|
||||
const ggml_tensor * src0 = dst_rms_norm->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_rms_norm_f32<GGML_RMS_NORM_FUSE_OP_MUL>(params, dst_rms_norm, dst_mul);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
@@ -11212,3 +11258,91 @@ void ggml_compute_forward_opt_step_sgd(const ggml_compute_params * params, ggml_
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_fwht_f32(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t n = ne10;
|
||||
GGML_ASSERT((n & (n - 1)) == 0); // must be power of 2
|
||||
|
||||
const int64_t nr = ne11 * ne12 * ne13;
|
||||
const int64_t rows_per_thread = (nr + nth - 1) / nth;
|
||||
const int64_t start_row = ith * rows_per_thread;
|
||||
const int64_t end_row = MIN(start_row + rows_per_thread, nr);
|
||||
|
||||
const float scale = 1.0f / sqrtf((float)n);
|
||||
|
||||
#if defined(GGML_SIMD)
|
||||
const GGML_F32_VEC v_minus_one = GGML_F32_VEC_SET1(-1.0f);
|
||||
#endif
|
||||
|
||||
for (int64_t r = start_row; r < end_row; r++) {
|
||||
const int64_t i13 = r / (ne11 * ne12);
|
||||
const int64_t i12 = (r - i13 * ne11 * ne12) / ne11;
|
||||
const int64_t i11 = r - i13 * ne11 * ne12 - i12 * ne11;
|
||||
|
||||
const float * src_row = (const float *) ((const char *) src1->data + i11 * nb11 + i12 * nb12 + i13 * nb13);
|
||||
float * dst_row = (float *) ((char *) dst->data + i11 * nb1 + i12 * nb2 + i13 * nb3);
|
||||
|
||||
for (int64_t j = 0; j < n; j++) {
|
||||
dst_row[j] = src_row[j] * scale;
|
||||
}
|
||||
|
||||
// Scalar passes
|
||||
#if defined(GGML_SIMD)
|
||||
const int step = GGML_F32_EPR;
|
||||
#else
|
||||
const int step = n;
|
||||
#endif
|
||||
for (int64_t len = 1; len < step && len < n; len <<= 1) {
|
||||
for (int64_t i = 0; i < n; i += 2 * len) {
|
||||
for (int64_t j = 0; j < len; j++) {
|
||||
float u = dst_row[i + j];
|
||||
float v = dst_row[i + len + j];
|
||||
dst_row[i + j] = u + v;
|
||||
dst_row[i + len + j] = u - v;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SIMD passes using GGML_F32_VEC_* macros for multi-architecture support
|
||||
#if defined(GGML_SIMD)
|
||||
for (int64_t len = step; len < n; len <<= 1) {
|
||||
for (int64_t i = 0; i < n; i += 2 * len) {
|
||||
for (int64_t j = 0; j < len; j += step) {
|
||||
GGML_F32_VEC u = GGML_F32_VEC_LOAD(dst_row + i + j);
|
||||
GGML_F32_VEC v = GGML_F32_VEC_LOAD(dst_row + i + len + j);
|
||||
|
||||
GGML_F32_VEC_STORE(dst_row + i + j, GGML_F32_VEC_ADD(u, v));
|
||||
GGML_F32_VEC_STORE(dst_row + i + len + j, GGML_F32_VEC_FMA(u, v, v_minus_one));
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_fwht(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
switch (src1->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_fwht_f32(params, dst);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
{
|
||||
GGML_ABORT("fatal error - fwht is F32 only");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,6 +44,7 @@ void ggml_compute_forward_concat(const struct ggml_compute_params * params, stru
|
||||
void ggml_compute_forward_silu_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rms_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rms_norm_mul_fused(const struct ggml_compute_params * params, struct ggml_tensor * dst_rms_norm, struct ggml_tensor * dst_mul);
|
||||
void ggml_compute_forward_rms_norm_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_group_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_l2_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
@@ -111,6 +112,7 @@ void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params *
|
||||
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_fwht(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_opt_step_sgd(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@@ -1036,12 +1036,12 @@ inline static float ggml_gelu_quick_f32(float x) {
|
||||
return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
|
||||
}
|
||||
|
||||
//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
// const uint16_t * i16 = (const uint16_t *) x;
|
||||
// for (int i = 0; i < n; ++i) {
|
||||
// y[i] = ggml_table_gelu_quick_f16[i16[i]];
|
||||
// }
|
||||
//}
|
||||
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
const uint16_t * i16 = (const uint16_t *) x;
|
||||
for (int i = 0; i < n; ++i) {
|
||||
y[i] = ggml_table_gelu_quick_f16[i16[i]];
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_GELU_QUICK_FP16
|
||||
inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
|
||||
@@ -1060,13 +1060,6 @@ inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float *
|
||||
}
|
||||
#endif
|
||||
|
||||
inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
float v = GGML_CPU_FP16_TO_FP32(x[i]);
|
||||
y[i] = GGML_CPU_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
|
||||
}
|
||||
}
|
||||
|
||||
// Sigmoid Linear Unit (SiLU) function
|
||||
inline static float ggml_silu_f32(float x) {
|
||||
return x/(1.0f + expf(-x));
|
||||
|
||||
@@ -830,6 +830,18 @@ static __device__ __forceinline__ float ggml_cuda_ue4m3_to_fp32(uint8_t x) {
|
||||
#endif // defined(GGML_USE_HIP) && defined(CDNA3) && defined(FP8_AVAILABLE) && HIP_VERSION >= 60200000
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ uint8_t ggml_cuda_fp32_to_ue4m3(float x) {
|
||||
#if defined(BLACKWELL_MMA_AVAILABLE) // This is used for NVFP4 subblock scale quantizations only
|
||||
if (!(x > 0.0f)) {
|
||||
return 0;
|
||||
}
|
||||
const __nv_fp8_e4m3 xf(x);
|
||||
return xf.__x;
|
||||
#else
|
||||
NO_DEVICE_CODE; // Used only for NVFP4 Scales for Activations, only for Blackwell
|
||||
#endif // defined(BLACKWELL_MMA_AVAILABLE)
|
||||
}
|
||||
|
||||
__device__ __forceinline__ uint8_t ggml_cuda_float_to_fp4_e2m1(float x, float e) {
|
||||
const uint8_t sign_bit = (x < 0.0f) << 3;
|
||||
float ax = fabsf(x) * e;
|
||||
|
||||
@@ -1,96 +1,79 @@
|
||||
#include "concat.cuh"
|
||||
|
||||
// contiguous kernels
|
||||
static __global__ void concat_f32_dim0(const float * x, const float * y, float * dst, const int ne0, const int ne00) {
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
template <int dim>
|
||||
static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont(const float * x,
|
||||
const float * y,
|
||||
float * dst,
|
||||
int64_t ne00,
|
||||
int64_t ne01,
|
||||
int64_t ne02,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2) {
|
||||
static_assert(dim >= 0 && dim <= 2, "dim must be in [0, 2]");
|
||||
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
const int64_t n = ne0 * ne1 * ne2;
|
||||
|
||||
if (nidx < ne00) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne00 +
|
||||
blockIdx.z * ne00 * gridDim.y;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
(nidx - ne00) +
|
||||
blockIdx.y * (ne0 - ne00) +
|
||||
blockIdx.z * (ne0 - ne00) * gridDim.y;
|
||||
dst[offset_dst] = y[offset_src];
|
||||
for (int64_t i = (int64_t) blockIdx.x * blockDim.x + threadIdx.x; i < n; i += (int64_t) blockDim.x * gridDim.x) {
|
||||
if constexpr (dim == 0) {
|
||||
const int64_t row = i / ne0;
|
||||
const int64_t i0 = i - row * ne0;
|
||||
|
||||
if (i0 < ne00) {
|
||||
dst[i] = x[row * ne00 + i0];
|
||||
} else {
|
||||
dst[i] = y[row * (ne0 - ne00) + (i0 - ne00)];
|
||||
}
|
||||
} else if constexpr (dim == 1) {
|
||||
const int64_t dst_plane = ne0 * ne1;
|
||||
const int64_t src0_plane = ne0 * ne01;
|
||||
const int64_t src1_plane = dst_plane - src0_plane;
|
||||
const int64_t i2 = i / dst_plane;
|
||||
const int64_t i01 = i - i2 * dst_plane;
|
||||
|
||||
if (i01 < src0_plane) {
|
||||
dst[i] = x[i2 * src0_plane + i01];
|
||||
} else {
|
||||
dst[i] = y[i2 * src1_plane + (i01 - src0_plane)];
|
||||
}
|
||||
} else {
|
||||
const int64_t src0_size = ne0 * ne1 * ne02;
|
||||
|
||||
if (i < src0_size) {
|
||||
dst[i] = x[i];
|
||||
} else {
|
||||
dst[i] = y[i - src0_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void concat_f32_dim1(const float * x, const float * y, float * dst, const int ne0, const int ne01) {
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
static void concat_f32_cuda(const float * x,
|
||||
const float * y,
|
||||
float * dst,
|
||||
int64_t ne00,
|
||||
int64_t ne01,
|
||||
int64_t ne02,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int dim,
|
||||
cudaStream_t stream) {
|
||||
const int64_t n = ne0 * ne1 * ne2;
|
||||
const int num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
||||
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
|
||||
if (blockIdx.y < (unsigned)ne01) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * ne01;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
nidx +
|
||||
(blockIdx.y - ne01) * ne0 +
|
||||
blockIdx.z * ne0 * (gridDim.y - ne01);
|
||||
dst[offset_dst] = y[offset_src];
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void concat_f32_dim2(const float * x, const float * y, float * dst, const int ne0, const int ne02) {
|
||||
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
if (nidx >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
int offset_dst =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
|
||||
if (blockIdx.z < (unsigned)ne02) { // src0
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
blockIdx.z * ne0 * gridDim.y;
|
||||
dst[offset_dst] = x[offset_src];
|
||||
} else {
|
||||
int offset_src =
|
||||
nidx +
|
||||
blockIdx.y * ne0 +
|
||||
(blockIdx.z - ne02) * ne0 * gridDim.y;
|
||||
dst[offset_dst] = y[offset_src];
|
||||
}
|
||||
}
|
||||
|
||||
static void concat_f32_cuda(const float * x, const float * y, float * dst, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, int dim, cudaStream_t stream) {
|
||||
int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
|
||||
dim3 gridDim(num_blocks, ne1, ne2);
|
||||
if (dim == 0) {
|
||||
concat_f32_dim0<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne00);
|
||||
concat_f32_cont<0>
|
||||
<<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
return;
|
||||
}
|
||||
if (dim == 1) {
|
||||
concat_f32_dim1<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne01);
|
||||
concat_f32_cont<1>
|
||||
<<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
return;
|
||||
}
|
||||
concat_f32_dim2<<<gridDim, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne0, ne02);
|
||||
concat_f32_cont<2><<<num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream>>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2);
|
||||
}
|
||||
|
||||
// non-contiguous kernel (slow)
|
||||
|
||||
@@ -66,6 +66,9 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 32, 128, 128, 128, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2, 32, 128, 128, 128, 2, true);
|
||||
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2, 32, 128, 128, 128, 1, false);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 256, 1, 32, 128, 128, 128, 1, false);
|
||||
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 8, 64, 4, 32, 256, 256, 128, 1, false);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 64, 4, 32, 256, 256, 128, 1, false);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2, 32, 128, 128, 128, 1, false);
|
||||
@@ -85,6 +88,9 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 64, 128, 128, 64, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2, 64, 128, 128, 64, 2, true);
|
||||
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2, 32, 128, 128, 128, 1, false);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 256, 1, 32, 128, 128, 128, 1, false);
|
||||
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 8, 64, 4, 32, 96, 64, 128, 1, false);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 64, 4, 32, 96, 64, 128, 1, false);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2, 32, 128, 128, 128, 1, false);
|
||||
@@ -118,6 +124,9 @@ static constexpr __host__ __device__ fattn_mma_config ggml_cuda_fattn_mma_get_co
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 32, 128, 2, 64, 128, 128, 64, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(256, 256, 64, 128, 2, 64, 128, 128, 64, 2, true);
|
||||
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 32, 128, 2, 64, 160, 128, 64, 2, true);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(320, 256, 64, 128, 2, 64, 160, 128, 64, 2, false);
|
||||
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 16, 64, 4, 32, 128, 128, 128, 1, false);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 32, 128, 2, 32, 128, 128, 128, 1, false);
|
||||
GGML_CUDA_FATTN_MMA_CONFIG_CASE(512, 512, 64, 256, 1, 32, 128, 128, 128, 1, false);
|
||||
@@ -1217,7 +1226,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
|
||||
float KQ_max_scale[cols_per_thread];
|
||||
#pragma unroll
|
||||
for (int col = 0; col < cols_per_thread; ++col) {
|
||||
const int jc = cols_per_warp == 8 ? T_C_KQ::get_j(col) : T_C_KQ::get_i(2*col);
|
||||
const int jc = (threadIdx.y/np)*cols_per_warp + (cols_per_warp == 8 ? T_C_KQ::get_j(col) : T_C_KQ::get_i(2*col));
|
||||
const float sink = sinks_f[jc % ncols2];
|
||||
|
||||
const float KQ_max_new = fmaxf(KQ_max[col], sink);
|
||||
@@ -1825,6 +1834,10 @@ extern DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
|
||||
extern DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
|
||||
extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
|
||||
|
||||
// Mistral Small 4 (DKQ=320, DV=256), GQA=32-only build:
|
||||
extern DECL_FATTN_MMA_F16_CASE(320, 256, 1, 32);
|
||||
extern DECL_FATTN_MMA_F16_CASE(320, 256, 2, 32);
|
||||
|
||||
// For GLM 4.7 Flash
|
||||
extern DECL_FATTN_MMA_F16_CASE(576, 512, 4, 4);
|
||||
extern DECL_FATTN_MMA_F16_CASE(576, 512, 8, 4);
|
||||
|
||||
@@ -38,6 +38,10 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
|
||||
GGML_ASSERT(V->ne[0] == K->ne[0]);
|
||||
ggml_cuda_flash_attn_ext_tile_case<256, 256>(ctx, dst);
|
||||
} break;
|
||||
case 320: {
|
||||
GGML_ASSERT(V->ne[0] == 256);
|
||||
ggml_cuda_flash_attn_ext_tile_case<320, 256>(ctx, dst);
|
||||
} break;
|
||||
case 512: {
|
||||
GGML_ASSERT(V->ne[0] == K->ne[0]);
|
||||
ggml_cuda_flash_attn_ext_tile_case<512, 512>(ctx, dst);
|
||||
|
||||
@@ -68,6 +68,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 64, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 64, 64)
|
||||
@@ -128,6 +130,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 32, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 32, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 32, 64)
|
||||
@@ -195,6 +199,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 128)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 512, 1, 128, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 2, 64, 64)
|
||||
@@ -264,6 +270,8 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 5, 32, 256)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 3, 64, 128)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 128, 64)
|
||||
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
|
||||
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 16, 256, 4, 64, 64)
|
||||
@@ -1116,7 +1124,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
|
||||
constexpr size_t nbytes_shared = 0;
|
||||
|
||||
#ifdef GGML_USE_HIP
|
||||
if constexpr (DV <= 128) {
|
||||
if constexpr (DKQ <= 128) {
|
||||
if (Q->ne[1] > 32/ncols2) {
|
||||
constexpr int cols_per_block = 64;
|
||||
const int nwarps = ggml_cuda_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
|
||||
@@ -1130,7 +1138,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
|
||||
#endif // GGML_USE_HIP
|
||||
|
||||
#ifndef GGML_USE_HIP
|
||||
if constexpr (DV <= 256)
|
||||
if constexpr (DKQ <= 256)
|
||||
#endif // GGML_USE_HIP
|
||||
{
|
||||
if (Q->ne[1] > 16/ncols2) {
|
||||
@@ -1144,14 +1152,16 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
|
||||
}
|
||||
}
|
||||
|
||||
if (Q->ne[1] > 8/ncols2) {
|
||||
constexpr int cols_per_block = 16;
|
||||
const int nwarps = ggml_cuda_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
|
||||
const int nbatch_fa = ggml_cuda_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile<DKQ, DV, cols_per_block/ncols2, ncols2, use_logit_softcap>;
|
||||
launch_fattn<DV, cols_per_block/ncols2, ncols2>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, nbatch_fa, true, true, false, warp_size);
|
||||
return;
|
||||
if constexpr (ncols2 <= 16) {
|
||||
if (Q->ne[1] > 8/ncols2) {
|
||||
constexpr int cols_per_block = 16;
|
||||
const int nwarps = ggml_cuda_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
|
||||
const int nbatch_fa = ggml_cuda_fattn_tile_get_nbatch_fa(DKQ, DV, cols_per_block, cc);
|
||||
fattn_kernel_t fattn_kernel = flash_attn_tile<DKQ, DV, cols_per_block/ncols2, ncols2, use_logit_softcap>;
|
||||
launch_fattn<DV, cols_per_block/ncols2, ncols2>
|
||||
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, nbatch_fa, true, true, false, warp_size);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (ncols2 <= 8) {
|
||||
@@ -1210,6 +1220,25 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
|
||||
const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX;
|
||||
const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0;
|
||||
|
||||
if constexpr (DKQ == 320) {
|
||||
// This branch is only used for Mistral Small 4 which has a GQA ratio of 32.
|
||||
// On AMD, simply use that GQA ratio with 32 columns / block since we always have enough SRAM.
|
||||
// On NVIDIA however, the tile kernel is only used for GPUs that can't use the mma kernel (Pascal and older).
|
||||
// Therefore, use a GQA ratio of 16 with 16 columns / block to stay below 48 kiB of SRAM / block.
|
||||
#ifdef GGML_USE_HIP
|
||||
if (use_gqa_opt && gqa_ratio % 32 == 0) {
|
||||
launch_fattn_tile_switch_ncols1<DKQ, DV, 32, use_logit_softcap>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
#else
|
||||
if (use_gqa_opt && gqa_ratio % 16 == 0) {
|
||||
launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
|
||||
return;
|
||||
}
|
||||
#endif // GGML_USE_HIP
|
||||
GGML_ABORT("flash-attn tile (320/256): expected GQA ratio multiple of 32");
|
||||
}
|
||||
|
||||
if constexpr (DKQ == 576) {
|
||||
if (use_gqa_opt && gqa_ratio % 16 == 0) {
|
||||
launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
|
||||
@@ -1221,7 +1250,7 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
|
||||
}
|
||||
}
|
||||
|
||||
if constexpr (DKQ <= 512) {
|
||||
if constexpr (DKQ <= 512 && DKQ != 320) {
|
||||
if (use_gqa_opt && gqa_ratio % 8 == 0) {
|
||||
launch_fattn_tile_switch_ncols1<DKQ, DV, 8, use_logit_softcap>(ctx, dst);
|
||||
return;
|
||||
@@ -1275,5 +1304,6 @@ extern DECL_FATTN_TILE_CASE( 96, 96);
|
||||
extern DECL_FATTN_TILE_CASE(112, 112);
|
||||
extern DECL_FATTN_TILE_CASE(128, 128);
|
||||
extern DECL_FATTN_TILE_CASE(256, 256);
|
||||
extern DECL_FATTN_TILE_CASE(320, 256);
|
||||
extern DECL_FATTN_TILE_CASE(512, 512);
|
||||
extern DECL_FATTN_TILE_CASE(576, 512);
|
||||
|
||||
@@ -143,6 +143,22 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
|
||||
GGML_ASSERT(V->ne[0] == 256);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
|
||||
break;
|
||||
case 320:
|
||||
// For Mistral Small 4, go straight to the ncols1 switch (ncols2=32-only build).
|
||||
GGML_ASSERT(V->ne[0] == 256);
|
||||
{
|
||||
float max_bias = 0.0f;
|
||||
memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
|
||||
|
||||
const bool use_gqa_opt = mask && max_bias == 0.0f;
|
||||
GGML_ASSERT(use_gqa_opt);
|
||||
GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
|
||||
const int gqa_ratio = Q->ne[2] / K->ne[2];
|
||||
GGML_ASSERT(gqa_ratio % 32 == 0);
|
||||
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<320, 256, 32>(ctx, dst);
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
GGML_ASSERT(V->ne[0] == 512);
|
||||
ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<512, 512>(ctx, dst);
|
||||
@@ -352,6 +368,14 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
break;
|
||||
case 320:
|
||||
if (V->ne[0] != 256 || !gqa_opt_applies) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
if (gqa_ratio % 32 != 0) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (V->ne[0] != K->ne[0]) {
|
||||
return BEST_FATTN_KERNEL_NONE;
|
||||
|
||||
@@ -6,17 +6,18 @@ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
|
||||
static __global__ void k_get_rows(
|
||||
const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
|
||||
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
|
||||
/*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
|
||||
/*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
|
||||
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
|
||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||
|
||||
for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
|
||||
for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
|
||||
for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) {
|
||||
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||
const int i10 = blockIdx.x;
|
||||
const int i11 = z / ne12; // TODO fastdiv
|
||||
const int i12 = z % ne12;
|
||||
const uint2 dm = fast_div_modulo((uint32_t)z, ne12_fdv);
|
||||
const int i11 = dm.x;
|
||||
const int i12 = dm.y;
|
||||
|
||||
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
|
||||
|
||||
@@ -42,17 +43,18 @@ template<typename src0_t, typename dst_t>
|
||||
static __global__ void k_get_rows_float(
|
||||
const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
|
||||
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
|
||||
/*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
|
||||
/*const int64_t ne10,*/ const int64_t ne11, const uint3 ne12_fdv, /*const int64_t ne13,*/
|
||||
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
|
||||
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
|
||||
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
|
||||
|
||||
for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
|
||||
for (int64_t z = blockIdx.z; z < ne11*(int64_t)ne12_fdv.z; z += gridDim.z) {
|
||||
for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) {
|
||||
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
|
||||
const int i10 = blockIdx.x;
|
||||
const int i11 = z / ne12; // TODO fastdiv
|
||||
const int i12 = z % ne12;
|
||||
const uint2 dm = fast_div_modulo((uint32_t)z, ne12_fdv);
|
||||
const int i11 = dm.x;
|
||||
const int i12 = dm.y;
|
||||
|
||||
if (i00 >= ne00) {
|
||||
return;
|
||||
@@ -115,10 +117,14 @@ static void get_rows_cuda_q(
|
||||
|
||||
GGML_ASSERT(ne00 % 2 == 0);
|
||||
|
||||
GGML_ASSERT(ne12 > 0);
|
||||
GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
|
||||
const uint3 ne12_fdv = init_fastdiv_values(ne12);
|
||||
|
||||
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
|
||||
src0_d, src1_d, dst_d,
|
||||
ne00, /*ne01, ne02, ne03,*/
|
||||
/*ne10,*/ ne11, ne12, /*ne13,*/
|
||||
/*ne10,*/ ne11, ne12_fdv, /*ne13,*/
|
||||
/* s0,*/ s1, s2, s3,
|
||||
/* nb00,*/ nb01, nb02, nb03,
|
||||
s10, s11, s12/*, s13*/);
|
||||
@@ -146,10 +152,14 @@ static void get_rows_cuda_float(
|
||||
const size_t s12 = nb12 / sizeof(int32_t);
|
||||
// const size_t s13 = nb13 / sizeof(int32_t);
|
||||
|
||||
GGML_ASSERT(ne12 > 0);
|
||||
GGML_ASSERT(ne11 <= std::numeric_limits<uint32_t>::max() / ne12);
|
||||
const uint3 ne12_fdv = init_fastdiv_values(ne12);
|
||||
|
||||
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
|
||||
src0_d, src1_d, dst_d,
|
||||
ne00, /*ne01, ne02, ne03,*/
|
||||
/*ne10,*/ ne11, ne12, /*ne13,*/
|
||||
/*ne10,*/ ne11, ne12_fdv, /*ne13,*/
|
||||
/* s0,*/ s1, s2, s3,
|
||||
/* nb00,*/ nb01, nb02, nb03,
|
||||
s10, s11, s12/*, s13*/);
|
||||
|
||||
@@ -3556,6 +3556,9 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
||||
&& unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_SILU) {
|
||||
const ggml_tensor * ssm_conv = cgraph->nodes[node_idx];
|
||||
const ggml_tensor * silu = cgraph->nodes[node_idx+1];
|
||||
if (ggml_get_unary_op(silu) != unary_ops.begin()[0]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ssm_conv->type != GGML_TYPE_F32 || silu->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
@@ -3564,6 +3567,31 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ops.size() == 3 && ops.begin()[0] == GGML_OP_SSM_CONV && ops.begin()[1] == GGML_OP_ADD
|
||||
&& ops.begin()[2] == GGML_OP_UNARY && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_SILU) {
|
||||
const ggml_tensor * ssm_conv = cgraph->nodes[node_idx];
|
||||
const ggml_tensor * add = cgraph->nodes[node_idx+1];
|
||||
const ggml_tensor * silu = cgraph->nodes[node_idx+2];
|
||||
if (ggml_get_unary_op(silu) != unary_ops.begin()[0]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (ssm_conv->type != GGML_TYPE_F32 || add->type != GGML_TYPE_F32 || silu->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// ADD must consume ssm_conv's output and broadcast a 1-D channel-wise bias.
|
||||
const ggml_tensor * bias = (add->src[0] == ssm_conv) ? add->src[1] : add->src[0];
|
||||
if (bias->type != GGML_TYPE_F32 || !ggml_is_contiguous(bias)) {
|
||||
return false;
|
||||
}
|
||||
if (ggml_nelements(bias) != ssm_conv->ne[0] || bias->ne[0] != ssm_conv->ne[0]) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
if (ops.size() == 2 && ops.begin()[0] == GGML_OP_UNARY && ops.begin()[1] == GGML_OP_MUL
|
||||
&& unary_ops.size() == 1 && (unary_ops.begin()[0] == GGML_UNARY_OP_SILU || unary_ops.begin()[0] == GGML_UNARY_OP_SIGMOID || unary_ops.begin()[0] == GGML_UNARY_OP_SOFTPLUS)) {
|
||||
const ggml_tensor * unary = cgraph->nodes[node_idx];
|
||||
@@ -3640,6 +3668,362 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph,
|
||||
return false;
|
||||
}
|
||||
|
||||
// try and fuse nodes and return the number of nodes to skip
|
||||
static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, int i) {
|
||||
|
||||
static bool disable_fusion = getenv("GGML_CUDA_DISABLE_FUSION") != nullptr && std::atoi(getenv("GGML_CUDA_DISABLE_FUSION"));
|
||||
if (disable_fusion) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
//topk-moe
|
||||
if (cgraph->nodes[i]->op == GGML_OP_UNARY || cgraph->nodes[i]->op == GGML_OP_SOFT_MAX ||
|
||||
cgraph->nodes[i]->op == GGML_OP_ARGSORT) {
|
||||
ggml_cuda_topk_moe_args args;
|
||||
const bool can_fuse = ggml_cuda_topk_moe_fusion(cgraph, i, args);
|
||||
std::vector<ggml_op> ops;
|
||||
|
||||
if (can_fuse) {
|
||||
const ggml_tensor * logits = node->src[0];
|
||||
ggml_tensor * weights = nullptr;
|
||||
ggml_tensor * ids = nullptr;
|
||||
const ggml_tensor * bias = nullptr;
|
||||
const ggml_tensor * clamp = nullptr;
|
||||
const ggml_tensor * scale = nullptr;
|
||||
|
||||
if (!args.delayed_softmax) {
|
||||
ggml_op gating_op = args.sigmoid ? GGML_OP_UNARY : GGML_OP_SOFT_MAX;
|
||||
int out_nodes[2]; // nodes which can't be elided
|
||||
|
||||
if (args.prob_bias) {
|
||||
bias = cgraph->nodes[i + 2]->src[1];
|
||||
ops.insert(ops.end(), { gating_op, GGML_OP_RESHAPE, GGML_OP_ADD, GGML_OP_ARGSORT, GGML_OP_VIEW,
|
||||
GGML_OP_GET_ROWS });
|
||||
out_nodes[0] = i + 4;
|
||||
ids = cgraph->nodes[i + 4];
|
||||
} else {
|
||||
ops.insert(ops.end(),
|
||||
{ gating_op, GGML_OP_RESHAPE, GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS });
|
||||
out_nodes[0] = i + 3;
|
||||
ids = cgraph->nodes[i + 3];
|
||||
}
|
||||
|
||||
if (args.norm) {
|
||||
ops.insert(ops.end(),
|
||||
{ GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP, GGML_OP_DIV, GGML_OP_RESHAPE });
|
||||
clamp = cgraph->nodes[i + ops.size() - 3];
|
||||
}
|
||||
if (args.scale) {
|
||||
ops.insert(ops.end(), { GGML_OP_SCALE });
|
||||
scale = cgraph->nodes[i + ops.size() - 1];
|
||||
}
|
||||
|
||||
weights = cgraph->nodes[i + ops.size() - 1];
|
||||
out_nodes[1] = i + ops.size() - 1;
|
||||
|
||||
if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
|
||||
ggml_cuda_should_use_topk_moe(node, logits, weights, ids) &&
|
||||
ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/true)) {
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
|
||||
return ops.size() - 1;
|
||||
}
|
||||
} else if (!args.norm && !args.prob_bias) {
|
||||
//special case gpt-oss, no norm, no bias.
|
||||
ops.insert(ops.end(), { GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
|
||||
GGML_OP_SOFT_MAX, GGML_OP_RESHAPE });
|
||||
weights = cgraph->nodes[i + 5];
|
||||
ids = cgraph->nodes[i + 1];
|
||||
const ggml_tensor * softmax = cgraph->nodes[i + 4];
|
||||
|
||||
int out_nodes[2] = { i + 1, i + 5 };
|
||||
if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
|
||||
ggml_cuda_should_use_topk_moe(softmax, logits, weights, ids) &&
|
||||
ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/true)) {
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
|
||||
return ops.size() - 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//RoPE + view + set-rows
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, {})) {
|
||||
ggml_tensor * rope = cgraph->nodes[i];
|
||||
ggml_tensor * set_rows = cgraph->nodes[i + 2];
|
||||
|
||||
ggml_cuda_op_rope_fused(*cuda_ctx, rope, set_rows);
|
||||
return 2;
|
||||
}
|
||||
|
||||
// multi-(add or mul)
|
||||
if (node->op == GGML_OP_ADD || node->op == GGML_OP_MUL) {
|
||||
int n_fuse = 0;
|
||||
ggml_op ops[8];
|
||||
std::fill(ops, ops + 8, node->op);
|
||||
|
||||
for (; n_fuse <= 6; ++n_fuse) {
|
||||
if (!ggml_can_fuse(cgraph, i + n_fuse, ops + n_fuse, 2)) {
|
||||
break;
|
||||
}
|
||||
if (cgraph->nodes[i + n_fuse] != cgraph->nodes[i + n_fuse + 1]->src[0]) {
|
||||
break;
|
||||
}
|
||||
if (!ggml_are_same_layout(cgraph->nodes[i + n_fuse]->src[1], cgraph->nodes[i + n_fuse + 1]->src[1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
n_fuse++;
|
||||
|
||||
if (n_fuse > 1) {
|
||||
ggml_tensor fused_node;
|
||||
memcpy(&fused_node, node, sizeof(ggml_tensor));
|
||||
for (int j = 0; j < n_fuse - 1; ++j) {
|
||||
fused_node.src[j + 2] = cgraph->nodes[i + j + 1]->src[1];
|
||||
}
|
||||
fused_node.data = cgraph->nodes[i + n_fuse - 1]->data;
|
||||
if (node->op == GGML_OP_ADD) {
|
||||
ggml_cuda_op_fused_add(*cuda_ctx, &fused_node, n_fuse);
|
||||
} else {
|
||||
ggml_cuda_op_fused_mul(*cuda_ctx, &fused_node, n_fuse);
|
||||
}
|
||||
return n_fuse - 1;
|
||||
}
|
||||
}
|
||||
|
||||
bool fused_mul_mat_vec = false;
|
||||
int fused_node_count = 0;
|
||||
|
||||
// gate + glu + up
|
||||
for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
|
||||
const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { op, bias_op, op, bias_op, GGML_OP_GLU }, {})) {
|
||||
ggml_tensor * glu = cgraph->nodes[i + 4];
|
||||
ggml_tensor * gate_bias_n = glu->src[0];
|
||||
ggml_tensor * up_bias_n = glu->src[1];
|
||||
|
||||
//we don't assume the order for {gate, up}. Instead infer it from the bias tensor
|
||||
ggml_tensor * gate_n = nullptr;
|
||||
ggml_tensor * up_n = nullptr;
|
||||
|
||||
if (gate_bias_n->src[0] == cgraph->nodes[i] || gate_bias_n->src[1] == cgraph->nodes[i]) {
|
||||
gate_n = cgraph->nodes[i];
|
||||
up_n = cgraph->nodes[i + 2];
|
||||
} else if (gate_bias_n->src[0] == cgraph->nodes[i + 2] || gate_bias_n->src[1] == cgraph->nodes[i + 2]) {
|
||||
gate_n = cgraph->nodes[i + 2];
|
||||
up_n = cgraph->nodes[i];
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto get_bias_tensor = [](const ggml_tensor * bias_node, const ggml_tensor * mul_node, ggml_op op_bias) {
|
||||
if (op_bias == GGML_OP_ADD) {
|
||||
if (bias_node->src[0] == mul_node) {
|
||||
return bias_node->src[1];
|
||||
}
|
||||
if (bias_node->src[1] == mul_node) {
|
||||
return bias_node->src[0];
|
||||
}
|
||||
return (ggml_tensor *) nullptr;
|
||||
}
|
||||
GGML_ASSERT(op_bias == GGML_OP_ADD_ID);
|
||||
GGML_ASSERT(bias_node->src[0] == mul_node);
|
||||
return bias_node->src[1];
|
||||
};
|
||||
|
||||
ggml_tensor * up_bias_tensor = get_bias_tensor(up_bias_n, up_n, bias_op);
|
||||
ggml_tensor * gate_bias_tensor = get_bias_tensor(gate_bias_n, gate_n, bias_op);
|
||||
|
||||
if (!up_bias_tensor || !gate_bias_tensor) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// we don't support repeating adds
|
||||
if (bias_op == GGML_OP_ADD && (!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
|
||||
!ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const ggml_tensor * src0 = up_n->src[0];
|
||||
const ggml_tensor * src1 = up_n->src[1];
|
||||
const ggml_tensor * ids = up_n->src[2];
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat_vec_f(up_n)) {
|
||||
ggml_cuda_mm_fusion_args_host fusion_data{};
|
||||
fusion_data.gate = gate_n->src[0];
|
||||
fusion_data.x_bias = up_bias_tensor;
|
||||
fusion_data.gate_bias = gate_bias_tensor;
|
||||
fusion_data.glu_op = ggml_get_glu_op(glu);
|
||||
|
||||
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
|
||||
fused_mul_mat_vec = true;
|
||||
fused_node_count = 5;
|
||||
break;
|
||||
}
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat_vec_q(up_n)) {
|
||||
ggml_cuda_mm_fusion_args_host fusion_data{};
|
||||
fusion_data.gate = gate_n->src[0];
|
||||
fusion_data.x_bias = up_bias_tensor;
|
||||
fusion_data.gate_bias = gate_bias_tensor;
|
||||
fusion_data.glu_op = ggml_get_glu_op(glu);
|
||||
|
||||
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
|
||||
fused_mul_mat_vec = true;
|
||||
fused_node_count = 5;
|
||||
break;
|
||||
}
|
||||
} else if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) {
|
||||
ggml_tensor * glu = cgraph->nodes[i + 2];
|
||||
ggml_tensor * gate = glu->src[0];
|
||||
ggml_tensor * up = glu->src[1];
|
||||
|
||||
bool ok = (gate == cgraph->nodes[i] && up == cgraph->nodes[i + 1]) ||
|
||||
(gate == cgraph->nodes[i + 1] && up == cgraph->nodes[i]);
|
||||
|
||||
if (!ok) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const ggml_tensor * src0 = up->src[0];
|
||||
const ggml_tensor * src1 = up->src[1];
|
||||
const ggml_tensor * ids = up->src[2];
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
|
||||
ggml_cuda_mm_fusion_args_host fusion_data{};
|
||||
fusion_data.gate = gate->src[0];
|
||||
fusion_data.glu_op = ggml_get_glu_op(glu);
|
||||
|
||||
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
|
||||
fused_mul_mat_vec = true;
|
||||
fused_node_count = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
|
||||
ggml_cuda_mm_fusion_args_host fusion_data{};
|
||||
fusion_data.gate = gate->src[0];
|
||||
fusion_data.glu_op = ggml_get_glu_op(glu);
|
||||
|
||||
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
|
||||
fused_mul_mat_vec = true;
|
||||
fused_node_count = 3;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (fused_mul_mat_vec) {
|
||||
return fused_node_count - 1;
|
||||
}
|
||||
|
||||
fused_mul_mat_vec = false;
|
||||
fused_node_count = 0;
|
||||
|
||||
// gate + add + glu + up + add
|
||||
for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
|
||||
const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
|
||||
|
||||
if (!ggml_can_fuse(cgraph, i, { op, bias_op })) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_tensor * mm_node = cgraph->nodes[i];
|
||||
ggml_tensor * bias_node = cgraph->nodes[i + 1];
|
||||
|
||||
ggml_tensor * bias_tensor = nullptr;
|
||||
if (bias_op == GGML_OP_ADD) {
|
||||
if (bias_node->src[0] == mm_node) {
|
||||
bias_tensor = bias_node->src[1];
|
||||
} else if (bias_node->src[1] == mm_node) {
|
||||
bias_tensor = bias_node->src[0];
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if (bias_node->src[0] != mm_node) {
|
||||
continue;
|
||||
}
|
||||
bias_tensor = bias_node->src[1];
|
||||
}
|
||||
|
||||
const ggml_tensor * src0 = mm_node->src[0];
|
||||
const ggml_tensor * src1 = mm_node->src[1];
|
||||
const ggml_tensor * ids = mm_node->src[2];
|
||||
|
||||
if (bias_op == GGML_OP_ADD_ID && bias_node->src[2] != ids) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_cuda_mm_fusion_args_host fusion_data{};
|
||||
fusion_data.x_bias = bias_tensor;
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat_vec_f(mm_node)) {
|
||||
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
|
||||
fused_mul_mat_vec = true;
|
||||
fused_node_count = 2;
|
||||
break;
|
||||
}
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat_vec_q(mm_node)) {
|
||||
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
|
||||
fused_mul_mat_vec = true;
|
||||
fused_node_count = 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (fused_mul_mat_vec) {
|
||||
return fused_node_count - 1;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD }, {})) {
|
||||
ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i + 1], cgraph->nodes[i + 2]);
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
|
||||
ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i + 1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SSM_CONV, GGML_OP_ADD, GGML_OP_UNARY }, { GGML_UNARY_OP_SILU })) {
|
||||
ggml_cuda_op_ssm_conv(*cuda_ctx, node, cgraph->nodes[i + 1], cgraph->nodes[i + 2]);
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SSM_CONV, GGML_OP_UNARY }, { GGML_UNARY_OP_SILU })) {
|
||||
ggml_cuda_op_ssm_conv(*cuda_ctx, node, /*bias_add_node=*/ nullptr, cgraph->nodes[i + 1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SILU }) ||
|
||||
ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SIGMOID }) ||
|
||||
ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SOFTPLUS })) {
|
||||
ggml_cuda_op_unary_mul(*cuda_ctx, node, cgraph->nodes[i + 1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_SQR }, { GGML_UNARY_OP_RELU })) {
|
||||
ggml_cuda_op_relu_sqr(*cuda_ctx, node, cgraph->nodes[i + 1]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
|
||||
ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i + 2], node);
|
||||
return 2;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph, const bool use_cuda_graph, const bool cuda_graph_update_required, const void * graph_key) {
|
||||
bool graph_evaluated_or_captured = false;
|
||||
|
||||
@@ -3786,355 +4170,11 @@ static void ggml_cuda_graph_evaluate_and_capture(ggml_backend_cuda_context * cud
|
||||
continue;
|
||||
}
|
||||
|
||||
// start of fusion operations
|
||||
static bool disable_fusion = (getenv("GGML_CUDA_DISABLE_FUSION") != nullptr);
|
||||
if (!disable_fusion) {
|
||||
ggml_cuda_topk_moe_args args;
|
||||
int nodes_to_skip = ggml_cuda_try_fuse(cuda_ctx, cgraph, i);
|
||||
|
||||
if (cgraph->nodes[i]->op == GGML_OP_UNARY || cgraph->nodes[i]->op == GGML_OP_SOFT_MAX ||
|
||||
cgraph->nodes[i]->op == GGML_OP_ARGSORT) {
|
||||
const bool can_fuse = ggml_cuda_topk_moe_fusion(cgraph, i, args);
|
||||
|
||||
std::vector<ggml_op> ops;
|
||||
|
||||
if (can_fuse) {
|
||||
const ggml_tensor * logits = node->src[0];
|
||||
ggml_tensor * weights = nullptr;
|
||||
ggml_tensor * ids = nullptr;
|
||||
const ggml_tensor * bias = nullptr;
|
||||
const ggml_tensor * clamp = nullptr;
|
||||
const ggml_tensor * scale = nullptr;
|
||||
|
||||
if (!args.delayed_softmax) {
|
||||
ggml_op gating_op = args.sigmoid ? GGML_OP_UNARY : GGML_OP_SOFT_MAX;
|
||||
int out_nodes[2]; // nodes which can't be elided
|
||||
|
||||
if (args.prob_bias) {
|
||||
bias = cgraph->nodes[i + 2]->src[1];
|
||||
ops.insert(ops.end(), { gating_op, GGML_OP_RESHAPE, GGML_OP_ADD, GGML_OP_ARGSORT,
|
||||
GGML_OP_VIEW, GGML_OP_GET_ROWS });
|
||||
out_nodes[0] = i + 4;
|
||||
ids = cgraph->nodes[i + 4];
|
||||
} else {
|
||||
ops.insert(ops.end(), { gating_op, GGML_OP_RESHAPE, GGML_OP_ARGSORT, GGML_OP_VIEW,
|
||||
GGML_OP_GET_ROWS });
|
||||
out_nodes[0] = i + 3;
|
||||
ids = cgraph->nodes[i + 3];
|
||||
}
|
||||
|
||||
if (args.norm) {
|
||||
ops.insert(ops.end(), { GGML_OP_RESHAPE, GGML_OP_SUM_ROWS, GGML_OP_CLAMP,
|
||||
GGML_OP_DIV, GGML_OP_RESHAPE });
|
||||
clamp = cgraph->nodes[i + ops.size() - 3];
|
||||
}
|
||||
if (args.scale) {
|
||||
ops.insert(ops.end(), { GGML_OP_SCALE });
|
||||
scale = cgraph->nodes[i + ops.size() - 1];
|
||||
}
|
||||
|
||||
weights = cgraph->nodes[i + ops.size() - 1];
|
||||
out_nodes[1] = i + ops.size() - 1;
|
||||
|
||||
if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
|
||||
ggml_cuda_should_use_topk_moe(node, logits, weights, ids) &&
|
||||
ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/ true)) {
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
|
||||
i += ops.size() - 1;
|
||||
continue;
|
||||
}
|
||||
} else if (!args.norm && !args.prob_bias) {
|
||||
//special case gpt-oss, no norm, no bias.
|
||||
ops.insert(ops.end(), { GGML_OP_ARGSORT, GGML_OP_VIEW, GGML_OP_GET_ROWS,
|
||||
GGML_OP_RESHAPE, GGML_OP_SOFT_MAX, GGML_OP_RESHAPE });
|
||||
weights = cgraph->nodes[i + 5];
|
||||
ids = cgraph->nodes[i + 1];
|
||||
const ggml_tensor * softmax = cgraph->nodes[i + 4];
|
||||
|
||||
int out_nodes[2] = { i + 1, i + 5 };
|
||||
if (ggml_can_fuse_subgraph(cgraph, i, ops.size(), ops.data(), out_nodes, 2) &&
|
||||
ggml_cuda_should_use_topk_moe(softmax, logits, weights, ids) &&
|
||||
ggml_cuda_check_fusion_memory_ranges(cgraph, i, ops.size(), out_nodes, 2, /*is_topk_moe=*/ true)) {
|
||||
ggml_cuda_op_topk_moe(*cuda_ctx, logits, weights, ids, clamp, scale, bias, args);
|
||||
i += ops.size() - 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_ROPE, GGML_OP_VIEW, GGML_OP_SET_ROWS }, {})) {
|
||||
ggml_tensor * rope = cgraph->nodes[i];
|
||||
ggml_tensor * set_rows = cgraph->nodes[i + 2];
|
||||
|
||||
ggml_cuda_op_rope_fused(*cuda_ctx, rope, set_rows);
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (node->op == GGML_OP_ADD || node->op == GGML_OP_MUL) {
|
||||
int n_fuse = 0;
|
||||
ggml_op ops[8];
|
||||
std::fill(ops, ops + 8, node->op);
|
||||
|
||||
for (; n_fuse <= 6; ++n_fuse){
|
||||
if (!ggml_can_fuse(cgraph, i + n_fuse, ops + n_fuse, 2)) {
|
||||
break;
|
||||
}
|
||||
if (cgraph->nodes[i + n_fuse] != cgraph->nodes[i + n_fuse + 1]->src[0]) {
|
||||
break;
|
||||
}
|
||||
if (!ggml_are_same_layout(cgraph->nodes[i + n_fuse]->src[1], cgraph->nodes[i + n_fuse + 1]->src[1])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
n_fuse++;
|
||||
|
||||
if (n_fuse > 1) {
|
||||
ggml_tensor fused_node;
|
||||
memcpy(&fused_node, node, sizeof(ggml_tensor));
|
||||
for (int j = 0; j < n_fuse - 1; ++j) {
|
||||
fused_node.src[j + 2] = cgraph->nodes[i + j + 1]->src[1];
|
||||
}
|
||||
fused_node.data = cgraph->nodes[i + n_fuse - 1]->data;
|
||||
if (node->op == GGML_OP_ADD) {
|
||||
ggml_cuda_op_fused_add(*cuda_ctx, &fused_node, n_fuse);
|
||||
} else {
|
||||
ggml_cuda_op_fused_mul(*cuda_ctx, &fused_node, n_fuse);
|
||||
}
|
||||
i += n_fuse - 1;
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
bool fused_mul_mat_vec = false;
|
||||
int fused_node_count = 0;
|
||||
|
||||
for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
|
||||
const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { op, bias_op, op, bias_op, GGML_OP_GLU }, {})) {
|
||||
ggml_tensor * glu = cgraph->nodes[i + 4];
|
||||
ggml_tensor * gate_bias_n = glu->src[0];
|
||||
ggml_tensor * up_bias_n = glu->src[1];
|
||||
|
||||
//we don't assume the order for {gate, up}. Instead infer it from the bias tensor
|
||||
ggml_tensor * gate_n = nullptr;
|
||||
ggml_tensor * up_n = nullptr;
|
||||
|
||||
if (gate_bias_n->src[0] == cgraph->nodes[i] || gate_bias_n->src[1] == cgraph->nodes[i]) {
|
||||
gate_n = cgraph->nodes[i];
|
||||
up_n = cgraph->nodes[i + 2];
|
||||
} else if (gate_bias_n->src[0] == cgraph->nodes[i + 2] || gate_bias_n->src[1] == cgraph->nodes[i + 2]) {
|
||||
gate_n = cgraph->nodes[i + 2];
|
||||
up_n = cgraph->nodes[i];
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto get_bias_tensor = [](const ggml_tensor * bias_node, const ggml_tensor * mul_node, ggml_op op_bias) {
|
||||
if (op_bias == GGML_OP_ADD) {
|
||||
if (bias_node->src[0] == mul_node) {
|
||||
return bias_node->src[1];
|
||||
}
|
||||
if (bias_node->src[1] == mul_node) {
|
||||
return bias_node->src[0];
|
||||
}
|
||||
return (ggml_tensor *) nullptr;
|
||||
}
|
||||
GGML_ASSERT(op_bias == GGML_OP_ADD_ID);
|
||||
GGML_ASSERT(bias_node->src[0] == mul_node);
|
||||
return bias_node->src[1];
|
||||
};
|
||||
|
||||
ggml_tensor * up_bias_tensor = get_bias_tensor(up_bias_n, up_n, bias_op);
|
||||
ggml_tensor * gate_bias_tensor = get_bias_tensor(gate_bias_n, gate_n, bias_op);
|
||||
|
||||
if (!up_bias_tensor || !gate_bias_tensor) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// we don't support repeating adds
|
||||
if (bias_op == GGML_OP_ADD &&
|
||||
(!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
|
||||
!ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const ggml_tensor * src0 = up_n->src[0];
|
||||
const ggml_tensor * src1 = up_n->src[1];
|
||||
const ggml_tensor * ids = up_n->src[2];
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat_vec_f(up_n)) {
|
||||
ggml_cuda_mm_fusion_args_host fusion_data{};
|
||||
fusion_data.gate = gate_n->src[0];
|
||||
fusion_data.x_bias = up_bias_tensor;
|
||||
fusion_data.gate_bias = gate_bias_tensor;
|
||||
fusion_data.glu_op = ggml_get_glu_op(glu);
|
||||
|
||||
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
|
||||
fused_mul_mat_vec = true;
|
||||
fused_node_count = 5;
|
||||
break;
|
||||
}
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat_vec_q(up_n)) {
|
||||
ggml_cuda_mm_fusion_args_host fusion_data{};
|
||||
fusion_data.gate = gate_n->src[0];
|
||||
fusion_data.x_bias = up_bias_tensor;
|
||||
fusion_data.gate_bias = gate_bias_tensor;
|
||||
fusion_data.glu_op = ggml_get_glu_op(glu);
|
||||
|
||||
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
|
||||
fused_mul_mat_vec = true;
|
||||
fused_node_count = 5;
|
||||
break;
|
||||
}
|
||||
} else if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) {
|
||||
ggml_tensor * glu = cgraph->nodes[i + 2];
|
||||
ggml_tensor * gate = glu->src[0];
|
||||
ggml_tensor * up = glu->src[1];
|
||||
|
||||
bool ok = (gate == cgraph->nodes[i] && up == cgraph->nodes[i + 1])
|
||||
|| (gate == cgraph->nodes[i + 1] && up == cgraph->nodes[i]);
|
||||
|
||||
if (!ok) continue;
|
||||
|
||||
const ggml_tensor * src0 = up->src[0];
|
||||
const ggml_tensor * src1 = up->src[1];
|
||||
const ggml_tensor * ids = up->src[2];
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
|
||||
ggml_cuda_mm_fusion_args_host fusion_data{};
|
||||
fusion_data.gate = gate->src[0];
|
||||
fusion_data.glu_op = ggml_get_glu_op(glu);
|
||||
|
||||
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
|
||||
fused_mul_mat_vec = true;
|
||||
fused_node_count = 3;
|
||||
break;
|
||||
}
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
|
||||
ggml_cuda_mm_fusion_args_host fusion_data{};
|
||||
fusion_data.gate = gate->src[0];
|
||||
fusion_data.glu_op = ggml_get_glu_op(glu);
|
||||
|
||||
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
|
||||
fused_mul_mat_vec = true;
|
||||
fused_node_count = 3;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (fused_mul_mat_vec) {
|
||||
i += fused_node_count - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
fused_mul_mat_vec = false;
|
||||
fused_node_count = 0;
|
||||
|
||||
for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
|
||||
const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
|
||||
|
||||
if (!ggml_can_fuse(cgraph, i, { op, bias_op })) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_tensor * mm_node = cgraph->nodes[i];
|
||||
ggml_tensor * bias_node = cgraph->nodes[i + 1];
|
||||
|
||||
ggml_tensor * bias_tensor = nullptr;
|
||||
if (bias_op == GGML_OP_ADD) {
|
||||
if (bias_node->src[0] == mm_node) {
|
||||
bias_tensor = bias_node->src[1];
|
||||
} else if (bias_node->src[1] == mm_node) {
|
||||
bias_tensor = bias_node->src[0];
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if (bias_node->src[0] != mm_node) {
|
||||
continue;
|
||||
}
|
||||
bias_tensor = bias_node->src[1];
|
||||
}
|
||||
|
||||
const ggml_tensor * src0 = mm_node->src[0];
|
||||
const ggml_tensor * src1 = mm_node->src[1];
|
||||
const ggml_tensor * ids = mm_node->src[2];
|
||||
|
||||
if (bias_op == GGML_OP_ADD_ID && bias_node->src[2] != ids) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
ggml_cuda_mm_fusion_args_host fusion_data{};
|
||||
fusion_data.x_bias = bias_tensor;
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat_vec_f(mm_node)) {
|
||||
ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
|
||||
fused_mul_mat_vec = true;
|
||||
fused_node_count = 2;
|
||||
break;
|
||||
}
|
||||
|
||||
if (ggml_cuda_should_fuse_mul_mat_vec_q(mm_node)) {
|
||||
ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
|
||||
fused_mul_mat_vec = true;
|
||||
fused_node_count = 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (fused_mul_mat_vec) {
|
||||
i += fused_node_count - 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) {
|
||||
ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL}, {})) {
|
||||
ggml_cuda_op_rms_norm_fused(*cuda_ctx, node, cgraph->nodes[i+1]);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SSM_CONV, GGML_OP_UNARY }, { GGML_UNARY_OP_SILU })) {
|
||||
ggml_cuda_op_ssm_conv(*cuda_ctx, node, cgraph->nodes[i+1]);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SILU }) ||
|
||||
ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SIGMOID }) ||
|
||||
ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_MUL }, { GGML_UNARY_OP_SOFTPLUS })) {
|
||||
ggml_cuda_op_unary_mul(*cuda_ctx, node, cgraph->nodes[i+1]);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_SQR }, { GGML_UNARY_OP_RELU })) {
|
||||
ggml_cuda_op_relu_sqr(*cuda_ctx, node, cgraph->nodes[i+1]);
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_SCALE, GGML_OP_UNARY, GGML_OP_SCALE }, { GGML_UNARY_OP_TANH })) {
|
||||
i += 2;
|
||||
ggml_cuda_op_softcap(*cuda_ctx, cgraph->nodes[i], node);
|
||||
continue;
|
||||
}
|
||||
if (nodes_to_skip != 0) {
|
||||
i += nodes_to_skip;
|
||||
continue;
|
||||
}
|
||||
#ifndef NDEBUG
|
||||
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
|
||||
@@ -4548,8 +4588,8 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
|
||||
/* .free = */ ggml_backend_cuda_free,
|
||||
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
|
||||
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
|
||||
/* .get_tensor_2d_async = */ ggml_backend_cuda_set_tensor_2d_async,
|
||||
/* .set_tensor_2d_async = */ ggml_backend_cuda_get_tensor_2d_async,
|
||||
/* .set_tensor_2d_async = */ ggml_backend_cuda_set_tensor_2d_async,
|
||||
/* .get_tensor_2d_async = */ ggml_backend_cuda_get_tensor_2d_async,
|
||||
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
|
||||
/* .synchronize = */ ggml_backend_cuda_synchronize,
|
||||
/* .graph_plan_create = */ NULL,
|
||||
@@ -5391,8 +5431,8 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
|
||||
char pci_bus_id[16] = {};
|
||||
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
|
||||
char pci_bus_id[32] = {};
|
||||
CUDA_CHECK(cudaDeviceGetPCIBusId(pci_bus_id, sizeof(pci_bus_id), i));
|
||||
dev_ctx->pci_bus_id = pci_bus_id;
|
||||
dev_ctx->op_offload_min_batch_size = min_batch_size;
|
||||
|
||||
|
||||
@@ -1015,25 +1015,35 @@ namespace ggml_cuda_mma {
|
||||
#endif // AMD_MFMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma_block_scaled(tile<16, 8, float> & D,
|
||||
const tile<16, 8, int> & A,
|
||||
const tile<8, 8, int> & B,
|
||||
uint32_t a_scale,
|
||||
uint32_t b_scale) {
|
||||
template <ggml_type type>
|
||||
static __device__ __forceinline__ void mma_block_scaled_fp4(tile<16, 8, float> & D,
|
||||
const tile<16, 8, int> & A,
|
||||
const tile<8, 8, int> & B,
|
||||
uint32_t a_scale,
|
||||
uint32_t b_scale) {
|
||||
#ifdef BLACKWELL_MMA_AVAILABLE
|
||||
const int * Axi = (const int *) A.x;
|
||||
const int * Bxi = (const int *) B.x;
|
||||
float * Dxi = (float *) D.x;
|
||||
|
||||
asm volatile(
|
||||
"mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
|
||||
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
|
||||
"%10, {0, 0}, %11, {0, 0};"
|
||||
: "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
|
||||
: "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
|
||||
if constexpr (type == GGML_TYPE_MXFP4) {
|
||||
asm volatile(
|
||||
"mma.sync.aligned.kind::mxf4.block_scale.scale_vec::2X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue8m0 "
|
||||
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
|
||||
"%10, {0, 0}, %11, {0, 0};"
|
||||
: "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
|
||||
: "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
|
||||
} else {
|
||||
asm volatile(
|
||||
"mma.sync.aligned.kind::mxf4nvf4.block_scale.scale_vec::4X.m16n8k64.row.col.f32.e2m1.e2m1.f32.ue4m3 "
|
||||
"{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3}, "
|
||||
"%10, {0, 0}, %11, {0, 0};"
|
||||
: "+f"(Dxi[0]), "+f"(Dxi[1]), "+f"(Dxi[2]), "+f"(Dxi[3])
|
||||
: "r"(Axi[0]), "r"(Axi[1]), "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[0]), "r"(Bxi[1]), "r"(a_scale), "r"(b_scale));
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED_VARS(D, A, B, a_scale, b_scale);
|
||||
#endif // BLACKWELL_MMA_AVAILABLE
|
||||
#endif // BLACKWELL_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ void mma(
|
||||
|
||||
@@ -122,7 +122,7 @@ void ggml_cuda_mul_mat_q(
|
||||
|| GGML_CUDA_CC_IS_CDNA(cc);
|
||||
|
||||
// TODO: tighter pool buffer size vs q8 path
|
||||
const bool use_native_mxfp4 = blackwell_mma_available(cc) && src0->type == GGML_TYPE_MXFP4;
|
||||
const bool use_native_fp4 = blackwell_mma_available(cc) && (src0->type == GGML_TYPE_MXFP4 || src0->type == GGML_TYPE_NVFP4);
|
||||
|
||||
if (!ids) {
|
||||
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
|
||||
@@ -133,9 +133,9 @@ void ggml_cuda_mul_mat_q(
|
||||
const int64_t s11 = src1->nb[1] / ts_src1;
|
||||
const int64_t s12 = src1->nb[2] / ts_src1;
|
||||
const int64_t s13 = src1->nb[3] / ts_src1;
|
||||
if (use_native_mxfp4) {
|
||||
if (use_native_fp4) {
|
||||
static_assert(sizeof(block_fp4_mmq) == 4 * sizeof(block_q8_1));
|
||||
quantize_mmq_mxfp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
|
||||
quantize_mmq_fp4_cuda(src1_d, nullptr, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded,
|
||||
ne11, ne12, ne13, stream);
|
||||
|
||||
} else {
|
||||
@@ -146,10 +146,8 @@ void ggml_cuda_mul_mat_q(
|
||||
}
|
||||
|
||||
// Stride depends on quantization format
|
||||
const int64_t s12 = use_native_mxfp4 ?
|
||||
ne11 * ne10_padded * sizeof(block_fp4_mmq) /
|
||||
(8 * QK_MXFP4 * sizeof(int)) // block_fp4_mmq holds 256 values (8 blocks of 32)
|
||||
:
|
||||
const int64_t s12 = use_native_fp4 ?
|
||||
ne11 * ne10_padded * sizeof(block_fp4_mmq) / (QK_K * sizeof(int)) : // block_fp4_mmq holds 256 values
|
||||
ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
|
||||
const int64_t s13 = ne12*s12;
|
||||
|
||||
@@ -198,8 +196,8 @@ void ggml_cuda_mul_mat_q(
|
||||
const int64_t s12 = src1->nb[2] / ts_src1;
|
||||
const int64_t s13 = src1->nb[3] / ts_src1;
|
||||
|
||||
if (use_native_mxfp4) {
|
||||
quantize_mmq_mxfp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
||||
if (use_native_fp4) {
|
||||
quantize_mmq_fp4_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
||||
ne10_padded, ne11_flat, ne12_flat, ne13_flat, stream);
|
||||
} else {
|
||||
quantize_mmq_q8_1_cuda(src1_d, ids_src1.get(), src1_q8_1.get(), src0->type, ne10, s11, s12, s13,
|
||||
@@ -208,8 +206,9 @@ void ggml_cuda_mul_mat_q(
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
const int64_t s12 = use_native_mxfp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (8 * QK_MXFP4 * sizeof(int)) :
|
||||
ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
|
||||
static_assert(QK_K == 8 * QK_MXFP4, "QK_K needs to be 8 * QK_MXFP4");
|
||||
const int64_t s12 = use_native_fp4 ? ne11 * ne10_padded * sizeof(block_fp4_mmq) / (QK_K * sizeof(int)) :
|
||||
ne11 * ne10_padded * sizeof(block_q8_1) / (QK8_1 * sizeof(int));
|
||||
const int64_t s13 = ne12*s12;
|
||||
|
||||
// Note that ne02 is used instead of ne12 because the number of y channels determines the z dimension of the CUDA grid.
|
||||
|
||||
@@ -10,9 +10,9 @@
|
||||
using namespace ggml_cuda_mma;
|
||||
|
||||
#define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
|
||||
#define MMQ_ITER_K 256
|
||||
#define MMQ_ITER_K_MXFP4_FP4 512
|
||||
#define MMQ_NWARPS 8
|
||||
#define MMQ_ITER_K 256
|
||||
#define MMQ_ITER_K_FP4 512
|
||||
#define MMQ_NWARPS 8
|
||||
|
||||
typedef void (*load_tiles_mmq_t)(const char * __restrict__ x, int * x_tile, const int kbx0, const int i_max, const int stride);
|
||||
typedef void (*vec_dot_mmq_t)(const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00);
|
||||
@@ -46,9 +46,12 @@ struct block_q8_1_mmq {
|
||||
int8_t qs[4*QK8_1]; // 128 values quantized to 8 bit each
|
||||
};
|
||||
|
||||
// this struct is used for fp4 data types (currently only used for Blackwell)
|
||||
// mxfp4 has block size 32, each int32 of d4 contains 2 e8m0 scales in the lower 16 bits
|
||||
// nvfp4 has block size 16, each int32 of d4 contains 4 ue4m3 scales
|
||||
struct block_fp4_mmq {
|
||||
uint32_t d4[4]; // 8 E8M0 scales (1 per 32 values), 2 packed per uint32: d4[0]={s0,s1}, d4[1]={s2,s3}, etc.
|
||||
int8_t qs[4 * 32]; // 256 FP4 values packed as 4-bit pairs (2 per byte), 8 blocks of 32 values
|
||||
uint32_t d4[4];
|
||||
int8_t qs[4 * 32]; // 256 FP4 values packed as 4-bit pairs (2 per byte)
|
||||
};
|
||||
|
||||
static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
|
||||
@@ -143,10 +146,11 @@ static int get_mmq_y_host(const int cc) {
|
||||
|
||||
static constexpr __device__ int get_iter_k([[maybe_unused]] const ggml_type type) {
|
||||
#if defined(BLACKWELL_MMA_AVAILABLE)
|
||||
return type == GGML_TYPE_MXFP4 ? MMQ_ITER_K_MXFP4_FP4 : MMQ_ITER_K;
|
||||
#else
|
||||
return MMQ_ITER_K;
|
||||
if (type == GGML_TYPE_NVFP4 || type == GGML_TYPE_MXFP4) {
|
||||
return MMQ_ITER_K_FP4;
|
||||
}
|
||||
#endif // defined(BLACKWELL_MMA_AVAILABLE)
|
||||
return MMQ_ITER_K;
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_mmq_y_device() {
|
||||
@@ -213,8 +217,8 @@ static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml
|
||||
}
|
||||
|
||||
#define MMQ_MMA_TILE_X_K_Q8_0 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4)
|
||||
#define MMQ_MMA_TILE_X_K_FP4 (2*MMQ_TILE_NE_K + 8 + 4) // MXFP4
|
||||
#define MMQ_MMA_TILE_X_K_NVFP4 (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4) // NVFP4
|
||||
#define MMQ_MMA_TILE_X_K_FP4 (2*MMQ_TILE_NE_K + 8 + 4) // MXFP4 and NVFP4 Blackwell
|
||||
#define MMQ_MMA_TILE_X_K_NVFP4 (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4) // NVFP4 Generic
|
||||
#define MMQ_MMA_TILE_X_K_Q8_1 (2*MMQ_TILE_NE_K + 2*MMQ_TILE_NE_K/QI8_0 + 4)
|
||||
#define MMQ_MMA_TILE_X_K_Q2_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K + 4)
|
||||
#define MMQ_MMA_TILE_X_K_Q3_K (2*MMQ_TILE_NE_K + MMQ_TILE_NE_K/2 + 4)
|
||||
@@ -240,7 +244,11 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
|
||||
case GGML_TYPE_Q8_0: return MMQ_MMA_TILE_X_K_Q8_0;
|
||||
// tile sizes are the same for Q8_1 and FP4 for blackwell
|
||||
case GGML_TYPE_MXFP4: return MMQ_MMA_TILE_X_K_Q8_1;
|
||||
#if defined(BLACKWELL_MMA_AVAILABLE)
|
||||
case GGML_TYPE_NVFP4: return MMQ_MMA_TILE_X_K_FP4;
|
||||
#else
|
||||
case GGML_TYPE_NVFP4: return MMQ_MMA_TILE_X_K_NVFP4;
|
||||
#endif // defined(BLACKWELL_MMA_AVAILABLE)
|
||||
case GGML_TYPE_Q2_K: return MMQ_MMA_TILE_X_K_Q2_K;
|
||||
case GGML_TYPE_Q3_K: return MMQ_MMA_TILE_X_K_Q3_K;
|
||||
case GGML_TYPE_Q4_K: return MMQ_MMA_TILE_X_K_Q8_1;
|
||||
@@ -934,6 +942,128 @@ static __device__ __forceinline__ void load_tiles_mxfp4_fp4(const char * __restr
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef BLACKWELL_MMA_AVAILABLE
|
||||
template <int mmq_y, bool need_check>
|
||||
static __device__ __forceinline__ void load_tiles_nvfp4_nvfp4(const char * __restrict__ x,
|
||||
int * __restrict__ x_tile,
|
||||
const int kbx0,
|
||||
const int i_max,
|
||||
const int stride) {
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
constexpr int iter_k = get_iter_k(GGML_TYPE_NVFP4);
|
||||
constexpr int threads_per_row = iter_k / QK_NVFP4; // each thread processes 1 block
|
||||
constexpr int rows_per_warp = warp_size / threads_per_row;
|
||||
|
||||
uint32_t * x_u32 = (uint32_t *) x_tile;
|
||||
|
||||
const int txi = threadIdx.x;
|
||||
const int kbx = txi % threads_per_row;
|
||||
const int row_in_warp = txi / threads_per_row;
|
||||
|
||||
const block_nvfp4 * bxi_base = (const block_nvfp4 *) x + kbx0 + kbx;
|
||||
uint32_t * x_u32_scale = x_u32 + 64 + kbx;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += rows_per_warp * nwarps) {
|
||||
int i = i0 + threadIdx.y * rows_per_warp + row_in_warp;
|
||||
|
||||
if constexpr (need_check) {
|
||||
i = min(i, i_max);
|
||||
}
|
||||
|
||||
const block_nvfp4 * bxi = bxi_base + i * stride;
|
||||
const int row_base = i * MMQ_MMA_TILE_X_K_FP4;
|
||||
const int q_base = row_base + 8 * kbx;
|
||||
|
||||
const uint32_t * src_qs = reinterpret_cast<const uint32_t *>(bxi->qs);
|
||||
|
||||
#pragma unroll
|
||||
for (int sub = 0; sub < QK_NVFP4 / QK_NVFP4_SUB; ++sub) {
|
||||
x_u32[q_base + 2 * sub + 0] = src_qs[2 * sub + 0];
|
||||
x_u32[q_base + 2 * sub + 1] = src_qs[2 * sub + 1];
|
||||
}
|
||||
|
||||
x_u32_scale[row_base] = get_int_b4(bxi->d, 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Shared MMA kernel for MXFP4 and NVFP4 on Blackwell.
|
||||
// Both quantizations encode values as e2m1 (FP4) and produce one uint32 scale per
|
||||
// m16n8k64 MMA call; only the PTX kind (scale_vec::2X ue8m0 vs scale_vec::4X ue4m3)
|
||||
// and the per-type stride constant differ.
|
||||
template <int mmq_x, int mmq_y, ggml_type type>
|
||||
static __device__ __forceinline__ void vec_dot_fp4_fp4_mma(const int * __restrict__ x,
|
||||
const int * __restrict__ y,
|
||||
float * __restrict__ sum,
|
||||
const int k00) {
|
||||
static_assert(type == GGML_TYPE_MXFP4 || type == GGML_TYPE_NVFP4,
|
||||
"vec_dot_fp4_fp4_mma: type must be MXFP4 or NVFP4");
|
||||
|
||||
typedef tile<16, 8, int> tile_A;
|
||||
typedef tile<8, 8, int> tile_B;
|
||||
typedef tile<16, 8, float> tile_C;
|
||||
|
||||
constexpr int stride = MMQ_MMA_TILE_X_K_FP4;
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = 2 * granularity;
|
||||
constexpr int ntx = rows_per_warp / tile_C::I;
|
||||
constexpr int nfrags = MMQ_TILE_NE_K / tile_A::J;
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_K);
|
||||
|
||||
const int * x_qs = (const int *) x;
|
||||
const uint32_t * x_sc = (const uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const uint32_t * y_sc = (const uint32_t *) y;
|
||||
|
||||
// 2 threads per quad supply the packed scale register to the block_scale MMA,
|
||||
// see https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling
|
||||
const int tidx_A = threadIdx.x / 4 + (threadIdx.x % 2) * 8;
|
||||
const int tidx_B = threadIdx.x / 4;
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
tile_A A[ntx][nfrags];
|
||||
uint32_t scaleA[ntx][nfrags];
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
#pragma unroll
|
||||
for (int frag = 0; frag < nfrags; ++frag) {
|
||||
const int k0 = k00 + frag * tile_A::J;
|
||||
load_ldmatrix(A[n][frag], x_qs + (i0 + n * tile_A::I) * stride + k0, stride);
|
||||
scaleA[n][frag] = x_sc[(i0 + n * tile_A::I + tidx_A) * stride + k0 / tile_A::J];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) {
|
||||
tile_B B[nfrags];
|
||||
uint32_t scaleB[nfrags];
|
||||
|
||||
#pragma unroll
|
||||
for (int frag = 0; frag < nfrags; ++frag) {
|
||||
const int k0 = frag * tile_B::J;
|
||||
load_generic(B[frag], y_qs + j0 * MMQ_TILE_Y_K + k0, MMQ_TILE_Y_K);
|
||||
scaleB[frag] = y_sc[(j0 + tidx_B) * MMQ_TILE_Y_K + frag];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
#pragma unroll
|
||||
for (int frag = 0; frag < nfrags; ++frag) {
|
||||
tile_C C = {};
|
||||
mma_block_scaled_fp4<type>(C, A[n][frag], B[frag], scaleA[n][frag], scaleB[frag]);
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif // BLACKWELL_MMA_AVAILABLE
|
||||
|
||||
|
||||
template <int mmq_y, bool need_check>
|
||||
static __device__ __forceinline__ void load_tiles_nvfp4(const char * __restrict__ x,
|
||||
@@ -1163,77 +1293,6 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_mxfp4_mxfp4_mma(const int * __restrict__ x,
|
||||
const int * __restrict__ y,
|
||||
float * __restrict__ sum,
|
||||
const int k00) {
|
||||
typedef tile<16, 8, int> tile_A;
|
||||
typedef tile<8, 8, int> tile_B;
|
||||
typedef tile<16, 8, float> tile_C; // Output is float for native scaled MMA
|
||||
|
||||
constexpr int granularity = mmq_get_granularity_device(mmq_x);
|
||||
constexpr int rows_per_warp = 2 * granularity;
|
||||
constexpr int ntx = rows_per_warp / tile_C::I; // Number of x minitiles per warp.
|
||||
|
||||
y += (threadIdx.y % ntx) * (tile_C::J * MMQ_TILE_Y_FP4_K);
|
||||
|
||||
// Match layout from load_tiles_mxfp4_fp4
|
||||
const int * x_qs = (const int *) x;
|
||||
const uint32_t * x_sc = (const uint32_t *) (x_qs + 2 * MMQ_TILE_NE_K);
|
||||
const int * y_qs = (const int *) y + 4;
|
||||
const uint32_t * y_sc = (const uint32_t *) y;
|
||||
|
||||
// tile_A has a length of 64 logical values vs. 32 values in block_mxfp4
|
||||
tile_A A[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)];
|
||||
uint32_t scaleA[ntx][MMQ_TILE_NE_K / (2 * QI_MXFP4)];
|
||||
|
||||
// Block scale
|
||||
// Each thread has to point to a 4 byte scale value
|
||||
// https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling
|
||||
|
||||
const int i0 = (threadIdx.y / ntx) * rows_per_warp;
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
#pragma unroll
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
load_ldmatrix(A[n][k01 / (2 * QI_MXFP4)], x_qs + (i0 + n * tile_A::I) * MMQ_MMA_TILE_X_K_FP4 + k0,
|
||||
MMQ_MMA_TILE_X_K_FP4);
|
||||
|
||||
// based on block-scaling document, 2 threads in each quad need to supply to the scale value
|
||||
const int tidx = threadIdx.x / 4 + (threadIdx.x % 2) * 8;
|
||||
scaleA[n][k01 / (2 * QI_MXFP4)] =
|
||||
*(x_sc + (i0 + n * tile_A::I + tidx) * MMQ_MMA_TILE_X_K_FP4 + k0 / (2 * QI_MXFP4));
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += ntx * tile_C::J) {
|
||||
#pragma unroll
|
||||
for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 2 * QI_MXFP4) {
|
||||
tile_B B;
|
||||
uint32_t scaleB; // 2xN scales
|
||||
|
||||
load_generic(B, y_qs + j0 * MMQ_TILE_Y_FP4_K + k01, MMQ_TILE_Y_FP4_K);
|
||||
|
||||
scaleB = y_sc[(j0 + threadIdx.x / 4) * MMQ_TILE_Y_FP4_K + k01 / (2 * QI_MXFP4)];
|
||||
|
||||
#pragma unroll
|
||||
for (int n = 0; n < ntx; ++n) {
|
||||
tile_C C;
|
||||
|
||||
mma_block_scaled(C, A[n][k01 / (2 * QI_MXFP4)], B, scaleA[n][k01 / (2 * QI_MXFP4)], scaleB);
|
||||
#pragma unroll
|
||||
for (int l = 0; l < tile_C::ne; ++l) {
|
||||
sum[(j0 / tile_C::J + n) * tile_C::ne + l] += C.x[l];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int mmq_x, int mmq_y>
|
||||
static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
|
||||
@@ -3259,7 +3318,7 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_MXFP4> {
|
||||
static constexpr int vdr = VDR_MXFP4_Q8_1_MMQ;
|
||||
#ifdef BLACKWELL_MMA_AVAILABLE
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4_fp4<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_mxfp4_mxfp4_mma<mmq_x, mmq_y>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_fp4_fp4_mma<mmq_x, mmq_y, GGML_TYPE_MXFP4>;
|
||||
#else
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_mxfp4<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
|
||||
@@ -3270,8 +3329,13 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_MXFP4> {
|
||||
template <int mmq_x, int mmq_y, bool need_check>
|
||||
struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_NVFP4> {
|
||||
static constexpr int vdr = VDR_NVFP4_Q8_1_MMQ;
|
||||
#ifdef BLACKWELL_MMA_AVAILABLE
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_nvfp4_nvfp4<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_fp4_fp4_mma<mmq_x, mmq_y, GGML_TYPE_NVFP4>;
|
||||
#else
|
||||
static constexpr load_tiles_mmq_t load_tiles = load_tiles_nvfp4<mmq_y, need_check>;
|
||||
static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_16_q8_1_mma<mmq_x, mmq_y>;
|
||||
#endif // BLACKWELL_MMA_AVAILABLE
|
||||
static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_16_q8_1_dp4a<mmq_x, mmq_y>;
|
||||
};
|
||||
|
||||
@@ -3406,7 +3470,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
|
||||
|
||||
#if defined(BLACKWELL_MMA_AVAILABLE)
|
||||
// FP4 tile stores 8 blocks
|
||||
constexpr int ne_block = (type == GGML_TYPE_MXFP4) ? 8 * QK_MXFP4 : 4 * QK8_1;
|
||||
constexpr int ne_block = (type == GGML_TYPE_MXFP4 || type == GGML_TYPE_NVFP4) ? QK_K : 4 * QK8_1;
|
||||
#else
|
||||
constexpr int ne_block = 4 * QK8_1;
|
||||
#endif // defined(BLACKWELL_MMA_AVAILABLE)
|
||||
@@ -3478,10 +3542,10 @@ template <ggml_type type, int mmq_x, bool need_check>
|
||||
static __global__ void mul_mat_q(
|
||||
const char * __restrict__ x, const int * __restrict__ y, const int32_t * __restrict__ ids_dst,
|
||||
const int32_t * __restrict__ expert_bounds, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||
const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
|
||||
const int channel_ratio, const int nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const int sample_ratio, const int nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
const int ncols_max) {
|
||||
const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
|
||||
const uint3 channel_ratio, const uint3 nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
|
||||
const uint3 sample_ratio, const uint3 nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
|
||||
const uint3 ntx) {
|
||||
|
||||
// Skip unused template specializations for faster compilation:
|
||||
if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) {
|
||||
@@ -3495,8 +3559,7 @@ static __global__ void mul_mat_q(
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
|
||||
const int ntx = (ncols_max + mmq_x - 1) / mmq_x; // Number of tiles x
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
|
||||
const uint32_t nty = (nrows_x + mmq_y - 1) / mmq_y; // Number of tiles y
|
||||
|
||||
// Initialize the ids for writing back data with just the index.
|
||||
// For regular matrix multiplications this is never changed.
|
||||
@@ -3517,8 +3580,9 @@ static __global__ void mul_mat_q(
|
||||
// On non-CDNA AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
||||
#if (defined(GGML_USE_HIP) && !defined(CDNA)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||
{
|
||||
const int wt = blockIdx.z / nchannels_y;
|
||||
const int zt = blockIdx.z - wt*nchannels_y;
|
||||
const uint2 tmp2 = fast_div_modulo(blockIdx.z, nchannels_y);
|
||||
const int wt = tmp2.x;
|
||||
const int zt = tmp2.y;
|
||||
const int jt = blockIdx.y;
|
||||
const int it = blockIdx.x;
|
||||
|
||||
@@ -3561,40 +3625,40 @@ static __global__ void mul_mat_q(
|
||||
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
|
||||
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
|
||||
|
||||
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
|
||||
const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
|
||||
|
||||
constexpr bool fixup = false;
|
||||
mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
|
||||
tile_x_max_i, tile_y_max_j, 0, ncols_x/qk);
|
||||
tile_x_max_i, tile_y_max_j, 0, blocks_per_ne00.z);
|
||||
return;
|
||||
}
|
||||
#endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
|
||||
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
|
||||
const int64_t blocks_per_ne00 = ncols_x / qk;
|
||||
constexpr int blocks_per_iter = ITER_K / qk;
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
constexpr int blocks_per_iter = ITER_K / qk;
|
||||
|
||||
// kbc == k block continuous, current index in continuous ijk space.
|
||||
int64_t kbc = (int64_t) blockIdx.x *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
|
||||
int64_t kbc_stop = (int64_t)(blockIdx.x + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
|
||||
int kbc = int64_t(blockIdx.x) *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
|
||||
int kbc_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
|
||||
|
||||
kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
|
||||
kbc_stop -= (kbc_stop % blocks_per_ne00) % blocks_per_iter;
|
||||
kbc -= fastmodulo(kbc, blocks_per_ne00) % blocks_per_iter;
|
||||
kbc_stop -= fastmodulo(kbc_stop, blocks_per_ne00) % blocks_per_iter;
|
||||
|
||||
// kb0 == k index when doing the matrix multiplication for an output tile.
|
||||
int kb0_start = kbc % blocks_per_ne00;
|
||||
int kb0_stop = min(blocks_per_ne00, kb0_start + kbc_stop - kbc);
|
||||
while (kbc < kbc_stop && kb0_stop == blocks_per_ne00) {
|
||||
int tmp = kbc;
|
||||
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
|
||||
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
|
||||
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
|
||||
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
|
||||
const int zt = tmp / (ntx*blocks_per_ne00);
|
||||
tmp -= zt * (ntx*blocks_per_ne00);
|
||||
const int jt = tmp / blocks_per_ne00;
|
||||
int kb0_start = fastmodulo(kbc, blocks_per_ne00);
|
||||
int kb0_stop = min(blocks_per_ne00.z, uint32_t(kb0_start + kbc_stop - kbc));
|
||||
while (kbc < kbc_stop && kb0_stop == int(blocks_per_ne00.z)) {
|
||||
int tmp = fastdiv(kbc, blocks_per_ne00);
|
||||
uint2 tmp2 = fast_div_modulo(tmp, ntx);
|
||||
const int jt = tmp2.y;
|
||||
tmp = tmp2.x;
|
||||
tmp2 = fast_div_modulo(tmp, nchannels_y);
|
||||
const int zt = tmp2.y;
|
||||
tmp = tmp2.x;
|
||||
tmp2 = fast_div_modulo(tmp, nsamples_y);
|
||||
const int wt = tmp2.y;
|
||||
const int it = tmp2.x;
|
||||
|
||||
// Defaults for regular matrix multiplication:
|
||||
int col_low = 0;
|
||||
@@ -3612,11 +3676,11 @@ static __global__ void mul_mat_q(
|
||||
offset_dst = 0;
|
||||
|
||||
if (jt*mmq_x >= col_diff) {
|
||||
kbc += blocks_per_ne00;
|
||||
kbc -= kbc % blocks_per_ne00;
|
||||
kbc += blocks_per_ne00.z;
|
||||
kbc -= fastmodulo(kbc, blocks_per_ne00);
|
||||
|
||||
kb0_start = 0;
|
||||
kb0_stop = min(blocks_per_ne00, kbc_stop - kbc);
|
||||
kb0_stop = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -3641,32 +3705,34 @@ static __global__ void mul_mat_q(
|
||||
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
|
||||
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
|
||||
|
||||
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
|
||||
const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
|
||||
|
||||
constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
|
||||
mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
|
||||
(x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
|
||||
tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
|
||||
|
||||
kbc += blocks_per_ne00;
|
||||
kbc -= kbc % blocks_per_ne00;
|
||||
kbc += blocks_per_ne00.z;
|
||||
kbc -= fastmodulo(kbc, blocks_per_ne00);
|
||||
|
||||
kb0_start = 0;
|
||||
kb0_stop = min(blocks_per_ne00, kbc_stop - kbc);
|
||||
kb0_stop = min(blocks_per_ne00.z, uint32_t(kbc_stop - kbc));
|
||||
}
|
||||
|
||||
if (kbc >= kbc_stop) {
|
||||
return;
|
||||
}
|
||||
|
||||
int tmp = kbc;
|
||||
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
|
||||
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
|
||||
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
|
||||
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
|
||||
const int zt = tmp / (ntx*blocks_per_ne00);
|
||||
tmp -= zt * (ntx*blocks_per_ne00);
|
||||
const int jt = tmp / blocks_per_ne00;
|
||||
int tmp = fastdiv(kbc, blocks_per_ne00);
|
||||
uint2 tmp2 = fast_div_modulo(tmp, ntx);
|
||||
const int jt = tmp2.y;
|
||||
tmp = tmp2.x;
|
||||
tmp2 = fast_div_modulo(tmp, nchannels_y);
|
||||
const int zt = tmp2.y;
|
||||
tmp = tmp2.x;
|
||||
tmp2 = fast_div_modulo(tmp, nsamples_y);
|
||||
const int wt = tmp2.y;
|
||||
const int it = tmp2.x;
|
||||
|
||||
// Defaults for regular matrix multiplication:
|
||||
int col_low = 0;
|
||||
@@ -3708,7 +3774,7 @@ static __global__ void mul_mat_q(
|
||||
const int tile_x_max_i = nrows_x - it*mmq_y - 1;
|
||||
const int tile_y_max_j = col_diff - jt*mmq_x - 1;
|
||||
|
||||
const int offset_x = (wt/sample_ratio)*stride_sample_x + (zt/channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
|
||||
const int offset_x = fastdiv(wt, sample_ratio)*stride_sample_x + fastdiv(zt, channel_ratio)*stride_channel_x + it*mmq_y*stride_row_x;
|
||||
|
||||
constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
|
||||
mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
|
||||
@@ -3717,46 +3783,37 @@ static __global__ void mul_mat_q(
|
||||
}
|
||||
|
||||
template <ggml_type type, int mmq_x, bool need_check>
|
||||
static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
const int32_t * expert_bounds,
|
||||
float * __restrict__ dst,
|
||||
const float * __restrict__ tmp_last_tile,
|
||||
const int ncols_x,
|
||||
const int nrows_x,
|
||||
const int ncols_dst,
|
||||
const size_t stride_col_dst,
|
||||
const int nchannels_y,
|
||||
const size_t stride_channel_dst,
|
||||
const int nsamples_y,
|
||||
const size_t stride_sample_dst,
|
||||
const int ncols_max) {
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
__launch_bounds__(ggml_cuda_get_physical_warp_size()*mmq_get_nwarps_device()/2, 1)
|
||||
static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int32_t * __restrict__ ids_dst, const int32_t * __restrict__ expert_bounds, float * __restrict__ dst,
|
||||
float * __restrict__ tmp_last_tile, const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst,
|
||||
const int stride_col_dst, const uint3 nchannels_y, const int stride_channel_dst, const uint3 nsamples_y,
|
||||
const int stride_sample_dst, const uint3 ntx) {
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
constexpr int qk = ggml_cuda_type_traits<type>::qk;
|
||||
constexpr int ITER_K = get_iter_k(type);
|
||||
constexpr int blocks_per_iter = ITER_K / qk;
|
||||
|
||||
constexpr int blocks_per_iter = ITER_K / qk;
|
||||
const int64_t blocks_per_ne00 = ncols_x / qk;
|
||||
|
||||
constexpr int nwarps = mmq_get_nwarps_device();
|
||||
constexpr int nwarps = mmq_get_nwarps_device()/2;
|
||||
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
|
||||
|
||||
float sum[mmq_x*mmq_y / (nwarps*warp_size)] = {0.0f};
|
||||
float sum[mmq_x / nwarps] = {0.0f};
|
||||
const int i = blockIdx.y*warp_size + threadIdx.x;
|
||||
|
||||
const int ntx = (ncols_max + mmq_x - 1) / mmq_x;
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
const int nty = (nrows_x + mmq_y - 1) / mmq_y;
|
||||
|
||||
const int bidx0 = blockIdx.x;
|
||||
|
||||
// kbc == k block continuous, current index in continuous ijk space.
|
||||
int64_t kbc0 = (int64_t) bidx0 *nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
|
||||
int64_t kbc0_stop = (int64_t)(bidx0 + 1)*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
|
||||
int kbc0 = int64_t(blockIdx.x) *(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
|
||||
int kbc0_stop = int64_t(blockIdx.x + 1)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
|
||||
|
||||
kbc0 -= (kbc0 % blocks_per_ne00) % blocks_per_iter;
|
||||
kbc0_stop -= (kbc0_stop % blocks_per_ne00) % blocks_per_iter;
|
||||
kbc0 -= fastmodulo(kbc0, blocks_per_ne00) % blocks_per_iter;
|
||||
kbc0_stop -= fastmodulo(kbc0_stop, blocks_per_ne00) % blocks_per_iter;
|
||||
|
||||
const bool did_not_have_any_data = kbc0 == kbc0_stop;
|
||||
const bool wrote_beginning_of_tile = kbc0 % blocks_per_ne00 == 0;
|
||||
const bool did_not_write_last = kbc0/blocks_per_ne00 == kbc0_stop/blocks_per_ne00 && kbc0_stop % blocks_per_ne00 != 0;
|
||||
const bool wrote_beginning_of_tile = fastmodulo(kbc0, blocks_per_ne00) == 0;
|
||||
const bool did_not_write_last = fastdiv(kbc0, blocks_per_ne00) == fastdiv(kbc0_stop, blocks_per_ne00) && fastmodulo(kbc0_stop, blocks_per_ne00) != 0;
|
||||
if (did_not_have_any_data || wrote_beginning_of_tile || did_not_write_last) {
|
||||
return;
|
||||
}
|
||||
@@ -3765,11 +3822,11 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
|
||||
// Iterate over previous blocks and sum up partial sums written to fixup buffer.
|
||||
// All CUDA blocks that get here must have a previous block that needs a fixup.
|
||||
int64_t bidx = bidx0 - 1;
|
||||
int64_t kbc_stop = kbc0;
|
||||
int bidx = bidx0 - 1;
|
||||
int kbc_stop = kbc0;
|
||||
while(true) {
|
||||
int64_t kbc = bidx*nsamples_y*nchannels_y*ntx*nty*blocks_per_ne00 / gridDim.x;
|
||||
kbc -= (kbc % blocks_per_ne00) % blocks_per_iter;
|
||||
int kbc = int64_t(bidx)*(nsamples_y.z*nchannels_y.z*ntx.z*nty*blocks_per_ne00.z) / gridDim.x;
|
||||
kbc -= fastmodulo(kbc, blocks_per_ne00) % blocks_per_iter;
|
||||
|
||||
if (kbc == kbc_stop) { // Did not have any data.
|
||||
bidx--;
|
||||
@@ -3779,20 +3836,16 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
|
||||
any_fixup = true;
|
||||
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
|
||||
}
|
||||
sum[j0/nwarps] += tmp_last_tile[bidx*(mmq_x*mmq_y) + j*mmq_y + i];
|
||||
}
|
||||
|
||||
// If this block started in a previous tile we are done and don't need to combine additional partial results.
|
||||
if (kbc % blocks_per_ne00 == 0 || kbc/blocks_per_ne00 < kbc0/blocks_per_ne00) {
|
||||
if (fastmodulo(kbc, blocks_per_ne00) == 0 || fastdiv(kbc, blocks_per_ne00) < fastdiv(kbc0, blocks_per_ne00)) {
|
||||
break;
|
||||
}
|
||||
bidx--;
|
||||
@@ -3803,14 +3856,16 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
return;
|
||||
}
|
||||
|
||||
int tmp = kbc0;
|
||||
const int it = tmp / (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
|
||||
tmp -= it * (nsamples_y*nchannels_y*ntx*blocks_per_ne00);
|
||||
const int wt = tmp / (nchannels_y*ntx*blocks_per_ne00);
|
||||
tmp -= wt * (nchannels_y*ntx*blocks_per_ne00);
|
||||
const int zt = tmp / (ntx*blocks_per_ne00);
|
||||
tmp -= zt * (ntx*blocks_per_ne00);
|
||||
const int jt = tmp / blocks_per_ne00;
|
||||
int tmp = fastdiv(kbc0, blocks_per_ne00);
|
||||
uint2 tmp2 = fast_div_modulo(tmp, ntx);
|
||||
const int jt = tmp2.y;
|
||||
tmp = tmp2.x;
|
||||
tmp2 = fast_div_modulo(tmp, nchannels_y);
|
||||
const int zt = tmp2.y;
|
||||
tmp = tmp2.x;
|
||||
tmp2 = fast_div_modulo(tmp, nsamples_y);
|
||||
const int wt = tmp2.y;
|
||||
const int it = tmp2.x;
|
||||
|
||||
if (!ids_dst) {
|
||||
const int offset_dst = wt*stride_sample_dst + zt*stride_channel_dst + jt*mmq_x*stride_col_dst + it*mmq_y;
|
||||
@@ -3818,6 +3873,9 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
|
||||
const int i_max = nrows_x - it*mmq_y - 1;
|
||||
const int j_max = ncols_dst - jt*mmq_x - 1;
|
||||
if (need_check && i > i_max) {
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
@@ -3827,16 +3885,7 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
if (need_check && i > i_max) {
|
||||
continue;
|
||||
}
|
||||
|
||||
dst[j*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size];
|
||||
}
|
||||
dst[j*stride_col_dst + i] += sum[j0/nwarps];
|
||||
}
|
||||
return;
|
||||
}
|
||||
@@ -3856,6 +3905,9 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
|
||||
const int i_max = nrows_x - it*mmq_y - 1;
|
||||
const int j_max = col_diff - jt*mmq_x - 1;
|
||||
if (need_check && i > i_max) {
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
@@ -3865,16 +3917,7 @@ static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += warp_size) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
if (need_check && i > i_max) {
|
||||
continue;
|
||||
}
|
||||
|
||||
dst[ids_dst_shared[j]*stride_col_dst + i] += sum[(j0/nwarps) * (mmq_y/warp_size) + i0/warp_size];
|
||||
}
|
||||
dst[ids_dst_shared[j]*stride_col_dst + i] += sum[j0/nwarps];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3922,29 +3965,44 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
const int channel_ratio = args.nchannels_y / args.nchannels_x;
|
||||
const int sample_ratio = args.nsamples_y / args.nsamples_x;
|
||||
|
||||
const uint3 blocks_per_ne00_fd = init_fastdiv_values(args.ncols_x / ggml_cuda_type_traits<type>::qk);
|
||||
const uint3 ntx_fd = init_fastdiv_values(ntx);
|
||||
const uint3 nchannels_y_fd = init_fastdiv_values(args.nchannels_y);
|
||||
const uint3 nsamples_y_fd = init_fastdiv_values(args.nsamples_y);
|
||||
const uint3 channel_ratio_fd = init_fastdiv_values(channel_ratio);
|
||||
const uint3 sample_ratio_fd = init_fastdiv_values(sample_ratio);
|
||||
|
||||
if (!args.use_stream_k) {
|
||||
if (args.nrows_x % mmq_y == 0) {
|
||||
constexpr bool need_check = false;
|
||||
mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
args.ncols_max);
|
||||
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
ntx_fd);
|
||||
} else {
|
||||
constexpr bool need_check = true;
|
||||
mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
args.ncols_max);
|
||||
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
ntx_fd);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const dim3 block_nums_stream_k(nsm, 1, 1);
|
||||
const bool fixup_needed = ntx*nty*ntzw % nsm != 0;
|
||||
// For the stream-k kernel it is possible to run it with tiling by setting the number of CUDA blocks equal to the number of tiles.
|
||||
// This is worthwhile if the efficiency of tiling is high and skipping the fixup kernel is more important.
|
||||
const int ntiles_dst = ntx * nty * ntzw;
|
||||
const int tiles_nwaves = (ntiles_dst + nsm - 1) / nsm;
|
||||
const int tiles_efficiency_percent = 100 * ntiles_dst / (nsm*tiles_nwaves);
|
||||
const dim3 block_nums_stream_k(GGML_CUDA_CC_IS_NVIDIA(cc) && tiles_efficiency_percent >= 90 ? ntiles_dst : nsm, 1, 1);
|
||||
|
||||
GGML_ASSERT(ntiles_dst * blocks_per_ne00_fd.z < (1 << 30)); // Assert that variable kbc will not overflow.
|
||||
|
||||
const bool fixup_needed = ntiles_dst % block_nums_stream_k.x != 0;
|
||||
|
||||
ggml_cuda_pool & pool = ctx.pool(id);
|
||||
ggml_cuda_pool_alloc<float> tmp_fixup(pool);
|
||||
@@ -3952,40 +4010,45 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
tmp_fixup.alloc(block_nums_stream_k.x * mmq_x*mmq_y);
|
||||
}
|
||||
|
||||
const dim3 block_nums_fixup(block_nums_stream_k.x, mmq_y/warp_size, 1);
|
||||
const dim3 block_dims_fixup(block_dims.x, block_dims.y/2, block_dims.z);
|
||||
|
||||
if (args.nrows_x % mmq_y == 0) {
|
||||
constexpr bool need_check = false;
|
||||
mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
args.ncols_max);
|
||||
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
ntx_fd);
|
||||
|
||||
if (!fixup_needed) {
|
||||
return;
|
||||
}
|
||||
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
|
||||
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst,
|
||||
args.ncols_max);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
|
||||
args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
|
||||
ntx_fd);
|
||||
} else {
|
||||
constexpr bool need_check = true;
|
||||
mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
|
||||
(args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
|
||||
args.ncols_x, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio, args.nchannels_y, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio, args.nsamples_y, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
args.ncols_max);
|
||||
blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
|
||||
channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
|
||||
sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
|
||||
ntx_fd);
|
||||
|
||||
if (!fixup_needed) {
|
||||
return;
|
||||
}
|
||||
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, args.ncols_x, args.nrows_x, args.ncols_dst,
|
||||
args.nrows_dst, args.nchannels_y, args.stride_channel_dst, args.nsamples_y, args.stride_sample_dst,
|
||||
args.ncols_max);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
mul_mat_q_stream_k_fixup<type, mmq_x, need_check><<<block_nums_fixup, block_dims_fixup, 0, stream>>>
|
||||
(args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst,
|
||||
args.nrows_dst, nchannels_y_fd, args.stride_channel_dst, nsamples_y_fd, args.stride_sample_dst,
|
||||
ntx_fd);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -115,6 +115,7 @@ static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_pascal_older(gg
|
||||
case GGML_TYPE_IQ4_NL: return 6;
|
||||
case GGML_TYPE_IQ4_XS: return 5;
|
||||
case GGML_TYPE_MXFP4: return 4;
|
||||
case GGML_TYPE_NVFP4: return 4;
|
||||
case GGML_TYPE_Q2_K: return 4;
|
||||
case GGML_TYPE_Q3_K: return 4;
|
||||
case GGML_TYPE_Q4_0: return 6;
|
||||
@@ -135,6 +136,7 @@ static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_turing_plus(ggm
|
||||
case GGML_TYPE_IQ3_S: return 6;
|
||||
case GGML_TYPE_IQ3_XXS: return 7;
|
||||
case GGML_TYPE_MXFP4: return 7;
|
||||
case GGML_TYPE_NVFP4: return 8;
|
||||
case GGML_TYPE_Q2_K: return 7;
|
||||
case GGML_TYPE_Q3_K: return 5;
|
||||
default: return MMVQ_MAX_BATCH_SIZE;
|
||||
@@ -221,6 +223,7 @@ static constexpr __host__ __device__ int get_mmvq_mmid_max_batch_rdna4(ggml_type
|
||||
case GGML_TYPE_IQ4_NL: return 7;
|
||||
case GGML_TYPE_IQ4_XS: return 5;
|
||||
case GGML_TYPE_MXFP4: return 5;
|
||||
case GGML_TYPE_NVFP4: return 5;
|
||||
case GGML_TYPE_Q3_K: return 4;
|
||||
case GGML_TYPE_Q4_0: return 7;
|
||||
case GGML_TYPE_Q4_1: return 7;
|
||||
|
||||
@@ -70,6 +70,102 @@ __device__ __forceinline__ uint8_t compute_e8m0_scale(float amax) {
|
||||
return static_cast<uint8_t>(biased);
|
||||
}
|
||||
|
||||
|
||||
static __global__ void quantize_mmq_nvfp4(
|
||||
const float * __restrict__ x, const int32_t * __restrict__ ids, void * __restrict__ vy,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2) {
|
||||
#if defined(BLACKWELL_MMA_AVAILABLE)
|
||||
|
||||
const int64_t i0_base = ((int64_t) blockDim.x * blockIdx.y + threadIdx.x) * QK_NVFP4_SUB;
|
||||
if (i0_base >= ne0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t i1 = blockIdx.x;
|
||||
const int64_t i2 = blockIdx.z % ne2;
|
||||
const int64_t i3 = blockIdx.z / ne2;
|
||||
const int64_t i01 = ids ? ids[i1] : i1;
|
||||
const int64_t k_block = i0_base / QK_K;
|
||||
const int64_t blocks_per_col = (ne0 + QK_K - 1) / QK_K;
|
||||
if (k_block >= blocks_per_col) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int64_t ib = blockIdx.z * ((int64_t) blocks_per_col * ne1) + k_block * ne1 + blockIdx.x;
|
||||
block_fp4_mmq * y = (block_fp4_mmq *) vy;
|
||||
block_fp4_mmq * yb = y + ib;
|
||||
|
||||
const int sub = (i0_base % QK_K) / QK_NVFP4_SUB;
|
||||
|
||||
float vals_raw[QK_NVFP4_SUB];
|
||||
float amax_raw = 0.0f;
|
||||
const int64_t base_idx = i3 * s03 + i2 * s02 + i01 * s01;
|
||||
#pragma unroll
|
||||
for (int k = 0; k < QK_NVFP4_SUB; k++) {
|
||||
const int64_t i00 = i0_base + k;
|
||||
if (i00 < ne00) {
|
||||
const float v = x[base_idx + i00];
|
||||
vals_raw[k] = v;
|
||||
amax_raw = fmaxf(amax_raw, fabsf(v));
|
||||
} else {
|
||||
vals_raw[k] = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
static constexpr int test_offsets[5] = { 0, -1, 1, -2, 2};
|
||||
const int first_fp8_code = (int) ggml_cuda_fp32_to_ue4m3(amax_raw / 6.0f);
|
||||
|
||||
float best_err = FLT_MAX;
|
||||
uint8_t fp8_code = 0;
|
||||
float subblock_scale = 0.0f;
|
||||
|
||||
#pragma unroll // Check +/- 2 to find best code to reduce NVFP4 activation loss. Negligible overhead on Blackwell.
|
||||
for (int i = 0; i < 5; i++) {
|
||||
const int test_code = first_fp8_code + test_offsets[i];
|
||||
if (test_code < 0 || test_code > 0x7e) {
|
||||
continue;
|
||||
}
|
||||
const uint8_t code = (uint8_t) test_code;
|
||||
const float test_scale = ggml_cuda_ue4m3_to_fp32(code);
|
||||
const float test_inv_scale = test_scale > 0.0f ? 0.5f / test_scale : 0.0f;
|
||||
float cur_err = 0.0f;
|
||||
#pragma unroll
|
||||
for (int k = 0; k < QK_NVFP4_SUB; ++k) {
|
||||
const float v = vals_raw[k];
|
||||
const uint8_t q = ggml_cuda_float_to_fp4_e2m1(v, test_inv_scale);
|
||||
const float err_diff = fabsf(v) - fabsf(kvalues_mxfp4[q & 0x7]) * test_scale;
|
||||
cur_err = fmaf(err_diff, err_diff, cur_err);
|
||||
}
|
||||
|
||||
if (cur_err < best_err) {
|
||||
best_err = cur_err;
|
||||
fp8_code = test_code;
|
||||
subblock_scale = test_scale;
|
||||
}
|
||||
}
|
||||
|
||||
const float inv_scale = subblock_scale > 0.0f ? 0.5f / subblock_scale : 0.0f;
|
||||
uint32_t q0 = 0;
|
||||
uint32_t q1 = 0;
|
||||
#pragma unroll // this is faster than the previous __nv_fp4x4_e2m1
|
||||
for (int k = 0; k < QK_NVFP4_SUB / 4; ++k) {
|
||||
q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 0], inv_scale) << (8 * k);
|
||||
q0 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 8], inv_scale) << (8 * k + 4);
|
||||
q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 4], inv_scale) << (8 * k);
|
||||
q1 |= (uint32_t) ggml_cuda_float_to_fp4_e2m1(vals_raw[k + 12], inv_scale) << (8 * k + 4);
|
||||
}
|
||||
|
||||
uint32_t * yqs = reinterpret_cast<uint32_t *>(yb->qs);
|
||||
yqs[2 * sub + 0] = q0;
|
||||
yqs[2 * sub + 1] = q1;
|
||||
reinterpret_cast<uint8_t *>(yb->d4)[sub] = fp8_code;
|
||||
#else
|
||||
NO_DEVICE_CODE; // This is for Blackwell NVFP4 activations only.
|
||||
#endif // defined(BLACKWELL_MMA_AVAILABLE)
|
||||
|
||||
}
|
||||
|
||||
// quantize values in the format mxfp4 is stored which is interleaved nibbles
|
||||
// i.e. a block a0-a31 is represented as a0a16,a1a17 ...a15a31
|
||||
static __global__ void quantize_mmq_mxfp4(const float * __restrict__ x,
|
||||
@@ -316,28 +412,32 @@ void quantize_mmq_q8_1_cuda(
|
||||
}
|
||||
}
|
||||
|
||||
void quantize_mmq_mxfp4_cuda(const float * x,
|
||||
const int32_t * ids,
|
||||
void * vy,
|
||||
[[maybe_unused]] const ggml_type type_src0,
|
||||
const int64_t ne00,
|
||||
const int64_t s01,
|
||||
const int64_t s02,
|
||||
const int64_t s03,
|
||||
const int64_t ne0,
|
||||
const int64_t ne1,
|
||||
const int64_t ne2,
|
||||
const int64_t ne3,
|
||||
cudaStream_t stream) {
|
||||
GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0);
|
||||
void quantize_mmq_fp4_cuda(
|
||||
const float * x, const int32_t * ids, void * vy, const ggml_type type_src0,
|
||||
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
|
||||
const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
|
||||
GGML_ASSERT(type_src0 == GGML_TYPE_MXFP4 || type_src0 == GGML_TYPE_NVFP4);
|
||||
GGML_ASSERT(ne0 > 0);
|
||||
|
||||
constexpr int nwarps = 8;
|
||||
constexpr int vals_per_warp = 2 * QK_MXFP4;
|
||||
constexpr int vals_per_block = nwarps * vals_per_warp;
|
||||
if (type_src0 == GGML_TYPE_NVFP4) {
|
||||
GGML_ASSERT(ne00 % QK_NVFP4 == 0);
|
||||
constexpr int nvfp4_block_size = 128;
|
||||
const int64_t block_num_y = (ne0 + QK_NVFP4_SUB * nvfp4_block_size - 1) / (QK_NVFP4_SUB * nvfp4_block_size);
|
||||
const dim3 block_size(nvfp4_block_size, 1, 1);
|
||||
const dim3 num_blocks(ne1, block_num_y, ne2 * ne3);
|
||||
quantize_mmq_nvfp4<<<num_blocks, block_size, 0, stream>>>(
|
||||
x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
|
||||
} else {
|
||||
GGML_ASSERT(ne0 % (2 * QK_MXFP4) == 0);
|
||||
|
||||
const int64_t block_num_y = (ne0 + vals_per_block - 1) / vals_per_block;
|
||||
const dim3 num_blocks(ne1, block_num_y, ne2 * ne3);
|
||||
const dim3 block_size(WARP_SIZE, nwarps, 1);
|
||||
constexpr int nwarps = 8;
|
||||
constexpr int vals_per_warp = 2 * QK_MXFP4;
|
||||
constexpr int vals_per_block = nwarps * vals_per_warp;
|
||||
|
||||
quantize_mmq_mxfp4<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
|
||||
const int64_t block_num_y = (ne0 + vals_per_block - 1) / vals_per_block;
|
||||
const dim3 num_blocks(ne1, block_num_y, ne2 * ne3);
|
||||
const dim3 block_size(WARP_SIZE, nwarps, 1);
|
||||
|
||||
quantize_mmq_mxfp4<<<num_blocks, block_size, 0, stream>>>(x, ids, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,7 +26,7 @@ void quantize_mmq_q8_1_cuda(
|
||||
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
|
||||
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
|
||||
|
||||
void quantize_mmq_mxfp4_cuda(const float * x,
|
||||
void quantize_mmq_fp4_cuda(const float * x,
|
||||
const int32_t * ids,
|
||||
void * vy,
|
||||
ggml_type type_src0,
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
template <bool apply_silu, size_t split_d_inner, size_t d_conv>
|
||||
static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float * __restrict__ src1,
|
||||
const float * __restrict__ bias,
|
||||
const int src0_nb0, const int src0_nb1, const int src0_nb2, const int src1_nb1,
|
||||
float * __restrict__ dst, const int dst_nb0, const int dst_nb1, const int dst_nb2,
|
||||
const int64_t n_t) {
|
||||
@@ -27,6 +28,8 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float
|
||||
w[j] = w_block[tid * stride_w + j];
|
||||
}
|
||||
|
||||
float b = bias != nullptr ? bias[bidy * split_d_inner + tid] : 0.0f;
|
||||
|
||||
for (int64_t i = 0; i < n_t; i++) {
|
||||
float sumf = 0.0f;
|
||||
|
||||
@@ -42,12 +45,14 @@ static __global__ void ssm_conv_f32(const float * __restrict__ src0, const float
|
||||
for (size_t j = 0; j < d_conv; j++) {
|
||||
sumf += x[(i + j) % d_conv] * w[j];
|
||||
}
|
||||
sumf += b;
|
||||
y_block[i * stride_y + tid] = apply_silu ? ggml_cuda_op_silu_single(sumf) : sumf;
|
||||
}
|
||||
}
|
||||
|
||||
template <bool apply_silu, size_t split_d_inner, size_t d_conv, int64_t split_n_t>
|
||||
static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, const float * __restrict__ src1,
|
||||
const float * __restrict__ bias,
|
||||
const int src0_nb0, const int src0_nb1, const int src0_nb2,
|
||||
const int src1_nb1, float * __restrict__ dst, const int dst_nb0,
|
||||
const int dst_nb1, const int dst_nb2, const int64_t n_t) {
|
||||
@@ -97,6 +102,8 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
|
||||
w[j] = w_block[tid * stride_w + j];
|
||||
}
|
||||
|
||||
float b = bias != nullptr ? bias[bidy * split_d_inner + tid] : 0.0f;
|
||||
|
||||
// Compute from shared memory
|
||||
for (int64_t i = 0; i < local_n_t; i++) {
|
||||
float sumf = 0.0f;
|
||||
@@ -104,12 +111,13 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0,
|
||||
for (size_t j = 0; j < d_conv; j++) {
|
||||
sumf += smem[tid * n_cols + i + j] * w[j];
|
||||
}
|
||||
sumf += b;
|
||||
y_block[i * stride_y + tid] = apply_silu ? ggml_cuda_op_silu_single(sumf) : sumf;
|
||||
}
|
||||
}
|
||||
|
||||
template <bool apply_silu>
|
||||
static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int src0_nb0, const int src0_nb1,
|
||||
static void ssm_conv_f32_cuda(const float * src0, const float * src1, const float * bias, const int src0_nb0, const int src0_nb1,
|
||||
const int src0_nb2, const int src1_nb1, float * dst, const int dst_nb0, const int dst_nb1,
|
||||
const int dst_nb2, const int64_t nc, const int64_t nr, const int64_t n_t,
|
||||
const int64_t n_s, cudaStream_t stream) {
|
||||
@@ -120,14 +128,14 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
|
||||
constexpr int kNC = decltype(NC)::value;
|
||||
if (n_t <= 32) {
|
||||
const dim3 blocks(n_s, (nr + threads - 1) / threads, 1);
|
||||
ssm_conv_f32<apply_silu, threads, kNC><<<blocks, threads, 0, stream>>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
|
||||
ssm_conv_f32<apply_silu, threads, kNC><<<blocks, threads, 0, stream>>>(src0, src1, bias, src0_nb0, src0_nb1, src0_nb2, src1_nb1,
|
||||
dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
||||
} else {
|
||||
const int64_t split_n_t = 32;
|
||||
dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t);
|
||||
const size_t smem_size = threads * (kNC - 1 + split_n_t) * sizeof(float);
|
||||
ssm_conv_long_token_f32<apply_silu, threads, kNC, split_n_t><<<blocks, threads, smem_size, stream>>>(
|
||||
src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
||||
src0, src1, bias, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -140,11 +148,18 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * silu_dst) {
|
||||
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * bias_add_node, ggml_tensor * silu_dst) {
|
||||
const struct ggml_tensor * src0 = dst->src[0]; // conv_x
|
||||
const struct ggml_tensor * src1 = dst->src[1]; // conv1d.weight
|
||||
const bool fuse_bias = bias_add_node != nullptr;
|
||||
const bool fuse_silu = silu_dst != nullptr;
|
||||
|
||||
// bias always comes with silu.
|
||||
GGML_ASSERT(!fuse_bias || fuse_silu);
|
||||
|
||||
// The bias (when fused) is the non-conv operand of the ADD node.
|
||||
const struct ggml_tensor * bias = fuse_bias ? (bias_add_node->src[0] == dst ? bias_add_node->src[1] : bias_add_node->src[0]) : nullptr;
|
||||
|
||||
// When fusing, write to silu_dst (the node downstream references).
|
||||
const struct ggml_tensor * out = fuse_silu ? silu_dst : dst;
|
||||
|
||||
@@ -160,16 +175,23 @@ void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, g
|
||||
|
||||
const float * src0_d = (const float *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
const float * bias_d = fuse_bias ? (const float *) bias->data : nullptr;
|
||||
float * dst_d = (float *) out->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(out->type == GGML_TYPE_F32);
|
||||
if (fuse_bias) {
|
||||
GGML_ASSERT(bias->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(bias));
|
||||
GGML_ASSERT(ggml_nelements(bias) == nr);
|
||||
}
|
||||
|
||||
if (fuse_silu) {
|
||||
ssm_conv_f32_cuda<true>(src0_d, src1_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, out->nb[0], out->nb[1],
|
||||
ssm_conv_f32_cuda<true>(src0_d, src1_d, bias_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, out->nb[0], out->nb[1],
|
||||
out->nb[2], nc, nr, n_t, n_s, stream);
|
||||
} else {
|
||||
ssm_conv_f32_cuda<false>(src0_d, src1_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, out->nb[0], out->nb[1],
|
||||
ssm_conv_f32_cuda<false>(src0_d, src1_d, bias_d, src0->nb[0], src0->nb[1], src0->nb[2], src1->nb[1], dst_d, out->nb[0], out->nb[1],
|
||||
out->nb[2], nc, nr, n_t, n_s, stream);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * silu_dst = nullptr);
|
||||
void ggml_cuda_op_ssm_conv(ggml_backend_cuda_context & ctx, ggml_tensor * dst, ggml_tensor * bias_add_node = nullptr, ggml_tensor * silu_dst = nullptr);
|
||||
|
||||
@@ -2,4 +2,5 @@
|
||||
|
||||
#include "../fattn-mma-f16.cuh"
|
||||
|
||||
DECL_FATTN_MMA_F16_CASE(320, 256, 1, 32);
|
||||
DECL_FATTN_MMA_F16_CASE(576, 512, 1, 32);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user