Compare commits

...

147 Commits
b6350 ... b6497

Author SHA1 Message Date
David Ribeiro Alves
cd08fc3ecc common : Fix corrupted memory error on json grammar initialization (#16038)
Initalizing RESERVED_NAME in is_reserved_name() is not thread
safe and leads to corrupted memory when used from multiple threads
as can be seen in the asan trace below. This fixes the initialization
to make it thread-safe.

    #0 0x000100abd018 in std::__1::pair<std::__1::__hash_iterator<std::__1::__hash_node<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, void*>*>, bool> std::__1::__hash_table<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::hash<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>, std::__1::equal_to<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>>::__emplace_unique_key_args<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&) __hash_table:1565
    #1 0x000100ab0320 in SchemaConverter::visit(nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&) json-schema-to-grammar.cpp:802
    #2 0x000100aafc48 in std::__1::__function::__func<build_grammar(std::__1::function<void (common_grammar_builder const&)> const&, common_grammar_options const&)::$_2, std::__1::allocator<build_grammar(std::__1::function<void (common_grammar_builder const&)> const&, common_grammar_options const&)::$_2>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> (std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&)>::operator()(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&) function.h:319
    #3 0x000100a2c938 in std::__1::__function::__func<common_chat_params_init_llama_3_x(minja::chat_template const&, templates_params const&, bool)::$_0::operator()(common_grammar_builder const&) const::'lambda'(nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&), std::__1::allocator<common_chat_params_init_llama_3_x(minja::chat_template const&, templates_params const&, bool)::$_0::operator()(common_grammar_builder const&) const::'lambda'(nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&)>, void (nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&)>::operator()(nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&) function.h:319
    #4 0x000100a139f8 in foreach_function(nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&, std::__1::function<void (nlohmann::json_abi_v3_12_0::basic_json<nlohmann::json_abi_v3_12_0::ordered_map, std::__1::vector, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, bool, long long, unsigned long long, double, std::__1::allocator, nlohmann::json_abi_v3_12_0::adl_serializer, std::__1::vector<unsigned char, std::__1::allocator<unsigned char>>, void> const&)> const&) chat.cpp:762
    #5 0x000100a2a7f4 in std::__1::__function::__func<common_chat_params_init_llama_3_x(minja::chat_template const&, templates_params const&, bool)::$_0, std::__1::allocator<common_chat_params_init_llama_3_x(minja::chat_template const&, templates_params const&, bool)::$_0>, void (common_grammar_builder const&)>::operator()(common_grammar_builder const&) function.h:319
    #6 0x000100aa98f4 in build_grammar(std::__1::function<void (common_grammar_builder const&)> const&, common_grammar_options const&) json-schema-to-grammar.cpp:982
    #7 0x0001009c9314 in common_chat_params_init_llama_3_x(minja::chat_template const&, templates_params const&, bool) chat.cpp:1110
    #8 0x0001009b8afc in common_chat_templates_apply_jinja(common_chat_templates const*, common_chat_templates_inputs const&) chat.cpp:1992
    #9 0x0001009b533c in common_chat_templates_apply(common_chat_templates const*, common_chat_templates_inputs const&) chat.cpp:2074
    #10 0x000100810120 in llamacpp_apply_chat_template+0x724 (predict_oai-98384e17fb94e863:arm64+0x100090120)
    ...

==45482==Register values:
 x[0] = 0x00006020004147f8   x[1] = 0x00006080000013c8   x[2] = 0x0000000000000000   x[3] = 0x0000604006289738
 x[4] = 0x0000000000000002   x[5] = 0x0000000000000001   x[6] = 0x04034000004b4000   x[7] = 0x0000000000000001
 x[8] = 0xbebebebebebebebe   x[9] = 0x17d7d7d7d7d7d7d7  x[10] = 0x00000c04000828ff  x[11] = 0x0000000000000001
x[12] = 0x000000002018d383  x[13] = 0x0000000000000000  x[14] = 0xfa0000000000fafa  x[15] = 0x000010700001ffff
x[16] = 0x000000019dc012c0  x[17] = 0x00000001021284f8  x[18] = 0x0000000000000000  x[19] = 0x00000001700acdc0
x[20] = 0x0000000000000002  x[21] = 0x000000002018d384  x[22] = 0x16dd16fd2e731151  x[23] = 0x0000007000020000
x[24] = 0x0000000100c69c08  x[25] = 0x0000000100c69c20  x[26] = 0x00006080000013c7  x[27] = 0x0000000100c69c00
x[28] = 0x00000001700acd60     fp = 0x00000001700aceb0     lr = 0x0000000100abce30     sp = 0x00000001700acd60
AddressSanitizer can not provide additional info.
SUMMARY: AddressSanitizer: SEGV __hash_table:1565 in std::__1::pair<std::__1::__hash_iterator<std::__1::__hash_node<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, void*>*>, bool> std::__1::__hash_table<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::hash<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>, std::__1::equal_to<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>>::__emplace_unique_key_args<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&>(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>> const&)
Thread T5 created by T0 here:
    #0 0x0001020b99d4 in pthread_create+0x5c (libclang_rt.asan_osx_dynamic.dylib:arm64e+0x359d4)
    #1 0x000100873910 in std::sys::pal::unix::thread::Thread::new::h77254fdd87a28e05+0x118 (predict_oai-98384e17fb94e863:arm64+0x1000f3910)
    #2 0x0001007c7a1c in test::run_test::haeb3c2bcd5ed6cf6+0x76c (predict_oai-98384e17fb94e863:arm64+0x100047a1c)
    #3 0x0001007aedb0 in test::console::run_tests_console::he9d142d704f3a986+0x149c (predict_oai-98384e17fb94e863:arm64+0x10002edb0)
    #4 0x0001007c5758 in test::test_main::hf86a5e20735245b9+0x118 (predict_oai-98384e17fb94e863:arm64+0x100045758)
    #5 0x0001007c5da0 in test::test_main_static::h61ee9c8fd30abca0+0x54 (predict_oai-98384e17fb94e863:arm64+0x100045da0)
    ...

==45482==ABORTING
2025-09-17 11:08:02 +03:00
Eve
cb5bb6cc05 vulkan: automatically remove unsupported devices (#15976)
* remove unsupported vulkan devices

* make this happen during selection instead

* pass by reference
2025-09-17 09:35:37 +02:00
Daniel Bevenius
a91d035b90 ci : revert back to macos-13 for macOS-latest-cmake-x64 (#16040)
This commit reverts the change of the runs-on parameter for the
macOS-latest-cmake-x64 job back to macos-13 that was make in
Commit 51abc96bdc ("ci : update
macos-latest* jobs to use macos-latest (#15938)").

The motivation for this is that using macos-latest will cause an ARM
based runner to be used, and not an x64 based runner.

Refs: https://github.com/ggml-org/llama.cpp/pull/15938#issuecomment-3300805127
2025-09-17 09:34:09 +02:00
Jie Fu (傅杰)
745cbcf2fe llama-quant : fix the verification of attention layers for encoder-decoder models (#16023)
Signed-off-by: Jie Fu <jiefu@tencent.com>
2025-09-17 09:30:55 +02:00
Jie Fu (傅杰)
1cbd80f8cf examples : support encoder-decoder models in the simple example (#16002)
Signed-off-by: Jie Fu <jiefu@tencent.com>
2025-09-17 10:29:00 +03:00
Shane A
85286f3548 model : add OLMo3 support (#16015)
* Add HF to gguf conversion logic for Olmo3

* Add Olmo3 implementation

* Update rope comment

* Fix indentation

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Apply suggestion from @CISC

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-09-17 09:01:58 +02:00
Chenguang Li
d5fabe3682 CANN: Optimize ggml_cann_set_device (#15935)
* CANN: Fix ggml_cann_set_device to avoid redundant device switches

- Added a check to skip aclrtSetDevice if the current device is already set.
- Prevents unnecessary context switches while keeping thread/device consistency.

* CANN: add device default id
2025-09-17 14:33:08 +08:00
jacekpoplawski
8ff206097c llama-bench: add --n-cpu-moe support (#15952)
* llama-bench: add --n-cpu-moe support

Support --n-cpu-moe in llama-bench the same way it is supported by
llama-server.
2025-09-16 16:17:08 +02:00
Daniel Bevenius
77475530b8 ci : use macos-latest for arm64 webgpu build (#16029)
This commit updates the runs-on field for the macOS arm64 webgpu build
job to use macos-latest instead of just latest.

The motivation for this is that this job can wait for a runner to pick
up the job for a very long time, sometimes over 7 hours. This is an
attempt to see if this change can help reduce the wait time.

Refs: https://github.com/ggml-org/llama.cpp/actions/runs/17754163447/job/50454257570?pr=16004
2025-09-16 15:27:52 +02:00
Daniel Bevenius
3913f8730e ggml : fix padding in timestep embedding kernels (#15932)
* ggml : remove adding extra dim timestep embedding

This commit updates the ggml_timestep_embedding function to no longer
add an extra dimension when the specified dimension is odd.

The motivation for this change is that this introduces an unnecessary
dimension when the dimension is odd, which caused an issue in the
kernels which were not expecting this extra dimension and it resulted in
uninitialized memory for the second to last dimension.

* ggml-cuda : fix padding in timestep embedding kernel

This commit removes the zeroing out of the last dimension now that we
are not adding the extra padding dimension.

* ggml-metal : fix padding in timestep embedding kernel

This commit fixes the zero padding for odd dimensions in
the timestep embedding kernel

* ggml-opencl : fix padding in timestep embedding kernel

This commit fixes the zero padding for odd dimensions in
the timestep embedding kernel.

* ggml-sycl : fix padding in timestep embedding kernel

This commit fixes the zero padding for odd dimensions in
the timestep embedding kernel.

* ggml-vulkan : fix padding in timestep embedding kernel

This commit fixes the zero padding for odd dimensions in
the timestep embedding kernel.

* ggml-cpu : fix padding in timestep embedding function

This commit removes the zeroing out of the last dimension now that we
are not adding the extra padding dimension.
2025-09-16 15:25:57 +02:00
Daniel Bevenius
76888d202e ci : upload xcframework artifact from ios-xcode-build job (#16010)
This commit updates the github workflows build.yml file to include steps
for uploading and downloading the xcframework artifact. The
macos-latest-swift job now depends on the ios-xcode-build job and
downloads the xcframework artifact produced by it.

The motivation for this changes is that it takes a long time to build
the xcframework and we are currently doing this twice in the workflow.
With this change, we only build it once and reuse the artifact.
2025-09-16 13:41:38 +02:00
Bowen Han
f1fbffb5c0 fix: apply clang-format to CUDA macros (#16017)
clang-format previously broke long CUDA macros (e.g. __launch_bounds__) into
unreadable line breaks inside template declarations, such as:

  template<int D, int ncols, int nwarps, int VKQ_stride,
           typename KQ_acc_t, bool use_logit_softcap>
      __launch_bounds__(nwarps*ggml_cuda_get_physical_warp_size(), 1)

This change adjusts formatting rules so that CUDA macros remain consistent
and aligned with the surrounding template syntax.
2025-09-16 08:59:19 +02:00
Daniel Bevenius
51abc96bdc ci : update macos-latest* jobs to use macos-latest (#15938)
* ci : update macos-latest* jobs to use macos-latest

This commit updates the jobs that are named macos-latest* to use the
macos-latest label instead explicit versions.

The motivation for this is that there is currently a mixuture of
versions in this workflow and there are jobs that are failing because
they require a newer version.

Refs: https://github.com/ggml-org/llama.cpp/actions/runs/17644792595/job/50140010907#step:5:1759

* ci : add xcodebuild -downloadPlatform iOS command
2025-09-16 05:57:16 +02:00
Yuri Khrustalev
07808ebb07 cmake : Do not install tools on iOS targets (#15903) 2025-09-16 09:54:44 +07:00
Aman Gupta
6d758839ff Add LLaDA-7b-MoE diffusion model (#16003) 2025-09-16 10:38:28 +08:00
Jake Karnes
3d4053f77f CUDA: fix im2col_3d to respect non-contiguous inputs (views) (#15956)
* fix im2col_3d to respect non-contiguous inputs (views)

The CUDA 3D im2col kernel computed source addresses assuming compact layout (products of dims), ignoring nb[] strides. 

This patch switches im2col_3d source indexing to use true strides derived from src1->nb[] (in elements), mirroring the approach used in the 2D CUDA im2col path. Destination indexing is unchanged.

* use ggml_element_size() for src strides

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2025-09-16 00:28:31 +02:00
Diego Devesa
dc381aa9a6 docker : enable rocWMMA in ROCm images, add gfx1151 (#15997) 2025-09-15 23:38:52 +02:00
Diego Devesa
10d197409b releases : switch to rocWMMA develop branch, add gfx1151 (#15992)
* releases : switch to rocWMMA develop branch, add gfx1151

* remove unused variable ROCM_VERSION
2025-09-15 23:38:42 +02:00
yael-works
b907255f4b SYCL: Add COUNT_EQUAL operator support (#15991)
* SYCL: Add COUNT_EQUAL operator support (rebased on master)

* SYCL: remove duplicate op_count_equal definition

* tests: remove test_count_equal_typed and use test_count_equal for all cases

* tests: keep only I32 case for COUNT_EQUAL as suggested

* tests: keep only I32 case for COUNT_EQUAL as requested
2025-09-15 18:51:35 +02:00
Nikolay Popov
28c39da7c6 llama-run: Fix model download on Windows (#15988)
* llama-run: Fix model download on Windows
 * fix SSL error (SSL peer certificate or SSH remote key was not OK)
 * fix program crash on std::filesystem::rename

* llama-run: create a separate method to utilize RAII

* llama-run: handle rename exception
2025-09-15 11:08:30 +01:00
Aman Gupta
106220562a CUDA: some micro-optimizations in mmf.cuh for mul_mat_id (#15926) 2025-09-15 17:35:11 +08:00
ddh0
a68f31edd7 fix KLD percentile output (#15999)
In `llama-perplexity`, when using `--kl-divergence`, the KL divergence statistics output mistakenly displays the 99th percentile twice. This change fixes that and correctly displays the 90th percentile as originally intended (presumably).
2025-09-15 09:54:57 +02:00
Sigbjørn Skjæret
b8e09f08b9 model : add grok-2 support (#15539)
* add grok-2 support

* type fix

* type fix

* type fix

* "fix" vocab for invalid sequences

* fix expert tensor mapping and spaces in vocab

* add chat template

* fix norm tensor mapping

* rename layer_out_norm to ffn_post_norm

* ensure ffn_post_norm is mapped

* fix experts merging

* remove erroneous FFN_GATE entry

* concatenate split tensors and add more metadata

* process all expert layers and try cat instead of hstack

* add support for community BPE vocab

* fix expert feed forward length and ffn_down concat

* commit this too

* add ffn_up/gate/down, unsure if sequence is right

* add ffn_gate/down/up to tensor names

* correct residual moe (still not working)

* mess--

* fix embedding scale being applied twice

* add built in chat template

* change beta fast for grok if default value

* remove spm vocab in favor of community bpe vocab

* change attention temp length metadata type to integer

* update attention temp length metadata

* remove comment

* replace M_SQRT2 with std::sqrt(2)

* add yarn metadata, move defaults to hparams
2025-09-14 23:00:59 +02:00
Sigbjørn Skjæret
6c019cb04e server : only attempt to enable thinking if using jinja (#15967) 2025-09-14 21:17:04 +02:00
Georgi Gerganov
9dcd200d57 metal : remove memory pools (#15966)
* metal : remove mem pool usage

ggml-ci

* metal : remove mem pool implementation

ggml-ci

* metal : take into account the actual allocated memory of the tensor

ggml-ci

* cont : use ggml_backend_buft_get_alloc_size

ggml-ci

* cont : improve, comments

ggml-ci

* cont : add functions for the extra tensor sizes

* metal : add comments

ggml-ci

* metal : implement .get_alloc_size for the rest of the buffer types

ggml-ci

* metal : remove ggml_metal_heap

ggml-ci
2025-09-14 22:02:32 +03:00
Adam
0fa154e350 rocm.Dockerfile: added gfx1200,gfx1201 architectures to support AMD Radeon RX 9000 series (#15994)
* rocm.Dockerfile: added gfx1200,gfx1201 architectures to support  AMD Radeon RX 9000 series

https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html#rdna-os
states the Radeon RX 9000 series is supported support from Ubuntu 24.04.2, and the dockerfile is using 24.04 which is ROCm 6.4.

This fixed the `ROCm error: invalid device function` I was getting when trying to use the rocm container.
2025-09-14 20:43:54 +02:00
Ruben Ortlam
261e6a20ff Vulkan: Clean up mul_mm shader (#15987)
* vulkan: move mul_mm dequantization steps into a separate file and functions

* improve mul_mm vector load code

* fix debug mode issues and warnings
2025-09-14 16:56:28 +02:00
lcy
a0e13dcbe5 build: fix the build failures of Windows HIP release job (#15984)
* build: fix the cache keys for Windows HIP release job

Update the cache keys to include the HIP SDK version, preventing the
use of outdated ROCm installation caches.

* build: sync changes from release.yml to build.yml

- Update HIP SDK version to 25.Q3 and ROCm version to 6.4.2
- Update the cache keys to reflect the new versions

* build: remove Windows HIP release for gfx1151
since the current stable rocWMMA does not support gfx1151.
2025-09-14 07:20:35 -07:00
Georgi Gerganov
a14bd35014 metal : fix kernel requirements (#15983)
* metal : fix kernel requirements

ggml-ci

* cont : fix supports_op

* cont : fix supports_op for ARGMAX
2025-09-14 15:33:22 +03:00
Radoslav Gerganov
918b26f197 rpc : fix regression when --device is used (#15981)
Fix regression introduced with commit 50f4281a6
2025-09-14 12:28:18 +03:00
Diego Devesa
9ecb884346 releases : update ROCM, add gfx1200, gfx1201, gfx1151 (#15972)
* releases : update ROCM, add gfx1200, gfx1201, gfx1151

* releases : set target to 13.3 for macos-x64

* add hipblaslt.dll to release

* add hipblaslt/library to release
2025-09-14 02:21:59 -07:00
Radoslav Gerganov
d1c6f11f47 doc : update documentation for --tensor-split (#15980)
* doc : update documentation for --tensor-split

* Update tools/main/README.md

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Update tools/main/README.md

Co-authored-by: Diego Devesa <slarengh@gmail.com>

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
Co-authored-by: Diego Devesa <slarengh@gmail.com>
2025-09-14 12:10:07 +03:00
Aaron Teo
6380d6a3e7 ggml-zdnn: rm user mapped buffers (#15965)
* ggml-zdnn: rm user mapped buffers

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: rm dead code

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-zdnn: attempt to fix missing extra data buffer free

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

---------

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
2025-09-14 13:37:03 +08:00
Jeff Bolz
aa0c461efe vulkan: fix failing dequant shaders (#15862)
* vulkan: fix failing dequant shaders

* add missing const
2025-09-13 17:29:43 +02:00
Jeff Bolz
b9c9c9f789 vulkan: initialize vulkan-hpp to allow using extension function pointers (#15705)
Use this to query register count for shader compiles on NVIDIA. Currently
this is only for performance debug, but it could eventually be used in some
heuristics like split_k.
2025-09-13 17:23:30 +02:00
Diego Devesa
50f4281a6f llama : allow using iGPUs with --device (#15951)
* llama : allow using iGPUs with --device

* mtmd : allow iGPU

* rpc-server : allow iGPU
2025-09-13 16:49:49 +02:00
Georgi Gerganov
55758b00ca metal : refactor kernel loading (#15964)
* metal : refactor bin kernels loading

ggml-ci

* metal : refactor rms kernel loading

ggml-ci

* ci : try to add memory leaks check

ggml-ci

* ci : try to enable memory leak detection for Mac

* cont : seems to be working
2025-09-13 16:24:22 +03:00
Georgi Gerganov
f161463a54 metal : allow ops to run concurrently (#15929)
* metal : run graphs ops concurrently

ggml-ci

* cont : add flags for debugging and disabling concurrency

ggml-ci

* cont : refactor and handle fusing

ggml-ci

* cont : simplify - no need to use GPU address

ggml-ci

* cont : prepare mem ranges for reuse + add ggml-metal-common.cpp

ggml-ci

* cont : avoid redundant keywords in cpp [no ci]

* metal : reorder graph for better concurrency

ggml-ci

* metal : fix race on mem pool buffers

ggml-ci

* cont : add env GGML_METAL_GRAPH_OPTIMIZE_DISABLE

ggml-ci

* cont : refactor, optimize, add comments

ggml-ci

* cont : refactor ggml-metal.m

ggml-ci

* minor : update logs [no ci]
2025-09-13 13:54:28 +03:00
Georgi Gerganov
84d7b2fca1 metal : fix memory leaks (#15962)
ggml-ci
2025-09-13 12:45:04 +03:00
Aaron Teo
40be51152d ggml-zdnn: fix #15414, activate FP16 and BF16 acceleration and incorrect zTensor free (#15839) 2025-09-13 02:39:52 +08:00
Eric Curtin
4bf5549269 Add docker protocol support for llama-server model loading (#15790)
To pull and run models via: llama-server -dr gemma3
Add some validators and sanitizers for Docker Model urls and metadata

Signed-off-by: Eric Curtin <eric.curtin@docker.com>
2025-09-12 16:31:50 +01:00
Haiyue Wang
f4e664f838 context : remove redundant explicit casting to the same type (#15948)
The function 'output_reserve' return type is 'uint32_t', so need to add
explicit casting.
2025-09-12 18:16:32 +03:00
Georgi Gerganov
f088b6a84f server : adjust prompt similarity thold + add logs (#15913)
ggml-ci
2025-09-12 17:02:55 +03:00
Ruben Ortlam
304ac5693d Vulkan iGPU device selection overhaul and PCI ID API support (#15947)
* vulkan: implement ggml igpu device type, implement pci id support

* fix compiler warning

* prevent printf overflow warning
2025-09-12 13:24:21 +02:00
Mathieu Baudier
6c88ad8fa7 vulkan: Make device memory check more portable (#15939) 2025-09-12 09:06:20 +02:00
Neo Zhang Jianyu
704d90c987 Revert "sycl: add usage of enqueue_functions extension (#14244)" (#15910)
* Revert "sycl: add usage of enqueue_functions extension (#14244)"

This reverts commit 8308f98c7f.

* fix missed revert code, format the code
2025-09-12 09:15:12 +08:00
Diego Devesa
360d6533db ggml-backend : add GGML_BACKEND_DEVICE_TYPE_IGPU device type (#15797)
* ggml-backend : add GGML_BACKEND_DEVICE_TYPE_IGPU device type

ggml-backend : add device id to device props

llama : only use iGPU devices if there are no GPU devices

llama : do not use multiple devices from different backends with the same device id
2025-09-11 22:47:38 +02:00
Johannes Gäßler
0e6ff0046f CUDA: larger SRAM reads for tile FA, AMD FP16 dot (#15927)
* CUDA: larger SRAM reads for tile FA, AMD FP16 dot

* fix logic for availability of v_dot2_f32_f16
2025-09-11 21:19:58 +02:00
ddh0
df082f5630 nitpick : correct MB to MiB (#15934)
MB was incorrectly used for 1024 x 1024 bytes instead of MiB
2025-09-11 19:12:34 +02:00
Daniel Bevenius
24a6734daf ggml-cpu : add check for ARM MATMUL_INT8/i8mm support (#15922)
This commit adds a check for GGML_MACHINE_SUPPORTS_i8mm when enabling
MATMUL_INT8 features, ensuring that i8mm intrinsics are only used when
the target hardware actually supports them.

The motivation for this is to fix ggml CI build failures where the
feature detection correctly identifies that i8mm is not supported,
adding the +noi8mm flag, but MATMUL_INT8 preprocessor definitions are
still enabled, causing the compiler to attempt to use vmmlaq_s32
intrinsics without i8mm support.

Refs: https://github.com/ggml-org/ggml/actions/runs/17525174120/job/49909199499
2025-09-11 14:39:12 +01:00
Charles Xu
2b3efea9a4 kleidiai: fix GGML_ASSERT(*cur_backend_id != -1) failed (#15614)
* kleidiai: fix GGML_ASSERT(*cur_backend_id != -1) failed

* removes the Whisper-specific check for GET_ROWS support
2025-09-11 12:45:40 +02:00
hipudding
c0389dba43 CANN: Disable acl_graph for prefill stage (#15933)
Since the prefill length is not fixed, graphs constructed for the
prefill stage cannot be reused. For this reason, ACL graph
execution is disabled by default during prefill.
2025-09-11 15:59:37 +08:00
Oliver Simons
00681dfc16 CUDA: Add fastdiv to k_bin_bcast*, giving 1-3% E2E performance (#15872)
* Add fastdiv and fastmodulo to k_bin_bcast kernel

* Address review comments

* `prod_` instead of `prod` suffix

* Add test case for `k_bin_bcast_unravel` in CUDA backend
2025-09-10 22:04:03 +02:00
Jie Fu (傅杰)
4f658855fa llama : support T5 models with unequal number of encoder-decoder layers (#15909)
* Extend the support of T5 models with different encoder-decoder layers

Signed-off-by: Jie Fu <jiefu@tencent.com>

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update gguf-py/gguf/constants.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update gguf-py/gguf/gguf_writer.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-arch.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-arch.h

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-hparams.h

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Rename n_dec_layer --> dec_n_layer

Signed-off-by: Jie Fu <jiefu@tencent.com>

* Adapt to cases when dec_n_layer > n_layer

Signed-off-by: Jie Fu <jiefu@tencent.com>

---------

Signed-off-by: Jie Fu <jiefu@tencent.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-09-10 20:51:51 +02:00
Sigbjørn Skjæret
6ab397e12b graph : support non-contiguous Q in build_attn_mha (#15908)
* support non-contiguous Q in build_attn_mha

* Update src/llama-graph.cpp

ggml-ci

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-09-10 19:08:59 +02:00
Daniel Bevenius
9de447d94e ggml-cpu : fix padding in ggml_timestep_embedding (#15917)
This commit fixes the zero padding for odd dimensions in
ggml_compute_forward_timestep_embedding_f32.
The motivation for this is that currently if an odd dimension is used,
the padding check incorrectly uses the dimension value for indexing.
For example, with dim=15:

Elements 0-6 are set to cosine values
Elements 7-13 are set to sine values
Element 14 is left uninitialized (contains garbage)
Element 15 is correctly set to zero

This fix changes embed_data[dim] to embed_data[2 * half] so that
element 14 (the first unused element) is properly set to zero as well
as the last element.

Resolves: https://github.com/ggml-org/ggml/issues/1324
2025-09-10 17:31:40 +02:00
Georgi Gerganov
0f0a3c2851 metal : make the backend async (#15906)
* metal : make the backend async

ggml-ci

* cont : add comments, extend op offload, clean up

ggml-ci

* metal : fix batch size for MUL_MAT_ID

* metal : remove deprecated ggml_backend_metal_buffer_from_ptr

* metal : create only metal buffers, no wrapping of host memory

ggml-ci

* metal : restore .alloc_buffer for buffer_from_ptr_type

ggml-ci

* metal : remove broken implementation of GGML_OP_SET

ggml-ci

* metal : clean-up loose ends, ready for tests

ggml-ci

* metal : support both private and shared buffers

ggml-ci

* metal : enable private buffers + add global device queue

* metal : disable host buffer to prevent races

ggml-ci

* metal : avoid extra copy during set_tensor

ggml-ci

* metal : use separate buffer types for shread and private Metal buffers

ggml-ci

* metal : simplify synchronization logic

ggml-ci

* metal : fix build

ggml-ci

* metal : do not implement cpy_tensor

ggml-ci

* metal : separate implementations for shared and private buffers

ggml-ci
2025-09-10 17:52:35 +03:00
Daniel Bevenius
33daece86b ci : add caching for ROCm installation in release workflow (#15924)
This commit applies the same caching to the release workflow which
currently exists for the main CI workflow that was introduced in Commit
ff02caf9ee ("ci : cache ROCm installation
in windows-latest-cmake-hip (#15887)").
2025-09-10 15:39:57 +02:00
Daniel Bevenius
e7b6d83b52 tests : filter out no-ops from coverage report (#15900)
* tests : filter out no-ops from coverage report

This commit is a follow-up commit for #15745 to address the feedback on
how no-op operations should be filtered out from the coverage report.

The feedback regarding the UNARY and GLU sub-operations not being
handled I not exactly sure what should be done. They are included in the
coverage, for example ABS, ELU, EXP, GELU, GEGLU, GEGLU_ERF etc are in
the list of covered operations:
```console
$ ./build/bin/test-backend-ops --show-coverage
Operations covered by tests (89):
  ✓ ABS
  ✓ ACC
  ✓ ADD
  ✓ ADD1
  ✓ ADD_ID
  ✓ ARANGE
  ✓ ARGMAX
  ✓ ARGSORT
  ✓ CLAMP
  ✓ CONCAT
  ✓ CONV_2D
  ✓ CONV_2D_DW
  ✓ CONV_3D
  ✓ CONV_TRANSPOSE_1D
  ✓ CONV_TRANSPOSE_2D
  ✓ COS
  ✓ COUNT_EQUAL
  ✓ CPY
  ✓ CROSS_ENTROPY_LOSS
  ✓ CROSS_ENTROPY_LOSS_BACK
  ✓ DIAG_MASK_INF
  ✓ DIV
  ✓ DUP
  ✓ ELU
  ✓ EXP
  ✓ FLASH_ATTN_EXT
  ✓ GATED_LINEAR_ATTN
  ✓ GEGLU
  ✓ GEGLU_ERF
  ✓ GEGLU_QUICK
  ✓ GELU
  ✓ GELU_ERF
  ✓ GELU_QUICK
  ✓ GET_ROWS
  ✓ GET_ROWS_BACK
  ✓ GROUP_NORM
  ✓ HARDSIGMOID
  ✓ HARDSWISH
  ✓ IM2COL
  ✓ IM2COL_3D
  ✓ L2_NORM
  ✓ LEAKY_RELU
  ✓ LOG
  ✓ MEAN
  ✓ MUL
  ✓ MUL_MAT
  ✓ MUL_MAT_ID
  ✓ NEG
  ✓ NORM
  ✓ OPT_STEP_ADAMW
  ✓ OPT_STEP_SGD
  ✓ OUT_PROD
  ✓ PAD
  ✓ PAD_REFLECT_1D
  ✓ POOL_2D
  ✓ REGLU
  ✓ RELU
  ✓ REPEAT
  ✓ REPEAT_BACK
  ✓ RMS_NORM
  ✓ RMS_NORM_BACK
  ✓ ROLL
  ✓ ROPE
  ✓ ROPE_BACK
  ✓ RWKV_WKV6
  ✓ RWKV_WKV7
  ✓ SCALE
  ✓ SET
  ✓ SET_ROWS
  ✓ SGN
  ✓ SIGMOID
  ✓ SILU
  ✓ SILU_BACK
  ✓ SIN
  ✓ SOFT_MAX
  ✓ SOFT_MAX_BACK
  ✓ SQR
  ✓ SQRT
  ✓ SSM_CONV
  ✓ SSM_SCAN
  ✓ STEP
  ✓ SUB
  ✓ SUM
  ✓ SUM_ROWS
  ✓ SWIGLU
  ✓ SWIGLU_OAI
  ✓ TANH
  ✓ TIMESTEP_EMBEDDING
  ✓ UPSCALE

Operations without tests (14):
  ✗ ADD_REL_POS
  ✗ CUSTOM
  ✗ DIAG
  ✗ DIAG_MASK_ZERO
  ✗ FLASH_ATTN_BACK
  ✗ GET_REL_POS
  ✗ IM2COL_BACK
  ✗ MAP_CUSTOM1
  ✗ MAP_CUSTOM2
  ✗ MAP_CUSTOM3
  ✗ POOL_1D
  ✗ POOL_2D_BACK
  ✗ WIN_PART
  ✗ WIN_UNPART

Coverage Summary:
  Total operations: 103
  Tested operations: 89
  Untested operations: 14
  Coverage: 86.4%
```

Refs: https://github.com/ggml-org/llama.cpp/pull/15745

* use of ggml_op enum values instead of strcmp
2025-09-10 14:17:09 +02:00
j-k
2cfef4d117 media : add transparent icon svg and png [no ci] (#15891) 2025-09-10 14:51:28 +03:00
Jesse
09e72a037c gitignore : Ignore vim swap files in tests (#15901) 2025-09-10 14:28:47 +03:00
Chenguang Li
10d8b2b6b0 CANN: Add ROPE sin/cos cache for reuse (#15912)
* CANN: Add ROPE sin/cos cache for reuse

Introduce sin/cos caching mechanism in ROPE to avoid redundant
computation across layers. The cache is built on the first layer
per device and reused by subsequent layers if parameters match.

- Added sin_cache / cos_cache pointers and position_length tracking
- Introduced cache validity flags and properties:
  (ext_factor, theta_scale, freq_scale, attn_factor, is_neox)
- Accelerates ROPE by eliminating repeated sin/cos generation

This change reduces overhead in multi-layer scenarios while
preserving correctness by verifying parameter consistency.

Co-authored-by: hipudding <huafengchun@gmail.com>

* fix typo

Signed-off-by: noemotiovon <757486878@qq.com>

---------

Signed-off-by: noemotiovon <757486878@qq.com>
Co-authored-by: hipudding <huafengchun@gmail.com>
2025-09-10 18:42:00 +08:00
Chenguang Li
28b5f190ef CANN: implement LRU cache for ACL graphs (#15814)
* CANN: implement LRU cache for ACL graphs in CANN backend

- Introduce ggml_cann_graph_lru_cache to store multiple ggml_cann_graph objects.
- Graphs are loaded on demand and evicted using LRU policy when capacity is exceeded.
- Updated push, move_to_front, and clear methods to manage cached graphs efficiently.
- Ensures reuse of graphs, reducing graph reconstruction overhead in CANN backend.

* fix typo

* The LRU cache capacity can be configured via an env variable

Signed-off-by: noemotiovon <757486878@qq.com>

* refactory acl graph

* refactory && fix review comments

Signed-off-by: noemotiovon <757486878@qq.com>

---------

Signed-off-by: noemotiovon <757486878@qq.com>
2025-09-10 15:29:12 +08:00
Daniel Bevenius
86587da03b llama : check returned fn ptrs from ggml_backend_reg_get_proc_address (#15893)
This commit adds check for two function pointers returned from
ggml_backend_reg_get_proc_address.

The motivation for this is that the function pointer could be nullptr if
the get proc address function changes in the future. This is also
consistent with all the other calls to ggml_backend_reg_get_proc_address
in the code base.
2025-09-10 05:33:58 +02:00
Daniel Bevenius
ff02caf9ee ci : cache ROCm installation in windows-latest-cmake-hip (#15887)
This commit adds caching of the ROCm installation for the windows-latest-cmake-hip job. 

The motivation for this is that the installation can sometimes hang and/or not complete properly leaving an invalid installation which later fails the build. By caching the installation hopefully we can keep a good installation available in the cache and avoid the installation step.

Refs: https://github.com/ggml-org/llama.cpp/pull/15365
2025-09-10 05:23:19 +02:00
Ruben Ortlam
ae355f6f71 vulkan: throw the oom error instead of no memory type found (#15905) 2025-09-09 22:26:03 +02:00
Jeff Bolz
4f63cd705c vulkan: Fix OOB accesses in soft_max_back (#15861) 2025-09-09 14:41:15 +02:00
Johannes Gäßler
17bc5a815f HIP: use v_dot2_f32_f16 instruction for FA (#15884) 2025-09-09 14:04:43 +02:00
lksj92hs
ed54e32558 Workaround for subgroup arithmetic failing on MoltenVK with AMD GPUs (issue 15846) (#15886) 2025-09-09 14:01:15 +02:00
Aman Gupta
a972faebed CUDA: Add mul_mat_id support for the mmf kernel (#15767)
* CUDA: Add mul_mat_id support the mmf

Add support for mul_mat_id for bs < 16

* Review: use warp_size, fix should_use_mmf condition

* Launch one block per expert, stride along n_expert_used

* templatize mul_mat_id

* Pad shmem to 16 bytes, add helper function mul_mat_f_switch_ids

* Reduce compile times by dividing mmf into f16, bf16 and f32 variants

* Divide mmf by ncols_dst

* Add missing files

* Fix MUSA/HIP builds
2025-09-09 14:38:02 +08:00
Johannes Gäßler
550cf726e1 CUDA: fix GET_ROWS for large tensors (#15882) 2025-09-09 08:11:01 +02:00
Georgi Gerganov
c252ce67c4 contrib : add notes about merging PRs (#15881)
* contrib : add notes about merging PRs

* Update CONTRIBUTING.md

Co-authored-by: Diego Devesa <slarengh@gmail.com>

* Update CONTRIBUTING.md

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Diego Devesa <slarengh@gmail.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2025-09-09 08:42:10 +03:00
Daniel Bevenius
70cd37dbbe requirements : update transformers/torch for Embedding Gemma (#15828)
* requirements : update transformers/torch for Embedding Gemma

This commit updates the requirements to support converting
Embedding Gemma 300m models.

The motivation for this change is that during development I had a local
copy of the transformers package which is what I used for converting
the models. This was a mistake on my part and I should have also updated
my transformers version to the official release.

I had checked the requirements/requirements-convert_legacy_llama.txt
file and noted that the version was >=4.45.1,<5.0.0 and came to the
conculusion that no updated would be needed, this assumed that
Embedding Gemma would be in a transformers release at the time
Commit fb15d649ed ("llama : add support
for EmbeddingGemma 300m (#15798)) was merged. So anyone wanting to
convert themselves would be able to do so. However, Embedding Gemma is
a preview release and this commit updates the requirements to use this
preview release.

* resolve additional python dependencies

* fix pyright errors in tokenizer test and remove unused import
2025-09-09 06:06:52 +02:00
Piotr Wilkin (ilintar)
acc1b008cf model-conversion : add extra debugging support for model conversion (#15877)
* feat: Extra debugging support for model conversion - added BF16 support for llama-callback-eval and support for dumping intermediate steps in run-org-model.py
2025-09-09 06:05:55 +02:00
Aldehir Rojas
7057faf64b json : support enum values within allOf (#15830) 2025-09-08 16:14:32 -05:00
j-k
fe1c92cd7b media : add llama1 icon (#15878)
Add svg and png based off llama1-icon.svg
2025-09-08 21:57:01 +03:00
Jeff Bolz
e68aa10d8f vulkan: sort graph to allow more parallel execution (#15850)
* vulkan: sort graph to allow more parallel execution

Add a backend proc to allow the backend to modify the graph. The
vulkan implementation looks at which nodes depend on each other
and greedily reorders them to group together nodes that don't
depend on each other. It only reorders the nodes, doesn't change
the contents of any of them.

With #15489, this reduces the number of synchronizations needed.

* call optimize_graph per-split
2025-09-09 02:10:07 +08:00
Aman Gupta
0a16bf52e6 CUDA: generate_cu_files.py - add missing mxfp4 (#15880) 2025-09-09 01:23:46 +08:00
Jesse
88021565f0 chat : Deepseek V3.1 reasoning and tool calling support (OpenAI Style) (#15533)
* Add DeepSeek V3.1 thinking mode support

- Added COMMON_CHAT_FORMAT_DEEPSEEK_V3_1 enum value
- Created common_chat_params_init_deepseek_v3_1() function (currently uses R1 implementation)
- Created common_chat_parse_deepseek_v3_1() function that handles V3.1 thinking format:
  - Extracts reasoning content before '</think>' tag into reasoning_content
  - Extracts regular content after '</think>' tag into content
  - No opening '<think>' tag in V3.1 format
- Added detection logic for V3.1 templates based on pattern: 'message['prefix'] is defined and message['prefix'] and thinking'
- Added V3.1 case to parsing switch statement

This addresses the issue where V3.1 outputs reasoning content followed by '</think>' and then regular content without the opening '<think>' tag.

* Another attempt by V3.1 non-thinking

* Fix test, but it's not asserting anything.

* Ignore vim swap files in tests dir

* Update the test

* Try using try_find_literal instead of regex

* passing test

* Revert "Try using try_find_literal instead of regex"

This reverts commit c50d887ec2.

* Remove unnecessary change

* Remove comment

* Add code to handle non-thinking mode.

* Try to set message['prefix'] when thinking is enabled.

* This fixes reasoning, but breaks normal content. We need state in the
chat parser.

* DeepSeek V3.1 thinking is now the default. Disable with `--reasoning-budget 0`.

* Simplify (DeepSeek V3.1 reasoning)

* Fix sign inversion bug

* Add some tool calling code (not working).

* Tool calls working in non-reasoning mode.

* Attempt a unit test for tool call parsing.

* Passing test

* Add tests for both happy path and broken fenced DeepSeek V3.1 tool call variants.

* Passing DeepSeek V3.1 tool call tests, but model is not working.

* Revert assistance response prefill change. Not my monkeys.

* Add fenced_thinking unit test variant. Passes, but thinking tool calling
still isn't working for some reason.

* Tests pass in reasoning mode. Also e2e tool test passes.

* Make a copy of the parse_json_tool_calls function for deepseek-v3.1 so
as to not accidentally introduce regressions.

* Fix thinking_forced_open logic. tool calling broken. Need to add another
test case.

* That's what I get for cargo culting a newline.

* Add multi tool call test for deepseek v3.1 non-reasoning

* Move test, remove .gitignore change

* Place deepseek-v3.1 reasoning test directly into existing reasoning
function per CISC's request.

* Address whitespace CI failure.

* Merge two assert_equals per CISC's request.

* Add DeepSeek-V3.1 tests to tests/test-chat.cpp per CISC's request.

* Merge deepseek V3.1 and regular parse_json_tool_calls() function
behaviors by adding optional update_cursor argument.

* Update tests/test-chat-parser.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update tests/test-chat-parser.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update tests/test-chat-parser.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update tests/test-chat-parser.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update tests/test-chat-parser.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update tests/test-chat-parser.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update tests/test-chat-parser.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update tests/test-chat-parser.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update tests/test-chat-parser.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* DeepSeek V3.1 fix reasoning_format none

* Strip grammar down to strictly what we expect based on model card. Throw
out parts we cargo culted from R1 that don't make sense.

* Update tests/test-chat-parser.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* DeepSeek V3.1 - Add edge case where thinking is forced open, there is
tool calling in the reasoning content, but then the model just stops the
output without closing the </think> tag, so it's not a partial. In this
case, use the tool call in the reasoning content.

* DeepSeek V3.1 - simplify update_cursor

* Update common/chat.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update common/chat.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update common/chat.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Fix indent

---------

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-09-08 16:59:48 +02:00
Xuan-Son Nguyen
56920f5665 server : bring back timings_per_token (#15879) 2025-09-08 16:50:05 +02:00
Georgi Gerganov
b0d52998b9 cuda : fix supports_op condition for get_rows when number of blocks is too large (#15868)
* cuda : fix supports_op condition for get_rows when src1->ne2 > 1

ggml-ci

* ggml : add comment about ggml_get_rows

ggml-ci

* cuda : add FIXME [no ci]

* cuda : update support condition

ggml-ci
2025-09-08 13:56:51 +03:00
Georgi Gerganov
f28d4f4ac9 metal : refactor + optimize (#15857)
* metal : refactor

ggml-ci

* cont : refactor FA-vec kernel

* cont : print metal library load time

* minor : warn to debug + bettern kernel names

ggml-ci

* metal : optimize mul_mv q8_0

ggml-ci

* metal : simplify FA pipeline creation functions

ggml-ci

* metal : improve naming consistency

* metal : safer function constants offsets

ggml-ci

* metal : comments

ggml-ci
2025-09-08 13:34:56 +03:00
Xuan-Son Nguyen
9fcb29f22f ggml: allow casting between f32 and i32 (#15783)
* ggml: allow casting between f32 and i32

* fix cuda

* add vulkan

* fix CPU non-cont

* add non-cont test case

* add note

* extend test number range

* correct note

* add cont version for vulkan
2025-09-08 12:33:01 +02:00
Sigbjørn Skjæret
5ef22d281d CUDA: non-contiguous src0 not supported for PAD (#15869) 2025-09-08 12:55:44 +03:00
Daniel Bevenius
233d773d02 convert : force setting sliding_window from original config (#15867)
* convert : force setting sliding_window from original config

This commit modifies the set_gguf_parameters method for EmbeddingGemma
so that it reads the sliding_window parameter from the original model
config.json and uses that value.

The motivation for this change is that the Gemma3TextConfig
constructor adjusts the sliding_window value, which can lead to
inconsistencies when converting models as we expects this value to
match the original model's configuration.

Refs: bb45d3631e/src/transformers/models/gemma3/configuration_gemma3.py (L230)

* fix flake8 error

* add link to huggingface PR
2025-09-08 09:44:34 +02:00
Georgi Gerganov
a885dcff11 batched-bench : fix llama_synchronize usage during prompt processing (#15835)
ggml-ci
2025-09-08 10:27:07 +03:00
Georgi Gerganov
663027fd54 context : fix n_outputs during reserve (#15858)
ggml-ci
2025-09-08 10:26:36 +03:00
Georgi Gerganov
cf0e3ba150 model : avoid ggml_cont_3d for fused QKV weights (#15662)
* model : avoid ggml_cont_3d for fused QKV weights

ggml-ci

* kv-cache : make cpy_k and cpy_v implementation more readable

ggml-ci

* cont : add comments

ggml-ci

* cont : minor fix [no ci]

* cont : one more fix

* cont : clarity

ggml-ci

* kv-cache : require contiguous heads of k_cur and v_cur

ggml-ci
2025-09-08 10:25:33 +03:00
Jeff Bolz
d413dca003 tests: large sizes for get_rows (#15687) 2025-09-07 23:23:41 -05:00
Chenguang Li
85ca66a746 CANN: Stream sync between devices for acl_graph (#15809)
* CANN: Switch to stream synchronization

Switch to stream synchronization because events are not effective.

Co-authored-by: hipudding <huafengchun@gmail.com>

* CANN: add Comments

---------

Co-authored-by: hipudding <huafengchun@gmail.com>
2025-09-08 10:03:29 +08:00
Jeff Bolz
3976dfbe00 vulkan: support im2col_3d (#15795) 2025-09-07 13:50:26 -05:00
Aaron Teo
d36e61c580 ggml-cpu: clean up s390x SIMD (#15855)
* ggml-cpu: clean up s390x simd

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
(cherry picked from commit 0da4b6aa07)
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

* ggml-cpu: fix hsum data types

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>

---------

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
2025-09-08 02:18:28 +08:00
Jeff Bolz
c97b5e5854 vulkan: Support pad_ext (#15794) 2025-09-07 19:00:49 +02:00
Jeff Bolz
267e99867f vulkan: Use larger loads in scalar/coopmat1 matmul (#15729)
I think glslang will translate an access like x[i][1].z to
OpAccessChain ... x, i, 1, 2
OpLoad float16_t ...

rather than loading all of x[i] in a single OpLoad. Change the
code to explicitly load the vector/matrix.
2025-09-07 18:53:07 +02:00
Daniel Bevenius
3b15924d71 ggml WebGPU: remove userdata from request adapter callback (#15527)
* ggml WebGPU: remove userdata from request adapter callback

This commit removes the `userdata` parameter from the WebGPU request
adapter callback in `ggml-webgpu.cpp`. Instead, the lambda function
captures the `webgpu_context` directly.

The motivation for this change is to simplify the code and improve
readability.

* inline the callback lambda into the RequestAdapter call

This commit removes the callback lambda variable and inlines it directly
into the RequestAdapter call.
2025-09-07 11:19:45 +03:00
Johannes Gäßler
79bc429262 CUDA: faster tile FA (Pascal/AMD), headsize 256 (#15769) 2025-09-07 00:26:28 +02:00
Charles Xu
c4df49a42d kleidiai: generalize compute_forward_kv_cache to compute_forward_fp16 (#15817) 2025-09-06 22:08:43 +08:00
Xuan-Son Nguyen
3c3635d2f2 server : speed up tests (#15836)
* server : speed up tests

* clean up

* restore timeout_seconds in some places

* flake8

* explicit offline
2025-09-06 14:45:24 +02:00
Xuan-Son Nguyen
61bdfd5298 server : implement prompt processing progress report in stream mode (#15827)
* server : implement `return_progress`

* add timings.cache_n

* add progress.time_ms

* add test

* fix test for chat/completions

* readme: add docs on timings

* use ggml_time_us

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-09-06 13:35:04 +02:00
Johannes Gäßler
01806e7771 ggml-cpu: document use of "free" memory [no ci] (#15834) 2025-09-06 13:28:44 +02:00
Aaron Teo
186415d595 ggml-cpu: drop support for nnpa intrinsics (#15821) 2025-09-06 11:27:28 +08:00
Gabe Goodhart
fd621880f3 aLoRA Support (#15327)
* feat: Add python-side constants and conversion for adapter.lora.invocation_string

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Add c++ side constants for adapter.lora.invocation_string

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Parse invocation string for adapters from GGUF

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(python): Update conversion to alora_invocation_tokens

This is the preferred method in PEFT which is the source of ground truth

https://github.com/huggingface/peft/pull/2609/files#diff-13380145401d203d5935c5189dd09879f990b81aa63e8e3aaff8ce9110333f0e

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix(cpp): Update to alora_invocation_tokens on c++ side

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Add C APIs to get alora invocation token array from lora

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Initial implementation of alora cache logic in server

This does not yet do the part to identify the invocation tokens and only
apply the lora adapter afterwards, but it does seem to produce correct
results if the invocation tokens are the beginning of the uncached input.

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Identify alora invocation sequences

This currently limits to a single enabled alora per slot. Multiple aloras
with different invocation sequences would be possible, but it would require
a more complex integration of the adapter toggling and is not really a well
studied case for alora since it's unclear if one alora can reuse cache from
previous prefill computed with a different alora.

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Only reuse cache for tokens before the alora invocation start

This is a bit of an edge case, but theoretically a user could try the same
query with the alora disabled (just using the base model), then retry with
the alora. The cached tokens from the first pass should be invalid.

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Handle un-cached tokens that come before the alora activation

The solution is to only fill up to the token before the invocation start in
the batch if there are any tokens to be prefilled between those pulled from
cache and the invocation start. When this is detected, the alora is
temporarily disabled with a scale of 0.0, then immediately re-enabled after
it has been initialized for the internal graph. Since the batch does not
complete the prompt tokens, the remaining prompt tokens are handled in the
next task, pulling all of the non-alora tokens from cache and proceeding
with prefill for the alora tokens.

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Use || instead of 'or'

Too much python 🤦

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Fix off-by-one for limiting cached tokens to before alora start

This was the cause of the inconsistent results from the dummy test script
with and without the turn that runs the prompt without the adapter before
running it with the adapter.

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Support backwards-compatibility for "invocation_string" in adapter_config.json

While this has been replaced in the PEFT PR in favor of
alora_invocation_tokens, the existing adapters in the ibm-granite org on HF
use "invocation_string," so this will enable backwards compatibility and
enable testing now (before PEFT PR changes have percolated everywhere).

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Remove duplicate logging

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* feat: Report alora_invocation_string and alora_invocation_tokens from /lora-adapters

Branch: gabe-l-hart/alora-support

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-09-05 17:32:39 -06:00
Sigbjørn Skjæret
4281c7b315 ci : exempt correct research label (#15825) 2025-09-06 01:21:15 +02:00
Gabe Goodhart
5fac79cbc7 Thinking model disabled assistant prefill (#15404)
* feat: Set enable_thinking IFF not disabled and supported

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Fix inverted logic condition for prefill error

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Always parse the enable_thinking kwarg to overwrite the default value

From what I can tell, this started as a Qwen3-specific keyword, but from
the use in `chat.cpp` translates this inputs.enable_thinking to the right
thinking kwarg for the given model, this is now more of a standardized
kwarg, so it should always override the default value when sent as part of
the chat_template_kwargs field in the API.

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Don't limit tempalte expansion check to jinja

With the use_jinja check, non-jinja models would enable thinking and always
fail assistant prefill

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Add the error text to json type errors in json_value

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* feat: Explicitly reject string values for "enable_thinking"

There are too many possible "truthy" / "falsy" strings and too many
ambiguous strings that don't have a clear truthy/falsy value, so the
simplest thing to do here is to reject the request. Ideally, this would be
a 422 (Unprocessable Entity), but right now it's coming back as a 500.

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* refactor: Move logic for detecting template enable_thinking support to common

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

* fix: Use raw pointer for common chat template function

Branch: gabe-l-hart/thinking-model-disabled-agent-prefill

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>

---------

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2025-09-05 14:31:24 -06:00
Eric Curtin
408ff524b4 Implement --log-colors with always/never/auto (#15792)
With auto by default

Signed-off-by: Eric Curtin <ericcurtin17@gmail.com>
2025-09-05 19:43:59 +01:00
Johannes Gäßler
5143fa895e CUDA: fastdiv, launch bounds for mmvq + q8_1 quant (#15802)
* CUDA: fastdiv, launch bounds for mmvq + q8_1 quant
2025-09-05 16:07:02 +02:00
Daniel Bevenius
3a550b5ca4 tests : add --list-ops and --show-coverage options (#15745)
This commit adds two new command-line options to the
test-backend-ops.cpp that allow users to list all available GGML
operations and to show test coverage of these operations.

The motivation for this is that it can be useful to quickly see which
operations are currently covered by tests and which are not. Also it
migth be useful when using the `support` mode.
2025-09-05 13:49:21 +01:00
Erik Scholz
a81283820a gguf: gguf_writer refactor (#15691)
* gguf: split gguf writer into base and buf impl
* gguf: templated gguf write out
* gguf: file based writer (avoid writing everything to memory first!)
* examples(llama2c): fix log not being the same level and compiler nits
2025-09-05 11:34:28 +02:00
Georgi Gerganov
c610b6c11b kv-cache : fix SWA checks + disable cacheless iSWA (#15811)
ggml-ci
2025-09-05 10:39:22 +03:00
Daniel Bevenius
5d6688de08 model-conversion : add --embeddings flag to modelcard.template [no ci] (#15801)
This commit updates the modelcard.template file used in the model
conversion scripts for embedding models to include the llama-server
--embeddings flag in the recommended command to run the model.

The motivation for this change was that when using the model-conversion
"tool" to upload the EmbeddingGemma models to Hugging Face this flag was
missing and the embedding endpoint was there for not available when
copy&pasting the command.
2025-09-05 04:36:23 +02:00
ExtReMLapin
4fd1242bef chat : fixed crash when Hermes 2 <tool_call> had a newline before it (#15639)
Co-authored-by: CNE Pierre FICHEPOIL <pierre-1.fichepoil@gendarmerie.interieur.gouv.fr>
2025-09-05 01:24:08 +02:00
Piotr Wilkin (ilintar)
b2426e469e chat : nemotron thinking & toolcalling support (#15676)
* feat: nemotron thinking & toolcalling support

* Trailing whitespaces

* Corrected template for Nemotron

* Template and parser fixes

* Final template and grammar changes

* Whitespace

* Always do lazy grammar processing since </think> tag will always be there.

* Allow extra content after toolcall

* Whitespace

* New tests: thinking + tools, tools + content, thinking + tools + content (new!)

* Whitespace

* Remove cURL test script
2025-09-05 01:22:22 +02:00
Piotr Wilkin (ilintar)
9e2b1e83c6 scripts : add Jinja tester PySide6 simple app (#15756)
* feat: add Jinja tester PySide6 simple app

* Linter fixes

* Pylint fixes

* Whitespace

* Add commandline support; add formatter; add extensions

* Remove testing actions

* Silence flake8 warnings for commandline mode

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Fix trailing whitespace/newline logic

* Update scripts/jinja/jinja-tester.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update scripts/jinja/jinja-tester.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-09-05 01:05:12 +02:00
Daniel Bevenius
fb15d649ed llama : add support for EmbeddingGemma 300m (#15798)
This commit add support for the EmbeddingGemma 300m. This model supports
sliding window attention (SWA) and a new swq_type is introduced to
support symmetric SWA masking.

This commit also extracts the code from the function
llama_is_masked_swa in llama-impl.h, so that the logic can be shared
by both llm_graph_input_attn_no_cache::set_input and
llama_kv_cache::set_input_kq_mask.

With this commit the EmbeddingGemma 300m model can be converted to
to GGUF and used with llama.cpp.

Once the model has been uploaded to HuggingFace it can be used like
this:
```console
./build/bin/llama-cli -hf ggml-org/embeddinggemma-300m-GGUF:Q8_0
```
2025-09-04 18:10:29 +02:00
Gabe Goodhart
856ed0947f metal : Add template specialization for mul_mm_id w/ ne20 == 10 (#15799)
Branch: GGMLMetalNE20

Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
2025-09-04 18:53:22 +03:00
Daniel Bevenius
d1e2adba65 llama : set n_outputs to 1 to avoid 0 outputs mean-pooling (#15791)
* llama : set n_outputs to 1 to avoid 0 outputs mean-pooling

This commit modifies the llama_context constructor to set n_outputs to
1.

The motivation for this is that when using pooling, and specifically
mean pooling, for embeddings having n_outputs set to 0 can lead to the
following error:
```console
$ build/bin/llama-embedding -m models/nomic-embed-text-1.5-Q4_K_M.gguf \
   --pooling mean -p "Hello, how are you?"
...
llama_context:        CPU  output buffer size =     0.12 MiB
/home/danbev/work/ai/llama.cpp/ggml/src/ggml.c:3023: GGML_ASSERT(ggml_can_mul_mat(a, b)) failed
0x0000743c96d107e3 in __GI___wait4 (pid=292978, stat_loc=0x0, options=0, usage=0x0) at ../sysdeps/unix/sysv/linux/wait4.c:30
warning: 30	../sysdeps/unix/sysv/linux/wait4.c: No such file or directory
30	in ../sysdeps/unix/sysv/linux/wait4.c
196	        waitpid(child_pid, NULL, 0);
230	        ggml_print_backtrace();
3023	    GGML_ASSERT(ggml_can_mul_mat(a, b));
1823	                cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, inp)), inp_mean);
18983	    llm->build_pooling(cls, cls_b, cls_out, cls_out_b);
1399	    auto * gf = model.build_graph(gparams);
292	            auto * gf = graph_reserve(1, n_seqs, n_outputs, mctx.get(), true);
2329	        auto * ctx = new llama_context(*model, params);
913	    llama_context * lctx = llama_init_from_model(model, cparams);
105	    common_init_result llama_init = common_init_from_params(params);
[Inferior 1 (process 292976) detached]
Aborted (core dumped)
```

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* add comment about not reserving graphs with zero outputs

* add assert in graph_reserve to ensure n_outputs >= 1

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-09-04 15:40:44 +02:00
Chenguang Li
c1c354e44c CANN: Refactor ND to NZ workspace to be per-device (#15763)
* CANN:Refactor ND to NZ workspace to be per-device in Ascend backend

- Replaced the previous single global ND→NZ workspace with a per-device
  cache using unordered_map keyed by device ID.
- Functions `release_nz_workspace`, `relloc_nz_workspace`, and
  `get_nz_workspace` now manage workspace independently for each device,
  preventing memory conflicts in multi-device / pipeline parallel scenarios.
- This change fixes potential precision issues caused by workspace
  overwrites when multiple devices perform ND→NZ conversions concurrently.

Co-authored-by: hipudding <huafengchun@gmail.com>

* refactor

Signed-off-by: noemotiovon <757486878@qq.com>

* rename

Signed-off-by: noemotiovon <757486878@qq.com>

* fix review comments

Signed-off-by: noemotiovon <757486878@qq.com>

---------

Signed-off-by: noemotiovon <757486878@qq.com>
Co-authored-by: hipudding <huafengchun@gmail.com>
2025-09-04 20:20:14 +08:00
Xuan-Son Nguyen
a68d914426 server: add exceed_context_size_error type (#15780)
* server: add exceed_context_size_error type

* change error code to 400
2025-09-04 11:50:23 +02:00
Eric Curtin
badb80cadb Document the new max GPU layers default in help (#15771)
This is a key change, just letting users know.

Signed-off-by: Eric Curtin <ericcurtin17@gmail.com>
2025-09-04 10:49:44 +01:00
leejet
0a1b3982cd ggml: add ops for WAN video model (cuda && cpu) (#15669)
* add conv3d support

* add ggml_pad_ext for cpu & cuda backend

* cuda/cpu: add im2col_3d support

* cuda: make im2col a little faster

* fix cuda pad/scale/im2col3d

* make im2col_3d faster

* gguf: support loading tensors which n_dims > GGML_MAX_DIMS

* fix cuda get_rows

* avoid ggml_conv_3d conflict

* correct GGML_OP_COUNT assertion

* avoid build failure

* avoid build failure on MacOS

* cuda: remove unnecessary MIN define

* fix cpu im2col_3d

* adjust the code style

* cuda: use simpler loop in get_rows

* add test_im2col_3d to test-backend-ops

* test-backend-ops.cpp: remove trailing whitespace

* cpu: im2col_3d support non continuous src

Co-authored-by: Jeff Bolz <jbolz@nvidia.com>

* fix test_im2col_3d

* remove unused variables

* cuda: get_rows: dfloat2 -> float2

* add test_pad_ext to test-backend-ops.cpp

* add gguf_init_from_file_ext impl

* Revert "gguf: support loading tensors which n_dims > GGML_MAX_DIMS"

This reverts commit d8377a0a37.

* Revert "add gguf_init_from_file_ext impl"

This reverts commit d9f1d13208.

* update ggml_backend_vk_device_supports_op

* fix ggml_backend_vk_device_supports_op

* update other backend supports op for ggml_pad_ext

* metal/opencl/sycl/vulkan: fix GGML_OP_PAD check in supports_op

---------

Co-authored-by: Jeff Bolz <jbolz@nvidia.com>
2025-09-04 10:38:49 +02:00
hipudding
5421f63ab0 CANN: Fix precision issue on 310I DUO multi-devices (#15784) 2025-09-04 15:12:30 +08:00
rmatif
820bc98531 opencl: add hs=40 to FA (#15758) 2025-09-03 23:30:28 -07:00
Chenguang Li
239b60e898 CANN: fix acl_rstd allocation size in ggml_cann_rms_norm (#15760)
Fixes #15330

Adjust the allocation size of acl_rstd. The parameter `dims` is set to 3 according to the CANN documentation.

Co-authored-by: Yuchuan <yuchuan-cao@users.noreply.github.com>
2025-09-04 11:03:02 +08:00
Ruben Ortlam
dff7551bfd vulkan: fix mmv subgroup16 selection (#15775) 2025-09-03 21:55:10 +01:00
Jeff Bolz
0fce7a1248 vulkan: don't use std::string in load_shaders, to improve compile time (#15724)
* vulkan: don't use std::string in load_shaders, to improve compile time

* keep the string version for those calls that use it
2025-09-03 20:33:15 +02:00
Daniel Bevenius
8227695d7a vulkan : update ggml_vk_instance_validation_ext_available (#15666)
* vulkan : update ggml_vk_instance_validation_ext_available

This commit updates ggml_vk_instance_validation_ext_available() to
check for VK_EXT_validation_features instead of
VK_KHR_portability_enumeration.

Based on how the returned boolean is used later in the code (to enable
both the validation layer and the VK_EXT_validation_features extension),
it appears the function may have been intended to check for the
validation layer features extension.

* remove try/catch

This was a left over from a previous iteration where I was explicitly
quering for a specific validation layer first, which would throw.

* update warning message about validation layers
2025-09-03 20:24:50 +02:00
Shin-myoung-serp
0014fb4add ggml vulkan: add hardsigmoid and hardswish operations (#15762) 2025-09-03 20:22:55 +02:00
Oliver Simons
661ae31c9c CUDA: Optimize rms_norm_f32 kernel and its fused variants, giving 1-6% perf E2E (#15715)
* Add fastdiv, use it in modulo and use modulo in rms_norm_f32

Fastdiv is much faster way to do integer division, which was identified
as bottleneck in rms_norm_f32

* Support more `block_size` values in `rms_norm_f32`

This makes us more flexible in selecting the optimal threads w.r.t
paralellizing across a col vs. launch-overheads of threads and mio
throttles

* Update ggml/src/ggml-cuda/common.cuh

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Replace modulo with fastmodulo in `rms_norm_f32`

* Use `BinPackArguments=true` for formating function calls

Will file a separate PR to adjust .clang-format file

* Update ggml/src/ggml-cuda/common.cuh

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Use uint3 for both `fastdiv` and `fastmodulo`

The compiler seems to reliably optimize away the unused .z component in
the fastdiv use-case, see https://godbolt.org/z/rx8KPrKr3

* More constrained type declarations

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* Rename fastdiv and fastmodulo variables to shared variable name

As suggest by JohannesGaessler, this increases clarity of the intended
use

* Pack fastdiv/fastmodulo constants into uint2/uint3 objects

By packing constants to be used together into a struct, we are less
likely to make errors.

* Rename function parameter of fastmodulo

`modulo_consts` is more fitting/descriptive

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
2025-09-03 19:59:16 +02:00
Daniel Bevenius
407c23786d model-conversion : fix pyright errors (#15770)
This commit addresses type errors reported by pyright in the model
conversion scripts.
2025-09-03 18:28:36 +02:00
Georgi Gerganov
cdedb70a99 sampling : optimize dist sampler (#15704)
ggml-ci
2025-09-03 18:16:26 +03:00
Daniel Bevenius
2c8dac72eb llama : fix incorrect model type for Gemma 270M (#15764)
This commit fixes the model type for the Gemma 270M model in
llama_model.cpp which should be LLM_TYPE_270M. I incorrectly added this
previously as LLM_TYPE_537M which was wrong.

The motivation for this is that it causes the model to not be identified
properly when using tools like llama-bench. For example:
```console
$ ./build/bin/llama-bench -m models/gemma-3-270m-Q8_0.gguf
| model                          |       size | ...
| ------------------------------ | ---------: | ...
| gemma3 ?B Q8_0                 | 271.81 MiB | ...
| gemma3 ?B Q8_0                 | 271.81 MiB | ...
```

With the changes in this commit the output will be:
```console
$ ./build/bin/llama-bench -m models/gemma-3-270m-Q8_0.gguf
| model                          |       size | ...
| ------------------------------ | ---------: | ...
| gemma3 270M Q8_0               | 271.81 MiB | ...
| gemma3 270M Q8_0               | 271.81 MiB | ...
```
2025-09-03 13:35:49 +02:00
Daniel Bevenius
40a751ea9a model-conversion : remove hardcoded /bin/bash shebangs [no ci] (#15765)
* model-conversion : remove hardcoded /bin/bash shebangs [no ci]

This commit updates the bash scripts to use env instead of using
hardcoded /bin/bash in the shebang line.

The motivation for this is that some systems may have bash installed
in a different location, and using /usr/bin/env bash ensures that
the script will use the first bash interpreter found in the user's
PATH, making the scripts more portable across different environments.

* model-conversion : rename script to .py [no ci]

This commit renames run-casual-gen-embeddings-org.sh to
run-casual-gen-embeddings-org.py to reflect its Python nature.
2025-09-03 12:50:47 +02:00
hipudding
5eae934883 CANN: Add RoPE contiguous check for 310I DUP device (#15735) 2025-09-03 16:46:01 +08:00
xctan
05c0380f2a ggml-cpu : optimize RVV kernels (#15720)
* ggml-cpu : optimize rvv ggml_vec_dot_f32

* ggml-cpu : optimize 128-bit rvv ggml_vec_dot_q4_K_q8_K

* ggml-cpu : fix riscv arch flags

* ggml-cpu : add more rvv ops

* ggml-cpu : optimize rvv ggml_vec_dot_q4_K_q8_K

* ggml-cpu : optimize rvv ggml_vec_dot_q6_K_q8_K

* ggml-cpu : minor rvv adjustments

* ggml-cpu : fix riscv include
2025-09-03 16:16:21 +08:00
Daniel Bevenius
8c3fdf44ec model-conversion : add missing curl script [no ci] (#15761)
This commit adds a curl script to the model-conversion examples
which is currently missing. This script is required for the running the
embedding server targets to test llama-server embeddings functionality.
2025-09-03 09:48:35 +02:00
hipudding
f6da8cb86a CANN: Mask unsupported TRANSPOSE_1D operator (#15733)
CANN currently does not support kernels larger than 255.
This change disables such cases.
2025-09-03 14:08:22 +08:00
Chenguang Li
8a2234ea0c CANN: Fix type float_t to float (#15736)
Signed-off-by: noemotiovon <757486878@qq.com>
2025-09-03 10:43:53 +08:00
SnA1lGo
3de008208b fix: resolve unsigned int initialization warning for n_dims/size in gguf.cpp (#15754) 2025-09-02 21:27:30 +02:00
Oliver Simons
69db8a52e6 chore: Update .clang-format to use BinPackArguments=true (#15744)
This seems to correspond with what we want to do, see
[here](https://github.com/ggml-org/llama.cpp/pull/15715#discussion_r2315613796)
and [clang-format docs](https://clang.llvm.org/docs/ClangFormatStyleOptions.html#binpackarguments)
2025-09-03 01:40:37 +08:00
Johannes Gäßler
c466abe158 llama: -fa 1/0/-1 aliases for -fa on/off/auto (#15746) 2025-09-02 18:17:26 +02:00
Ruben Ortlam
0a2a3841e8 vulkan: fix shaders gen when no integer dot is available (#15740) 2025-09-02 16:02:26 +02:00
hipudding
9961d244f2 CANN: Resolve soft_max precision issue (#15730)
Previously, the slope tensor was set to fp16 to improve efficiency.
While this worked correctly in FA, it caused precision issues in soft_max.
This change applies different data types for different operators
to balance both accuracy and performance.
2025-09-02 17:12:37 +08:00
Jeff Bolz
25f1045f07 vulkan: Fix macro parameter order for f32 matmul shaders (#15716) 2025-09-02 14:37:01 +08:00
rmatif
97669e4073 opencl: add attn sinks support for FA kernels (#15706) 2025-09-01 23:26:53 -07:00
Chenguang Li
2f853687b3 CANN: Support eager execution mode under ACL graph compilation (#15712)
* [CANN] Support eager execution mode under ACL graph compilation

Add support for running operators in eager mode while ACL graph
compilation is enabled. This allows bypassing graph execution
and directly submitting ops, which is useful for debugging and
reducing graph build overhead in certain scenarios.

Signed-off-by: noemotiovon <757486878@qq.com>

* fix typo

Signed-off-by: noemotiovon <757486878@qq.com>

* rename to acl_graph_mode

Signed-off-by: noemotiovon <757486878@qq.com>

---------

Signed-off-by: noemotiovon <757486878@qq.com>
2025-09-02 14:07:48 +08:00
hipudding
ef2af57ddf CANN: Support ext_factor in rope (#15710) 2025-09-02 14:05:23 +08:00
Johannes Gäßler
5d804a4938 ggml-backend: raise GGML_MAX_SPLIT_INPUTS (#15722) 2025-09-01 16:14:55 -07:00
245 changed files with 21207 additions and 9720 deletions

View File

@@ -22,7 +22,14 @@ AllowShortIfStatementsOnASingleLine: Never
AllowShortLambdasOnASingleLine: Inline
AllowShortLoopsOnASingleLine: false
AlwaysBreakBeforeMultilineStrings: true
BinPackArguments: false
# Treat CUDA keywords/attributes as "attribute macros" and avoid breaking lines inside them
AttributeMacros:
- __host__
- __device__
- __global__
- __forceinline__
- __launch_bounds__
BinPackArguments: true
BinPackParameters: false # OnePerLine
BitFieldColonSpacing: Both
BreakBeforeBraces: Custom # Attach

View File

@@ -4,7 +4,7 @@ ARG UBUNTU_VERSION=24.04
ARG ROCM_VERSION=6.4
ARG AMDGPU_VERSION=6.4
# Target the CUDA build image
# Target the ROCm build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
### Build image
@@ -15,16 +15,13 @@ FROM ${BASE_ROCM_DEV_CONTAINER} AS build
# This is mostly tied to rocBLAS supported archs.
# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
# gfx906 is deprecated
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.4.1/reference/system-requirements.html
ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
#ARG ROCM_DOCKER_ARCH=gfx1100
ARG ROCM_DOCKER_ARCH='gfx803;gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1010;gfx1030;gfx1032;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx1151'
#ARG ROCM_DOCKER_ARCH='gfx1151'
# Set nvcc architectured
# Set ROCm architectures
ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
# ENV CC=/opt/rocm/llvm/bin/clang
# ENV CXX=/opt/rocm/llvm/bin/clang++
RUN apt-get update \
&& apt-get install -y \
@@ -39,8 +36,16 @@ WORKDIR /app
COPY . .
RUN git clone https://github.com/rocm/rocwmma --branch develop --depth 1
RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
cmake -S . -B build \
-DGGML_HIP=ON \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DCMAKE_HIP_FLAGS="-I$(pwd)/rocwmma/library/include/" \
-DAMDGPU_TARGETS="$ROCM_DOCKER_ARCH" \
-DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON \
-DCMAKE_BUILD_TYPE=Release -DLLAMA_BUILD_TESTS=OFF \
&& cmake --build build --config Release -j$(nproc)
RUN mkdir -p /app/lib \

View File

@@ -56,7 +56,7 @@ env:
jobs:
macOS-latest-cmake-arm64:
runs-on: macos-14
runs-on: macos-latest
steps:
- name: Clone
@@ -88,6 +88,7 @@ jobs:
-DGGML_METAL_SHADER_DEBUG=ON \
-DGGML_RPC=ON
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
leaks -atExit -- ./build/bin/test-thread-safety -hf ggml-org/gemma-3-270m-qat-GGUF -ngl 99 -p "$(printf 'hello %.0s' {1..128})" -n 16 -c 512 -ub 32 -np 2 -t 2 -lv 1
- name: Test
id: cmake_test
@@ -126,7 +127,8 @@ jobs:
-DCMAKE_BUILD_RPATH="@loader_path" \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON
-DGGML_RPC=ON \
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Test
@@ -136,7 +138,7 @@ jobs:
ctest -L main --verbose --timeout 900
macOS-latest-cmake-arm64-webgpu:
runs-on: macos-14
runs-on: macos-latest
steps:
- name: Clone
@@ -709,6 +711,7 @@ jobs:
macOS-latest-swift:
runs-on: macos-latest
needs: ios-xcode-build
strategy:
matrix:
@@ -725,6 +728,12 @@ jobs:
key: macOS-latest-swift
evict-old-files: 1d
- name: Download xcframework artifact
uses: actions/download-artifact@v4
with:
name: llama-xcframework
path: build-apple/llama.xcframework/
- name: Dependencies
id: depends
continue-on-error: true
@@ -746,11 +755,6 @@ jobs:
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: xcodebuild for swift package
id: xcodebuild
run: |
./build-xcframework.sh
windows-msys2:
runs-on: windows-2025
@@ -1050,9 +1054,13 @@ jobs:
run: examples/sycl/win-build-sycl.bat
windows-latest-cmake-hip:
if: ${{ github.event.inputs.create_release != 'true' }}
runs-on: windows-2022
env:
# The ROCm version must correspond to the version used in the HIP SDK.
ROCM_VERSION: "6.4.2"
HIPSDK_INSTALLER_VERSION: "25.Q3"
steps:
- name: Clone
id: checkout
@@ -1061,23 +1069,46 @@ jobs:
- name: Clone rocWMMA repository
id: clone_rocwmma
run: |
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
git clone https://github.com/rocm/rocwmma --branch rocm-${{ env.ROCM_VERSION }} --depth 1
- name: Install
- name: Cache ROCm Installation
id: cache-rocm
uses: actions/cache@v4
with:
path: C:\Program Files\AMD\ROCm
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
- name: Install ROCm
if: steps.cache-rocm.outputs.cache-hit != 'true'
id: depends
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
$proc.WaitForExit(600000)
$completed = $proc.WaitForExit(600000)
if (-not $completed) {
Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
$proc.Kill()
exit 1
}
if ($proc.ExitCode -ne 0) {
Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
exit 1
}
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
id: verify
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
# Find and test ROCm installation
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
if (-not $clangPath) {
Write-Error "ROCm installation not found"
exit 1
}
& $clangPath.FullName --version
- name: Install ccache
uses: ggml-org/ccache-action@v1.2.16
@@ -1141,8 +1172,17 @@ jobs:
run: |
./build-xcframework.sh
- name: Upload xcframework artifact
uses: actions/upload-artifact@v4
with:
name: llama-xcframework
path: build-apple/llama.xcframework/
retention-days: 1
- name: Build Xcode project
run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
run: |
xcodebuild -downloadPlatform iOS
xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
android-build:
runs-on: ubuntu-latest

View File

@@ -17,7 +17,7 @@ jobs:
steps:
- uses: actions/stale@v5
with:
exempt-issue-labels: "refactoring,help wanted,good first issue,research,bug,roadmap"
exempt-issue-labels: "refactoring,help wanted,good first issue,research 🔬,bug,roadmap"
days-before-issue-stale: 30
days-before-issue-close: 14
stale-issue-label: "stale"

View File

@@ -108,7 +108,8 @@ jobs:
-DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DLLAMA_FATAL_WARNINGS=ON \
-DGGML_METAL=OFF \
-DGGML_RPC=ON
-DGGML_RPC=ON \
-DCMAKE_OSX_DEPLOYMENT_TARGET=13.3
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
- name: Determine tag name
@@ -528,11 +529,14 @@ jobs:
windows-hip:
runs-on: windows-2022
env:
HIPSDK_INSTALLER_VERSION: "25.Q3"
strategy:
matrix:
include:
- name: "radeon"
gpu_targets: "gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
gpu_targets: "gfx1151;gfx1200;gfx1201;gfx1100;gfx1101;gfx1102;gfx1030;gfx1031;gfx1032"
steps:
- name: Clone
@@ -542,29 +546,52 @@ jobs:
- name: Clone rocWMMA repository
id: clone_rocwmma
run: |
git clone https://github.com/rocm/rocwmma --branch rocm-6.2.4 --depth 1
git clone https://github.com/rocm/rocwmma --branch develop --depth 1
- name: Cache ROCm Installation
id: cache-rocm
uses: actions/cache@v4
with:
path: C:\Program Files\AMD\ROCm
key: rocm-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ runner.os }}
- name: ccache
uses: ggml-org/ccache-action@v1.2.16
with:
key: windows-latest-cmake-hip-${{ matrix.name }}-x64
key: windows-latest-cmake-hip-${{ env.HIPSDK_INSTALLER_VERSION }}-${{ matrix.name }}-x64
evict-old-files: 1d
- name: Install
- name: Install ROCm
if: steps.cache-rocm.outputs.cache-hit != 'true'
id: depends
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-24.Q3-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
$proc = Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -PassThru
$proc.WaitForExit(600000)
$completed = $proc.WaitForExit(600000)
if (-not $completed) {
Write-Error "ROCm installation timed out after 10 minutes. Killing the process"
$proc.Kill()
exit 1
}
if ($proc.ExitCode -ne 0) {
Write-Error "ROCm installation failed with exit code $($proc.ExitCode)"
exit 1
}
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
id: verify
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
# Find and test ROCm installation
$clangPath = Get-ChildItem 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | Select-Object -First 1
if (-not $clangPath) {
Write-Error "ROCm installation not found"
exit 1
}
& $clangPath.FullName --version
- name: Build
id: cmake_build
@@ -585,9 +612,12 @@ jobs:
-DLLAMA_CURL=OFF
cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS}
md "build\bin\rocblas\library\"
md "build\bin\hipblaslt\library"
cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\hipblaslt.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas.dll" "build\bin\"
cp "${env:HIP_PATH}\bin\rocblas\library\*" "build\bin\rocblas\library\"
cp "${env:HIP_PATH}\bin\hipblaslt\library\*" "build\bin\hipblaslt\library\"
- name: Pack artifacts
id: pack_artifacts

View File

@@ -58,6 +58,12 @@ if (MSVC)
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
endif()
if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
else()
set(LLAMA_TOOLS_INSTALL_DEFAULT ${LLAMA_STANDALONE})
endif()
#
# option list
#
@@ -82,6 +88,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
# 3rd party libs
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)

View File

@@ -16,6 +16,9 @@
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
- Consider adding yourself to [CODEOWNERS](CODEOWNERS)
- Let authors, who are also collaborators, merge their own PRs
- When merging a PR by a contributor, make sure you have a good understanding of the changes
- Be mindful of maintenance: most of the work going into a feature happens after the PR is merged. If the PR author is not committed to contribute long-term, someone else needs to take responsibility (you)
# Coding guidelines

View File

@@ -270,7 +270,9 @@ function gg_run_ctest_with_model_debug {
local model; model=$(gg_get_model)
cd build-ci-debug
set -e
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
set +e
cd ..
}
@@ -281,7 +283,15 @@ function gg_run_ctest_with_model_release {
local model; model=$(gg_get_model)
cd build-ci-release
set -e
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
# test memory leaks
#if [[ ! -z ${GG_BUILD_METAL} ]]; then
# # TODO: this hangs for some reason ...
# (time leaks -quiet -atExit -- ./bin/test-thread-safety -m $model --parallel 2 -t 2 -p "hello") 2>&1 | tee -a $OUT/${ci}-leaks.log
#fi
set +e
cd ..
}
@@ -860,10 +870,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
fi
ret=0
if [ -z ${GG_BUILD_SYCL} ]; then
# SYCL build breaks with debug build flags
test $ret -eq 0 && gg_run ctest_debug
fi
test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -871,9 +878,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run rerank_tiny
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
if [ -z ${GG_BUILD_SYCL} ]; then
test $ret -eq 0 && gg_run test_scripts_debug
fi
test $ret -eq 0 && gg_run test_scripts_debug
test $ret -eq 0 && gg_run test_scripts_release
fi
@@ -884,9 +889,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run pythia_2_8b
#test $ret -eq 0 && gg_run open_llama_7b_v2
fi
if [ -z ${GG_BUILD_SYCL} ]; then
test $ret -eq 0 && gg_run ctest_with_model_debug
fi
test $ret -eq 0 && gg_run ctest_with_model_debug
test $ret -eq 0 && gg_run ctest_with_model_release
fi
fi

View File

@@ -745,6 +745,124 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
#endif // LLAMA_USE_CURL
//
// Docker registry functions
//
static std::string common_docker_get_token(const std::string & repo) {
std::string url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:" + repo + ":pull";
common_remote_params params;
auto res = common_remote_get_content(url, params);
if (res.first != 200) {
throw std::runtime_error("Failed to get Docker registry token, HTTP code: " + std::to_string(res.first));
}
std::string response_str(res.second.begin(), res.second.end());
nlohmann::ordered_json response = nlohmann::ordered_json::parse(response_str);
if (!response.contains("token")) {
throw std::runtime_error("Docker registry token response missing 'token' field");
}
return response["token"].get<std::string>();
}
static std::string common_docker_resolve_model(const std::string & docker) {
// Parse ai/smollm2:135M-Q4_K_M
size_t colon_pos = docker.find(':');
std::string repo, tag;
if (colon_pos != std::string::npos) {
repo = docker.substr(0, colon_pos);
tag = docker.substr(colon_pos + 1);
} else {
repo = docker;
tag = "latest";
}
// ai/ is the default
size_t slash_pos = docker.find('/');
if (slash_pos == std::string::npos) {
repo.insert(0, "ai/");
}
LOG_INF("%s: Downloading Docker Model: %s:%s\n", __func__, repo.c_str(), tag.c_str());
try {
// --- helper: digest validation ---
auto validate_oci_digest = [](const std::string & digest) -> std::string {
// Expected: algo:hex ; start with sha256 (64 hex chars)
// You can extend this map if supporting other algorithms in future.
static const std::regex re("^sha256:([a-fA-F0-9]{64})$");
std::smatch m;
if (!std::regex_match(digest, m, re)) {
throw std::runtime_error("Invalid OCI digest format received in manifest: " + digest);
}
// normalize hex to lowercase
std::string normalized = digest;
std::transform(normalized.begin()+7, normalized.end(), normalized.begin()+7, [](unsigned char c){
return std::tolower(c);
});
return normalized;
};
std::string token = common_docker_get_token(repo); // Get authentication token
// Get manifest
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
std::string manifest_url = url_prefix + "/manifests/" + tag;
common_remote_params manifest_params;
manifest_params.headers.push_back("Authorization: Bearer " + token);
manifest_params.headers.push_back(
"Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
if (manifest_res.first != 200) {
throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
}
std::string manifest_str(manifest_res.second.begin(), manifest_res.second.end());
nlohmann::ordered_json manifest = nlohmann::ordered_json::parse(manifest_str);
std::string gguf_digest; // Find the GGUF layer
if (manifest.contains("layers")) {
for (const auto & layer : manifest["layers"]) {
if (layer.contains("mediaType")) {
std::string media_type = layer["mediaType"].get<std::string>();
if (media_type == "application/vnd.docker.ai.gguf.v3" ||
media_type.find("gguf") != std::string::npos) {
gguf_digest = layer["digest"].get<std::string>();
break;
}
}
}
}
if (gguf_digest.empty()) {
throw std::runtime_error("No GGUF layer found in Docker manifest");
}
// Validate & normalize digest
gguf_digest = validate_oci_digest(gguf_digest);
LOG_DBG("%s: Using validated digest: %s\n", __func__, gguf_digest.c_str());
// Prepare local filename
std::string model_filename = repo;
std::replace(model_filename.begin(), model_filename.end(), '/', '_');
model_filename += "_" + tag + ".gguf";
std::string local_path = fs_get_cache_file(model_filename);
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
if (!common_download_file_single(blob_url, local_path, token, false)) {
throw std::runtime_error("Failed to download Docker Model");
}
LOG_INF("%s: Downloaded Docker Model to: %s\n", __func__, local_path.c_str());
return local_path;
} catch (const std::exception & e) {
LOG_ERR("%s: Docker Model download failed: %s\n", __func__, e.what());
throw;
}
}
//
// utils
//
@@ -795,7 +913,9 @@ static handle_model_result common_params_handle_model(
handle_model_result result;
// handle pre-fill default model path and url based on hf_repo and hf_file
{
if (!model.hf_repo.empty()) {
if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
model.path = common_docker_resolve_model(model.docker_repo);
} else if (!model.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
if (model.hf_file.empty()) {
if (model.path.empty()) {
@@ -1184,7 +1304,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
} else {
for (const auto & device : dev_names) {
auto * dev = ggml_backend_dev_by_name(device.c_str());
if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
}
devices.push_back(dev);
@@ -1194,7 +1314,7 @@ static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & val
return devices;
}
static void add_rpc_devices(std::string servers) {
static void add_rpc_devices(const std::string & servers) {
auto rpc_servers = string_split<std::string>(servers, ',');
if (rpc_servers.empty()) {
throw std::invalid_argument("no RPC servers specified");
@@ -1263,6 +1383,18 @@ static std::string list_builtin_chat_templates() {
return msg.str();
}
static bool is_truthy(const std::string & value) {
return value == "on" || value == "enabled" || value == "1";
}
static bool is_falsey(const std::string & value) {
return value == "off" || value == "disabled" || value == "0";
}
static bool is_autoy(const std::string & value) {
return value == "auto" || value == "-1";
}
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
// load dynamic backends
ggml_backend_load_all();
@@ -1544,21 +1676,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_chunks = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY, LLAMA_EXAMPLE_RETRIEVAL}));
add_opt(common_arg(
{"-fa", "--flash-attn"}, "FA",
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')", llama_flash_attn_type_name(params.flash_attn_type)),
[](common_params & params, const std::string & value) {
if (value == "on" || value == "enabled") {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
} else if (value == "off" || value == "disabled") {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
} else if (value == "auto") {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
} else {
throw std::runtime_error(string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
}
}
).set_env("LLAMA_ARG_FLASH_ATTN"));
add_opt(common_arg({ "-fa", "--flash-attn" }, "[on|off|auto]",
string_format("set Flash Attention use ('on', 'off', or 'auto', default: '%s')",
llama_flash_attn_type_name(params.flash_attn_type)),
[](common_params & params, const std::string & value) {
if (is_truthy(value)) {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_ENABLED;
} else if (is_falsey(value)) {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
} else if (is_autoy(value)) {
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
} else {
throw std::runtime_error(
string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
}
}).set_env("LLAMA_ARG_FLASH_ATTN"));
add_opt(common_arg(
{"-p", "--prompt"}, "PROMPT",
"prompt to start generation with; for system message, use -sys",
@@ -1572,7 +1704,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.system_prompt = value;
}
).set_examples({LLAMA_EXAMPLE_MAIN}));
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg(
{"--no-perf"},
string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
@@ -2384,24 +2516,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--list-devices"},
"print list of available devices and exit",
[](common_params &) {
std::vector<ggml_backend_dev_t> rpc_devices;
std::vector<ggml_backend_dev_t> all_devices;
std::vector<ggml_backend_dev_t> devices;
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
auto * dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
if (ggml_backend_reg_name(reg) == std::string("RPC")) {
rpc_devices.push_back(dev);
} else {
all_devices.push_back(dev);
}
if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
devices.push_back(dev);
}
}
// insert RPC devices in front
all_devices.insert(all_devices.begin(), rpc_devices.begin(), rpc_devices.end());
printf("Available devices:\n");
for (size_t i = 0; i < all_devices.size(); ++i) {
auto * dev = all_devices[i];
for (auto * dev : devices) {
size_t free, total;
ggml_backend_dev_memory(dev, &free, &total);
printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
@@ -2425,7 +2548,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--cpu-moe", "-cmoe"},
"keep all Mixture of Experts (MoE) weights in the CPU",
[](common_params & params) {
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
}
).set_env("LLAMA_ARG_CPU_MOE"));
add_opt(common_arg(
@@ -2438,7 +2561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
for (int i = 0; i < value; ++i) {
// keep strings alive and avoid leaking memory by storing them in a static vector
static std::list<std::string> buft_overrides;
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
buft_overrides.push_back(llm_ffn_exps_block_regex(i));
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
}
}
@@ -2447,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
{"--cpu-moe-draft", "-cmoed"},
"keep all Mixture of Experts (MoE) weights in the CPU for the draft model",
[](common_params & params) {
params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override());
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT"));
add_opt(common_arg(
@@ -2459,14 +2582,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
for (int i = 0; i < value; ++i) {
static std::list<std::string> buft_overrides_draft;
buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i));
params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()});
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
"number of layers to store in VRAM",
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
[](common_params & params, int value) {
params.n_gpu_layers = value;
if (!llama_supports_gpu_offload()) {
@@ -2624,6 +2747,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.model.url = value;
}
).set_env("LLAMA_ARG_MODEL_URL"));
add_opt(common_arg(
{ "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
"Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
"example: gemma3\n"
"(default: unused)",
[](common_params & params, const std::string & value) {
params.model.docker_repo = value;
}
).set_env("LLAMA_ARG_DOCKER_REPO"));
add_opt(common_arg(
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
@@ -3134,13 +3266,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
common_log_set_file(common_log_main(), value.c_str());
}
));
add_opt(common_arg(
{"--log-colors"},
"Enable colored logging",
[](common_params &) {
common_log_set_colors(common_log_main(), true);
}
).set_env("LLAMA_LOG_COLORS"));
add_opt(common_arg({ "--log-colors" }, "[on|off|auto]",
"Set colored logging ('on', 'off', or 'auto', default: 'auto')\n"
"'auto' enables colors when output is to a terminal",
[](common_params &, const std::string & value) {
if (is_truthy(value)) {
common_log_set_colors(common_log_main(), LOG_COLORS_ENABLED);
} else if (is_falsey(value)) {
common_log_set_colors(common_log_main(), LOG_COLORS_DISABLED);
} else if (is_autoy(value)) {
common_log_set_colors(common_log_main(), LOG_COLORS_AUTO);
} else {
throw std::invalid_argument(
string_format("error: unkown value for --log-colors: '%s'\n", value.c_str()));
}
}).set_env("LLAMA_LOG_COLORS"));
add_opt(common_arg(
{"-v", "--verbose", "--log-verbose"},
"Set verbosity level to infinity (i.e. log all messages, useful for debugging)",

View File

@@ -163,6 +163,19 @@ common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::strin
throw std::runtime_error("Invalid tool_choice: " + tool_choice);
}
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates) {
common_chat_templates_inputs dummy_inputs;
common_chat_msg msg;
msg.role = "user";
msg.content = "test";
dummy_inputs.messages = {msg};
dummy_inputs.enable_thinking = false;
const auto rendered_no_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
dummy_inputs.enable_thinking = true;
const auto rendered_with_thinking = common_chat_templates_apply(chat_templates, dummy_inputs);
return rendered_no_thinking.prompt != rendered_with_thinking.prompt;
}
template <>
std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const json & messages) {
std::vector<common_chat_msg> msgs;
@@ -618,11 +631,13 @@ const char * common_chat_format_name(common_chat_format format) {
case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1: return "DeepSeek V3.1";
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
case COMMON_CHAT_FORMAT_GRANITE: return "Granite";
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
default:
throw std::runtime_error("Unknown chat format");
}
@@ -684,11 +699,13 @@ static void parse_json_tool_calls(
size_t from = std::string::npos;
auto first = true;
while (true) {
auto start_pos = builder.pos();
auto res = function_regex_start_only && first
? builder.try_consume_regex(*function_regex_start_only)
: function_regex
? builder.try_find_regex(*function_regex, from)
: std::nullopt;
if (res) {
std::string name;
if (get_function_name) {
@@ -723,6 +740,8 @@ static void parse_json_tool_calls(
return;
}
throw common_chat_msg_partial_exception("incomplete tool call");
} else {
builder.move_to(start_pos);
}
break;
}
@@ -1184,6 +1203,67 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
});
return data;
}
static common_chat_params common_chat_params_init_nemotron_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
// Generate the prompt using the apply() function with the template
data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_NEMOTRON_V2;
// Handle thinking tags appropriately based on inputs.enable_thinking
if (string_ends_with(data.prompt, "<think>\n")) {
if (!inputs.enable_thinking) {
data.prompt += "</think>";
} else {
data.thinking_forced_open = true;
}
}
// When tools are present, build grammar for the <TOOLCALL> format, similar to CommandR, but without tool call ID
if (!inputs.tools.is_null() && inputs.tools.is_array() && !inputs.tools.empty()) {
data.grammar_lazy = true;
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
auto schemas = json::array();
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
schemas.push_back({
{ "type", "object" },
{ "properties",
{
{ "name",
{
{ "type", "string" },
{ "const", function.at("name") },
} },
{ "arguments", function.at("parameters") },
} },
{ "required", json::array({ "name", "arguments" }) },
});
});
auto schema = json{
{ "type", "array" },
{ "items", schemas.size() == 1 ? schemas[0] : json{ { "anyOf", schemas } } },
{ "minItems", 1 },
};
if (!inputs.parallel_tool_calls) {
schema["maxItems"] = 1;
}
builder.add_rule("root",
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
"\"<TOOLCALL>\" " + builder.add_schema("tool_calls", schema) +
" \"</TOOLCALL>\"");
});
data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
// If thinking_forced_open, then we capture the </think> tag in the grammar,
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
std::string(data.thinking_forced_open ?
"[\\s\\S]*?(</think>\\s*)" :
"(?:<think>[\\s\\S]*?</think>\\s*)?") +
"(<TOOLCALL>)[\\s\\S]*" });
}
return data;
}
static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool with_builtin_tools = false) {
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
@@ -1313,6 +1393,71 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
}
return data;
}
static common_chat_params common_chat_params_init_deepseek_v3_1(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
// Pass thinking context for DeepSeek V3.1 template
json additional_context = {
{"thinking", inputs.enable_thinking},
};
auto prompt = apply(tmpl, inputs,
/* messages_override= */ inputs.messages,
/* tools_override= */ std::nullopt,
additional_context);
data.prompt = prompt;
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_V3_1;
if (string_ends_with(data.prompt, "<think>")) {
if (!inputs.enable_thinking) {
data.prompt += "</think>";
} else {
data.thinking_forced_open = true;
}
}
if (inputs.tools.is_array() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
std::vector<std::string> tool_rules;
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
std::string name = function.at("name");
auto parameters = function.at("parameters");
builder.resolve_refs(parameters);
tool_rules.push_back(builder.add_rule(name + "-call",
"( \"<tool▁call▁begin>\" )? \"" + name + "<tool▁sep>"
"\" " + builder.add_schema(name + "-args", parameters) + " "
"\"<tool▁call▁end>\""));
});
// Distill Qwen 7B & 32B models seem confused re/ syntax of their tool call opening tag,
// so we accept common variants (then it's all constrained)
builder.add_rule("root",
std::string(data.thinking_forced_open ? "( \"</think>\" space )? " : "") +
"( \"<tool▁calls▁begin>\" | \"<tool_calls_begin>\" | \"<tool calls begin>\" | \"<tool\\\\_calls\\\\_begin>\" | \"<tool▁calls>\" ) "
"(" + string_join(tool_rules, " | ") + ")" + (inputs.parallel_tool_calls ? "*" : "") + " "
"\"<tool▁calls▁end>\""
" space");
data.grammar_triggers.push_back({
COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
// If thinking_forced_open, then we capture the </think> tag in the grammar,
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") +
"(<tool▁calls▁begin>|<tool_calls_begin>|<tool calls begin>|<tool\\\\_calls\\\\_begin>|<tool▁calls>)[\\s\\S]*"
});
data.preserved_tokens = {
"<think>",
"</think>",
"<tool▁calls▁begin>",
"<tool▁call▁begin>",
"<tool▁sep>",
"<tool▁call▁end>",
"<tool▁calls▁end>",
};
});
}
return data;
}
static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
builder.try_parse_reasoning("<think>", "</think>");
if (!builder.syntax().parse_tool_calls) {
@@ -1334,6 +1479,66 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
tool_calls_end);
}
static void common_chat_parse_deepseek_v3_1_content(common_chat_msg_parser & builder) {
static const common_regex function_regex("(?:<tool▁call▁begin>)?([^\\n<]+)(?:<tool▁sep>)");
static const common_regex close_regex("(?:[\\s]*)?<tool▁call▁end>");
static const common_regex tool_calls_begin("(?:<tool▁calls▁begin>|<tool_calls_begin>|<tool calls begin>|<tool\\\\_calls\\\\_begin>|<tool▁calls>)");
static const common_regex tool_calls_end("<tool▁calls▁end>");
if (!builder.syntax().parse_tool_calls) {
LOG_DBG("%s: not parse_tool_calls\n", __func__);
builder.add_content(builder.consume_rest());
return;
}
LOG_DBG("%s: parse_tool_calls\n", __func__);
parse_json_tool_calls(
builder,
/* block_open= */ tool_calls_begin,
/* function_regex_start_only= */ std::nullopt,
function_regex,
close_regex,
tool_calls_end);
}
static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
// DeepSeek V3.1 outputs reasoning content between "<think>" and "</think>" tags, followed by regular content
// First try to parse using the standard reasoning parsing method
LOG_DBG("%s: thinking_forced_open: %s\n", __func__, std::to_string(builder.syntax().thinking_forced_open).c_str());
auto start_pos = builder.pos();
auto found_end_think = builder.try_find_literal("</think>");
builder.move_to(start_pos);
if (builder.syntax().thinking_forced_open && !builder.is_partial() && !found_end_think) {
LOG_DBG("%s: no end_think, not partial, adding content\n", __func__);
common_chat_parse_deepseek_v3_1_content(builder);
} else if (builder.try_parse_reasoning("<think>", "</think>")) {
// If reasoning was parsed successfully, the remaining content is regular content
LOG_DBG("%s: parsed reasoning, adding content\n", __func__);
// </think><tool▁calls▁begin><tool▁call▁begin>function<tool▁sep>NAME\n```json\nJSON\n```<tool▁call▁end><tool▁calls▁end>
common_chat_parse_deepseek_v3_1_content(builder);
} else {
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
common_chat_parse_deepseek_v3_1_content(builder);
return;
}
// If no reasoning tags found, check if we should treat everything as reasoning
if (builder.syntax().thinking_forced_open) {
// If thinking is forced open but no tags found, treat everything as reasoning
LOG_DBG("%s: thinking_forced_open, adding reasoning content\n", __func__);
builder.add_reasoning_content(builder.consume_rest());
} else {
LOG_DBG("%s: no thinking_forced_open, adding content\n", __func__);
// <tool▁call▁begin>NAME<tool▁sep>JSON<tool▁call▁end>
common_chat_parse_deepseek_v3_1_content(builder);
}
}
}
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
auto prompt = apply(tmpl, inputs);
@@ -1830,7 +2035,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
// If thinking_forced_open, then we capture the </think> tag in the grammar,
// (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
std::string(data.thinking_forced_open ? "[\\s\\S]*?(</think>\\s*)" : "(?:<think>[\\s\\S]*?</think>\\s*)?") + (
"(\\s*"
"\\s*("
"(?:<tool_call>"
"|<function"
"|(?:```(?:json|xml)?\n\\s*)?(?:<function_call>|<tools>|<xml><json>|<response>)?"
@@ -2060,6 +2265,33 @@ static void common_chat_parse_granite(common_chat_msg_parser & builder) {
}
}
static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) {
// Parse thinking tags
builder.try_parse_reasoning("<think>", "</think>");
if (!builder.syntax().parse_tool_calls) {
builder.add_content(builder.consume_rest());
return;
}
// Look for tool calls
static const common_regex tool_call_regex(regex_escape("<TOOLCALL>"));
if (auto res = builder.try_find_regex(tool_call_regex)) {
builder.move_to(res->groups[0].end);
// Expect JSON array of tool calls
auto tool_calls_data = builder.consume_json();
if (tool_calls_data.json.is_array()) {
if (!builder.try_consume_literal("</TOOLCALL>")) {
throw common_chat_msg_partial_exception("Incomplete tool call");
}
builder.add_tool_calls(tool_calls_data.json);
} else {
throw common_chat_msg_partial_exception("Incomplete tool call");
}
}
builder.add_content(builder.consume_rest());
}
static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
// Parse thinking tags first - this handles the main reasoning content
builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2263,6 +2495,12 @@ static common_chat_params common_chat_templates_apply_jinja(
}
}
// DeepSeek V3.1: detect based on specific patterns in the template
if (src.find("message['prefix'] is defined and message['prefix'] and thinking") != std::string::npos &&
params.json_schema.is_null()) {
return common_chat_params_init_deepseek_v3_1(tmpl, params);
}
// DeepSeek R1: use handler in all cases except json schema (thinking / tools).
if (src.find("<tool▁calls▁begin>") != std::string::npos && params.json_schema.is_null()) {
return common_chat_params_init_deepseek_r1(tmpl, params);
@@ -2293,6 +2531,11 @@ static common_chat_params common_chat_templates_apply_jinja(
return common_chat_params_init_seed_oss(tmpl, params, inputs);
}
// Nemotron v2
if (src.find("<SPECIAL_10>") != std::string::npos) {
return common_chat_params_init_nemotron_v2(tmpl, params);
}
// Use generic handler when mixing tools + JSON schema.
// TODO: support that mix in handlers below.
if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -2430,6 +2673,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
common_chat_parse_deepseek_r1(builder);
break;
case COMMON_CHAT_FORMAT_DEEPSEEK_V3_1:
common_chat_parse_deepseek_v3_1(builder);
break;
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2:
common_chat_parse_functionary_v3_2(builder);
break;
@@ -2454,6 +2700,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
case COMMON_CHAT_FORMAT_SEED_OSS:
common_chat_parse_seed_oss(builder);
break;
case COMMON_CHAT_FORMAT_NEMOTRON_V2:
common_chat_parse_nemotron_v2(builder);
break;
default:
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
}

View File

@@ -107,11 +107,13 @@ enum common_chat_format {
COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
COMMON_CHAT_FORMAT_DEEPSEEK_V3_1,
COMMON_CHAT_FORMAT_HERMES_2_PRO,
COMMON_CHAT_FORMAT_COMMAND_R7B,
COMMON_CHAT_FORMAT_GRANITE,
COMMON_CHAT_FORMAT_GPT_OSS,
COMMON_CHAT_FORMAT_SEED_OSS,
COMMON_CHAT_FORMAT_NEMOTRON_V2,
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
};
@@ -198,6 +200,8 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_p
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
bool common_chat_templates_support_enable_thinking(const common_chat_templates * chat_templates);
// Parses a JSON array of messages in OpenAI's chat completion API format.
// T can be std::string containing JSON or nlohmann::ordered_json
template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);

View File

@@ -193,10 +193,11 @@ struct common_params_sampling {
};
struct common_params_model {
std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string path = ""; // model local path // NOLINT
std::string url = ""; // model url to download // NOLINT
std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file // NOLINT
std::string docker_repo = ""; // Docker repo // NOLINT
};
struct common_params_speculative {
@@ -287,9 +288,9 @@ struct common_params {
float rope_freq_base = 0.0f; // RoPE base frequency
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = 32.0f; // YaRN low correction dim
float yarn_beta_slow = 1.0f; // YaRN high correction dim
float yarn_attn_factor = -1.0f; // YaRN magnitude scaling factor
float yarn_beta_fast = -1.0f; // YaRN low correction dim
float yarn_beta_slow = -1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
// offload params
@@ -452,7 +453,7 @@ struct common_params {
std::string slot_save_path;
float slot_prompt_similarity = 0.5f;
float slot_prompt_similarity = 0.1f;
// batched-bench params
bool is_pp_shared = false;
@@ -733,6 +734,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
}
//
// MoE utils
//
const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps";
static std::string llm_ffn_exps_block_regex(int idx) {
return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX);
}
static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() {
return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() };
}
//
// training utils
//

View File

@@ -257,12 +257,13 @@ std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
};
static bool is_reserved_name(const std::string & name) {
static std::unordered_set<std::string> RESERVED_NAMES;
if (RESERVED_NAMES.empty()) {
RESERVED_NAMES.insert("root");
for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first);
for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first);
}
static const std::unordered_set<std::string> RESERVED_NAMES = [] {
std::unordered_set<std::string> s;
s.insert("root");
for (const auto & p : PRIMITIVE_RULES) s.insert(p.first);
for (const auto & p : STRING_FORMAT_RULES) s.insert(p.first);
return s;
}();
return RESERVED_NAMES.find(name) != RESERVED_NAMES.end();
}
@@ -843,9 +844,10 @@ public:
_build_object_rule(
properties, required, name,
schema.contains("additionalProperties") ? schema["additionalProperties"] : json()));
} else if ((schema_type.is_null() || schema_type == "object") && schema.contains("allOf")) {
} else if ((schema_type.is_null() || schema_type == "object" || schema_type == "string") && schema.contains("allOf")) {
std::unordered_set<std::string> required;
std::vector<std::pair<std::string, json>> properties;
std::map<std::string, size_t> enum_values;
std::string hybrid_name = name;
std::function<void(const json &, bool)> add_component = [&](const json & comp_schema, bool is_required) {
if (comp_schema.contains("$ref")) {
@@ -857,6 +859,14 @@ public:
required.insert(prop.key());
}
}
} else if (comp_schema.contains("enum")) {
for (const auto & v : comp_schema["enum"]) {
const auto rule = _generate_constant_rule(v);
if (enum_values.find(rule) == enum_values.end()) {
enum_values[rule] = 0;
}
enum_values[rule] += 1;
}
} else {
// todo warning
}
@@ -870,6 +880,17 @@ public:
add_component(t, true);
}
}
if (!enum_values.empty()) {
std::vector<std::string> enum_intersection;
for (const auto & p : enum_values) {
if (p.second == schema["allOf"].size()) {
enum_intersection.push_back(p.first);
}
}
if (!enum_intersection.empty()) {
return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space");
}
}
return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json()));
} else if ((schema_type.is_null() || schema_type == "array") && (schema.contains("items") || schema.contains("prefixItems"))) {
json items = schema.contains("items") ? schema["items"] : schema["prefixItems"];

View File

@@ -4,17 +4,52 @@
#include <condition_variable>
#include <cstdarg>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <mutex>
#include <sstream>
#include <thread>
#include <vector>
#if defined(_WIN32)
# include <io.h>
# include <windows.h>
# define isatty _isatty
# define fileno _fileno
#else
# include <unistd.h>
#endif // defined(_WIN32)
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
void common_log_set_verbosity_thold(int verbosity) {
common_log_verbosity_thold = verbosity;
}
// Auto-detect if colors should be enabled based on terminal and environment
static bool common_log_should_use_colors_auto() {
// Check NO_COLOR environment variable (https://no-color.org/)
if (const char * no_color = std::getenv("NO_COLOR")) {
if (no_color[0] != '\0') {
return false;
}
}
// Check TERM environment variable
if (const char * term = std::getenv("TERM")) {
if (std::strcmp(term, "dumb") == 0) {
return false;
}
}
// Check if stdout and stderr are connected to a terminal
// We check both because log messages can go to either
bool stdout_is_tty = isatty(fileno(stdout));
bool stderr_is_tty = isatty(fileno(stderr));
return stdout_is_tty || stderr_is_tty;
}
static int64_t t_us() {
return std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
}
@@ -353,6 +388,11 @@ struct common_log * common_log_init() {
struct common_log * common_log_main() {
static struct common_log log;
static std::once_flag init_flag;
std::call_once(init_flag, [&]() {
// Set default to auto-detect colors
log.set_colors(common_log_should_use_colors_auto());
});
return &log;
}
@@ -380,8 +420,19 @@ void common_log_set_file(struct common_log * log, const char * file) {
log->set_file(file);
}
void common_log_set_colors(struct common_log * log, bool colors) {
log->set_colors(colors);
void common_log_set_colors(struct common_log * log, log_colors colors) {
if (colors == LOG_COLORS_AUTO) {
log->set_colors(common_log_should_use_colors_auto());
return;
}
if (colors == LOG_COLORS_DISABLED) {
log->set_colors(false);
return;
}
GGML_ASSERT(colors == LOG_COLORS_ENABLED);
log->set_colors(true);
}
void common_log_set_prefix(struct common_log * log, bool prefix) {

View File

@@ -24,6 +24,12 @@
#define LOG_DEFAULT_DEBUG 1
#define LOG_DEFAULT_LLAMA 0
enum log_colors {
LOG_COLORS_AUTO = -1,
LOG_COLORS_DISABLED = 0,
LOG_COLORS_ENABLED = 1,
};
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
// set via common_log_set_verbosity()
extern int common_log_verbosity_thold;
@@ -65,10 +71,10 @@ void common_log_add(struct common_log * log, enum ggml_log_level level, const ch
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
//
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
void common_log_set_colors (struct common_log * log, log_colors colors); // not thread-safe
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
// helper macros for logging
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold

View File

@@ -735,6 +735,9 @@ class TextModel(ModelBase):
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
res = "qwen2"
if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273":
# ref: https://huggingface.co/alvarobartt/grok-2-tokenizer
res = "grok-2"
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
res = "llama-bpe"
@@ -885,6 +888,9 @@ class TextModel(ModelBase):
if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
# ref: https://huggingface.co/JetBrains/Mellum-4b-base
res = "mellum"
if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
# ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
res = "llada-moe"
if res is None:
logger.warning("\n")
@@ -2682,12 +2688,20 @@ class BitnetModel(TextModel):
yield (new_name, data_torch)
@ModelBase.register("GrokForCausalLM")
@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM")
class GrokModel(TextModel):
model_arch = gguf.MODEL_ARCH.GROK
def set_vocab(self):
self._set_vocab_sentencepiece()
if (self.dir_model / 'tokenizer.model').is_file():
self._set_vocab_sentencepiece()
return
if not (self.dir_model / 'tokenizer.json').is_file() or not (self.dir_model / 'chat_template.jinja').is_file():
logger.error('Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer')
sys.exit(1)
self._set_vocab_gpt2()
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@@ -2695,11 +2709,46 @@ class GrokModel(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
_experts: list[dict[str, Tensor]] | None = None
self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0))
self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0))
if (final_logit_softcap := self.hparams.get("final_logit_softcapping")):
self.gguf_writer.add_final_logit_softcapping(final_logit_softcap)
if (rope_dim := self.hparams.get("head_dim")) is None:
rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
# Treat "original" as "yarn", seems to have been a mistake
if self.hparams.get("rope_type") in ("yarn", "original"):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"])
self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"])
self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"])
self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"])
self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"])
if temp_len := self.hparams.get("attn_temperature_len"):
self.gguf_writer.add_attn_temperature_length(temp_len)
self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5))
self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"])
self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"])
_experts: list[dict[str, list[Tensor]]] | None = None
_cur_expert = ""
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
tensors: list[tuple[str, Tensor]] = []
is_expert = ".moe." in name or ".block_sparse_moe.experts." in name
if not is_expert:
tensors.append((self.map_tensor_name(name), data_torch))
# process the experts separately
if name.find(".moe.") != -1:
if is_expert or self._cur_expert:
n_experts = self.hparams["num_local_experts"]
assert bid is not None
@@ -2707,32 +2756,41 @@ class GrokModel(TextModel):
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for wid in ["linear", "linear_1", "linear_v"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
# concatenate split tensors
if name in self._experts[bid]:
self._cur_expert = name
self._experts[bid][name].append(data_torch)
return []
elif is_expert:
self._cur_expert = name
self._experts[bid][name] = [data_torch]
return []
else:
self._cur_expert = ""
return [(self.map_tensor_name(name), data_torch)]
for bid in range(self.block_count):
if len(self._experts[bid]) >= n_experts * 3:
# merge the experts into a single 3d tensor
for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight"
if ename not in self._experts[bid]:
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight"
tensor_list = self._experts[bid][ename]
datas.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight"
new_name = self.map_tensor_name(merged_name)
yield (new_name, data_torch)
yield from tensors
@ModelBase.register("DbrxForCausalLM")
@@ -5122,6 +5180,29 @@ class Gemma3Model(TextModel):
return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("Gemma3TextModel")
class EmbeddingGemma(Gemma3Model):
model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING
def set_gguf_parameters(self):
super().set_gguf_parameters()
# Override the sliding window size as it gets adjusted by the Gemma3TextConfig
# constructor. We want to use the value from the original model's config.json.
# ref: https://github.com/huggingface/transformers/pull/40700
with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
config = json.load(f)
orig_sliding_window = config.get("sliding_window")
if orig_sliding_window is None:
raise ValueError("sliding_window not found in model config - this is required for the model")
logger.info(f"Using original sliding_window from config: {orig_sliding_window} "
f"instead of {self.hparams['sliding_window']}")
self.gguf_writer.add_sliding_window(orig_sliding_window)
self._try_set_pooling_type()
@ModelBase.register("Gemma3ForConditionalGeneration")
class Gemma3VisionModel(MmprojModel):
def set_gguf_parameters(self):
@@ -5928,9 +6009,34 @@ class SeedOssModel(TextModel):
@ModelBase.register("Olmo2ForCausalLM")
@ModelBase.register("Olmo3ForCausalLM")
class Olmo2Model(TextModel):
model_arch = gguf.MODEL_ARCH.OLMO2
def set_gguf_parameters(self):
super().set_gguf_parameters()
rope_scaling = self.hparams.get("rope_scaling") or {}
if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling:
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"])
self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"])
if "sliding_window" in self.hparams:
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
sliding_window_pattern = []
if "layer_types" in self.hparams:
sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]]
else:
# Olmo2 does not use sliding window attention.
# Olmo3 defaults to using sliding window for all layers except every 4th.
for i in range(self.hparams["num_hidden_layers"]):
sliding_window_pattern.append((i + 1) % 4 != 0)
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
@ModelBase.register("OlmoeForCausalLM")
class OlmoeModel(TextModel):
@@ -6678,6 +6784,8 @@ class T5Model(TextModel):
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
self.gguf_writer.add_block_count(self.hparams["num_layers"])
if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None:
self.gguf_writer.add_decoder_block_count(dec_n_layer)
self.gguf_writer.add_head_count(self.hparams["num_heads"])
self.gguf_writer.add_key_length(self.hparams["d_kv"])
self.gguf_writer.add_value_length(self.hparams["d_kv"])
@@ -8159,6 +8267,76 @@ class HunYuanMoEModel(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM")
class LLaDAMoEModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLADA_MOE
def set_gguf_parameters(self):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
# number of experts used per token (top-k)
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
self.gguf_writer.add_mask_token_id(156895)
self.gguf_writer.add_causal_attention(False)
self.gguf_writer.add_diffusion_shift_logits(False)
_experts: list[dict[str, Tensor]] | None = None
# Copied from: Qwen2MoeModel
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
# Copied from: Qwen2MoeModel
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("HunYuanDenseV1ForCausalLM")
class HunYuanModel(TextModel):
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE

View File

@@ -139,6 +139,7 @@ models = [
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
{"name": "llada-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
]
# some models are known to be broken upstream, so we will skip them as exceptions
@@ -158,6 +159,7 @@ pre_computed_hashes = [
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
{"name": "grok-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/alvarobartt/grok-2-tokenizer", "chkhsh": "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273"},
]

View File

@@ -12,7 +12,7 @@ import json
from math import prod
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Sequence, SupportsIndex, cast
from transformers import AutoConfig
from transformers import AutoConfig, AutoTokenizer
import torch
@@ -26,6 +26,8 @@ import gguf
# reuse model definitions from convert_hf_to_gguf.py
from convert_hf_to_gguf import LazyTorchTensor, ModelBase
from gguf.constants import GGUFValueType
logger = logging.getLogger("lora-to-gguf")
@@ -369,7 +371,31 @@ if __name__ == '__main__':
self.gguf_writer.add_string(gguf.Keys.Adapter.TYPE, "lora")
def set_gguf_parameters(self):
logger.debug("GGUF KV: %s = %d", gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
self.gguf_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, self.lora_alpha)
alora_invocation_tokens = lparams.get("alora_invocation_tokens")
invocation_string = lparams.get("invocation_string")
if invocation_string and not alora_invocation_tokens:
logger.debug("Tokenizing invocation_string -> alora_invocation_tokens")
base_model_path_or_id = hparams.get("_name_or_path")
try:
tokenizer = AutoTokenizer.from_pretrained(base_model_path_or_id)
except ValueError:
logger.error("Unable to load tokenizer from %s", base_model_path_or_id)
raise
# NOTE: There's an off-by-one with the older aLoRAs where
# the invocation string includes the "<|start_of_turn|>"
# token, but the adapters themselves were trained to
# activate _after_ that first token, so we drop it here.
alora_invocation_tokens = tokenizer(invocation_string)["input_ids"][1:]
if alora_invocation_tokens:
logger.debug("GGUF KV: %s = %s", gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS, alora_invocation_tokens)
self.gguf_writer.add_key_value(
gguf.Keys.Adapter.ALORA_INVOCATION_TOKENS,
alora_invocation_tokens,
GGUFValueType.ARRAY,
GGUFValueType.UINT32,
)
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
# Never add extra tensors (e.g. rope_freqs) for LoRA adapters

View File

@@ -293,17 +293,14 @@ We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers fr
## Environment variable setup
### GGML_CANN_ASYNC_MODE
Enables asynchronous operator submission. Disabled by default.
### GGML_CANN_MEM_POOL
Specifies the memory pool management strategy:
Specifies the memory pool management strategy, Default is vmm.
- vmm: Utilizes a virtual memory manager pool. If hardware support for VMM is unavailable, falls back to the legacy (leg) memory pool.
- prio: Employs a priority queue-based memory pool management.
- leg: Uses a fixed-size buffer pool.
### GGML_CANN_DISABLE_BUF_POOL_CLEAN
@@ -312,5 +309,16 @@ Controls automatic cleanup of the memory pool. This option is only effective whe
### GGML_CANN_WEIGHT_NZ
Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU.
Converting the matmul weight format from ND to NZ to improve performance. Enabled by default.
### GGML_CANN_ACL_GRAPH
Operators are executed using ACL graph execution, rather than in op-by-op (eager) mode. Enabled by default.
### GGML_CANN_GRAPH_CACHE_CAPACITY
Maximum number of compiled CANN graphs kept in the LRU cache, default is 12. When the number of cached graphs exceeds this capacity, the least recently used graph will be evicted.
### GGML_CANN_PREFILL_USE_GRAPH
Enable ACL graph execution during the prefill stage, default is false. This option is only effective when FA is enabled.

View File

@@ -42,18 +42,6 @@ cmake --build build --config Release -j $(nproc)
cmake --build build --config Release -j $(nproc)
```
- By default, NNPA is disabled by default. To enable it:
```bash
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DGGML_NNPA=ON
cmake --build build --config Release -j $(nproc)
```
- For debug builds:
```bash
@@ -164,15 +152,11 @@ All models need to be converted to Big-Endian. You can achieve this in three cas
Only available in IBM z15/LinuxONE 3 or later system with the `-DGGML_VXE=ON` (turned on by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z14/arch12. In such systems, the APIs can still run but will use a scalar implementation.
### 2. NNPA Vector Intrinsics Acceleration
Only available in IBM z16/LinuxONE 4 or later system with the `-DGGML_NNPA=ON` (turned off by default) compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs can still run but will use a scalar implementation.
### 3. zDNN Accelerator (WIP)
### 2. zDNN Accelerator (WIP)
Only available in IBM z17/LinuxONE 5 or later system with the `-DGGML_ZDNN=ON` compile flag. No hardware acceleration is possible with llama.cpp with older systems, such as IBM z15/arch13. In such systems, the APIs will default back to CPU routines.
### 4. Spyre Accelerator
### 3. Spyre Accelerator
_Only available with IBM z17 / LinuxONE 5 or later system. No support currently available._
@@ -230,10 +214,6 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
CXXFLAGS="-include cstdint" pip3 install -r requirements.txt
```
5. `-DGGML_NNPA=ON` generates gibberish output
Answer: We are aware of this as detailed in [this issue](https://github.com/ggml-org/llama.cpp/issues/14877). Please either try reducing the number of threads, or disable the compile option using `-DGGML_NNPA=OFF`.
## Getting Help on IBM Z & LinuxONE
1. **Bugs, Feature Requests**
@@ -258,38 +238,38 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
## Appendix B: SIMD Support Matrix
| | VX/VXE/VXE2 | NNPA | zDNN | Spyre |
| ---------- | ----------- | ---- | ---- | ----- |
| FP32 | ✅ | ✅ | ✅ | ❓ |
| FP16 | ✅ | ✅ | ❓ | ❓ |
| BF16 | 🚫 | 🚫 | ❓ | ❓ |
| Q4_0 | ✅ | ✅ | ❓ | ❓ |
| Q4_1 | ✅ | ✅ | ❓ | ❓ |
| MXFP4 | 🚫 | 🚫 | ❓ | ❓ |
| Q5_0 | ✅ | ✅ | ❓ | ❓ |
| Q5_1 | ✅ | ✅ | ❓ | ❓ |
| Q8_0 | ✅ | ✅ | ❓ | ❓ |
| Q2_K | 🚫 | 🚫 | ❓ | ❓ |
| Q3_K | ✅ | ✅ | ❓ | ❓ |
| Q4_K | ✅ | ✅ | ❓ | ❓ |
| Q5_K | ✅ | ✅ | ❓ | ❓ |
| Q6_K | ✅ | ✅ | ❓ | ❓ |
| TQ1_0 | 🚫 | 🚫 | ❓ | ❓ |
| TQ2_0 | 🚫 | 🚫 | ❓ | ❓ |
| IQ2_XXS | 🚫 | 🚫 | ❓ | ❓ |
| IQ2_XS | 🚫 | 🚫 | ❓ | ❓ |
| IQ2_S | 🚫 | 🚫 | ❓ | ❓ |
| IQ3_XXS | 🚫 | 🚫 | ❓ | ❓ |
| IQ3_S | 🚫 | 🚫 | ❓ | ❓ |
| IQ1_S | 🚫 | 🚫 | ❓ | ❓ |
| IQ1_M | 🚫 | 🚫 | ❓ | ❓ |
| IQ4_NL | ✅ | ✅ | ❓ | ❓ |
| IQ4_XS | ✅ | ✅ | ❓ | ❓ |
| FP32->FP16 | 🚫 | ✅ | ❓ | ❓ |
| FP16->FP32 | 🚫 | ✅ | ❓ | ❓ |
| | VX/VXE/VXE2 | zDNN | Spyre |
|------------|-------------|------|-------|
| FP32 | ✅ | ✅ | ❓ |
| FP16 | ✅ | ✅ | ❓ |
| BF16 | 🚫 | | ❓ |
| Q4_0 | ✅ | ❓ | ❓ |
| Q4_1 | ✅ | ❓ | ❓ |
| MXFP4 | 🚫 | ❓ | ❓ |
| Q5_0 | ✅ | ❓ | ❓ |
| Q5_1 | ✅ | ❓ | ❓ |
| Q8_0 | ✅ | ❓ | ❓ |
| Q2_K | 🚫 | ❓ | ❓ |
| Q3_K | ✅ | ❓ | ❓ |
| Q4_K | ✅ | ❓ | ❓ |
| Q5_K | ✅ | ❓ | ❓ |
| Q6_K | ✅ | ❓ | ❓ |
| TQ1_0 | 🚫 | ❓ | ❓ |
| TQ2_0 | 🚫 | ❓ | ❓ |
| IQ2_XXS | 🚫 | ❓ | ❓ |
| IQ2_XS | 🚫 | ❓ | ❓ |
| IQ2_S | 🚫 | ❓ | ❓ |
| IQ3_XXS | 🚫 | ❓ | ❓ |
| IQ3_S | 🚫 | ❓ | ❓ |
| IQ1_S | 🚫 | ❓ | ❓ |
| IQ1_M | 🚫 | ❓ | ❓ |
| IQ4_NL | ✅ | ❓ | ❓ |
| IQ4_XS | ✅ | ❓ | ❓ |
| FP32->FP16 | 🚫 | ❓ | ❓ |
| FP16->FP32 | 🚫 | ❓ | ❓ |
- ✅ - acceleration available
- 🚫 - acceleration unavailable, will still run using scalar implementation
- ❓ - acceleration unknown, please contribute if you can test it yourself
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Aug 22, 2025.
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 7, 2025.

View File

@@ -18,6 +18,7 @@ Legend:
| ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
| ADD_ID | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
| ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
@@ -26,6 +27,7 @@ Legend:
| CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
| CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| CONV_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
| COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ |
@@ -49,9 +51,11 @@ Legend:
| GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
| GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
| GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
| IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
| IM2COL_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
| LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
@@ -61,7 +65,9 @@ Legend:
| MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
| NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
| NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
| OPT_STEP_SGD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
| PAD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| PAD_REFLECT_1D | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
@@ -98,6 +104,7 @@ Legend:
| SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
| SUM_ROWS | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
| SWIGLU_OAI | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |

File diff suppressed because it is too large Load Diff

View File

@@ -333,17 +333,17 @@ static void print_params(struct my_llama_hparams * params) {
}
static void print_tensor_info(const struct ggml_context * ctx) {
for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
for (auto * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
LOG_INF("%s: Allocating ", __func__);
int64_t total = 1;
int i = 0;
for (; i < ggml_n_dims(t); ++i) {
if (i > 0) LOG("x ");
LOG("[%" PRId64 "] ", t->ne[i]);
if (i > 0) { LOG_INF("x "); }
LOG_INF("[%" PRId64 "] ", t->ne[i]);
total *= t->ne[i];
}
if (i > 1) LOG("= [%" PRId64 "] ", total);
LOG("float space for %s\n", ggml_get_name(t));
if (i > 1) { LOG_INF("= [%" PRId64 "] ", total); }
LOG_INF("float space for %s\n", ggml_get_name(t));
}
}

View File

@@ -510,19 +510,27 @@ static void diffusion_generate(llama_context * ctx,
n_generated = params.max_length;
}
static std::string format_input_text(const std::string & prompt, bool use_chat_template, llama_model * model) {
static std::string format_input_text(const std::string & prompt, const std::string & system_prompt, bool use_chat_template, llama_model * model) {
if (!use_chat_template) {
return prompt;
}
auto chat_templates = common_chat_templates_init(model, "");
common_chat_templates_inputs inputs;
common_chat_msg user_msg;
user_msg.role = "user";
user_msg.content = prompt;
inputs.add_generation_prompt = true;
common_chat_msg system_msg;
if (!system_prompt.empty()) {
system_msg.role = "system";
system_msg.content = system_prompt;
inputs.messages.push_back(system_msg);
}
common_chat_msg user_msg;
user_msg.role = "user";
user_msg.content = prompt;
inputs.messages.push_back(user_msg);
inputs.add_generation_prompt = true;
auto result = common_chat_templates_apply(chat_templates.get(), inputs);
@@ -579,7 +587,8 @@ int main(int argc, char ** argv) {
llama_set_n_threads(ctx, params.cpuparams.n_threads, params.cpuparams_batch.n_threads);
const llama_vocab * vocab = llama_model_get_vocab(model);
std::string formatted_prompt = format_input_text(params.prompt, params.enable_chat_template, model);
std::string formatted_prompt = format_input_text(params.prompt, params.system_prompt, params.enable_chat_template, model);
std::vector<llama_token> input_tokens = common_tokenize(vocab,
formatted_prompt,
@@ -596,6 +605,7 @@ int main(int argc, char ** argv) {
}
llama_token mask_token_id = llama_vocab_mask(vocab);
GGML_ASSERT(mask_token_id != LLAMA_TOKEN_NULL);
bool visual_mode = params.diffusion.visual_mode;

View File

@@ -28,6 +28,15 @@ static std::string ggml_ne_string(const ggml_tensor * t) {
return str;
}
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
union {
float f;
uint32_t i;
} u;
u.i = (uint32_t)h.bits << 16;
return u.f;
}
static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
@@ -43,6 +52,8 @@ static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t *
v = (float) *(int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) &data[i];
} else if (type == GGML_TYPE_BF16) {
v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
} else {
GGML_ABORT("fatal error");
}

View File

@@ -586,9 +586,10 @@ class SchemaConverter:
properties = list(schema.get('properties', {}).items())
return self._add_rule(rule_name, self._build_object_rule(properties, required, name, schema.get('additionalProperties')))
elif schema_type in (None, 'object') and 'allOf' in schema:
elif schema_type in (None, 'object', 'string') and 'allOf' in schema:
required = set()
properties = []
enum_sets = []
hybrid_name = name
def add_component(comp_schema, is_required):
if (ref := comp_schema.get('$ref')) is not None:
@@ -600,6 +601,9 @@ class SchemaConverter:
if is_required:
required.add(prop_name)
if 'enum' in comp_schema:
enum_sets.append(set(comp_schema['enum']))
for t in schema['allOf']:
if 'anyOf' in t:
for tt in t['anyOf']:
@@ -607,6 +611,15 @@ class SchemaConverter:
else:
add_component(t, is_required=True)
if enum_sets:
enum_intersection = enum_sets[0]
for s in enum_sets[1:]:
enum_intersection &= s
if enum_intersection:
rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space'
return self._add_rule(rule_name, rule)
return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None))
elif schema_type in (None, 'array') and ('items' in schema or 'prefixItems' in schema):

View File

@@ -63,7 +63,7 @@ causal-verify-logits: causal-run-original-model causal-run-converted-model
@MODEL_PATH="$(MODEL_PATH)" ./scripts/utils/check-nmse.py -m ${MODEL_PATH}
causal-run-original-embeddings:
@./scripts/causal/run-casual-gen-embeddings-org.sh
@./scripts/causal/run-casual-gen-embeddings-org.py
causal-run-converted-embeddings:
@./scripts/causal/run-converted-model-embeddings-logits.sh

View File

@@ -1,5 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cpu
torch~=2.6.0
torchvision~=0.21.0
transformers~=4.55.0
huggingface-hub~=0.34.0
torch
torchvision
transformers
huggingface-hub
accelerate

View File

@@ -1,4 +1,4 @@
#/bin/bash
#!/usr/bin/env bash
set -e

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e

View File

@@ -3,11 +3,10 @@
import argparse
import os
import importlib
import sys
import torch
import numpy as np
from transformers import AutoTokenizer, AutoConfig, AutoModel, AutoModelForCausalLM
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from pathlib import Path
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
@@ -43,6 +42,8 @@ if unreleased_model_name:
model = model_class.from_pretrained(model_path)
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
print("Falling back to AutoModelForCausalLM")
model = AutoModelForCausalLM.from_pretrained(model_path)
else:
model = AutoModelForCausalLM.from_pretrained(model_path)
print(f"Model class: {type(model)}")

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e

View File

@@ -9,15 +9,134 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
import torch
import numpy as np
unreleased_model_name = os.getenv('UNRELEASED_MODEL_NAME')
### If you want to dump RoPE activations, apply this monkey patch to the model
### class from Transformers that you are running (replace apertus.modeling_apertus
### with the proper package and class for your model
### === START ROPE DEBUG ===
# from transformers.models.apertus.modeling_apertus import apply_rotary_pos_emb
parser = argparse.ArgumentParser(description='Process model with specified path')
parser.add_argument('--model-path', '-m', help='Path to the model')
# orig_rope = apply_rotary_pos_emb
# torch.set_printoptions(threshold=float('inf'))
# torch.set_printoptions(precision=6, sci_mode=False)
# def debug_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
# # log inputs
# summarize(q, "RoPE.q_in")
# summarize(k, "RoPE.k_in")
# # call original
# q_out, k_out = orig_rope(q, k, cos, sin, position_ids, unsqueeze_dim)
# # log outputs
# summarize(q_out, "RoPE.q_out")
# summarize(k_out, "RoPE.k_out")
# return q_out, k_out
# # Patch it
# import transformers.models.apertus.modeling_apertus as apertus_mod # noqa: E402
# apertus_mod.apply_rotary_pos_emb = debug_rope
### == END ROPE DEBUG ===
def summarize(tensor: torch.Tensor, name: str, max_seq: int = 3, max_vals: int = 3):
"""
Print a tensor in llama.cpp debug style.
Supports:
- 2D tensors (seq, hidden)
- 3D tensors (batch, seq, hidden)
- 4D tensors (batch, seq, heads, dim_per_head) via flattening heads × dim_per_head
Shows first and last max_vals of each vector per sequence position.
"""
t = tensor.detach().to(torch.float32).cpu()
# Determine dimensions
if t.ndim == 3:
_, s, _ = t.shape
elif t.ndim == 2:
_, s = 1, t.shape[0]
t = t.unsqueeze(0)
elif t.ndim == 4:
_, s, _, _ = t.shape
else:
print(f"Skipping tensor due to unsupported dimensions: {t.ndim}")
return
ten_shape = t.shape
print(f"ggml_debug: {name} = (f32) ... = {{{ten_shape}}}")
print(" [")
print(" [")
# Determine indices for first and last sequences
first_indices = list(range(min(s, max_seq)))
last_indices = list(range(max(0, s - max_seq), s))
# Check if there's an overlap between first and last indices or if we're at the edge case of s = 2 * max_seq
has_overlap = bool(set(first_indices) & set(last_indices)) or (max_seq * 2 == s)
# Combine indices
if has_overlap:
# If there's overlap, just use the combined unique indices
indices = sorted(list(set(first_indices + last_indices)))
separator_index = None
else:
# If no overlap, we'll add a separator between first and last sequences
indices = first_indices + last_indices
separator_index = len(first_indices)
for i, si in enumerate(indices):
# Add separator if needed
if separator_index is not None and i == separator_index:
print(" ...")
# Extract appropriate slice
vec = t[0, si]
if vec.ndim == 2: # 4D case: flatten heads × dim_per_head
flat = vec.flatten().tolist()
else: # 2D or 3D case
flat = vec.tolist()
# First and last slices
first = flat[:max_vals]
last = flat[-max_vals:] if len(flat) >= max_vals else flat
first_str = ", ".join(f"{v:12.4f}" for v in first)
last_str = ", ".join(f"{v:12.4f}" for v in last)
print(f" [{first_str}, ..., {last_str}]")
print(" ],")
print(" ]")
print(f" sum = {t.sum().item():.6f}\n")
def debug_hook(name):
def fn(_m, input, output):
if isinstance(input, torch.Tensor):
summarize(input, name + "_in")
elif isinstance(input, (tuple, list)) and isinstance(input[0], torch.Tensor):
summarize(input[0], name + "_in")
if isinstance(output, torch.Tensor):
summarize(output, name + "_out")
elif isinstance(output, (tuple, list)) and isinstance(output[0], torch.Tensor):
summarize(output[0], name + "_out")
return fn
unreleased_model_name = os.getenv("UNRELEASED_MODEL_NAME")
parser = argparse.ArgumentParser(description="Process model with specified path")
parser.add_argument("--model-path", "-m", help="Path to the model")
args = parser.parse_args()
model_path = os.environ.get('MODEL_PATH', args.model_path)
model_path = os.environ.get("MODEL_PATH", args.model_path)
if model_path is None:
parser.error("Model path must be specified either via --model-path argument or MODEL_PATH environment variable")
parser.error(
"Model path must be specified either via --model-path argument or MODEL_PATH environment variable"
)
config = AutoConfig.from_pretrained(model_path)
@@ -34,18 +153,30 @@ config = AutoConfig.from_pretrained(model_path)
if unreleased_model_name:
model_name_lower = unreleased_model_name.lower()
unreleased_module_path = f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
unreleased_module_path = (
f"transformers.models.{model_name_lower}.modular_{model_name_lower}"
)
class_name = f"{unreleased_model_name}ForCausalLM"
print(f"Importing unreleased model module: {unreleased_module_path}")
try:
model_class = getattr(importlib.import_module(unreleased_module_path), class_name)
model = model_class.from_pretrained(model_path) # Note: from_pretrained, not fromPretrained
model_class = getattr(
importlib.import_module(unreleased_module_path), class_name
)
model = model_class.from_pretrained(
model_path
) # Note: from_pretrained, not fromPretrained
except (ImportError, AttributeError) as e:
print(f"Failed to import or load model: {e}")
exit(1)
else:
model = AutoModelForCausalLM.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
model_path, device_map="auto", offload_folder="offload"
)
for name, module in model.named_modules():
if len(list(module.children())) == 0: # only leaf modules
module.register_forward_hook(debug_hook(name))
model_name = os.path.basename(model_path)
# Printing the Model class to allow for easier debugging. This can be useful

View File

@@ -1,4 +1,4 @@
#/bin/bash
#!/usr/bin/env bash
set -e

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e

View File

@@ -7,7 +7,7 @@ base_model:
Recommended way to run this model:
```sh
llama-server -hf {namespace}/{model_name}-GGUF
llama-server -hf {namespace}/{model_name}-GGUF --embeddings
```
Then the endpoint can be accessed at http://localhost:8080/embedding, for

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e

View File

@@ -1,4 +1,6 @@
#!/usr/bin/env bash
COLLECTION_SLUG=$(python ./create_collection.py --return-slug)
echo "Created collection: $COLLECTION_SLUG"

View File

@@ -0,0 +1,6 @@
#!/usr/bin/env bash
curl --request POST \
--url http://localhost:8080/embedding \
--header "Content-Type: application/json" \
--data '{"input": "Hello world today"}' \
--silent

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
# First try command line argument, then environment variable, then file
CONVERTED_MODEL="${1:-"$CONVERTED_MODEL"}"

View File

@@ -40,7 +40,7 @@ if os.path.exists(index_path):
file_path = os.path.join(model_path, file_name)
print(f"\n--- From {file_name} ---")
with safe_open(file_path, framework="pt") as f:
with safe_open(file_path, framework="pt") as f: # type: ignore
for tensor_name in sorted(tensor_names):
tensor = f.get_tensor(tensor_name)
print(f"- {tensor_name} : shape = {tensor.shape}, dtype = {tensor.dtype}")
@@ -49,7 +49,7 @@ elif os.path.exists(single_file_path):
# Single file model (original behavior)
print("Single-file model detected")
with safe_open(single_file_path, framework="pt") as f:
with safe_open(single_file_path, framework="pt") as f: # type: ignore
keys = f.keys()
print("Tensors in model:")
for key in sorted(keys):

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e

View File

@@ -1,4 +1,4 @@
#!/bin/bash
#!/usr/bin/env bash
set -e
#

View File

@@ -145,6 +145,20 @@ int main(int argc, char ** argv) {
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
if (llama_model_has_encoder(model)) {
if (llama_encode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
decoder_start_token_id = llama_vocab_bos(vocab);
}
batch = llama_batch_get_one(&decoder_start_token_id, 1);
}
// main loop
const auto t_main_start = ggml_time_us();

View File

@@ -129,10 +129,11 @@ endif()
option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
option(GGML_RV_ZFH "ggml: enable riscv zfh" ON)
option(GGML_RV_ZVFH "ggml: enable riscv zvfh" ON)
option(GGML_RV_ZICBOP "ggml: enable riscv zicbop" ON)
option(GGML_XTHEADVECTOR "ggml: enable xtheadvector" OFF)
option(GGML_VXE "ggml: enable vxe" ON)
option(GGML_NNPA "ggml: enable nnpa" OFF) # temp disabled by default, see: https://github.com/ggml-org/llama.cpp/issues/14877
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")

View File

@@ -132,6 +132,8 @@ extern "C" {
GGML_BACKEND_DEVICE_TYPE_CPU,
// GPU device using dedicated memory
GGML_BACKEND_DEVICE_TYPE_GPU,
// integrated GPU device using host memory
GGML_BACKEND_DEVICE_TYPE_IGPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL
};
@@ -150,11 +152,21 @@ extern "C" {
// all the device properties
struct ggml_backend_dev_props {
// device name
const char * name;
// device description
const char * description;
// device free memory in bytes
size_t memory_free;
// device total memory in bytes
size_t memory_total;
// device type
enum ggml_backend_dev_type type;
// device id
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
// if the id is unknown, this should be NULL
const char * device_id;
// device capabilities
struct ggml_backend_dev_caps caps;
};

View File

@@ -101,7 +101,6 @@ extern "C" {
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_vxe (void);
GGML_BACKEND_API int ggml_cpu_has_nnpa (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
@@ -135,6 +134,7 @@ extern "C" {
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
GGML_BACKEND_API void ggml_cpu_fp32_to_fp32(const float *, float *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_i32 (const float *, int32_t *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);

View File

@@ -43,14 +43,8 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
GGML_DEPRECATED(
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
"obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family
// ideally, the user code should be doing these checks
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf

View File

@@ -7,8 +7,6 @@
extern "C" {
#endif
GGML_BACKEND_API ggml_backend_t ggml_backend_zdnn_init(void);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_zdnn_reg(void);
#ifdef __cplusplus

View File

@@ -511,6 +511,7 @@ extern "C" {
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_IM2COL_3D,
GGML_OP_CONV_2D,
GGML_OP_CONV_3D,
GGML_OP_CONV_2D_DW,
@@ -1403,6 +1404,7 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
// note: casting from f32 to i32 will discard the fractional part
GGML_API struct ggml_tensor * ggml_cast(
struct ggml_context * ctx,
struct ggml_tensor * a,
@@ -1527,7 +1529,11 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
// supports 3D: a->ne[2] == b->ne[1]
// supports 4D a:
// a [n_embd, ne1, ne2, ne3]
// b I32 [n_rows, ne2, ne3, 1]
//
// return [n_embd, n_rows, ne2, ne3]
GGML_API struct ggml_tensor * ggml_get_rows(
struct ggml_context * ctx,
struct ggml_tensor * a, // data
@@ -1870,6 +1876,41 @@ extern "C" {
int d0, // dilation dimension 0
int d1); // dilation dimension 1
GGML_API struct ggml_tensor * ggml_im2col_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int64_t IC,
int s0, // stride width
int s1, // stride height
int s2, // stride depth
int p0, // padding width
int p1, // padding height
int p2, // padding depth
int d0, // dilation width
int d1, // dilation height
int d2, // dilation depth
enum ggml_type dst_type);
// a: [OC*IC, KD, KH, KW]
// b: [N*IC, ID, IH, IW]
// result: [N*OC, OD, OH, OW]
GGML_API struct ggml_tensor * ggml_conv_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int64_t IC,
int s0, // stride width
int s1, // stride height
int s2, // stride depth
int p0, // padding width
int p1, // padding height
int p2, // padding depth
int d0, // dilation width
int d1, // dilation height
int d2 // dilation depth
);
// kernel size is a->ne[0] x a->ne[1]
// stride is equal to kernel size
// padding is zero
@@ -1941,7 +1982,7 @@ extern "C" {
int d0, // dilation dimension 0
int d1); // dilation dimension 1
GGML_API struct ggml_tensor * ggml_conv_3d(
GGML_API struct ggml_tensor * ggml_conv_3d_direct(
struct ggml_context * ctx,
struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC]
struct ggml_tensor * b, // input [W, H, D, C * N]
@@ -2048,6 +2089,19 @@ extern "C" {
int p2,
int p3);
GGML_API struct ggml_tensor * ggml_pad_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
int lp0,
int rp0,
int lp1,
int rp1,
int lp2,
int rp2,
int lp3,
int rp3
);
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
struct ggml_context * ctx,

View File

@@ -8,7 +8,7 @@
extern "C" {
#endif
#define GGML_BACKEND_API_VERSION 1
#define GGML_BACKEND_API_VERSION 2
//
// Backend buffer type
@@ -114,6 +114,9 @@ extern "C" {
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
// wait for an event on on a different stream
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
// (optional) sort/optimize the nodes in the graph
void (*optimize_graph) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
};
struct ggml_backend {

View File

@@ -400,9 +400,8 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
ggml_backend_t ggml_backend_init_best(void) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
if (!dev) {
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
}
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
if (!dev) {
return nullptr;
}

View File

@@ -463,6 +463,13 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event)
backend->iface.event_wait(backend, event);
}
static void ggml_backend_optimize_graph(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
GGML_ASSERT(backend);
if (backend->iface.optimize_graph != NULL) {
backend->iface.optimize_graph(backend, cgraph);
}
}
// Backend device
const char * ggml_backend_dev_name(ggml_backend_dev_t device) {
@@ -651,7 +658,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
#endif
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
#define GGML_SCHED_MAX_SPLIT_INPUTS 30
#endif
#ifndef GGML_SCHED_MAX_COPIES
@@ -1298,6 +1305,10 @@ void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgra
struct ggml_backend_sched_split * split = &sched->splits[i];
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
// Optimize this split of the graph. This needs to happen before we make graph_copy,
// so they are in sync.
ggml_backend_optimize_graph(sched->backends[split->backend_id], &split->graph);
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
for (int j = 0; j < split->n_inputs; j++) {
assert(graph_copy->size > (graph_copy->n_nodes + 1));

View File

@@ -270,6 +270,7 @@ static struct ggml_backend_i blas_backend_i = {
/* .graph_compute = */ ggml_backend_blas_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .optimize_graph = */ NULL,
};
static ggml_guid_t ggml_backend_blas_guid(void) {

View File

@@ -70,6 +70,8 @@
#include <aclnnop/aclnn_zero.h>
#include <aclnnop/aclnn_index_copy.h>
#include <aclnnop/aclnn_index_select.h>
#include <aclnnop/aclnn_clamp.h>
#include <aclnnop/aclnn_threshold.h>
#include <float.h>
#include <cmath>
@@ -587,9 +589,16 @@ void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
// the position of elements in the array means which dirction to padding,
// each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind,
// dim2.front, dim2.behind, dim3.front, dim3.behind]
int64_t paddings[] = {
0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1],
0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]};
const int32_t lp0 = ggml_get_op_params_i32(dst, 0);
const int32_t rp0 = ggml_get_op_params_i32(dst, 1);
const int32_t lp1 = ggml_get_op_params_i32(dst, 2);
const int32_t rp1 = ggml_get_op_params_i32(dst, 3);
const int32_t lp2 = ggml_get_op_params_i32(dst, 4);
const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
int64_t paddings[] = {lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3};
aclnn_pad(ctx, acl_src, acl_dst, paddings);
ggml_cann_release_resources(ctx, acl_src, acl_dst);
}
@@ -973,18 +982,19 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
);
// build rstd, zero...
size_t acl_rstd_nb[GGML_MAX_DIMS];
int64_t acl_rstd_ne[] = {src->ne[1], src->ne[2], src->ne[3]};
size_t acl_rstd_nb[GGML_MAX_DIMS - 1];
acl_rstd_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
acl_rstd_nb[i] = acl_rstd_nb[i - 1] * src->ne[i - 1];
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
acl_rstd_nb[i] = acl_rstd_nb[i - 1] * acl_rstd_ne[i - 1];
}
aclTensor* acl_rstd = get_f32_cache_acl_tensor(
ctx,
&ctx.rms_norm_zero_tensor_cache.cache,
ctx.rms_norm_zero_tensor_cache.size,
src->ne,
acl_rstd_ne,
acl_rstd_nb,
GGML_MAX_DIMS,
GGML_MAX_DIMS - 1,
0.0f // value
);
@@ -1423,21 +1433,25 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
* @param start Starting exponent offset.
* @param stop Stopping exponent offset (exclusive).
* @param step Step size for the exponent increment.
* @param dtype Data type for slope tensor.
*/
static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_buffer,
float m, int64_t size, float start, float stop, float step){
int64_t ne[] = {size};
size_t nb[] = {sizeof(uint16_t)};
float m, int64_t size, float start, float stop, float step, ggml_type dtype){
aclDataType acl_type = ggml_cann_type_mapping(dtype);
size_t type_size = ggml_type_size(dtype);
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * sizeof(uint16_t));
int64_t ne[] = {size};
size_t nb[] = {type_size};
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * type_size);
void* arange_buffer = arange_allocator.get();
aclTensor* arange_tensor = ggml_cann_create_tensor(
arange_buffer, ACL_FLOAT16, sizeof(uint16_t), ne, nb, 1);
arange_buffer, acl_type, type_size, ne, nb, 1);
aclnn_arange(ctx, arange_tensor, start, stop, step, size);
aclTensor* slope_tensor = ggml_cann_create_tensor(
slope_buffer, ACL_FLOAT16, sizeof(uint16_t), ne, nb, 1);
slope_buffer, acl_type, type_size, ne, nb, 1);
aclScalar* sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT);
@@ -1468,10 +1482,11 @@ static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_bu
* @param n_head Total number of attention heads.
* @param slope_buffer Pointer to the output buffer (float array) for storing slopes.
* @param max_bias Maximum bias value for slope computation.
* @param dtype Data type for slope tensor.
*
*/
static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
void* slope_buffer, float max_bias) {
void* slope_buffer, float max_bias, ggml_type dtype) {
const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
@@ -1488,7 +1503,7 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
float step = 1;
float count = n_head_log2;
// end needs to be +1 because aclnn uses a left-closed, right-open interval.
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step);
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step, dtype);
if (n_head_log2 < n_head) {
// arange2
start = 2 * (n_head_log2 - n_head_log2) + 1;
@@ -1497,7 +1512,7 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
count = n_head - n_head_log2;
aclnn_get_slope_inner(
ctx, (char *) slope_buffer + n_head_log2 * sizeof(float),
m1, count, start, end + 1, step);
m1, count, start, end + 1, step, dtype);
}
}
@@ -1534,7 +1549,7 @@ static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
ggml_cann_pool_alloc bias_allocator(
ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
bias_buffer = bias_allocator.get();
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias);
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias, GGML_TYPE_F32);
}
// broadcast for mask, slop and dst;
@@ -1760,10 +1775,10 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
case GGML_TYPE_F16: {
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
ggml_cann_pool_alloc src_buffer_allocator(
ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
ctx.pool(), ggml_nelements(src0) * sizeof(float));
void* src_trans_buffer = src_buffer_allocator.get();
size_t src_trans_nb[GGML_MAX_DIMS];
src_trans_nb[0] = sizeof(float_t);
src_trans_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
}
@@ -1807,14 +1822,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
// [3,4,5,64] -> [3,4,5,2,32]
dequant_ne = weight_ne;
dequant_nb[0] = sizeof(float_t);
dequant_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
}
scale_offset = ggml_nelements(src0) * sizeof(int8_t);
ggml_cann_pool_alloc dequant_buffer_allocator(
ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
ctx.pool(), ggml_nelements(src0) * sizeof(float));
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
@@ -1823,11 +1838,11 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
src0->data, ACL_FLOAT16, sizeof(uint16_t), scale_ne, scale_nb,
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
aclTensor* dequant_tensor = ggml_cann_create_tensor(
dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float),
dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
dequant_nb[0] = sizeof(float_t);
dequant_nb[0] = sizeof(float);
dequant_ne = src0->ne;
for (int i = 1; i < GGML_MAX_DIMS; i++) {
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
@@ -1948,7 +1963,7 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
aclTensor* acl_weight_tensor;
// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
if (weight_to_nz && is_matmul_weight(weight)) {
int64_t acl_stride[2] = {1, transpose_ne[1]};
@@ -2253,8 +2268,6 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
* stream, and persistent buffers for rope init/cache.
* @param dst The destination ggml_tensor whose computation
* depends on the RoPE values (usually Qcur/Kcur).
* @param sin_tensor_buffer Pre-allocated buffer for storing repeated sin values.
* @param cos_tensor_buffer Pre-allocated buffer for storing repeated cos values.
* @param theta_scale Scalar exponent base for computing theta scale values.
* @param freq_scale Frequency scaling factor, applied to theta scale.
* @param attn_factor Attention scaling factor, applied to sin/cos.
@@ -2262,20 +2275,27 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
* (dim expansion vs repeat_interleave).
*/
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
void* sin_tensor_buffer, void* cos_tensor_buffer,
float* corr_dims, float ext_factor,
float theta_scale, float freq_scale,
float attn_factor, bool is_neox) {
// int sin/cos cache, cache has different repeat method depond on
// @param.is_neox
ggml_tensor* src0 = dst->src[0]; // input
ggml_tensor* src1 = dst->src[1]; // position
ggml_tensor* src2 = dst->src[2]; // freq_factors
if(src2 == nullptr && ctx.rope_cache.cached
&& ctx.rope_cache.ext_factor == ext_factor
&& ctx.rope_cache.theta_scale == theta_scale
&& ctx.rope_cache.freq_scale == freq_scale
&& ctx.rope_cache.attn_factor == attn_factor
&& ctx.rope_cache.is_neox == is_neox) {
// use cache.
return;
}
int64_t theta_scale_length = src0->ne[0] / 2;
int64_t theta_scale_ne[] = {theta_scale_length, 1, 1, 1};
size_t theta_scale_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t),
theta_scale_length * sizeof(float_t)};
size_t theta_scale_nb[] = {sizeof(float), sizeof(float), sizeof(float),
theta_scale_length * sizeof(float)};
GGML_ASSERT(src1->type == GGML_TYPE_I32);
int64_t position_length = src1->ne[0];
@@ -2285,7 +2305,7 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
int64_t theta_ne[] = {theta_scale_length, 1, position_length, 1};
size_t theta_nb[GGML_MAX_DIMS];
theta_nb[0] = sizeof(float_t);
theta_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1];
}
@@ -2300,16 +2320,14 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
ctx.rope_cache.freq_scale != freq_scale) {
ctx.rope_cache.theta_scale_length = theta_scale_length;
ctx.rope_cache.theta_scale = theta_scale;
ctx.rope_cache.freq_scale = freq_scale;
if (ctx.rope_cache.theta_scale_cache != nullptr) {
ACL_CHECK(aclrtFree(ctx.rope_cache.theta_scale_cache));
}
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float_t), ACL_MEM_MALLOC_HUGE_FIRST));
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.theta_scale_cache, theta_scale_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
acl_theta_scale_tensor =
ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float_t),
ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
float start = 0;
@@ -2318,39 +2336,97 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
float n_elements = theta_scale_length;
aclnn_arange(ctx, acl_theta_scale_tensor, start, stop, step, n_elements);
ggml_cann_pool_alloc yarn_ramp_allocator(ctx.pool());
aclTensor* acl_yarn_ramp_tensor = nullptr;
if (ext_factor != 0) {
// -rope_yarn_ramp
// const float y = (i0 / 2 - low) / MAX(0.001f, high - low);
// return MIN(1, MAX(0, y)) - 1;
yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
void* yarn_ramp_buffer = yarn_ramp_allocator.get();
acl_yarn_ramp_tensor = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float),
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
float zero_value = 0, one_value = 1;
float denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
aclScalar* low = aclCreateScalar(&corr_dims[0], aclDataType::ACL_FLOAT);
aclScalar* zero = aclCreateScalar(&zero_value, aclDataType::ACL_FLOAT);
aclScalar* one = aclCreateScalar(&one_value, aclDataType::ACL_FLOAT);
aclScalar* denom_safe = aclCreateScalar(&denom_safe_value, aclDataType::ACL_FLOAT);
aclScalar* ext_factor_sc = aclCreateScalar(&ext_factor, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, Subs, acl_theta_scale_tensor, low, one, acl_yarn_ramp_tensor);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceDivs, acl_yarn_ramp_tensor, denom_safe);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceThreshold, acl_yarn_ramp_tensor, zero, zero);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceClampMax, acl_yarn_ramp_tensor, one);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceSubs, acl_yarn_ramp_tensor, one, one);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor, ext_factor_sc);
// theta_interp = freq_scale * theta_extrap;
// theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
// theta = freq_scale * theta_extrap * (1 - ramp_mix) + theta_extrap * ramp_mix;
// theta = freq_scale * theta_extrap - freq_scale * theta_extrap * ramp_mix + theta_extrap * ramp_mix;
// theta = theta_extrap * (freq_scale - freq_scale * ramp_mix + ramp_mix);
//
// we cache (freq_scale - freq_scale * ramp_mix + ramp_mix), Considering that the rope_yarn_ramp here is the inverse
// cache freq_scale + (freq_scale - 1) * ramp_mix
float freq_scale_1 = freq_scale - 1;
aclScalar* freq_scale_sc = aclCreateScalar(&freq_scale, aclDataType::ACL_FLOAT);
aclScalar* freq_scale_1_sc = aclCreateScalar(&freq_scale_1, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMuls, acl_yarn_ramp_tensor, freq_scale_1_sc);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceAdds, acl_yarn_ramp_tensor, freq_scale_sc, one);
ggml_cann_release_resources(ctx, low, zero, one, denom_safe, ext_factor_sc, freq_scale_sc, freq_scale_1_sc);
}
// power
aclScalar* acl_theta_scale = aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT);
GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, acl_theta_scale, acl_theta_scale_tensor,
acl_theta_scale_tensor);
// freq_scale
if (freq_scale != 1) {
if (ext_factor != 0) {
aclnn_mul(ctx, acl_theta_scale_tensor, acl_yarn_ramp_tensor);
} else if (freq_scale != 1) {
aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
}
ggml_cann_release_resources(ctx, acl_theta_scale);
ggml_cann_release_resources(ctx, acl_yarn_ramp_tensor, acl_theta_scale);
} else {
// use cache
acl_theta_scale_tensor =
ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float_t),
ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
}
ggml_cann_pool_alloc freq_fac_res_allocator(ctx.pool());
// freq_factors
if (src2) {
freq_fac_res_allocator.alloc(theta_scale_length * sizeof(float_t));
freq_fac_res_allocator.alloc(theta_scale_length * sizeof(float));
void* freq_fac_res_ptr = freq_fac_res_allocator.get();
aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
src2->data, ggml_cann_type_mapping(src2->type),
ggml_type_size(src2->type), theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
aclTensor* acl_freq_fac_res_tensor = ggml_cann_create_tensor(
freq_fac_res_ptr, ACL_FLOAT, sizeof(float_t),
freq_fac_res_ptr, ACL_FLOAT, sizeof(float),
theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);
aclnn_div(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor, acl_freq_fac_res_tensor);
std::swap(acl_theta_scale_tensor, acl_freq_fac_res_tensor);
ggml_cann_release_resources(ctx, acl_freq_factors_tensor, acl_freq_fac_res_tensor);
}
// init sin_repeat && cos_repeat, only to accelerate first layer on each device
if (position_length > ctx.rope_cache.position_length) {
ctx.rope_cache.position_length = position_length;
if (ctx.rope_cache.sin_cache != nullptr) {
ACL_CHECK(aclrtFree(ctx.rope_cache.sin_cache));
}
if (ctx.rope_cache.cos_cache != nullptr) {
ACL_CHECK(aclrtFree(ctx.rope_cache.cos_cache));
}
int64_t repeat_theta_length = theta_scale_length * position_length * 2;
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.sin_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
ACL_CHECK(aclrtMalloc(&ctx.rope_cache.cos_cache, repeat_theta_length * sizeof(float), ACL_MEM_MALLOC_HUGE_FIRST));
}
// position
aclTensor* acl_position_tensor = ggml_cann_create_tensor(
src1->data, ggml_cann_type_mapping(src1->type),
@@ -2359,32 +2435,36 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
// power * position
int64_t theta_length = theta_scale_length * position_length;
ggml_cann_pool_alloc theta_allocator(ctx.pool(),
theta_length * sizeof(float_t));
theta_length * sizeof(float));
void* theta_buffer = theta_allocator.get();
aclTensor* acl_theta_tensor =
ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t),
ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float),
theta_ne, theta_nb, GGML_MAX_DIMS);
aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor,
acl_theta_tensor);
// sin/cos
ggml_cann_pool_alloc sin_allocator(ctx.pool(),
theta_length * sizeof(float_t));
theta_length * sizeof(float));
void* sin_buffer = sin_allocator.get();
aclTensor* acl_sin_tensor = ggml_cann_create_tensor(
sin_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb,
sin_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb,
GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_sin(ctx, acl_theta_tensor, acl_sin_tensor);
ggml_cann_pool_alloc cos_allocator(ctx.pool(),
theta_length * sizeof(float_t));
theta_length * sizeof(float));
void* cos_buffer = cos_allocator.get();
aclTensor* acl_cos_tensor = ggml_cann_create_tensor(
cos_buffer, ACL_FLOAT, sizeof(float_t), theta_ne, theta_nb,
cos_buffer, ACL_FLOAT, sizeof(float), theta_ne, theta_nb,
GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_cos(ctx, acl_theta_tensor, acl_cos_tensor);
if (ext_factor != 0) {
attn_factor *= 1.0f + 0.1f * logf(1.0f / freq_scale);
}
// attn_factor
if (attn_factor != 1) {
aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
@@ -2393,15 +2473,15 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
size_t sin_reshape_nb[GGML_MAX_DIMS];
sin_reshape_nb[0] = sizeof(float_t);
sin_reshape_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
}
aclTensor* acl_sin_repeat_tensor =
ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float_t),
ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
aclTensor* acl_cos_repeat_tensor =
ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float_t),
ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
// repeat
@@ -2419,6 +2499,14 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
num_repeats, output_size);
}
// Other layers use cache except first layer.
ctx.rope_cache.cached = true;
ctx.rope_cache.ext_factor = ext_factor;
ctx.rope_cache.theta_scale = theta_scale;
ctx.rope_cache.freq_scale = freq_scale;
ctx.rope_cache.attn_factor = attn_factor;
ctx.rope_cache.is_neox = is_neox;
ggml_cann_release_resources(ctx, acl_theta_scale_tensor, acl_position_tensor,
acl_theta_tensor, acl_sin_tensor, acl_sin_repeat_tensor, acl_cos_tensor,
acl_cos_repeat_tensor);
@@ -2440,10 +2528,7 @@ aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
#endif
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
// TODO: use ascendc
// Only test with LLAMA model.
ggml_tensor* src0 = dst->src[0]; // input
ggml_tensor* src1 = dst->src[1];
// param
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
@@ -2465,8 +2550,6 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
// TODO: n_dims <= ne0
GGML_ASSERT(n_dims == ne0);
GGML_ASSERT(n_dims % 2 == 0);
// TODO: ext_factor != 0
GGML_ASSERT(ext_factor == 0);
const float theta_scale = powf(freq_base, -2.0f / n_dims);
@@ -2476,28 +2559,21 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
// sin/cos tensor length.
int64_t repeat_theta_length = src0->ne[0] * src1->ne[0];
ggml_cann_pool_alloc sin_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float));
ggml_cann_pool_alloc cos_tensor_allocator(ctx.pool(), repeat_theta_length * sizeof(float));
void *sin_tensor_buffer = sin_tensor_allocator.get();
void *cos_tensor_buffer = cos_tensor_allocator.get();
// init ctx.rope_cos/rope_sin cache
aclnn_cache_init(ctx, dst, sin_tensor_buffer, cos_tensor_buffer,
aclnn_cache_init(ctx, dst, corr_dims, ext_factor,
theta_scale, freq_scale, attn_factor, is_neox);
int64_t sin_reshape_ne[4] = {ne00, 1, ne02, 1};
size_t sin_reshape_nb[GGML_MAX_DIMS];
sin_reshape_nb[0] = sizeof(float_t);
sin_reshape_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1];
}
aclTensor* acl_sin_reshape_tensor =
ggml_cann_create_tensor(sin_tensor_buffer, ACL_FLOAT, sizeof(float_t),
ggml_cann_create_tensor(ctx.rope_cache.sin_cache, ACL_FLOAT, sizeof(float),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
aclTensor* acl_cos_reshape_tensor =
ggml_cann_create_tensor(cos_tensor_buffer, ACL_FLOAT, sizeof(float_t),
ggml_cann_create_tensor(ctx.rope_cache.cos_cache, ACL_FLOAT, sizeof(float),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
aclTensor* acl_src = ggml_cann_create_tensor(src0);
@@ -2512,7 +2588,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
void* minus_one_scale_buffer = nullptr;
ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0));
ggml_cann_pool_alloc minus_one_scale_allocator(
ctx.pool(), sizeof(float_t) * src0->ne[0]);
ctx.pool(), sizeof(float) * src0->ne[0]);
if (!is_neox) {
// roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...]
input_roll_buffer = roll_allocator.get();
@@ -2542,13 +2618,13 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
size_t minus_one_nb[GGML_MAX_DIMS];
minus_one_nb[0] = sizeof(float_t);
minus_one_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
}
acl_minus_one_tensor = aclnn_values(
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0],
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
int64_t dim = 3;
int64_t* index = new int64_t[src0->ne[0]];
for (int i = 0; i < src0->ne[0]; i++) {
@@ -2576,22 +2652,22 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
minus_one_scale_buffer = minus_one_scale_allocator.get();
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
size_t minus_one_nb[GGML_MAX_DIMS];
minus_one_nb[0] = sizeof(float_t);
minus_one_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
}
acl_minus_one_tensor = aclnn_values(
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
ctx, minus_one_scale_buffer, sizeof(float) * src0->ne[0],
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float), 1);
// -1 * first half
int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1};
size_t first_half_nb[GGML_MAX_DIMS];
first_half_nb[0] = sizeof(float_t);
first_half_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1];
}
aclTensor* acl_first_half_tensor = ggml_cann_create_tensor(
minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne,
minus_one_scale_buffer, ACL_FLOAT, sizeof(float), first_half_ne,
first_half_nb, GGML_MAX_DIMS);
bool inplace = true;
float scale = -1;
@@ -2631,28 +2707,28 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
// TODO: ne0 != n_dims in mode2
} else if (src0->type == GGML_TYPE_F16) {
size_t input_fp32_nb[GGML_MAX_DIMS];
input_fp32_nb[0] = sizeof(float_t);
input_fp32_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1];
}
ggml_cann_pool_alloc fp32_allocator1(
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
ctx.pool(), ggml_nelements(dst) * sizeof(float));
void* input_fp32_buffer1 = fp32_allocator1.get();
aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor(
input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne,
input_fp32_buffer1, ACL_FLOAT, sizeof(float), dst->ne,
input_fp32_nb, GGML_MAX_DIMS);
ggml_cann_pool_alloc fp32_allocator2(
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
ctx.pool(), ggml_nelements(dst) * sizeof(float));
void* input_fp32_buffer2 = fp32_allocator2.get();
aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor(
input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne,
input_fp32_buffer2, ACL_FLOAT, sizeof(float), dst->ne,
input_fp32_nb, GGML_MAX_DIMS);
ggml_cann_pool_alloc fp32_allocator(
ctx.pool(), ggml_nelements(dst) * sizeof(float_t));
ctx.pool(), ggml_nelements(dst) * sizeof(float));
output_fp32_buffer = fp32_allocator.get();
aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
output_fp32_buffer, ACL_FLOAT, sizeof(float), dst->ne,
input_fp32_nb, GGML_MAX_DIMS);
aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
@@ -2749,8 +2825,6 @@ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* ds
aclIntArray *padding = aclCreateIntArray(paddingVal, 1);
int64_t dilationVal[] = {1};
aclIntArray *dilation = aclCreateIntArray(dilationVal, 1);
bool transposed = true;
int64_t groups = 1;
int8_t cubeMathType = 0;
#ifdef ASCEND_310P
@@ -2758,7 +2832,7 @@ void ggml_cann_conv_transpose_1d(ggml_backend_cann_context& ctx, ggml_tensor* ds
#endif
GGML_CANN_CALL_ACLNN_OP(ctx, Convolution, acl_input, acl_weight, nullptr, stride,
padding, dilation, transposed, padding, groups, acl_dst, cubeMathType);
padding, dilation, true, padding, 1, acl_dst, cubeMathType);
ggml_cann_release_resources(ctx, acl_weight, acl_dst, stride, padding, dilation);
}
@@ -3220,7 +3294,7 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
const int64_t n_heads = src0->ne[2];
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(uint16_t));
void* slope_buffer = slope_allocator.get();
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias);
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias, GGML_TYPE_F16);
int64_t slope_ne[] = {1, 1, n_heads, 1};
size_t slope_nb[GGML_MAX_DIMS];

View File

@@ -38,6 +38,7 @@
#include <unistd.h>
#include <functional>
#include <optional>
#include <list>
#include "../include/ggml-cann.h"
#include "../include/ggml.h"
@@ -106,6 +107,7 @@ int32_t ggml_cann_get_device();
std::optional<std::string> get_env(const std::string& name);
bool parse_bool(const std::string& value);
int parse_integer(const std::string& value);
/**
* @brief Abstract base class for memory pools used by CANN.
@@ -350,7 +352,7 @@ struct ggml_graph_node_properties {
struct ggml_cann_graph {
~ggml_cann_graph() {
if (graph != nullptr) {
aclmdlRIDestroy(graph);
ACL_CHECK(aclmdlRIDestroy(graph));
}
}
@@ -358,6 +360,64 @@ struct ggml_cann_graph {
std::vector<ggml_graph_node_properties> ggml_graph_properties;
};
/**
* @brief LRU cache for managing ggml_cann_graph objects.
*
* This class maintains a list of shared_ptr to ggml_cann_graph objects
* and enforces a maximum capacity. It provides methods to push new graphs,
* move existing graphs to the front (most recently used), and clear the cache.
*/
struct ggml_cann_graph_lru_cache {
size_t capacity; /**< Maximum number of graphs in the cache. */
std::list<ggml_cann_graph*> cache_list; /**< List storing cached graphs as raw pointers. */
ggml_cann_graph_lru_cache() {
capacity = parse_integer(get_env("GGML_CANN_GRAPH_CACHE_CAPACITY").value_or("12"));
}
/**
* @brief Push a new graph to the front of the cache.
* If the cache exceeds capacity, the least recently used graph is deleted.
* @param new_node Pointer to the new ggml_cann_graph to cache.
* Ownership is transferred to the cache (cache will delete it).
*/
void push(ggml_cann_graph* new_node) {
if (cache_list.size() >= capacity) {
ggml_cann_graph* old = cache_list.back();
cache_list.pop_back();
delete old; // free the old graph
}
cache_list.push_front(new_node);
}
/**
* @brief Move an existing graph to the front of the cache.
* @param node Pointer to the ggml_cann_graph to move.
*/
void move_to_front(ggml_cann_graph* node) {
cache_list.remove(node);
cache_list.push_front(node);
}
/**
* @brief Clear all graphs from the cache (also frees memory).
*/
void clear() {
for (auto ptr : cache_list) {
delete ptr;
}
cache_list.clear();
}
/**
* @brief Destructor that clears the cache and frees all cached graphs.
*/
~ggml_cann_graph_lru_cache() {
clear();
}
};
#endif // USE_ACL_GRAPH
struct ggml_cann_rope_cache {
@@ -365,12 +425,27 @@ struct ggml_cann_rope_cache {
if(theta_scale_cache != nullptr) {
ACL_CHECK(aclrtFree(theta_scale_cache));
}
if(sin_cache != nullptr) {
ACL_CHECK(aclrtFree(sin_cache));
}
if(cos_cache != nullptr) {
ACL_CHECK(aclrtFree(cos_cache));
}
}
void* theta_scale_cache = nullptr;
int64_t theta_scale_length = 0;
// sin/cos cache, used only to accelerate first layer on each device
void* sin_cache = nullptr;
void* cos_cache = nullptr;
int64_t position_length = 0;
// Properties to check before reusing the sincos cache
bool cached = false;
float ext_factor = 0.0f;
float theta_scale = 0.0f;
float freq_scale = 0.0f;
float attn_factor = 0.0f;
bool is_neox = false;
};
struct ggml_cann_tensor_cache {
@@ -394,7 +469,8 @@ struct ggml_backend_cann_context {
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
#ifdef USE_ACL_GRAPH
/// Cached CANN ACL graph used for executing the current ggml computation graph.
std::unique_ptr<ggml_cann_graph> cann_graph;
ggml_cann_graph_lru_cache graph_lru_cache;
bool acl_graph_mode = true;
#endif
cann_task_queue task_queue;
bool async_mode;
@@ -404,7 +480,6 @@ struct ggml_backend_cann_context {
ggml_cann_tensor_cache rms_norm_one_tensor_cache;
ggml_cann_tensor_cache rms_norm_zero_tensor_cache;
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
/**
@@ -419,6 +494,13 @@ struct ggml_backend_cann_context {
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
device, async_mode ? "ON" : "OFF");
#ifdef USE_ACL_GRAPH
acl_graph_mode = parse_bool(get_env("GGML_CANN_ACL_GRAPH").value_or("on"));
GGML_LOG_INFO("%s: device %d execution mode is %s (%s)\n",
__func__, device,
acl_graph_mode ? "GRAPH" : "EAGER",
acl_graph_mode ? "acl graph enabled" : "acl graph disabled");
#endif
}
/**
@@ -444,7 +526,10 @@ struct ggml_backend_cann_context {
*/
aclrtStream stream(int stream) {
if (streams[stream] == nullptr) {
ggml_cann_set_device(device);
// If the device is not set here, destroying the stream later may cause a mismatch
// between the thread contexts where the stream was created and destroyed.
// However, I printed the device_id, thread_id, and stream, and they are all consistent.
ACL_CHECK(aclrtSetDevice(device));
ACL_CHECK(aclrtCreateStream(&streams[stream]));
}
return streams[stream];

View File

@@ -75,13 +75,12 @@
* @param device The device ID to set.
*/
void ggml_cann_set_device(const int32_t device) {
// TODO: uncomment these lines after empty context has fixed.
// int current_device;
// ACL_CHECK(aclrtGetDevice(&current_device));
int current_device = -1;
aclrtGetDevice(&current_device);
// if (device == current_device) {
// return;
// }
if (device == current_device) {
return;
}
ACL_CHECK(aclrtSetDevice(device));
}
@@ -116,6 +115,24 @@ bool parse_bool(const std::string& value) {
return valid_values.find(value) != valid_values.end();
}
/**
* @brief Parse a string as an integer, returning 0 if invalid.
*
* This function attempts to convert the input string `value` to an `int`.
* If the string is not a valid integer or is out of the `int` range,
* it returns 0.
*
* @param value The string to parse.
* @return The parsed integer, or 0 if conversion fails.
*/
int parse_integer(const std::string& value) {
try {
return std::stoi(value);
} catch (...) {
return 0;
}
}
/**
* @brief Initialize the CANN device information.
*
@@ -1116,30 +1133,65 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor(
return GGML_STATUS_SUCCESS;
}
// ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed
namespace {
void* g_nz_workspace = nullptr;
size_t g_nz_workspace_allocated = 0;
/**
* @brief Workspace for caching NZ buffers per device.
*
* This struct manages a device buffer used in NZ computations. It supports
* allocation, reallocation, and clearing of cached memory. The struct is
* designed to be used with a global array, one per device.
*/
struct ggml_cann_nz_workspace {
void* ptr; // Pointer to allocated device buffer
size_t allocated; // Size of currently allocated buffer in bytes
void release_nz_workspace() {
if (g_nz_workspace) {
aclrtFree(g_nz_workspace);
g_nz_workspace = nullptr;
g_nz_workspace_allocated = 0;
/**
* @brief Constructor. Initializes the workspace with no allocated memory.
*/
ggml_cann_nz_workspace() : ptr(nullptr), allocated(0) {}
/**
* @brief Free cached memory and reset the workspace.
*
* If a buffer has been allocated, this function releases it using
* aclrtFree and resets internal state.
*/
void clear() {
if (ptr) {
ACL_CHECK(aclrtFree(ptr));
ptr = nullptr;
allocated = 0;
}
}
void relloc_nz_workspace(size_t new_size) {
if (new_size > g_nz_workspace_allocated) {
if (g_nz_workspace) {
aclrtFree(g_nz_workspace);
g_nz_workspace = nullptr;
/**
* @brief Allocate or reallocate the workspace buffer.
*
* If the requested size is larger than the currently allocated size,
* the old buffer will be freed and a new buffer of the requested size
* will be allocated on the device.
*
* @param new_size Size in bytes to allocate for the workspace.
*/
void realloc(size_t new_size) {
if (new_size > allocated) {
clear();
ACL_CHECK(aclrtMalloc(&ptr, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
allocated = new_size;
}
ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST));
g_nz_workspace_allocated = new_size;
}
}
}
/**
* @brief Get the device buffer pointer.
*
* @return Pointer to the allocated buffer, or nullptr if not allocated.
*/
void* get() const { return ptr; }
};
/**
* @brief Global array of NZ workspaces, one per device.
*/
static ggml_cann_nz_workspace g_nz_workspaces[GGML_CANN_MAX_DEVICES];
/**
* @brief Convert tensor weights to NZ format using Ascend CANN API.
@@ -1149,13 +1201,13 @@ namespace {
* improve performance on certain hardware.
*
* @param tensor Pointer to the input ggml_tensor containing the weights.
* @param data Pointer to the raw data buffer for the tensor weights.
* @param offset Byte offset within the tensor data buffer where weights start.
* @param device device id.
*
* @note The workspace buffer used in this function is managed globally and reused
* across calls. This reduces overhead from repeated memory allocation and deallocation.
*/
static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) {
static void weight_format_to_nz(ggml_tensor *tensor, size_t offset, int device) {
aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne,
tensor->nb, 2, ACL_FORMAT_ND, offset);
uint64_t workspaceSize = 0;
@@ -1165,7 +1217,9 @@ static void weight_format_to_nz(ggml_tensor *tensor, size_t offset) {
ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed,
&workspaceSize, &executor));
// Avoid frequent malloc/free of the workspace.
relloc_nz_workspace(workspaceSize);
g_nz_workspaces[device].realloc(workspaceSize);
void* g_nz_workspace = g_nz_workspaces[device].get();
ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr));
ACL_CHECK(aclDestroyTensor(weightTransposed));
@@ -1196,14 +1250,14 @@ static void ggml_backend_cann_buffer_set_tensor(
// Why aclrtSynchronizeDevice?
// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
if (!need_transform(tensor->type)) {
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size,
ACL_MEMCPY_HOST_TO_DEVICE));
if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) {
GGML_ASSERT(tensor->ne[2] == 1);
GGML_ASSERT(tensor->ne[3] == 1);
weight_format_to_nz(tensor, offset);
weight_format_to_nz(tensor, offset, ctx->device);
}
} else {
void *transform_buffer = malloc(size);
@@ -1279,6 +1333,10 @@ static bool ggml_backend_cann_buffer_cpy_tensor(
ACL_MEMCPY_DEVICE_TO_DEVICE));
return true;
} else {
#ifdef ASCEND_310P
// TODO: Support 310p P2P copy
return false;
#endif
// Different device but can access by peer.
int32_t canAccessPeer = 0;
ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device,
@@ -1439,7 +1497,7 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
int64_t ne0 = tensor->ne[0];
// Only check env once.
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or(""));
static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("on"));
// last line must bigger than 32, because every single op deal at
// least 32 bytes.
@@ -1670,6 +1728,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
ggml_cann_get_rows(ctx, dst);
break;
case GGML_OP_SET_ROWS:
std::cout << "lcg GGML_OP_SET_ROWS"<< std::endl;
ggml_cann_set_rows(ctx, dst);
break;
case GGML_OP_DUP:
@@ -2000,6 +2059,8 @@ static bool ggml_backend_cann_cpy_tensor_async(
GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
ggml_backend_is_cann(backend_dst));
GGML_ASSERT(!is_matmul_weight((const ggml_tensor*)src));
if (!ggml_backend_buffer_is_cann(src->buffer) ||
!ggml_backend_buffer_is_cann(dst->buffer)) {
return false;
@@ -2020,6 +2081,10 @@ static bool ggml_backend_cann_cpy_tensor_async(
return true;
}
if (backend_src != backend_dst) {
#ifdef ASCEND_310P
// TODO: Support 310p P2P copy
return false;
#endif
ggml_backend_cann_buffer_context* buf_ctx_src =
(ggml_backend_cann_buffer_context*)buf_src->context;
ggml_backend_cann_buffer_context* buf_ctx_dst =
@@ -2036,7 +2101,6 @@ static bool ggml_backend_cann_cpy_tensor_async(
}
// need open both directions for memcpyasync between devices.
ggml_cann_set_device(cann_ctx_dst->device);
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
ggml_cann_set_device(cann_ctx_src->device);
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
@@ -2046,9 +2110,17 @@ static bool ggml_backend_cann_cpy_tensor_async(
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
ACL_MEMCPY_DEVICE_TO_DEVICE,
cann_ctx_src->stream()));
// record event on src stream after the copy
// TODO: this event is not effective with acl graph mode, change to use aclrtSynchronizeStream
// if (!cann_ctx_src->copy_event) {
// ACL_CHECK(aclrtCreateEventWithFlag(&cann_ctx_src->copy_event, ACL_EVENT_SYNC));
// }
// ACL_CHECK(aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
//TODO: workaround for Event didn`t work here.
aclrtSynchronizeStream(cann_ctx_src->stream());
// // wait on dst stream for the copy to complete
// ggml_cann_set_device(cann_ctx_dst->device);
// ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(), cann_ctx_src->copy_event));
ACL_CHECK(aclrtSynchronizeStream(cann_ctx_src->stream()));
} else {
// src and dst are on the same backend
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
@@ -2077,30 +2149,52 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
#ifdef USE_ACL_GRAPH
/**
* @brief Populate the internal CANN graph node properties from the ggml computation graph.
* @brief Add a new CANN graph to the LRU cache by populating node properties from the ggml graph.
*
* This function copies all node attributes (operation type, dimensions, strides, input sources,
* and operation parameters) into the cached CANN graph structure for later reuse or comparison.
* This function creates a new ggml_cann_graph object and fills its node properties
* (operation type, dimensions, strides, input sources, and operation parameters)
* based on the current ggml computation graph.
*
* @param cann_ctx The CANN backend context.
* @param cgraph The ggml computational graph.
* Each node in the ggml graph is mapped to a property entry in the new CANN graph:
* - node address
* - operation type
* - shape (ne) and strides (nb)
* - source tensor addresses
* - operation parameters
*
* After initialization, the new graph is pushed into the LRU cache owned by the
* CANN backend context. The cache takes ownership of the graph and manages its
* lifetime (including deletion upon eviction).
*
* @param cann_ctx The CANN backend context containing the graph cache.
* @param cgraph The current ggml computation graph.
*/
static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) {
ggml_tensor * node = cgraph->nodes[node_idx];
cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_address = node->data;
cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_op = node->op;
static void add_lru_matched_graph_node_properties(
ggml_backend_cann_context * cann_ctx,
ggml_cgraph * cgraph) {
// Create a new ggml_cann_graph object on the heap (its lifetime is managed by the cache).
ggml_cann_graph * new_graph = new ggml_cann_graph();
new_graph->ggml_graph_properties.resize(cgraph->n_nodes);
for (int dim = 0; dim < GGML_MAX_DIMS; dim++) {
cann_ctx->cann_graph->ggml_graph_properties[node_idx].ne[dim] = node->ne[dim];
cann_ctx->cann_graph->ggml_graph_properties[node_idx].nb[dim] = node->nb[dim];
for (int node_idx = 0; node_idx < cgraph->n_nodes; ++node_idx) {
ggml_tensor * node = cgraph->nodes[node_idx];
auto & prop = new_graph->ggml_graph_properties[node_idx];
prop.node_address = node->data;
prop.node_op = node->op;
std::copy_n(node->ne, GGML_MAX_DIMS, prop.ne);
std::copy_n(node->nb, GGML_MAX_DIMS, prop.nb);
for (int src = 0; src < GGML_MAX_SRC; ++src) {
prop.src_address[src] = node->src[src] ? node->src[src]->data : nullptr;
}
for (int src = 0; src < GGML_MAX_SRC; src++) {
cann_ctx->cann_graph->ggml_graph_properties[node_idx].src_address[src] =
node->src[src] ? node->src[src]->data : nullptr;
}
memcpy(cann_ctx->cann_graph->ggml_graph_properties[node_idx].op_params, node->op_params, GGML_MAX_OP_PARAMS);
memcpy(prop.op_params, node->op_params, GGML_MAX_OP_PARAMS);
}
// Insert into the LRU cache (cache takes ownership and will delete it when evicted).
cann_ctx->graph_lru_cache.push(new_graph);
}
/**
@@ -2145,30 +2239,45 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
}
/**
* @brief Determine if the CANN graph needs to be rebuilt due to graph changes.
* @brief Check whether there is a cached CANN graph that matches the current ggml graph.
*
* This checks whether the number or properties of ggml graph nodes have changed
* compared to the last captured CANN graph. If so, the CANN graph must be re-captured.
* This function iterates through the cached CANN graphs stored in the LRU cache and
* compares them against the given ggml computation graph. A match requires that the
* number of nodes is the same and that each nodes properties (operation type,
* dimensions, strides, inputs, and operation parameters) are identical.
*
* @param cann_ctx The CANN backend context.
* If a matching graph is found, it is promoted to the front of the LRU cache and the
* function returns true. Otherwise, the function returns false, indicating that a new
* CANN graph needs to be captured.
*
* @param cann_ctx The CANN backend context containing the graph cache.
* @param cgraph The current ggml computation graph.
* @return true if an update is required; false otherwise.
* @return true if a matching cached graph exists; false otherwise.
*/
static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
// The number of nodes is different, so the graph needs to be reconstructed.
if (cann_ctx->cann_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
cann_ctx->cann_graph->ggml_graph_properties.resize(cgraph->n_nodes);
return true;
}
static bool is_matched_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
ggml_cann_graph_lru_cache &lru_cache = cann_ctx->graph_lru_cache;
for (auto &graph_ptr : lru_cache.cache_list) {
// Skip graphs with a different number of nodes.
if (graph_ptr->ggml_graph_properties.size() != static_cast<size_t>(cgraph->n_nodes)) {
continue;
}
// The number of nodes is the same; iterate over each node to check whether they match.
for (int i = 0; i < cgraph->n_nodes; i++) {
bool has_matching_properties = ggml_graph_node_has_matching_properties(
cgraph->nodes[i], &cann_ctx->cann_graph->ggml_graph_properties[i]);
if(!has_matching_properties) {
// Check if all nodes match.
bool all_match = true;
for (int i = 0; i < cgraph->n_nodes; ++i) {
if (!ggml_graph_node_has_matching_properties(cgraph->nodes[i], &graph_ptr->ggml_graph_properties[i])) {
all_match = false;
break;
}
}
if (all_match) {
// update cache_list && renturn graph_ptr
lru_cache.move_to_front(graph_ptr);
return true;
}
}
return false;
}
#endif // USE_ACL_GRAPH
@@ -2187,17 +2296,13 @@ static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx,
* @param cann_graph_update_required Whether graph capture is needed due to graph changes.
*/
static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
bool & use_cann_graph, bool & cann_graph_update_required) {
bool & use_cann_graph, bool & cann_graph_update_required) {
#ifdef USE_ACL_GRAPH
ggml_cann_graph* matched_graph = cann_ctx->graph_lru_cache.cache_list.front();
if (use_cann_graph && cann_graph_update_required) {
if (cann_ctx->cann_graph->graph != nullptr) {
ACL_CHECK(aclmdlRIDestroy(cann_ctx->cann_graph->graph));
cann_ctx->cann_graph->graph = nullptr;
}
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
}
#endif // USE_ACL_GRAPH
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
// With the use of CANN graphs, the execution will be performed by the graph launch.
if (!use_cann_graph || cann_graph_update_required) {
@@ -2218,12 +2323,12 @@ static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx
#ifdef USE_ACL_GRAPH
if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &cann_ctx->cann_graph->graph));
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &matched_graph->graph));
}
if (use_cann_graph) {
// Execute graph
ACL_CHECK(aclmdlRIExecuteAsync(cann_ctx->cann_graph->graph, cann_ctx->stream()));
ACL_CHECK(aclmdlRIExecuteAsync(matched_graph->graph, cann_ctx->stream()));
}
#endif // USE_ACL_GRAPH
}
@@ -2246,26 +2351,46 @@ static enum ggml_status ggml_backend_cann_graph_compute(
ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)backend->context;
ggml_cann_set_device(cann_ctx->device);
release_nz_workspace();
g_nz_workspaces[cann_ctx->device].clear();
// calculate rope cache for fist layer in current device.
cann_ctx->rope_cache.cached = false;
#ifdef USE_ACL_GRAPH
bool use_cann_graph = true;
bool cann_graph_update_required = false;
if (use_cann_graph) {
if (cann_ctx->cann_graph == nullptr) {
cann_ctx->cann_graph.reset(new ggml_cann_graph());
cann_graph_update_required = true;
static bool prefill_use_graph = parse_bool(get_env("GGML_CANN_PREFILL_USE_GRAPH").value_or(""));
if (!prefill_use_graph) {
// Do not use acl_graph for prefill.
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
// TODO: Optimize here. Currently, we can only
// get seq_len by FA's input.
if (node->op == GGML_OP_FLASH_ATTN_EXT) {
// Q -> src[0], shape: [B, S, N, D]
use_cann_graph = (node->src[0]->ne[1] == 1);
break;
}
}
}
cann_graph_update_required = is_cann_graph_update_required(cann_ctx, cgraph);
set_ggml_graph_node_properties(cann_ctx, cgraph);
if (!cann_ctx->acl_graph_mode) {
use_cann_graph = false;
}
if (use_cann_graph) {
// If no matching graph is found, the graph needs to be recaptured.
cann_graph_update_required = !is_matched_graph(cann_ctx, cgraph);
if (cann_graph_update_required) {
// If no matching graph is found, add a new ACL graph.
add_lru_matched_graph_node_properties(cann_ctx, cgraph);
}
}
#else
bool use_cann_graph = false;
bool cann_graph_update_required = false;
#endif // USE_ACL_GRAPH
evaluate_and_capture_cann_graph(
cann_ctx,
cgraph,
@@ -2401,16 +2526,10 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
}
case GGML_OP_ROPE: {
// TODO: with ops-test v == 1
float ext_factor = 0.0f;
memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
// TODO: n_dims <= ne0
if (op->src[0]->ne[0] != op->op_params[1]) {
return false;
}
// TODO: ext_factor != 0
if (ext_factor != 0) {
return false;
}
const int mode = ((const int32_t *) op->op_params)[2];
if (mode & GGML_ROPE_TYPE_MROPE) {
@@ -2419,10 +2538,11 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
if (mode & GGML_ROPE_TYPE_VISION) {
return false;
}
#ifdef ASCEND_310P
if(!ggml_is_contiguous(op->src[0])){
return false;
}
#endif
return true;
}
case GGML_OP_UPSCALE: {
@@ -2484,12 +2604,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
case GGML_OP_ARGMAX:
case GGML_OP_COS:
case GGML_OP_SIN:
case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_LOG:
case GGML_OP_MEAN:
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_COUNT_EQUAL:
return true;
case GGML_OP_CONV_TRANSPOSE_1D:
// TODO: ((weightL - 1) * dilationW - padLeft)=1336 should not be larger than 255.
return (op->src[0]->ne[0] - 1) <= 255;
case GGML_OP_SCALE:
float bias;
memcpy(&bias, (const float *)(op->op_params) + 1, sizeof(float));
@@ -2523,13 +2645,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
// different head sizes of K and V are not supported yet
return false;
}
if (op->src[0]->ne[0] == 192) {
return false;
}
if (op->src[0]->ne[0] == 576) {
// DeepSeek MLA
return false;
}
if (op->src[0]->ne[0] % 16 != 0) {
// TODO: padding to support
return false;
@@ -2642,6 +2757,7 @@ static const ggml_backend_i ggml_backend_cann_interface = {
/* .graph_compute = */ ggml_backend_cann_graph_compute,
/* .event_record = */ ggml_backend_cann_event_record,
/* .event_wait = */ ggml_backend_cann_event_wait,
/* .optimize_graph = */ NULL,
};
/**

View File

@@ -224,7 +224,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC SME)
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
if (NOT ${feature_pos} EQUAL -1)
message(STATUS "ARM feature ${feature} enabled")
# Special handling for MATMUL_INT8 when machine doesn't support i8mm
if ("${feature}" STREQUAL "MATMUL_INT8" AND GGML_MACHINE_SUPPORTS_noi8mm)
message(STATUS "ARM feature ${feature} detected but unsetting due to machine not supporting i8mm")
list(APPEND ARCH_FLAGS -U__ARM_FEATURE_MATMUL_INT8)
else()
message(STATUS "ARM feature ${feature} enabled")
endif()
endif()
endforeach()
endif()
@@ -433,15 +439,22 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
ggml-cpu/arch/riscv/quants.c
ggml-cpu/arch/riscv/repack.cpp
)
if (GGML_RVV)
if (GGML_XTHEADVECTOR)
list(APPEND ARCH_FLAGS -march=rv64gc_zfhmin_xtheadvector -mabi=lp64d)
elseif (GGML_RV_ZFH)
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -mabi=lp64d)
else()
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
set(MARCH_STR "rv64gc")
if (GGML_RV_ZFH)
string(APPEND MARCH_STR "_zfh")
endif()
if (GGML_XTHEADVECTOR)
string(APPEND MARCH_STR "_xtheadvector")
elseif (GGML_RVV)
string(APPEND MARCH_STR "_v")
if (GGML_RV_ZVFH)
string(APPEND MARCH_STR "_zvfh")
endif()
endif()
if (GGML_RV_ZICBOP)
string(APPEND MARCH_STR "_zicbop")
endif()
list(APPEND ARCH_FLAGS "-march=${MARCH_STR}" -mabi=lp64d)
elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
message(STATUS "s390x detected")
list(APPEND GGML_CPU_SOURCES ggml-cpu/arch/s390/quants.c)
@@ -450,7 +463,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
# TODO: Separation to determine activation of VX/VXE/VXE2
if (${S390X_M} MATCHES "8561|8562")
set(GGML_NNPA OFF)
message(STATUS "z15 target")
list(APPEND ARCH_FLAGS -march=z15)
elseif (${S390X_M} MATCHES "3931")
@@ -472,11 +484,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
list(APPEND ARCH_FLAGS -mvx -mzvector)
list(APPEND ARCH_DEFINITIONS GGML_VXE)
endif()
if (GGML_NNPA)
message(STATUS "NNPA enabled")
list(APPEND ARCH_DEFINITIONS GGML_NNPA)
endif()
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
message(STATUS "Wasm detected")
list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)

View File

@@ -1270,29 +1270,40 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const float d = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_CPU_FP16_TO_FP32(x[i].dmin);
int tmp, tmp2, sumi;
float ftmp, ft2;
const uint8_t * restrict q40;
const uint8_t * restrict q41;
const uint8_t * restrict q42;
const uint8_t * restrict q43;
const int8_t * restrict q80;
const int8_t * restrict q81;
const int8_t * restrict q82;
const int8_t * restrict q83;
int s0, s1, s2, s3;
__asm__ __volatile__(
"vsetivli zero, 12, e8, m1\n\t"
"vle8.v v1, (%[s6b])\n\t" // {aux[0], aux[1], aux[2]}
"vsetivli zero, 4, e32, m1\n\t"
"li %[s1], 8\n\t"
"vsetivli zero, 4, e32, m1, ta, ma\n\t"
"vle32.v v1, (%[s6b])\n\t"
"vslide1down.vx v1, v1, zero\n\t"
"vmv.v.x v16, zero\n\t"
"vslidedown.vi v2, v1, 2\n\t"
"vmv1r.v v3, v2\n\t"
"vslideup.vi v2, v3, 1\n\t" // {aux[2], aux[2]}
"vsetivli zero, 2, e32, m1\n\t"
"vsetivli zero, 2, e32, m1, ta, ma\n\t"
"vmv.v.i v4, 4\n\t"
"vand.vx v8, v1, %[kmask1]\n\t"
"vslide1up.vx v5, v4, zero\n\t" // {0, 4}
"vsrl.vi v6, v1, 6\n\t"
"vsrl.vv v7, v2, v5\n\t"
"vsse32.v v8, (%[utmp]), %[s1]\n\t"
"vand.vx v0, v6, %[kmask3]\n\t"
"vand.vx v2, v7, %[kmask2]\n\t"
"vsll.vi v6, v0, 4\n\t"
"li %[t2], 8\n\t"
"addi %[t1], %[utmp], 4\n\t"
"addi %[s0], %[utmp], 4\n\t"
"vor.vv v1, v6, v2\n\t"
"vsse32.v v8, (%[utmp]), %[t2]\n\t"
"vsse32.v v1, (%[t1]), %[t2]\n\t"
"vsetivli zero, 8, e16, m1\n\t"
"vsse32.v v1, (%[s0]), %[s1]\n\t"
"vsetivli zero, 8, e16, m1, ta, ma\n\t"
"vle32.v v2, (%[bsums])\n\t"
"vnsrl.wi v0, v2, 0\n\t"
"vnsrl.wi v1, v2, 16\n\t"
@@ -1300,13 +1311,131 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
"vle8.v v3, (%[mins])\n\t"
"vzext.vf2 v4, v3\n\t"
"vwmul.vv v6, v4, v2\n\t"
"vsetivli zero, 4, e32, m1, ta, ma\n\t"
"vredsum.vs v0, v6, v16\n\t"
"vredsum.vs v0, v7, v0\n\t"
"vfcvt.f.x.v v0, v0\n\t"
"vfmv.f.s %[ftmp], v0\n\t"
"vsetivli zero, 16, e8, m1, ta, ma\n\t"
"vle8.v v0, (%[xs])\n\t"
"fnmsub.s %[sumf], %[dmin], %[ftmp], %[sumf]\n\t"
"addi %[q40], %[xs], 64\n\t"
"addi %[q41], %[xs], 16\n\t"
"addi %[q42], %[xs], 32\n\t"
"addi %[q43], %[xs], 48\n\t"
"addi %[q80], %[ys], 64\n\t"
"vle8.v v1, (%[q41])\n\t"
"vle8.v v2, (%[q42])\n\t"
"addi %[q81], %[ys], 16\n\t"
"addi %[q41], %[q41], 64\n\t"
"addi %[q82], %[ys], 32\n\t"
"vle8.v v3, (%[q43])\n\t"
"vle8.v v8, (%[ys])\n\t"
"addi %[q42], %[q42], 64\n\t"
"addi %[q83], %[ys], 48\n\t"
"addi %[q43], %[q43], 64\n\t"
"vsrl.vi v4, v0, 4\n\t"
"vle8.v v9, (%[q81])\n\t"
"vle8.v v10, (%[q82])\n\t"
"vand.vi v0, v0, 0xF\n\t"
"addi %[q81], %[q81], 64\n\t"
"vsrl.vi v5, v1, 4\n\t"
"addi %[q82], %[q82], 64\n\t"
"vle8.v v11, (%[q83])\n\t"
"vle8.v v12, (%[q80])\n\t"
"vand.vi v1, v1, 0xF\n\t"
"addi %[q83], %[q83], 64\n\t"
"vsrl.vi v6, v2, 4\n\t"
"addi %[q80], %[q80], 64\n\t"
"vle8.v v13, (%[q81])\n\t"
"vle8.v v14, (%[q82])\n\t"
"vand.vi v2, v2, 0xF\n\t"
"addi %[q81], %[q81], 64\n\t"
"vsrl.vi v7, v3, 4\n\t"
"addi %[q82], %[q82], 64\n\t"
"vwmul.vv v16, v0, v8\n\t"
"vle8.v v15, (%[q83])\n\t"
"vle8.v v0, (%[q40])\n\t"
"vand.vi v3, v3, 0xF\n\t"
"addi %[q83], %[q83], 64\n\t"
"vwmul.vv v24, v2, v12\n\t"
"vwmul.vv v20, v4, v10\n\t"
"vwmul.vv v28, v6, v14\n\t"
"vwmacc.vv v16, v1, v9\n\t"
"vle8.v v1, (%[q41])\n\t"
"vle8.v v2, (%[q42])\n\t"
"vwmacc.vv v24, v3, v13\n\t"
"vwmacc.vv v20, v5, v11\n\t"
"vwmacc.vv v28, v7, v15\n\t"
"addi %[q40], %[q80], 64\n\t"
"addi %[q41], %[q81], 64\n\t"
"vle8.v v3, (%[q43])\n\t"
"vle8.v v8, (%[q80])\n\t"
"addi %[q42], %[q82], 64\n\t"
"addi %[q43], %[q83], 64\n\t"
"vsrl.vi v4, v0, 4\n\t"
"vle8.v v9, (%[q81])\n\t"
"vle8.v v10, (%[q82])\n\t"
"vand.vi v0, v0, 0xF\n\t"
"vsrl.vi v5, v1, 4\n\t"
"vsrl.vi v7, v3, 4\n\t"
"vand.vi v3, v3, 0xF\n\t"
"vle8.v v11, (%[q83])\n\t"
"vle8.v v12, (%[q40])\n\t"
"vand.vi v1, v1, 0xF\n\t"
"vsrl.vi v6, v2, 4\n\t"
"vand.vi v2, v2, 0xF\n\t"
"vwmul.vv v18, v0, v8\n\t"
"vle8.v v13, (%[q41])\n\t"
"vle8.v v14, (%[q42])\n\t"
"vwmul.vv v26, v2, v12\n\t"
"vwmul.vv v22, v4, v10\n\t"
"vwmul.vv v30, v6, v14\n\t"
"vwmacc.vv v18, v1, v9\n\t"
"vle8.v v15, (%[q43])\n\t"
"vwmacc.vv v26, v3, v13\n\t"
"vwmacc.vv v22, v5, v11\n\t"
"vwmacc.vv v30, v7, v15\n\t"
"vmv.v.x v0, zero\n\t"
"vsetivli zero, 8, e32, m2\n\t"
"vredsum.vs v0, v6, v0\n\t"
"vmv.x.s %[sumi], v0"
: [t1] "=&r" (tmp), [t2] "=&r" (tmp2), [sumi] "=&r" (sumi)
: [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
, [s6b] "r" (x[i].scales), [kmask1] "r" (kmask1)
"vsetivli zero, 16, e16, m2, ta, ma\n\t"
"vwredsum.vs v4, v16, v0\n\t"
"lbu %[s0], 0(%[scale])\n\t"
"vwredsum.vs v5, v20, v0\n\t"
"lbu %[s1], 1(%[scale])\n\t"
"vwredsum.vs v6, v24, v0\n\t"
"lbu %[s2], 2(%[scale])\n\t"
"vwredsum.vs v7, v28, v0\n\t"
"lbu %[s3], 3(%[scale])\n\t"
"vwredsum.vs v8, v18, v0\n\t"
"lbu %[q40], 4(%[scale])\n\t"
"vwredsum.vs v9, v22, v0\n\t"
"lbu %[q41], 5(%[scale])\n\t"
"vwredsum.vs v10, v26, v0\n\t"
"lbu %[q42], 6(%[scale])\n\t"
"vwredsum.vs v11, v30, v0\n\t"
"lbu %[q43], 7(%[scale])\n\t"
"vsetivli zero, 4, e32, m1, ta, ma\n\t"
"vmul.vx v0, v4, %[s0]\n\t"
"vmul.vx v1, v8, %[q40]\n\t"
"vmacc.vx v0, %[s1], v5\n\t"
"vmacc.vx v1, %[q41], v9\n\t"
"vmacc.vx v0, %[s2], v6\n\t"
"vmacc.vx v1, %[q42], v10\n\t"
"vmacc.vx v0, %[s3], v7\n\t"
"vmacc.vx v1, %[q43], v11\n\t"
"vfcvt.f.x.v v0, v0\n\t"
"vfcvt.f.x.v v1, v1\n\t"
"vfmv.f.s %[ft2], v0\n\t"
"vfmv.f.s %[ftmp], v1\n\t"
"fadd.s %[ft2], %[ft2], %[ftmp]\n\t"
"fmadd.s %[sumf], %[d], %[ft2], %[sumf]"
: [ftmp] "=&f" (ftmp), [sumf] "+&f" (sumf), [ft2] "=&f" (ft2)
, [s0] "=&r" (s0), [s1] "=&r" (s1), [s2] "=&r" (s2), [s3] "=&r" (s3)
, [q40] "=&r" (q40), [q41] "=&r" (q41), [q42] "=&r" (q42), [q43] "=&r" (q43)
, [q80] "=&r" (q80), [q81] "=&r" (q81), [q82] "=&r" (q82), [q83] "=&r" (q83)
: [d] "f" (d), [ys] "r" (y[i].qs), [xs] "r" (x[i].qs), [scale] "r" (scales)
, [bsums] "r" (y[i].bsums), [mins] "r" (mins), [utmp] "r" (utmp)
, [s6b] "r" (&x[i]), [kmask1] "r" (kmask1), [dmin] "f" (dmin)
, [kmask2] "r" (kmask2), [kmask3] "r" (kmask3)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
@@ -1314,59 +1443,6 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
sumf -= dmin * sumi;
const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
sumi = 0;
const uint8_t * scale = scales;
for (int j = 0; j < QK_K/128; ++j) {
int vl128 = 128, vl64 = 64, vl32 = 32;
__asm__ __volatile__(
"vsetvli zero, %[vl128], e8, m8\n\t"
"vle8.v v8, (%[q8])\n\t"
"vsetvli zero, %[vl64], e8, m4\n\t"
"vle8.v v0, (%[q4])\n\t"
"vsrl.vi v4, v0, 4\n\t"
"vand.vi v0, v0, 0xF\n\t"
"vsetvli zero, %[vl32], e8, m2\n\t"
"vwmul.vv v28, v6, v14\n\t"
"vwmul.vv v20, v4, v10\n\t"
"vwmul.vv v24, v2, v12\n\t"
"vwmul.vv v16, v0, v8\n\t"
"vsetivli zero, 4, e32, m1\n\t"
"vle8.v v2, (%[scale])\n\t"
"vmv.v.x v0, zero\n\t"
"vzext.vf4 v1, v2\n\t"
"vsetvli zero, %[vl32], e16, m4\n\t"
"vwredsum.vs v6, v24, v0\n\t"
"vwredsum.vs v7, v28, v0\n\t"
"vwredsum.vs v4, v16, v0\n\t"
"vwredsum.vs v5, v20, v0\n\t"
"vsetivli zero, 4, e32, m1\n\t"
"vslideup.vi v6, v7, 1\n\t"
"vslideup.vi v4, v5, 1\n\t"
"vslideup.vi v4, v6, 2\n\t"
"vmul.vv v8, v4, v1\n\t"
"vredsum.vs v0, v8, v0\n\t"
"vmv.x.s %[tmp], v0\n\t"
"add %[sumi], %[sumi], %[tmp]"
: [tmp] "=&r" (tmp), [sumi] "+&r" (sumi)
: [vl128] "r" (vl128), [vl64] "r" (vl64), [vl32] "r" (vl32)
, [q4] "r" (q4), [q8] "r" (q8), [scale] "r" (scale)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
);
q4 += 64; q8 += 128; scale += 4;
}
sumf += d * sumi;
}
break;
default:
@@ -1693,6 +1769,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
case 128:
for (int i = 0; i < nb; ++i) {
__builtin_prefetch(&x[i + 1].d, 0, 1);
const float d = GGML_CPU_FP16_TO_FP32(x[i].d) * y[i].d;
const uint8_t * restrict q6 = x[i].ql;
@@ -1701,23 +1779,59 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int8_t * restrict scale = x[i].scales;
int sum_t = 0;
int t0;
int q6h;
float ftmp;
for (int j = 0; j < QK_K/128; ++j) {
__asm__ __volatile__(
"addi %[q6h], %[q6], 32\n\t"
"ld t0, 0(%[scale])\n\t"
"addi %[scale], %[scale], 8\n\t"
"slli t6, t0, 1 * 8\n\t"
"lb zero, 0(%[q6])\n\t"
"slli t5, t0, 2 * 8\n\t"
"slli t4, t0, 3 * 8\n\t"
"lb zero, 0(%[q6h])\n\t"
"slli t3, t0, 4 * 8\n\t"
"slli t2, t0, 5 * 8\n\t"
"lb zero, 0(%[qh])\n\t"
"lb zero, 31(%[q6h])\n\t"
"slli t1, t0, 6 * 8\n\t"
"srai a7, t0, 56\n\t"
"vsetvli zero, %[vl32], e8, m2\n\t"
"vle8.v v8, (%[q6])\n\t"
"srai t6, t6, 56\n\t"
"srai t5, t5, 56\n\t"
"srai t4, t4, 56\n\t"
"srai t3, t3, 56\n\t"
"vle8.v v10, (%[q6h])\n\t"
"addi %[q6], %[q6], 64\n\t"
"slli t0, t0, 7 * 8\n\t"
"srai t2, t2, 56\n\t"
"srai t1, t1, 56\n\t"
"srai t0, t0, 56\n\t"
"vle8.v v4, (%[qh])\n\t"
"vsrl.vi v12, v8, 4\n\t"
"vsrl.vi v14, v10, 4\n\t"
"lb zero, 0(%[q8])\n\t"
"vand.vi v8, v8, 0xF\n\t"
"vand.vi v10, v10, 0xF\n\t"
"lb zero, 32(%[q8])\n\t"
"vsll.vi v0, v4, 4\n\t"
"vsll.vi v2, v4, 2\n\t"
"lb zero, 64(%[q8])\n\t"
"vsrl.vi v6, v4, 2\n\t"
"vsetvli zero, %[vl64], e8, m4\n\t"
"vle8.v v8, (%[q6])\n\t"
"vsrl.vi v12, v8, 4\n\t"
"vand.vi v8, v8, 0xF\n\t"
"vsetvli zero, %[vl128], e8, m8\n\t"
"vand.vx v0, v0, %[mask]\n\t"
"lb zero, 96(%[q8])\n\t"
"vand.vx v2, v2, %[mask]\n\t"
"vand.vx v4, v4, %[mask]\n\t"
"vand.vx v6, v6, %[mask]\n\t"
"vor.vv v8, v8, v0\n\t"
"lb zero, 127(%[q8])\n\t"
"vor.vv v10, v10, v2\n\t"
"vor.vv v12, v12, v4\n\t"
"vor.vv v14, v14, v6\n\t"
"vsetvli zero, %[vl128], e8, m8\n\t"
"vle8.v v0, (%[q8])\n\t"
"vsub.vx v8, v8, %[vl32]\n\t"
"vsetvli zero, %[vl64], e8, m4\n\t"
@@ -1734,34 +1848,34 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
"vwredsum.vs v13, v28, v0\n\t"
"vwredsum.vs v14, v30, v0\n\t"
"vsetivli zero, 4, e32, m1\n\t"
"vslideup.vi v10, v9, 1\n\t"
"vslideup.vi v8, v7, 1\n\t"
"vslideup.vi v11, v12, 1\n\t"
"vslideup.vi v13, v14, 1\n\t"
"vslideup.vi v10, v8, 2\n\t"
"vslideup.vi v11, v13, 2\n\t"
"vsetivli zero, 8, e32, m2\n\t"
"vle8.v v2, (%[scale])\n\t"
"vsext.vf4 v4, v2\n\t"
"vmul.vv v2, v4, v10\n\t"
"vredsum.vs v0, v2, v0\n\t"
"vmv.x.s %[t0], v0\n\t"
"add %[sumi], %[sumi], %[t0]"
: [sumi] "+&r" (sum_t), [t0] "=&r" (t0)
: [qh] "r" (qh), [q6] "r" (q6), [q8] "r" (q8), [scale] "r" (scale)
"vmul.vx v0, v10, t0\n\t"
"vmul.vx v1, v9, t1\n\t"
"vmacc.vx v0, t2, v8\n\t"
"vmacc.vx v1, t3, v7\n\t"
"vmacc.vx v0, t4, v11\n\t"
"vmacc.vx v1, t5, v12\n\t"
"vmacc.vx v0, t6, v13\n\t"
"vmacc.vx v1, a7, v14\n\t"
"vadd.vv v0, v0, v1\n\t"
"vfcvt.f.x.v v0, v0\n\t"
"vfmv.f.s %[ftmp], v0\n\t"
"fmadd.s %[sumf], %[d], %[ftmp], %[sumf]"
: [q6] "+&r" (q6), [q6h] "=&r" (q6h)
, [scale] "+&r" (scale)
, [sumf] "+&f" (sumf), [ftmp] "=&f" (ftmp)
: [qh] "r" (qh), [q8] "r" (q8)
, [vl32] "r" (32), [vl64] "r" (64), [vl128] "r" (128)
, [mask] "r" (0x30)
, [mask] "r" (0x30), [d] "f" (d)
: "memory"
, "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
, "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15"
, "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"
, "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
, "t0", "t1", "t2", "t3", "t4", "t5", "t6", "a7"
, "a6", "a5", "a4", "a3"
);
q6 += 64; qh += 32; q8 += 128; scale += 8;
qh += 32; q8 += 128;
}
sumf += d * sum_t;
}
break;
default:

View File

@@ -53,9 +53,9 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
#if defined(__VXE__) || defined(__VXE2__)
for (int i = 0; i < nb; i++) {
__vector float srcv [8];
__vector float asrcv[8];
__vector float amaxv[8];
float32x4_t srcv [8];
float32x4_t asrcv[8];
float32x4_t amaxv[8];
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
@@ -74,8 +74,8 @@ void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
y[i].d = GGML_CPU_FP32_TO_FP16(d);
for (int j = 0; j < 8; j++) {
const __vector float v = vec_mul(srcv[j], vec_splats(id));
const __vector int32_t vi = vec_signed(v);
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
const int32x4_t vi = vec_signed(v);
y[i].qs[4*j + 0] = vec_extract(vi, 0);
y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -98,9 +98,9 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
#if defined(__VXE__) || defined(__VXE2__)
for (int i = 0; i < nb; i++) {
__vector float srcv [8];
__vector float asrcv[8];
__vector float amaxv[8];
float32x4_t srcv [8];
float32x4_t asrcv[8];
float32x4_t amaxv[8];
for (int j = 0; j < 8; j++) srcv[j] = vec_xl(0, x + i*32 + 4*j);
for (int j = 0; j < 8; j++) asrcv[j] = vec_abs(srcv[j]);
@@ -118,11 +118,11 @@ void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, i
y[i].d = GGML_CPU_FP32_TO_FP16(d);
__vector int32_t acc = vec_splats(0);
int32x4_t acc = vec_splats(0);
for (int j = 0; j < 8; j++) {
const __vector float v = vec_mul(srcv[j], vec_splats(id));
const __vector int32_t vi = vec_signed(v);
const float32x4_t v = vec_mul(srcv[j], vec_splats(id));
const int32x4_t vi = vec_signed(v);
y[i].qs[4*j + 0] = vec_extract(vi, 0);
y[i].qs[4*j + 1] = vec_extract(vi, 1);
@@ -162,37 +162,36 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
float sumf = 0;
#if defined(__VXE__) || defined(__VXE2__)
__vector float acc = vec_splats(0.0f);
float32x4_t acc = vec_splats(0.0f);
const __vector uint8_t v_m = vec_splats((const uint8_t)0x0F);
const __vector int8_t v_s = vec_splats( (const int8_t)0x08);
const uint8x16_t v_m = vec_splats((const uint8_t)0x0F);
const int8x16_t v_s = vec_splats( (const int8_t)0x08);
for (; ib < nb; ++ib) {
const __vector uint8_t v_x = vec_xl(0, x[ib].qs);
const __vector int8_t v_xl = (const __vector int8_t)(v_x & v_m);
const __vector int8_t v_xh = (const __vector int8_t)(v_x >> 4);
const uint8x16_t v_x = vec_xl(0, x[ib].qs);
const int8x16_t v_xl = (const int8x16_t)(v_x & v_m);
const int8x16_t v_xh = (const int8x16_t)(v_x >> 4);
const __vector int8_t v_xls = vec_sub(v_xl, v_s);
const __vector int8_t v_xhs = vec_sub(v_xh, v_s);
const int8x16_t v_xls = vec_sub(v_xl, v_s);
const int8x16_t v_xhs = vec_sub(v_xh, v_s);
const __vector int8_t v_yl = vec_xl(0 , y[ib].qs);
const __vector int8_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
const int8x16_t v_yl = vec_xl(0 , y[ib].qs);
const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs);
const __vector int16_t v_xylso = vec_mulo(v_xls, v_yl);
const __vector int16_t v_xylse = vec_mule(v_xls, v_yl);
const __vector int16_t v_xyhso = vec_mulo(v_xhs, v_yh);
const __vector int16_t v_xyhse = vec_mule(v_xhs, v_yh);
const int16x8_t v_xylso = vec_mulo(v_xls, v_yl);
const int16x8_t v_xylse = vec_mule(v_xls, v_yl);
const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh);
const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh);
__vector int16_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_);
const __vector float v_xy = vec_float(vec_unpackh(v_xy_));
const __vector float v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_));
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d));
acc = vec_madd(v_xy, v_d, acc);
}
sumf = acc[0] + acc[1] + acc[2] + acc[3];
sumf = vec_hsum_f32x4(acc);
*s = sumf;
#else
UNUSED(nb);
@@ -249,8 +248,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
acc = vec_madd(v_xy, v_d, acc);
}
sumf = acc[0] + acc[1] + acc[2] + acc[3] + summs;
sumf = vec_hsum_f32x4(acc) + summs;
*s = sumf;
#else
UNUSED(nb);
@@ -351,7 +349,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
}
sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1);
sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1);
#pragma GCC unroll 4
for (; ib < nb; ++ib) {
@@ -390,7 +388,7 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f));
sumf += vec_hsum(v_acc);
sumf += vec_hsum_f32x4(v_acc);
}
*s = sumf;
@@ -502,7 +500,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1);
}
sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1;
sumf += vec_hsum_f32x4(v_sum0) + vec_hsum_f32x4(v_sum1) + summs0 + summs1;
#pragma GCC unroll 4
for (; ib < nb; ++ib) {
@@ -543,7 +541,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi
const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d));
const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc);
sumf += vec_hsum(v_acc) + summs;
sumf += vec_hsum_f32x4(v_acc) + summs;
}
*s = sumf;
@@ -575,7 +573,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
float sumf = 0;
#if defined(__VXE__) || defined(__VXE2__)
__vector float acc = vec_splats(0.0f);
float32x4_t acc = vec_splats(0.0f);
#pragma GCC unroll 8
for (; ib < nb; ++ib) {
@@ -594,7 +592,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
acc = vec_madd(v_xy, v_d, acc);
}
sumf = acc[0] + acc[1] + acc[2] + acc[3];
sumf = vec_hsum_f32x4(acc);
*s = sumf;
#else
@@ -718,10 +716,10 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
isum += vec_hsum_i32x4(isum0) * scale[0];
isum += vec_hsum_i32x4(isum1) * scale[1];
isum += vec_hsum_i32x4(isum2) * scale[2];
isum += vec_hsum_i32x4(isum3) * scale[3];
scale += 4;
@@ -819,7 +817,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
v_xl[1] = (int8x16_t)vec_and(v_x[1], v_lm);
const int32x4_t p1 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
sumi1 += (p1[0] + p1[1] + p1[2] + p1[3]) * scales[2*j+0];
sumi1 += vec_hsum_i32x4(p1) * scales[2*j+0];
v_y[0] = vec_xl(0 , y0);
v_y[1] = vec_xl(16, y0);
@@ -829,7 +827,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
v_xl[1] = (int8x16_t)vec_sr(v_x[1], 4);
const int32x4_t p2 = ggml_vec_dot(ggml_vec_dot(v_z, v_xl[0], v_y[0]), v_xl[1], v_y[1]);
sumi2 += (p2[0] + p2[1] + p2[2] + p2[3]) * scales[2*j+1];
sumi2 += vec_hsum_i32x4(p2) * scales[2*j+1];
}
sumf += d * (sumi1 + sumi2);
@@ -911,7 +909,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh);
const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh);
const int32x4_t v_mins = vec_add(v_minsho, v_minshe);
const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
const int32_t mins = vec_hsum_i32x4(v_mins);
const uint8_t * scales = (const uint8_t *)utmp;
const uint8_t * GGML_RESTRICT x0l = x[i].qs;
@@ -948,8 +946,8 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
int32x4_t sumi0 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[0], v_y[0]), q5b[1], v_y[1]);
int32x4_t sumi1 = ggml_vec_dot(ggml_vec_dot(v_z, q5b[2], v_y[2]), q5b[3], v_y[3]);
sumi += (sumi0[0] + sumi0[1] + sumi0[2] + sumi0[3]) * *scales++;
sumi += (sumi1[0] + sumi1[1] + sumi1[2] + sumi1[3]) * *scales++;
sumi += vec_hsum_i32x4(sumi0) * *scales++;
sumi += vec_hsum_i32x4(sumi1) * *scales++;
}
sumf += d * sumi - dmin * mins;
@@ -1020,7 +1018,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh);
const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe;
const int32_t mins = v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3];
const int32_t mins = vec_hsum_i32x4(v_mins);
int32_t isum = 0;
for (int j = 0; j < QK_K/128; ++j) {
@@ -1060,10 +1058,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
int32x4_t summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
int32x4_t summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
(summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
(summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
(summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
isum += vec_hsum_i32x4(summs0) * scale[0] +
vec_hsum_i32x4(summs1) * scale[1] +
vec_hsum_i32x4(summs2) * scale[2] +
vec_hsum_i32x4(summs3) * scale[3];
scale += 4;
@@ -1094,10 +1092,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
summs2 = ggml_vec_dot(v_z, q6b[2], v_y[2]);
summs3 = ggml_vec_dot(v_z, q6b[3], v_y[3]);
isum += (summs0[0] + summs0[1] + summs0[2] + summs0[3]) * scale[0] +
(summs1[0] + summs1[1] + summs1[2] + summs1[3]) * scale[1] +
(summs2[0] + summs2[1] + summs2[2] + summs2[3]) * scale[2] +
(summs3[0] + summs3[1] + summs3[2] + summs3[3]) * scale[3];
isum += vec_hsum_i32x4(summs0) * scale[0] +
vec_hsum_i32x4(summs1) * scale[1] +
vec_hsum_i32x4(summs2) * scale[2] +
vec_hsum_i32x4(summs3) * scale[3];
scale += 4;
}
@@ -1285,7 +1283,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const v
const int8x16_t v_yh = vec_xl(QK8_0/2, y0->qs);
const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xl, v_yl), v_xh, v_yh);
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * (v_xy[0] + v_xy[1] + v_xy[2] + v_xy[3]);
sumf += GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d) * vec_hsum_i32x4(v_xy);
}
*s = sumf;
@@ -1354,8 +1352,8 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const v
h >>= 4;
sumi1 += (vsumi0[0] + vsumi0[1] + vsumi0[2] + vsumi0[3]) * ls1;
sumi2 += (vsumi1[0] + vsumi1[1] + vsumi1[2] + vsumi1[3]) * ls2;
sumi1 += vec_hsum_i32x4(vsumi0) * ls1;
sumi2 += vec_hsum_i32x4(vsumi1) * ls2;
}
sumf += GGML_CPU_FP16_TO_FP32(x[ibl].d) * y[ibl].d * (sumi1 + sumi2);

View File

@@ -68,12 +68,6 @@ struct ggml_compute_params {
#endif // __VXE2__
#endif // __s390x__ && __VEC__
#if defined(__s390x__) && defined(GGML_NNPA)
#ifndef __NNPA__
#define __NNPA__
#endif // __NNPA__
#endif // __s390x__ && GGML_NNPA
#if defined(__ARM_FEATURE_SVE)
#include <sys/prctl.h>
#endif
@@ -489,11 +483,16 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) {
/**
* @see https://github.com/ggml-org/llama.cpp/pull/14037
*/
inline static float vec_hsum(float32x4_t v) {
inline static float vec_hsum_f32x4(float32x4_t v) {
float32x4_t v_temp = v + vec_reve(v);
return v_temp[0] + v_temp[1];
}
inline static int32_t vec_hsum_i32x4(int32x4_t v) {
int32x4_t v_temp = v + vec_reve(v);
return v_temp[0] + v_temp[1];
}
inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {
const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b);
return acc + (vec_unpackh(p) + vec_unpackl(p));

View File

@@ -373,6 +373,9 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
.vec_dot_type = GGML_TYPE_Q8_K,
.nrows = 1,
},
[GGML_TYPE_I32] = {
.from_float = (ggml_from_float_t) ggml_cpu_fp32_to_i32,
},
};
const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
@@ -1876,6 +1879,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_im2col_back_f32(params, tensor);
} break;
case GGML_OP_IM2COL_3D:
{
ggml_compute_forward_im2col_3d(params, tensor);
} break;
case GGML_OP_CONV_2D:
{
ggml_compute_forward_conv_2d(params, tensor);
@@ -2255,6 +2262,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
} break;
case GGML_OP_IM2COL:
case GGML_OP_IM2COL_BACK:
case GGML_OP_IM2COL_3D:
case GGML_OP_CONV_2D:
case GGML_OP_CONV_3D:
case GGML_OP_CONV_2D_DW:
@@ -2691,7 +2699,10 @@ struct ggml_cplan ggml_graph_plan(
if (ggml_is_quantized(node->type) ||
// F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
(node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) ||
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) {
(node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16) ||
// conversion between F32 and I32
(node->src[0]->type == GGML_TYPE_F32 && node->src[1] && node->src[1]->type == GGML_TYPE_I32) ||
(node->src[0]->type == GGML_TYPE_I32 && node->src[1] && node->src[1]->type == GGML_TYPE_F32)) {
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
}
} break;
@@ -3206,20 +3217,12 @@ void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
__m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
_mm_storel_epi64((__m128i *)(y + i), y_vec);
}
#elif defined(__NNPA__)
for (; i + 7 < n; i += 8) {
float32x4_t v_xh = vec_xl(0, (const float *)(x + i + 0));
float32x4_t v_xl = vec_xl(0, (const float *)(x + i + 4));
uint16x8_t v_yd = vec_round_from_fp32(v_xh, v_xl, 0);
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
}
for (; i + 3 < n; i += 4) {
float32x4_t v_x = vec_xl(0, (const float *)(x + i));
float32x4_t v_zero = vec_splats(0.0f);
uint16x8_t v_yd = vec_round_from_fp32(v_x, v_zero, 0);
uint16x8_t v_y = vec_convert_to_fp16(v_yd, 0);
vec_xst(v_y, 0, (ggml_fp16_t *)(y + i));
#elif defined(__riscv_zvfh)
for (int vl; i < n; i += vl) {
vl = __riscv_vsetvl_e32m2(n - i);
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
vfloat16m1_t vy = __riscv_vfncvt_f_f_w_f16m1(vx, vl);
__riscv_vse16_v_f16m1((_Float16 *)&y[i], vy, vl);
}
#endif
for (; i < n; ++i) {
@@ -3247,21 +3250,6 @@ void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
__m128 y_vec = _mm_cvtph_ps(x_vec);
_mm_storeu_ps(y + i, y_vec);
}
#elif defined(__NNPA__)
for (; i + 7 < n; i += 8) {
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
float32x4_t v_yl = vec_extend_to_fp32_lo(v_yd, 0);
vec_xst(v_yh, 0, (float *)(y + i + 0));
vec_xst(v_yl, 0, (float *)(y + i + 4));
}
for (; i + 3 < n; i += 4) {
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)(x + i));
uint16x8_t v_yd = vec_convert_from_fp16(v_x, 0);
float32x4_t v_yh = vec_extend_to_fp32_hi(v_yd, 0);
vec_xst(v_yh, 0, (float *)(y + i));
}
#endif
for (; i < n; ++i) {
@@ -3276,6 +3264,13 @@ void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
}
}
void ggml_cpu_fp32_to_i32(const float * x, int32_t * y, int64_t n) {
int64_t i = 0;
for (; i < n; ++i) {
y[i] = x[i];
}
}
void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
int64_t i = 0;
#if defined(__AVX2__)
@@ -3465,14 +3460,6 @@ int ggml_cpu_has_vxe(void) {
#endif
}
int ggml_cpu_has_nnpa(void) {
#if defined(GGML_NNPA)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_neon(void) {
#if defined(__ARM_ARCH) && defined(__ARM_NEON)
return 1;

View File

@@ -190,6 +190,7 @@ static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
/* .optimize_graph = */ NULL,
};
static ggml_guid_t ggml_backend_cpu_guid(void) {
@@ -348,8 +349,10 @@ static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t *
long pages = sysconf(_SC_PHYS_PAGES);
long page_size = sysconf(_SC_PAGE_SIZE);
*total = pages * page_size;
// "free" system memory is ill-defined, for practical purposes assume that all of it is free:
*free = *total;
#endif
#endif // _WIN32
GGML_UNUSED(dev);
}
@@ -576,9 +579,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
if (ggml_cpu_has_vxe()) {
features.push_back({ "VXE", "1" });
}
if (ggml_cpu_has_nnpa()) {
features.push_back({ "NNPA", "1" });
}
if (ggml_cpu_has_wasm_simd()) {
features.push_back({ "WASM_SIMD", "1" });
}

View File

@@ -154,7 +154,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
return compute_forward_q4_0(params, dst);
} else if (dst->src[0]->type == GGML_TYPE_F16) {
return compute_forward_kv_cache(params, dst);
return compute_forward_fp16(params, dst);
}
} else if (dst->op == GGML_OP_GET_ROWS) {
if (dst->src[0]->type == GGML_TYPE_Q4_0) {
@@ -164,7 +164,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
return false;
}
bool compute_forward_kv_cache(ggml_compute_params * params, struct ggml_tensor * dst) {
bool compute_forward_fp16(ggml_compute_params * params, struct ggml_tensor * dst) {
static std::atomic_flag first_to_arrive = ATOMIC_FLAG_INIT;
const ggml_tensor * src0 = dst->src[0];
@@ -515,9 +515,6 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
op->src[0]->buffer &&
(ggml_n_dims(op->src[0]) == 2) &&
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
if (op->op == GGML_OP_GET_ROWS && op->src[1]->ne[0] != 8) {
return false;
}
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
return false;
}
@@ -534,13 +531,8 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
}
else if (ggml_kleidiai_select_kernels(ctx.features, op) &&
op->src[0]->op == GGML_OP_VIEW &&
(op->src[1]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_SOFT_MAX) &&
op->src[1]->ne[1] > 1) {
if ((op->src[0]->nb[0] != 2) ||
(op->src[1]->nb[0] != 4) ||
(op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
else if (ggml_kleidiai_select_kernels(ctx.features, op) && op->src[1]->ne[1] > 1) {
if ((op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
(op->src[1]->nb[1] * op->src[1]->ne[1] != op->src[1]->nb[2])) {
return nullptr;
}

View File

@@ -776,6 +776,24 @@ static void ggml_compute_forward_dup_f32(
id += ne00 * (ne01 - ir1);
}
}
} else if (dst->type == GGML_TYPE_I32) {
size_t id = 0;
int32_t * dst_ptr = (int32_t *) dst->data;
for (int i03 = 0; i03 < ne03; i03++) {
for (int i02 = 0; i02 < ne02; i02++) {
id += ne00 * ir0;
for (int i01 = ir0; i01 < ir1; i01++) {
for (int i00 = 0; i00 < ne00; i00++) {
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
dst_ptr[id] = *src0_ptr;
id++;
}
}
id += ne00 * (ne01 - ir1);
}
}
} else {
GGML_ABORT("fatal error"); // TODO: implement
}
@@ -947,6 +965,144 @@ static void ggml_compute_forward_dup_f32(
}
}
}
} else if (dst->type == GGML_TYPE_I32) {
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
i10 += ne00 * ir0;
while (i10 >= ne0) {
i10 -= ne0;
if (++i11 == ne1) {
i11 = 0;
if (++i12 == ne2) {
i12 = 0;
if (++i13 == ne3) {
i13 = 0;
}
}
}
}
for (int64_t i01 = ir0; i01 < ir1; i01++) {
for (int64_t i00 = 0; i00 < ne00; i00++) {
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
*(int32_t *) dst_ptr = *(const float *) src0_ptr;
if (++i10 == ne0) {
i10 = 0;
if (++i11 == ne1) {
i11 = 0;
if (++i12 == ne2) {
i12 = 0;
if (++i13 == ne3) {
i13 = 0;
}
}
}
}
}
}
i10 += ne00 * (ne01 - ir1);
while (i10 >= ne0) {
i10 -= ne0;
if (++i11 == ne1) {
i11 = 0;
if (++i12 == ne2) {
i12 = 0;
if (++i13 == ne3) {
i13 = 0;
}
}
}
}
}
}
} else {
GGML_ABORT("fatal error"); // TODO: implement
}
}
static void ggml_compute_forward_dup_i32(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
GGML_TENSOR_UNARY_OP_LOCALS
const int ith = params->ith; // thread index
const int nth = params->nth; // number of threads
// parallelize by rows
const int nr = ne01;
// number of rows per thread
const int dr = (nr + nth - 1) / nth;
// row range for this thread
const int ir0 = dr * ith;
const int ir1 = MIN(ir0 + dr, nr);
// dst counters
int64_t i10 = 0;
int64_t i11 = 0;
int64_t i12 = 0;
int64_t i13 = 0;
// TODO: not optimal, but works
if (dst->type == GGML_TYPE_F32) {
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
i10 += ne00 * ir0;
while (i10 >= ne0) {
i10 -= ne0;
if (++i11 == ne1) {
i11 = 0;
if (++i12 == ne2) {
i12 = 0;
if (++i13 == ne3) {
i13 = 0;
}
}
}
}
for (int64_t i01 = ir0; i01 < ir1; i01++) {
for (int64_t i00 = 0; i00 < ne00; i00++) {
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
*(float *) dst_ptr = *(const int32_t *) src0_ptr;
if (++i10 == ne0) {
i10 = 0;
if (++i11 == ne1) {
i11 = 0;
if (++i12 == ne2) {
i12 = 0;
if (++i13 == ne3) {
i13 = 0;
}
}
}
}
}
}
i10 += ne00 * (ne01 - ir1);
while (i10 >= ne0) {
i10 -= ne0;
if (++i11 == ne1) {
i11 = 0;
if (++i12 == ne2) {
i12 = 0;
if (++i13 == ne3) {
i13 = 0;
}
}
}
}
}
}
} else {
GGML_ABORT("fatal error"); // TODO: implement
}
@@ -1177,6 +1333,10 @@ void ggml_compute_forward_dup(
{
ggml_compute_forward_dup_f32(params, dst);
} break;
case GGML_TYPE_I32:
{
ggml_compute_forward_dup_i32(params, dst);
} break;
default:
{
if (ggml_is_quantized(src0->type) && dst->type == GGML_TYPE_F32) {
@@ -7027,6 +7187,209 @@ void ggml_compute_forward_im2col_back_f32(
}
}
// ggml_compute_forward_im2col_3d_f16
// src0: kernel [OC*IC, KD, KH, KW]
// src1: image [N*IC, ID, IH, IW]
// dst: result [N*OD, OH, OW, IC * KD * KH * KW]
static void ggml_compute_forward_im2col_3d_f16(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F16);
GGML_TENSOR_BINARY_OP_LOCALS;
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
const int32_t IC = ((const int32_t *)(dst->op_params))[9];
const int ith = params->ith;
const int nth = params->nth;
const int64_t N = ne13 / IC;
const int64_t ID = ne12;
const int64_t IH = ne11;
const int64_t IW = ne10;
const int64_t OC = ne03 / IC;
GGML_UNUSED(OC);
const int64_t KD = ne02;
const int64_t KH = ne01;
const int64_t KW = ne00;
const int64_t OD = ne3 / N;
const int64_t OH = ne2;
const int64_t OW = ne1;
const int64_t OH_OW = OH*OW;
const int64_t KD_KH_KW = KD*KH*KW;
const int64_t KH_KW = KH*KW;
const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
GGML_ASSERT(nb10 == sizeof(float));
// im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
{
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
for (int64_t in = 0; in < N; in++) {
for (int64_t iod = 0; iod < OD; iod++) {
for (int64_t ioh = 0; ioh < OH; ioh++) {
for (int64_t iow = 0; iow < OW; iow++) {
for (int64_t iic = ith; iic < IC; iic += nth) {
// micro kernel
ggml_fp16_t * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
for (int64_t ikd = 0; ikd < KD; ikd++) {
for (int64_t ikh = 0; ikh < KH; ikh++) {
for (int64_t ikw = 0; ikw < KW; ikw++) {
const int64_t iiw = iow*s0 + ikw*d0 - p0;
const int64_t iih = ioh*s1 + ikh*d1 - p1;
const int64_t iid = iod*s2 + ikd*d2 - p2;
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
} else {
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = GGML_CPU_FP32_TO_FP16(*s);
}
}
}
}
}
}
}
}
}
}
}
// ggml_compute_forward_im2col_3d_f32
// src0: kernel [OC*IC, KD, KH, KW]
// src1: image [N*IC, ID, IH, IW]
// dst: result [N*OD, OH, OW, IC * KD * KH * KW]
static void ggml_compute_forward_im2col_3d_f32(
const ggml_compute_params * params,
ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_TENSOR_BINARY_OP_LOCALS;
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
const int32_t IC = ((const int32_t *)(dst->op_params))[9];
const int ith = params->ith;
const int nth = params->nth;
const int64_t N = ne13 / IC;
const int64_t ID = ne12;
const int64_t IH = ne11;
const int64_t IW = ne10;
const int64_t OC = ne03 / IC;
GGML_UNUSED(OC);
const int64_t KD = ne02;
const int64_t KH = ne01;
const int64_t KW = ne00;
const int64_t OD = ne3 / N;
const int64_t OH = ne2;
const int64_t OW = ne1;
const int64_t OH_OW = OH*OW;
const int64_t KD_KH_KW = KD*KH*KW;
const int64_t KH_KW = KH*KW;
const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
GGML_ASSERT(nb10 == sizeof(float));
// im2col: [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
{
float * const wdata = (float *) dst->data;
for (int64_t in = 0; in < N; in++) {
for (int64_t iod = 0; iod < OD; iod++) {
for (int64_t ioh = 0; ioh < OH; ioh++) {
for (int64_t iow = 0; iow < OW; iow++) {
for (int64_t iic = ith; iic < IC; iic += nth) {
// micro kernel
float * dst_data = wdata + (in*OD*OH_OW + iod*OH_OW + ioh*OW + iow)*IC_KD_KH_KW; // [IC, KD, KH, KW]
const float * const src_data = (const float *) ((const char *)src1->data + (in*IC + iic)*nb13); // [ID, IH, IW]
for (int64_t ikd = 0; ikd < KD; ikd++) {
for (int64_t ikh = 0; ikh < KH; ikh++) {
for (int64_t ikw = 0; ikw < KW; ikw++) {
const int64_t iiw = iow*s0 + ikw*d0 - p0;
const int64_t iih = ioh*s1 + ikh*d1 - p1;
const int64_t iid = iod*s2 + ikd*d2 - p2;
if (iid < 0 || iid >= ID || iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = 0;
} else {
const float * const s = (const float *) ((const char *)src_data + iid*nb12 + iih*nb11 + iiw*nb10); // [ID, IH, IW]
dst_data[iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw] = *s;
}
}
}
}
}
}
}
}
}
}
}
void ggml_compute_forward_im2col_3d(
const ggml_compute_params * params,
ggml_tensor * dst) {
switch (dst->type) {
case GGML_TYPE_F16:
{
ggml_compute_forward_im2col_3d_f16(params, dst);
} break;
case GGML_TYPE_F32:
{
ggml_compute_forward_im2col_3d_f32(params, dst);
} break;
default:
{
GGML_ABORT("fatal error");
}
}
}
static void ggml_call_mul_mat(ggml_type type, const ggml_compute_params * params, int64_t m, int64_t n, int64_t k,
void * a, void * b, float * c) {
const ggml_type_traits * traits = ggml_get_type_traits(type);
@@ -8014,6 +8377,15 @@ static void ggml_compute_forward_pad_f32(
GGML_TENSOR_UNARY_OP_LOCALS
float * dst_ptr = (float *) dst->data;
const int32_t lp0 = ggml_get_op_params_i32(dst, 0);
const int32_t rp0 = ggml_get_op_params_i32(dst, 1);
const int32_t lp1 = ggml_get_op_params_i32(dst, 2);
const int32_t rp1 = ggml_get_op_params_i32(dst, 3);
const int32_t lp2 = ggml_get_op_params_i32(dst, 4);
const int32_t rp2 = ggml_get_op_params_i32(dst, 5);
const int32_t lp3 = ggml_get_op_params_i32(dst, 6);
const int32_t rp3 = ggml_get_op_params_i32(dst, 7);
// TODO: optimize
@@ -8022,10 +8394,12 @@ static void ggml_compute_forward_pad_f32(
for (int64_t i0 = 0; i0 < ne0; ++i0) {
for (int64_t i3 = 0; i3 < ne3; ++i3) {
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
if ((i0 >= lp0 && i0 < ne0 - rp0) \
&& (i1 >= lp1 && i1 < ne1 - rp1) \
&& (i2 >= lp2 && i2 < ne2 - rp2) \
&& (i3 >= lp3 && i3 < ne3 - rp3)) {
const int64_t src_idx = (i3 - lp3)*nb03 + (i2 - lp2)*nb02 + (i1 - lp1)*nb01 + (i0 - lp0)*nb00;
const float * src_ptr = (const float *)((char *) src0->data + src_idx);
dst_ptr[dst_idx] = *src_ptr;
} else {
dst_ptr[dst_idx] = 0;
@@ -8224,7 +8598,7 @@ static void ggml_compute_forward_timestep_embedding_f32(
embed_data[j + half] = sinf(arg);
}
if (dim % 2 != 0 && ith == 0) {
embed_data[dim] = 0.f;
embed_data[2 * half] = 0.f;
}
}
}

View File

@@ -69,6 +69,7 @@ void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struc
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_im2col_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);

View File

@@ -114,26 +114,6 @@ extern "C" {
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
#elif defined(__NNPA__)
#define GGML_CPU_COMPUTE_FP16_TO_FP32(x) nnpa_compute_fp16_to_fp32(x)
#define GGML_CPU_COMPUTE_FP32_TO_FP16(x) nnpa_compute_fp32_to_fp16(x)
#define GGML_CPU_FP16_TO_FP32(x) GGML_CPU_COMPUTE_FP16_TO_FP32(x)
#define GGML_CPU_FP32_TO_FP16(x) GGML_CPU_COMPUTE_FP32_TO_FP16(x)
static inline float nnpa_compute_fp16_to_fp32(ggml_fp16_t h) {
uint16x8_t v_h = vec_splats(h);
uint16x8_t v_hd = vec_convert_from_fp16(v_h, 0);
return vec_extend_to_fp32_hi(v_hd, 0)[0];
}
static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
float32x4_t v_f = vec_splats(f);
float32x4_t v_zero = vec_splats(0.0f);
uint16x8_t v_hd = vec_round_from_fp32(v_f, v_zero, 0);
uint16x8_t v_h = vec_convert_to_fp16(v_hd, 0);
return vec_extract(v_h, 0);
}
#endif
// precomputed f32 table for f16 (256 KB)
@@ -1156,11 +1136,6 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
#define GGML_F16_EPR GGML_F32_EPR
static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
#if defined(__NNPA__)
uint16x8_t v_x = vec_xl(0, (const ggml_fp16_t *)x);
uint16x8_t v_xd = vec_convert_from_fp16(v_x, 0);
return vec_extend_to_fp32_hi(v_xd, 0);
#else
float tmp[4];
for (int i = 0; i < 4; i++) {
@@ -1170,20 +1145,9 @@ static inline float32x4_t __lzs_f16cx4_load(const ggml_fp16_t * x) {
// note: keep type-cast here to prevent compiler bugs
// see: https://github.com/ggml-org/llama.cpp/issues/12846
return vec_xl(0, (const float *)(tmp));
#endif
}
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
#if defined(__NNPA__)
float32x4_t v_zero = vec_splats(0.0f);
uint16x8_t v_xd = vec_round_from_fp32(v_y, v_zero, 0);
uint16x8_t v_x = vec_convert_to_fp16(v_xd, 0);
x[0] = vec_extract(v_x, 0);
x[1] = vec_extract(v_x, 1);
x[2] = vec_extract(v_x, 2);
x[3] = vec_extract(v_x, 3);
#else
float arr[4];
// note: keep type-cast here to prevent compiler bugs
@@ -1193,7 +1157,6 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
for (int i = 0; i < 4; i++) {
x[i] = GGML_CPU_FP32_TO_FP16(arr[i]);
}
#endif
}
#define GGML_F16_VEC GGML_F32x4

View File

@@ -85,15 +85,21 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G
// reduce sum1,sum2 to sum1
GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8);
#elif defined(__riscv_v_intrinsic)
vfloat32m1_t vsum = __riscv_vfmv_v_f_f32m1(0.0f, 1);
for (int i = 0, avl; i < n; i += avl) {
avl = __riscv_vsetvl_e32m8(n - i);
vfloat32m8_t ax = __riscv_vle32_v_f32m8(&x[i], avl);
vfloat32m8_t ay = __riscv_vle32_v_f32m8(&y[i], avl);
vfloat32m8_t prod = __riscv_vfmul_vv_f32m8(ax, ay, avl);
vsum = __riscv_vfredusum_vs_f32m8_f32m1(prod, vsum, avl);
int vl = __riscv_vsetvlmax_e32m8();
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
vfloat32m8_t vsum;
vfloat32m8_t ax;
vfloat32m8_t ay;
vsum = __riscv_vfmv_v_f_f32m8_tu(vsum, 0.0f, vl);
for (int i = 0; i < n; i += vl) {
vl = __riscv_vsetvl_e32m8(n - i);
ax = __riscv_vle32_v_f32m8_tu(ax, &x[i], vl);
ay = __riscv_vle32_v_f32m8_tu(ay, &y[i], vl);
vsum = __riscv_vfmacc_vv_f32m8_tu(vsum, ax, ay, vl);
}
sumf += __riscv_vfmv_f_s_f32m1_f32(vsum);
vl = __riscv_vsetvlmax_e32m8();
vs = __riscv_vfredusum_vs_f32m8_f32m1(vsum, vs, vl);
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
#else
const int np = (n & ~(GGML_F32_STEP - 1));
@@ -208,7 +214,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
ggml_float sumf = 0.0;
#if defined(GGML_SIMD) && !defined(__riscv_v_intrinsic)
#if defined(GGML_SIMD)
#if defined(__ARM_FEATURE_SVE)
const int sve_register_length = svcntb() * 8; //get vector length
const int ggml_f16_epr = sve_register_length / 16; // running when 16
@@ -271,6 +277,29 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
sum1 = svmad_f16_x(pg, hx, hy, sum1);
}
GGML_F16x_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4);
#elif defined(__riscv_v_intrinsic)
#if defined(__riscv_zvfh)
int vl = __riscv_vsetvlmax_e32m2();
vfloat32m1_t vs = __riscv_vfmv_v_f_f32m1(0.0f, 1);
vfloat32m2_t vsum;
vfloat16m1_t ax;
vfloat16m1_t ay;
vsum = __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vmv_v_x_u32m2(0, vl));
for (int i = 0; i < n; i += vl) {
vl = __riscv_vsetvl_e16m1(n - i);
ax = __riscv_vle16_v_f16m1_tu(ax, (const _Float16 *)&x[i], vl);
ay = __riscv_vle16_v_f16m1_tu(ay, (const _Float16 *)&y[i], vl);
vsum = __riscv_vfwmacc_vv_f32m2_tu(vsum, ax, ay, vl);
}
vl = __riscv_vsetvlmax_e32m1();
vfloat32m1_t ac0 = __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(vsum, 0), __riscv_vget_v_f32m2_f32m1(vsum, 1), vl);
vs = __riscv_vfredusum_vs_f32m1_f32m1(ac0, vs, vl);
sumf += __riscv_vfmv_f_s_f32m1_f32(vs);
#else
for (int i = 0; i < n; ++i) {
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
}
#endif // __riscv_zvfh
#else
const int np = (n & ~(GGML_F16_STEP - 1));
@@ -302,7 +331,7 @@ void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * G
for (int i = 0; i < n; ++i) {
sumf += (ggml_float)(GGML_CPU_FP16_TO_FP32(x[i])*GGML_CPU_FP16_TO_FP32(y[i]));
}
#endif
#endif // GGML_SIMD
*s = sumf;
}
@@ -361,6 +390,14 @@ void ggml_vec_swiglu_f32(const int n, float * y, const float * x, const float *
for (; i + 3 < n; i += 4) {
vst1q_f32(y + i, vmulq_f32(ggml_v_silu(vld1q_f32(x + i)), vld1q_f32(g + i)));
}
#elif defined(__riscv_v_intrinsic)
for (int vl; i < n; i += vl) {
vl = __riscv_vsetvl_e32m2(n - i);
vfloat32m2_t vx = __riscv_vle32_v_f32m2(&x[i], vl);
vfloat32m2_t vg = __riscv_vle32_v_f32m2(&g[i], vl);
vfloat32m2_t vy = __riscv_vfmul_vv_f32m2(ggml_v_silu_m2(vx, vl), vg, vl);
__riscv_vse32_v_f32m2(&y[i], vy, vl);
}
#endif
for (; i < n; ++i) {
y[i] = ggml_silu_f32(x[i]) * g[i];

View File

@@ -1269,6 +1269,14 @@ inline static vfloat32m2_t ggml_v_expf_m2(vfloat32m2_t x, int vl) {
vl);
}
// computes silu x/(1+exp(-x)) in single precision vector
inline static vfloat32m2_t ggml_v_silu_m2(vfloat32m2_t x, int vl) {
const vfloat32m2_t neg_x = __riscv_vfneg_v_f32m2(x, vl);
const vfloat32m2_t exp_neg_x = ggml_v_expf_m2(neg_x, vl);
const vfloat32m2_t one_plus_exp_neg_x = __riscv_vfadd_vf_f32m2(exp_neg_x, 1.0f, vl);
return __riscv_vfdiv_vv_f32m2(x, one_plus_exp_neg_x, vl);
}
#endif // __ARM_NEON / __AVX2__ / __SSE2__ / __riscv_v_intrinsic
inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {

View File

@@ -44,6 +44,8 @@ if (CUDAToolkit_FOUND)
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "template-instances/mmf*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
if (GGML_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "template-instances/fattn-vec*.cu")

View File

@@ -23,28 +23,44 @@ static __device__ __forceinline__ float op_div(const float a, const float b) {
return a / b;
}
template <float (*bin_op)(const float, const float),
typename src0_t,
typename src1_t,
typename dst_t,
typename... src1_ptrs>
static __global__ void k_bin_bcast(const src0_t * src0,
const src1_t * src1,
dst_t * dst,
const int ne0,
const int ne1,
const int ne2,
const uint3 ne3,
const uint3 ne10,
const uint3 ne11,
const uint3 ne12,
const uint3 ne13,
/*int s0, */ const int s1,
const int s2,
const int s3,
/*int s00,*/ const int s01,
const int s02,
const int s03,
/*int s10,*/ const int s11,
const int s12,
const int s13,
src1_ptrs... src1s) {
const uint32_t i0s = blockDim.x * blockIdx.x + threadIdx.x;
const uint32_t i1 = (blockDim.y * blockIdx.y + threadIdx.y);
const uint32_t i2 = fastdiv((blockDim.z * blockIdx.z + threadIdx.z), ne3);
const uint32_t i3 = (blockDim.z * blockIdx.z + threadIdx.z) - (i2 * ne3.z);
template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t, typename... src1_ptrs>
static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
const int ne0, const int ne1, const int ne2, const int ne3,
const int ne10, const int ne11, const int ne12, const int ne13,
/*int s0, */ const int s1, const int s2, const int s3,
/*int s00,*/ const int s01, const int s02, const int s03,
/*int s10,*/ const int s11, const int s12, const int s13,
src1_ptrs... src1s) {
const int i0s = blockDim.x*blockIdx.x + threadIdx.x;
const int i1 = (blockDim.y*blockIdx.y + threadIdx.y);
const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3;
const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3;
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) {
return;
}
const int i11 = i1 % ne11;
const int i12 = i2 % ne12;
const int i13 = i3 % ne13;
const uint32_t i11 = fastmodulo(i1, ne11);
const uint32_t i12 = fastmodulo(i2, ne12);
const uint32_t i13 = fastmodulo(i3, ne13);
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
@@ -53,8 +69,8 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
dst_t * dst_row = dst + i_dst;
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) {
const int i10 = i0 % ne10;
for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) {
const uint32_t i10 = fastmodulo(i0, ne10);
float result = src0_row ? (float) src0_row[i0] : 0.0f;
if constexpr (sizeof...(src1_ptrs) > 0) {
@@ -67,28 +83,48 @@ static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst
}
}
template <float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t, typename... src1_ptrs>
static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
const int ne0, const int ne1, const int ne2,const int ne3,
const int ne10, const int ne11, const int ne12, const int ne13,
/*int s0, */ const int s1, const int s2, const int s3,
/*int s00,*/ const int s01, const int s02, const int s03,
/*int s10,*/ const int s11, const int s12, const int s13,
src1_ptrs ... src1s) {
template <float (*bin_op)(const float, const float),
typename src0_t,
typename src1_t,
typename dst_t,
typename... src1_ptrs>
static __global__ void k_bin_bcast_unravel(const src0_t * src0,
const src1_t * src1,
dst_t * dst,
const uint3 ne0,
const uint3 ne1,
const uint3 ne2,
const uint32_t ne3,
const uint3 prod_012,
const uint3 prod_01,
const uint3 ne10,
const uint3 ne11,
const uint3 ne12,
const uint3 ne13,
/*int s0, */ const int s1,
const int s2,
const int s3,
/*int s00,*/ const int s01,
const int s02,
const int s03,
/*int s10,*/ const int s11,
const int s12,
const int s13,
src1_ptrs... src1s) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
const int i3 = i/(ne2*ne1*ne0);
const int i2 = (i/(ne1*ne0)) % ne2;
const int i1 = (i/ne0) % ne1;
const int i0 = i % ne0;
const uint32_t i3 = fastdiv(i, prod_012);
const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01);
const uint32_t i1 = fastdiv(i - i3 * prod_012.z - i2 * prod_01.z, ne0);
const uint32_t i0 = i - i3 * prod_012.z - i2 * prod_01.z - i1 * ne0.z;
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
if (i0 >= ne0.z || i1 >= ne1.z || i2 >= ne2.z || i3 >= ne3) {
return;
}
const int i11 = i1 % ne11;
const int i12 = i2 % ne12;
const int i13 = i3 % ne13;
const int i11 = fastmodulo(i1, ne11);
const int i12 = fastmodulo(i2, ne12);
const int i13 = fastmodulo(i3, ne13);
const size_t i_src0 = i3*s03 + i2*s02 + i1*s01;
const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
@@ -97,7 +133,7 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t *
const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr;
dst_t * dst_row = dst + i_dst;
const int i10 = i0 % ne10;
const int i10 = fastmodulo(i0, ne10);
float result = src0_row ? (float) src0_row[i0] : 0.0f;
if constexpr (sizeof...(src1_ptrs) > 0) {
@@ -170,11 +206,6 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
//int64_t ne02 = cne0[2]; GGML_UNUSED(ne02);
//int64_t ne03 = cne0[3]; GGML_UNUSED(ne03);
int64_t ne10 = cne1[0];
int64_t ne11 = cne1[1];
int64_t ne12 = cne1[2];
int64_t ne13 = cne1[3];
size_t nb0 = cnb[0];
size_t nb1 = cnb[1];
size_t nb2 = cnb[2];
@@ -233,48 +264,51 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
block_dims.y = std::min<unsigned int>(ne1, block_size / block_dims.x);
block_dims.z = std::min(std::min<unsigned int>(ne2 * ne3, block_size / block_dims.x / block_dims.y), 64U);
dim3 block_nums((hne0 + block_dims.x - 1) / block_dims.x,
(ne1 + block_dims.y - 1) / block_dims.y,
dim3 block_nums((hne0 + block_dims.x - 1) / block_dims.x, (ne1 + block_dims.y - 1) / block_dims.y,
(ne2 * ne3 + block_dims.z - 1) / block_dims.z);
const uint3 ne10 = init_fastdiv_values((uint32_t) cne1[0]);
const uint3 ne11 = init_fastdiv_values((uint32_t) cne1[1]);
const uint3 ne12 = init_fastdiv_values((uint32_t) cne1[2]);
const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);
if (block_nums.z > 65535) {
int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1));
const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0);
const uint3 ne1_fastdiv = init_fastdiv_values((uint32_t) ne1);
const uint3 ne2_fastdiv = init_fastdiv_values((uint32_t) ne2);
if constexpr (sizeof...(I) > 0) {
k_bin_bcast_unravel<bin_op, src0_t, src1_t, dst_t>
<<<block_num, block_size, 0, stream>>>(src0_dd, src1_dd, dst_dd,
ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13,
/* s0, */ s1, s2, s3,
/* s00,*/ s01, s02, s03,
/* s10,*/ s11, s12,s13,
(const src1_t *) dst->src[I + 1]->data...);
k_bin_bcast_unravel<bin_op, src0_t, src1_t, dst_t><<<block_num, block_size, 0, stream>>>(
src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv, ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11,
ne12, ne13,
/* s0, */ s1, s2, s3,
/* s00,*/ s01, s02, s03,
/* s10,*/ s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
} else {
k_bin_bcast_unravel<bin_op, src0_t, src1_t, dst_t>
<<<block_num, block_size, 0, stream>>>(src0_dd, src1_dd, dst_dd,
ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13,
/* s0, */ s1, s2, s3,
/* s00,*/ s01, s02, s03,
/* s10,*/ s11, s12,s13);
<<<block_num, block_size, 0, stream>>>(src0_dd, src1_dd, dst_dd, ne0_fastdiv, ne1_fastdiv,
ne2_fastdiv, ne3, prod_012, prod_01, ne10, ne11, ne12, ne13,
/* s0, */ s1, s2, s3,
/* s00,*/ s01, s02, s03,
/* s10,*/ s11, s12, s13);
}
} else {
const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3);
if constexpr (sizeof...(I) > 0) {
k_bin_bcast<bin_op, src0_t, src1_t, dst_t>
<<<block_nums, block_dims, 0, stream>>>(src0_dd, src1_dd, dst_dd,
ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13,
/* s0, */ s1, s2, s3,
/* s00,*/ s01, s02, s03,
/* s10,*/ s11, s12,s13,
(const src1_t *) dst->src[I + 1]->data...);
k_bin_bcast<bin_op, src0_t, src1_t, dst_t><<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13,
/* s0, */ s1, s2, s3,
/* s00,*/ s01, s02, s03,
/* s10,*/ s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...);
} else {
k_bin_bcast<bin_op, src0_t, src1_t, dst_t>
<<<block_nums, block_dims, 0, stream>>>(src0_dd, src1_dd, dst_dd,
ne0, ne1, ne2, ne3,
ne10, ne11, ne12, ne13,
/* s0, */ s1, s2, s3,
/* s00,*/ s01, s02, s03,
/* s10,*/ s11, s12,s13);
k_bin_bcast<bin_op, src0_t, src1_t, dst_t><<<block_nums, block_dims, 0, stream>>>(
src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3_fastdiv, ne10, ne11, ne12, ne13,
/* s0, */ s1, s2, s3,
/* s00,*/ s01, s02, s03,
/* s10,*/ s11, s12, s13);
}
}
}

View File

@@ -545,6 +545,45 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
#endif // defined(GGML_USE_HIP)
}
static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float v, const float u) {
acc += v*u;
}
static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float2 v, const float2 u) {
acc += v.x*u.x;
acc += v.y*u.y;
}
static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) {
#if defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(__gfx906__) || defined(CDNA))
asm volatile("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(acc) : "v"(v), "v"(u));
#else
#ifdef FAST_FP16_AVAILABLE
const float2 tmp = __half22float2(v*u);
acc += tmp.x + tmp.y;
#else
const float2 tmpv = __half22float2(v);
const float2 tmpu = __half22float2(u);
acc += tmpv.x * tmpu.x;
acc += tmpv.y * tmpu.y;
#endif // FAST_FP16_AVAILABLE
#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
}
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
template <int nbytes>
static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
if constexpr (nbytes == 4) {
*(int *) dst = *(const int *) src;
} else if constexpr (nbytes == 8) {
*(int2 *) dst = *(const int2 *) src;
} else if constexpr (nbytes == 16) {
*(int4 *) dst = *(const int4 *) src;
} else {
static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
}
}
static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
#if CUDART_VERSION >= 12080
const nv_bfloat16 e = __nv_cvt_e8m0_to_bf16raw(x);
@@ -563,6 +602,40 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
#endif // CUDART_VERSION >= 12050
}
// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
// Precompute mp (m' in the paper) and L such that division
// can be computed using a multiply (high 32b of 64b result)
// and a shift:
//
// n/d = (mulhi(n, mp) + n) >> L;
static const uint3 init_fastdiv_values(uint32_t d) {
GGML_ASSERT(d != 0);
// compute L = ceil(log2(d));
uint32_t L = 0;
while (L < 32 && (uint32_t{ 1 } << L) < d) {
L++;
}
uint32_t mp = (uint32_t) ((uint64_t{ 1 } << 32) * ((uint64_t{ 1 } << L) - d) / d + 1);
// pack divisor as well to reduce error surface
return make_uint3(mp, L, d);
}
static __device__ __forceinline__ uint32_t fastdiv(uint32_t n, const uint3 fastdiv_values) {
// expects fastdiv_values to contain <mp, L, divisor> in <x, y, z>
// fastdiv_values.z is unused and optimized away by the compiler.
// Compute high 32 bits of n * mp
const uint32_t hi = __umulhi(n, fastdiv_values.x);
// add n, apply bit shift
return (hi + n) >> fastdiv_values.y;
}
static __device__ __forceinline__ uint32_t fastmodulo(uint32_t n, const uint3 fastdiv_values) {
// expects fastdiv_values to contain <mp, L, divisor> in <x, y, z> (see init_fastdiv_values)
return n - fastdiv(n, fastdiv_values) * fastdiv_values.z;
}
typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, float2 & v);
static __device__ __forceinline__ float get_alibi_slope(

View File

@@ -38,6 +38,8 @@ template<typename dst_t, typename src_t>
return __float2bfloat16(float(x));
} else if constexpr(std::is_same_v<src_t, nv_bfloat16>) {
return __bfloat162float(x);
} else if constexpr(std::is_same_v<dst_t, int32_t>) {
return int32_t(x);
} else {
return float(x);
}

View File

@@ -374,6 +374,10 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
ggml_cpy_flt_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
ggml_cpy_flt_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32) {
ggml_cpy_flt_cuda<float, int32_t> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_F32) {
ggml_cpy_flt_cuda<int32_t, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else {
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type));

View File

@@ -1,371 +0,0 @@
#include "common.cuh"
#include "fattn-common.cuh"
#include "fattn-tile-f16.cuh"
#define FATTN_KQ_STRIDE_TILE_F16 64
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
#if !defined(GGML_USE_HIP)
__launch_bounds__(nwarps*WARP_SIZE, 2)
#endif // !defined(GGML_USE_HIP)
static __global__ void flash_attn_tile_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
const char * __restrict__ sinks,
const int * __restrict__ KV_max,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
const int32_t nb01, const int32_t nb02, const int32_t nb03,
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
const int32_t nb11, const int32_t nb12, const int64_t nb13,
const int32_t nb21, const int32_t nb22, const int64_t nb23,
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#if defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
// Skip unused kernel variants for faster compilation:
#ifdef FP16_MMA_AVAILABLE
NO_DEVICE_CODE;
return;
#endif // FP16_MMA_AVAILABLE
if (use_logit_softcap && !(D == 128 || D == 256)) {
NO_DEVICE_CODE;
return;
}
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
const int sequence = blockIdx.z / ne02;
const int head = blockIdx.z - sequence*ne02;
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const float * sinksf = (const float *) (sinks);
const int stride_KV2 = nb11 / sizeof(half2);
const float slopef = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
const half slopeh = __float2half(slopef);
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
__shared__ half KQ[ncols*FATTN_KQ_STRIDE_TILE_F16];
half2 * KQ2 = (half2 *) KQ;
__shared__ half2 KV_tmp[FATTN_KQ_STRIDE_TILE_F16][D/2 + 1]; // Pad D to avoid memory bank conflicts.
half kqmax[ncols/nwarps];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
kqmax[j0/nwarps] = -HALF_MAX_HALF;
}
half2 kqsum[ncols/nwarps] = {{0.0f, 0.0f}};
half2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
// Convert Q to half2 and store in registers:
__shared__ half2 Q_h2[ncols][D/2];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i] : make_float2(0.0f, 0.0f);
Q_h2[j][i] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
}
}
__syncthreads();
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F16; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F16) {
// Calculate KQ tile and keep track of new maximum KQ values:
half kqmax_new[ncols/nwarps];
#pragma unroll
for (int j = 0; j < ncols/nwarps; ++j) {
kqmax_new[j] = kqmax[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
const int k_KQ = k_KQ_0 + threadIdx.x;
KV_tmp[i_KQ][k_KQ] = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
}
}
__syncthreads();
half2 sum2[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE][ncols/nwarps] = {{{0.0f, 0.0f}}};
#pragma unroll
for (int k_KQ = 0; k_KQ < D/2; ++k_KQ) {
half2 K_k[FATTN_KQ_STRIDE_TILE_F16/WARP_SIZE];
half2 Q_k[ncols/nwarps];
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
const int i_KQ = i_KQ_0 + threadIdx.x;
K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
}
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y;
Q_k[j_KQ_0/nwarps] = Q_h2[j_KQ][k_KQ];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE]*Q_k[j_KQ_0/nwarps];
}
}
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F16; i_KQ_0 += WARP_SIZE) {
const int i_KQ = i_KQ_0 + threadIdx.x;
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y;
half sum;
if (use_logit_softcap) {
const float2 tmp = __half22float2(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
sum = logit_softcap * tanhf(tmp.x + tmp.y);
} else {
sum = __low2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]) + __high2half(sum2[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
}
sum += mask ? slopeh*maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
kqmax_new[j_KQ_0/nwarps] = ggml_cuda_hmax(kqmax_new[j_KQ_0/nwarps], sum);
KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F16 + i_KQ] = sum;
}
}
__syncthreads();
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]));
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
#pragma unroll
for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F16/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
const half2 diff = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] - __half2half2(kqmax[j0/nwarps]);
const half2 val = h2exp(diff);
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + val;
KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + i] = val;
}
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale;
}
}
__syncthreads();
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += nwarps) {
const int k = k0 + threadIdx.y;
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
KV_tmp[k][i] = V_h2[int64_t(k_VKQ_0 + k)*stride_KV2 + i];
}
}
__syncthreads();
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F16; k0 += 2) {
half2 V_k[(D/2)/WARP_SIZE][2];
half2 KQ_k[ncols/nwarps];
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
V_k[i0/WARP_SIZE][0] = KV_tmp[k0 + 0][i];
V_k[i0/WARP_SIZE][1] = KV_tmp[k0 + 1][i];
}
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
KQ_k[j0/nwarps] = KQ2[j*(FATTN_KQ_STRIDE_TILE_F16/2) + k0/2];
}
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][0]* __low2half2(KQ_k[j0/nwarps]);
VKQ[j0/nwarps][i0/WARP_SIZE] += V_k[i0/WARP_SIZE][1]*__high2half2(KQ_k[j0/nwarps]);
}
}
}
__syncthreads();
}
//Attention sink: adjust running max and sum once per head
if (sinksf && blockIdx.y == 0) {
const half sink = __float2half(sinksf[head]);
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
half kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new_j));
kqmax[j0/nwarps] = kqmax_new_j;
const half val = hexp(sink - kqmax[j0/nwarps]);
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
if (threadIdx.x == 0) {
kqsum[j0/nwarps].x = __hadd(__low2half(kqsum[j0/nwarps]), val);
}
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale;
}
}
}
float2 * dst2 = (float2 *) dst;
#pragma unroll
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
const int j_VKQ = j_VKQ_0 + threadIdx.y;
if (ic0 + j_VKQ >= ne01) {
return;
}
half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
kqsum_j = warp_reduce_sum((float)kqsum_j);
const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
#pragma unroll
for (int i00 = 0; i00 < D/2; i00 += WARP_SIZE) {
const int i0 = i00 + threadIdx.x;
half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/WARP_SIZE];
if (gridDim.y == 1) {
dst_val /= __half2half2(kqsum_j);
}
dst2[j_dst_unrolled*(D/2) + i0] = __half22float2(dst_val);
}
if (gridDim.y != 1 && threadIdx.x == 0) {
dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
}
}
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
}
template <int cols_per_block, bool use_logit_softcap>
void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
switch (Q->ne[0]) {
case 64: {
constexpr int D = 64;
constexpr int nwarps = 8;
constexpr size_t nbytes_shared = 0;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
launch_fattn<D, cols_per_block, 1>
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
} break;
case 128: {
constexpr int D = 128;
constexpr int nwarps = 8;
constexpr size_t nbytes_shared = 0;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
launch_fattn<D, cols_per_block, 1>
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
} break;
default: {
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
} break;
}
}
void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
const int32_t precision = KQV->op_params[3];
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
if (Q->ne[1] <= 16) {
constexpr int cols_per_block = 16;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
}
return;
}
constexpr int cols_per_block = 32;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
}
}

View File

@@ -1,3 +0,0 @@
#include "common.cuh"
void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -1,379 +0,0 @@
#include "common.cuh"
#include "fattn-common.cuh"
#include "fattn-tile-f32.cuh"
#define FATTN_KQ_STRIDE_TILE_F32 32
template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
#if !defined(GGML_USE_HIP)
__launch_bounds__(nwarps*WARP_SIZE, 2)
#endif // !defined(GGML_USE_HIP)
static __global__ void flash_attn_tile_ext_f32(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
const char * __restrict__ sinks,
const int * __restrict__ KV_max,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
const int32_t nb01, const int32_t nb02, const int32_t nb03,
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
const int32_t nb11, const int32_t nb12, const int64_t nb13,
const int32_t nb21, const int32_t nb22, const int64_t nb23,
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#ifdef FLASH_ATTN_AVAILABLE
// Skip unused kernel variants for faster compilation:
#ifdef FP16_MMA_AVAILABLE
NO_DEVICE_CODE;
return;
#endif // FP16_MMA_AVAILABLE
if (use_logit_softcap && !(D == 128 || D == 256)) {
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
return;
}
// In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
const int sequence = blockIdx.z / ne02;
const int head = blockIdx.z - sequence*ne02;
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const float * sinksf = (const float *) (sinks);
const int stride_KV2 = nb11 / sizeof(half2);
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
__shared__ float KQ[ncols*FATTN_KQ_STRIDE_TILE_F32];
__shared__ float KV_tmp[FATTN_KQ_STRIDE_TILE_F32][D + 1]; // Pad D to avoid memory bank conflicts.
float2 * KV_tmp2 = (float2 *) KV_tmp;
float kqmax[ncols/nwarps];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
kqmax[j0/nwarps] = -FLT_MAX/2.0f;
}
float kqsum[ncols/nwarps] = {0.0f};
float2 VKQ[ncols/nwarps][(D/2)/WARP_SIZE] = {{{0.0f, 0.0f}}};
// Convert Q to half2 and store in registers:
__shared__ float Q_f[ncols][D];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
#pragma unroll
for (int i0 = 0; i0 < D; i0 += 2*WARP_SIZE) {
float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0/2 + threadIdx.x] : make_float2(0.0f, 0.0f);
Q_f[j][i0 + 0*WARP_SIZE + threadIdx.x] = tmp.x * scale;
Q_f[j][i0 + 1*WARP_SIZE + threadIdx.x] = tmp.y * scale;
}
}
__syncthreads();
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F32; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F32) {
// Calculate KQ tile and keep track of new maximum KQ values:
float kqmax_new[ncols/nwarps];
#pragma unroll
for (int j = 0; j < ncols/nwarps; ++j) {
kqmax_new[j] = kqmax[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 2*WARP_SIZE) {
const half2 tmp = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + threadIdx.x];
KV_tmp[i_KQ][k_KQ_0 + 0*WARP_SIZE + threadIdx.x] = __low2float(tmp);
KV_tmp[i_KQ][k_KQ_0 + 1*WARP_SIZE + threadIdx.x] = __high2float(tmp);
}
}
__syncthreads();
float sum[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE][ncols/nwarps] = {{0.0f}};
#pragma unroll
for (int k_KQ = 0; k_KQ < D; ++k_KQ) {
float K_k[FATTN_KQ_STRIDE_TILE_F32/WARP_SIZE];
float Q_k[ncols/nwarps];
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
const int i_KQ = i_KQ_0 + threadIdx.x;
K_k[i_KQ_0/WARP_SIZE] = KV_tmp[i_KQ][k_KQ];
}
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y;
Q_k[j_KQ_0/nwarps] = Q_f[j_KQ][k_KQ];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += K_k[i_KQ_0/WARP_SIZE] * Q_k[j_KQ_0/nwarps];
}
}
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE_TILE_F32; i_KQ_0 += WARP_SIZE) {
const int i_KQ = i_KQ_0 + threadIdx.x;
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y;
if (use_logit_softcap) {
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
}
sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps]);
KQ[j_KQ*FATTN_KQ_STRIDE_TILE_F32 + i_KQ] = sum[i_KQ_0/WARP_SIZE][j_KQ_0/nwarps];
}
}
__syncthreads();
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
kqmax_new[j0/nwarps] = warp_reduce_max(kqmax_new[j0/nwarps]);
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]);
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
float kqsum_add = 0.0f;
#pragma unroll
for (int i0 = 0; i0 < FATTN_KQ_STRIDE_TILE_F32; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
const float diff = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] - kqmax[j0/nwarps];
const float val = expf(diff);
kqsum_add += val;
KQ[j*FATTN_KQ_STRIDE_TILE_F32 + i] = val;
}
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
}
}
__syncthreads();
#pragma unroll
for (int k0 = 0; k0 < FATTN_KQ_STRIDE_TILE_F32; k0 += nwarps) {
const int k = k0 + threadIdx.y;
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
const half2 tmp = V_h2[int64_t(k_VKQ_0 + k)*stride_KV2 + i];
KV_tmp2[k*(D/2) + i].x = __low2float(tmp);
KV_tmp2[k*(D/2) + i].y = __high2float(tmp);
}
}
__syncthreads();
#pragma unroll
for (int k = 0; k < FATTN_KQ_STRIDE_TILE_F32; ++k) {
float2 V_k[(D/2)/WARP_SIZE];
float KQ_k[ncols/nwarps];
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
V_k[i0/WARP_SIZE] = KV_tmp2[k*(D/2) + i];
}
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
KQ_k[j0/nwarps] = KQ[j*FATTN_KQ_STRIDE_TILE_F32 + k];
}
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
VKQ[j0/nwarps][i0/WARP_SIZE].x += V_k[i0/WARP_SIZE].x*KQ_k[j0/nwarps];
VKQ[j0/nwarps][i0/WARP_SIZE].y += V_k[i0/WARP_SIZE].y*KQ_k[j0/nwarps];
}
}
}
__syncthreads();
}
//Attention sink: adjust running max and sum once per head
if (sinksf && blockIdx.y == 0) {
const float sink = sinksf[head];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j);
kqmax[j0/nwarps] = kqmax_new_j;
const float val = expf(sink - kqmax[j0/nwarps]);
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
if (threadIdx.x == 0) {
kqsum[j0/nwarps] += val;
}
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
}
}
}
float2 * dst2 = (float2 *) dst;
#pragma unroll
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
const int j_VKQ = j_VKQ_0 + threadIdx.y;
if (ic0 + j_VKQ >= ne01) {
return;
}
float kqsum_j = kqsum[j_VKQ_0/nwarps];
kqsum_j = warp_reduce_sum(kqsum_j);
const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
#pragma unroll
for (int i00 = 0; i00 < D/2; i00 += WARP_SIZE) {
const int i0 = i00 + threadIdx.x;
float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/WARP_SIZE];
if (gridDim.y == 1) {
dst_val.x /= kqsum_j;
dst_val.y /= kqsum_j;
}
dst2[j_dst_unrolled*(D/2) + i0] = dst_val;
}
if (gridDim.y != 1 && threadIdx.x == 0) {
dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
}
}
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}
template <int cols_per_block, bool use_logit_softcap>
void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
switch (Q->ne[0]) {
case 64: {
constexpr int D = 64;
constexpr int nwarps = 8;
constexpr size_t nbytes_shared = 0;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
launch_fattn<D, cols_per_block, 1>
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
} break;
case 128: {
constexpr int D = 128;
constexpr int nwarps = 8;
constexpr size_t nbytes_shared = 0;
fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
launch_fattn<D, cols_per_block, 1>
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
} break;
default: {
GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
} break;
}
}
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
if (Q->ne[1] <= 16) {
constexpr int cols_per_block = 16;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
}
return;
}
constexpr int cols_per_block = 32;
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
}
}

View File

@@ -1,3 +0,0 @@
#include "common.cuh"
void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -0,0 +1,660 @@
#include "common.cuh"
#include "fattn-common.cuh"
#include "fattn-tile.cuh"
#define FATTN_TILE_NTHREADS 256
static int fattn_tile_get_kq_stride_host(const int D, const int ncols, const int cc, const int warp_size) {
if (GGML_CUDA_CC_IS_AMD(cc)) {
switch (D) {
case 64:
return 64;
case 128:
case 256:
if (GGML_CUDA_CC_IS_GCN(cc) || GGML_CUDA_CC_IS_CDNA(cc)) {
return ncols <= 16 ? 64 : 32;
} else {
return 64;
}
default:
GGML_ABORT("fatal error");
return -1;
}
}
if (fast_fp16_available(cc)) {
switch (D) {
case 64:
case 128:
return 128;
case 256:
return ncols <= 16 ? 128 : 64;
default:
GGML_ABORT("fatal error");
return -1;
}
}
switch (D) {
case 64:
return ncols <= 16 ? 128 : 64;
case 128:
return ncols <= 16 ? 64 : 32;
case 256:
return 32;
default:
GGML_ABORT("fatal error");
return -1;
}
GGML_UNUSED(warp_size);
}
static constexpr __device__ int fattn_tile_get_kq_stride_device(int D, int ncols, int warp_size) {
#ifdef GGML_USE_HIP
switch (D) {
case 64:
return 64;
case 128:
#if defined(GCN) || defined(CDNA)
return ncols <= 16 ? 64 : 32;
#else
return 64;
#endif // defined(GCN) || defined(CDNA)
case 256:
#if defined(GCN) || defined(CDNA)
return ncols <= 16 ? 64 : 32;
#else
return 64;
#endif // defined(GCN) || defined(CDNA)
default:
return -1;
}
#else
#ifdef FAST_FP16_AVAILABLE
switch (D) {
case 64:
case 128:
return 128;
case 256:
return ncols <= 16 ? 128 : 64;
default:
return -1;
}
#else
switch (D) {
case 64:
return ncols <= 16 ? 128 : 64;
case 128:
return ncols <= 16 ? 64 : 32;
case 256:
return 32;
default:
return -1;
}
#endif // FAST_FP16_AVAILABLE
#endif // GGML_USE_HIP
GGML_UNUSED_VARS(ncols, warp_size);
}
static constexpr __device__ int fattn_tile_get_kq_nbatch_device(int D, int ncols, int warp_size) {
#ifdef GGML_USE_HIP
switch (D) {
case 64:
return 64;
case 128:
#if defined(GCN) || defined(CDNA)
return ncols <= 16 ? 64 : 128;
#else
return 64;
#endif // defined(GCN) || defined(CDNA)
case 256:
#if defined(GCN) || defined(CDNA)
return ncols <= 16 ? 64 : 128;
#else
return ncols <= 16 ? 64 : 256;
#endif // defined(GCN) || defined(CDNA)
default:
return -1;
}
#else
#ifdef FAST_FP16_AVAILABLE
switch (D) {
case 64:
return 64;
case 128:
return ncols <= 16 ? 128 : 64;
case 256:
return ncols <= 16 ? 64 : 128;
default:
return -1;
}
#else
switch (D) {
case 64:
return 64;
case 128:
return 128;
case 256:
return ncols <= 16 ? 128 : 64;
default:
return -1;
}
#endif // FAST_FP16_AVAILABLE
#endif // GGML_USE_HIP
GGML_UNUSED_VARS(ncols, warp_size);
}
template<int D, int ncols, bool use_logit_softcap> // D == head size
#ifdef GGML_USE_HIP
__launch_bounds__(FATTN_TILE_NTHREADS, 1)
#else
__launch_bounds__(FATTN_TILE_NTHREADS, 2)
#endif // GGML_USE_HIP
static __global__ void flash_attn_tile(
const char * __restrict__ Q,
const char * __restrict__ K,
const char * __restrict__ V,
const char * __restrict__ mask,
const char * __restrict__ sinks,
const int * __restrict__ KV_max,
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const float logit_softcap,
const int32_t ne00, const int32_t ne01, const int32_t ne02, const int32_t ne03,
const int32_t nb01, const int32_t nb02, const int32_t nb03,
const int32_t ne10, const int32_t ne11, const int32_t ne12, const int32_t ne13,
const int32_t nb11, const int32_t nb12, const int64_t nb13,
const int32_t nb21, const int32_t nb22, const int64_t nb23,
const int32_t ne31, const int32_t ne32, const int32_t ne33,
const int32_t nb31, const int32_t nb32, const int64_t nb33) {
#ifdef FLASH_ATTN_AVAILABLE
// Skip unused kernel variants for faster compilation:
#ifdef FP16_MMA_AVAILABLE
NO_DEVICE_CODE;
return;
#endif // FP16_MMA_AVAILABLE
if (use_logit_softcap && !(D == 128 || D == 256)) {
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
return;
}
constexpr int warp_size = 32;
constexpr int nwarps = FATTN_TILE_NTHREADS / warp_size;
constexpr int kq_stride = fattn_tile_get_kq_stride_device(D, ncols, warp_size);
static_assert(kq_stride % warp_size == 0, "kq_stride not divisable by warp_size.");
constexpr int kq_nbatch = fattn_tile_get_kq_nbatch_device(D, ncols, warp_size);
static_assert(kq_nbatch % (2*warp_size) == 0, "bad kq_nbatch");
// In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.
const int sequence = blockIdx.z / ne02;
const int head = blockIdx.z - sequence*ne02;
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const float * sinksf = (const float *) (sinks);
const int stride_KV2 = nb11 / sizeof(half2);
const float slope = get_alibi_slope(max_bias, head, n_head_log2, m0, m1);
#if defined(GGML_USE_HIP)
constexpr int cpy_nb = 16;
#else
constexpr int cpy_nb = 8;
#endif // defined(GGML_USE_HIP) && defined(GCN)
constexpr int cpy_ne = cpy_nb / 4;
__shared__ float KQ[ncols][kq_stride];
#ifdef FAST_FP16_AVAILABLE
__shared__ half2 Q_tmp[ncols][D/2];
__shared__ half2 KV_tmp_h2[kq_stride * (kq_nbatch/2 + cpy_ne)]; // Padded to avoid memory bank conflicts.
half2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
#else
__shared__ float Q_tmp[ncols][D];
__shared__ float KV_tmp_f[kq_stride * (kq_nbatch + cpy_ne)]; // Padded to avoid memory bank conflicts.
float2 * KV_tmp_f2 = (float2 *) KV_tmp_f;
float2 VKQ[ncols/nwarps][D/(2*warp_size)] = {{{0.0f, 0.0f}}};
#endif // FAST_FP16_AVAILABLE
float kqmax[ncols/nwarps];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
kqmax[j0/nwarps] = -FLT_MAX/2.0f;
}
float kqsum[ncols/nwarps] = {0.0f};
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
const float2 tmp = ic0 + j < ne01 ? Q_f2[j*(nb01/sizeof(float2)) + i0 + threadIdx.x] : make_float2(0.0f, 0.0f);
#ifdef FAST_FP16_AVAILABLE
Q_tmp[j][i0 + threadIdx.x] = make_half2(tmp.x * scale, tmp.y * scale);
#else
Q_tmp[j][2*i0 + threadIdx.x] = tmp.x * scale;
Q_tmp[j][2*i0 + warp_size + threadIdx.x] = tmp.y * scale;
#endif // FAST_FP16_AVAILABLE
}
}
__syncthreads();
const int k_VKQ_max = KV_max ? KV_max[sequence*gridDim.x + blockIdx.x] : ne11;
for (int k_VKQ_0 = blockIdx.y*kq_stride; k_VKQ_0 < k_VKQ_max; k_VKQ_0 += gridDim.y*kq_stride) {
// Calculate KQ tile and keep track of new maximum KQ values:
float kqmax_new[ncols/nwarps];
#pragma unroll
for (int j = 0; j < ncols/nwarps; ++j) {
kqmax_new[j] = kqmax[j];
}
float sum[kq_stride/warp_size][ncols/nwarps] = {{0.0f}};
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += kq_nbatch) {
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
#pragma unroll
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += warp_size) {
const half2 tmp_h2 = K_h2[int64_t(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ_0/2 + k_KQ_1 + threadIdx.x];
#ifdef FAST_FP16_AVAILABLE
KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1 + threadIdx.x] = tmp_h2;
#else
const float2 tmp_f2 = __half22float2(tmp_h2);
KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + threadIdx.x] = tmp_f2.x;
KV_tmp_f[i_KQ*(kq_nbatch + cpy_ne) + 2*k_KQ_1 + warp_size + threadIdx.x] = tmp_f2.y;
#endif // FAST_FP16_AVAILABLE
}
}
__syncthreads();
#ifdef FAST_FP16_AVAILABLE
#pragma unroll
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch/2; k_KQ_1 += cpy_ne) {
half2 K_k[kq_stride/warp_size][cpy_ne];
half2 Q_k[ncols/nwarps][cpy_ne];
#else
#pragma unroll
for (int k_KQ_1 = 0; k_KQ_1 < kq_nbatch; k_KQ_1 += cpy_ne) {
float K_k[kq_stride/warp_size][cpy_ne];
float Q_k[ncols/nwarps][cpy_ne];
#endif // FAST_FP16_AVAILABLE
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
const int i_KQ = i_KQ_0 + threadIdx.x;
#ifdef FAST_FP16_AVAILABLE
ggml_cuda_memcpy_1<cpy_nb>(&K_k[i_KQ_0/warp_size], &KV_tmp_h2[i_KQ*(kq_nbatch/2 + cpy_ne) + k_KQ_1]);
#else
ggml_cuda_memcpy_1<cpy_nb>(&K_k[i_KQ_0/warp_size], &KV_tmp_f [i_KQ*(kq_nbatch + cpy_ne) + k_KQ_1]);
#endif // FAST_FP16_AVAILABLE
}
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y;
#ifdef FAST_FP16_AVAILABLE
ggml_cuda_memcpy_1<cpy_nb>(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0/2 + k_KQ_1]);
#else
ggml_cuda_memcpy_1<cpy_nb>(&Q_k[j_KQ_0/nwarps], &Q_tmp[j_KQ][k_KQ_0 + k_KQ_1]);
#endif // FAST_FP16_AVAILABLE
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
#pragma unroll
for (int k = 0; k < cpy_ne; ++k) {
ggml_cuda_mad(sum[i_KQ_0/warp_size][j_KQ_0/nwarps], K_k[i_KQ_0/warp_size][k], Q_k[j_KQ_0/nwarps][k]);
}
}
}
}
if (k_KQ_0 + kq_nbatch < D) {
__syncthreads(); // Sync not needed on last iteration.
}
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < kq_stride; i_KQ_0 += warp_size) {
const int i_KQ = i_KQ_0 + threadIdx.x;
#pragma unroll
for (int j_KQ_0 = 0; j_KQ_0 < ncols; j_KQ_0 += nwarps) {
const int j_KQ = j_KQ_0 + threadIdx.y;
if (use_logit_softcap) {
sum[i_KQ_0/warp_size][j_KQ_0/nwarps] = logit_softcap * tanhf(sum[i_KQ_0/warp_size][j_KQ_0/nwarps]);
}
sum[i_KQ_0/warp_size][j_KQ_0/nwarps] += mask ? slope*__half2float(maskh[j_KQ*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
kqmax_new[j_KQ_0/nwarps] = fmaxf(kqmax_new[j_KQ_0/nwarps], sum[i_KQ_0/warp_size][j_KQ_0/nwarps]);
KQ[j_KQ][i_KQ] = sum[i_KQ_0/warp_size][j_KQ_0/nwarps];
}
}
__syncthreads();
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
kqmax_new[j0/nwarps] = warp_reduce_max<warp_size>(kqmax_new[j0/nwarps]);
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new[j0/nwarps]);
kqmax[j0/nwarps] = kqmax_new[j0/nwarps];
float kqsum_add = 0.0f;
if (kq_stride % (4*warp_size) == 0 && cpy_ne % 4 == 0) {
#pragma unroll
for (int i0 = 0; i0 < kq_stride; i0 += 4*warp_size) {
const int i = i0 + 4*threadIdx.x;
float4 val = *(const float4 *) &KQ[j][i];
val.x = expf(val.x - kqmax[j0/nwarps]);
val.y = expf(val.y - kqmax[j0/nwarps]);
val.z = expf(val.z - kqmax[j0/nwarps]);
val.w = expf(val.w - kqmax[j0/nwarps]);
kqsum_add += val.x + val.y + val.z + val.w;
#ifdef FAST_FP16_AVAILABLE
const half2 tmp[2] = {make_half2(val.x, val.y), make_half2(val.z, val.w)};
ggml_cuda_memcpy_1<sizeof(tmp)>(&KQ[j][i/2], &tmp);
#else
ggml_cuda_memcpy_1<sizeof(val)>(&KQ[j][i], &val);
#endif // FAST_FP16_AVAILABLE
}
} else if (kq_stride % (2*warp_size) == 0 && cpy_ne % 2 == 0) {
#pragma unroll
for (int i0 = 0; i0 < kq_stride; i0 += 2*warp_size) {
const int i = i0 + 2*threadIdx.x;
float2 val = *(const float2 *) &KQ[j][i];
val.x = expf(val.x - kqmax[j0/nwarps]);
val.y = expf(val.y - kqmax[j0/nwarps]);
kqsum_add += val.x + val.y;
#ifdef FAST_FP16_AVAILABLE
const half2 tmp = make_half2(val.x, val.y);
ggml_cuda_memcpy_1<sizeof(tmp)>(&KQ[j][i/2], &tmp);
#else
ggml_cuda_memcpy_1<sizeof(val)>(&KQ[j][i], &val);
#endif // FAST_FP16_AVAILABLE
}
} else {
for (int i0 = 0; i0 < kq_stride; i0 += warp_size) {
const int i = i0 + threadIdx.x;
const float diff = KQ[j][i] - kqmax[j0/nwarps];
const float val = expf(diff);
kqsum_add += val;
#ifdef FAST_FP16_AVAILABLE
((half *) KQ[j])[i] = val;
#else
KQ[j][i] = val;
#endif // FAST_FP16_AVAILABLE
}
}
kqsum[j0/nwarps] = kqsum[j0/nwarps]*KQ_max_scale + kqsum_add;
#ifdef FAST_FP16_AVAILABLE
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2;
}
#else
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale;
VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale;
}
#endif // FAST_FP16_AVAILABLE
}
constexpr int V_cols_per_iter = kq_stride*kq_nbatch / D;
static_assert(kq_stride % V_cols_per_iter == 0, "bad V_cols_per_iter");
#pragma unroll
for (int k0 = 0; k0 < kq_stride; k0 += V_cols_per_iter) {
#pragma unroll
for (int k1 = 0; k1 < V_cols_per_iter; k1 += nwarps) {
const int k_tile = k1 + threadIdx.y;
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
const int i = i0 + threadIdx.x;
const half2 tmp = V_h2[int64_t(k_VKQ_0 + k0 + k_tile)*stride_KV2 + i];
#ifdef FAST_FP16_AVAILABLE
KV_tmp_h2[k_tile*(D/2) + i] = tmp;
#else
KV_tmp_f2[k_tile*(D/2) + i] = __half22float2(tmp);
#endif // FAST_FP16_AVAILABLE
}
}
__syncthreads();
#pragma unroll
for (int k1 = 0; k1 < V_cols_per_iter; ++k1) {
#ifdef FAST_FP16_AVAILABLE
half2 V_k[(D/2)/warp_size];
half2 KQ_k[ncols/nwarps];
#else
float2 V_k[(D/2)/warp_size];
float KQ_k[ncols/nwarps];
#endif // FAST_FP16_AVAILABLE
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
const int i = i0 + threadIdx.x;
#ifdef FAST_FP16_AVAILABLE
V_k[i0/warp_size] = KV_tmp_h2[k1*(D/2) + i];
#else
V_k[i0/warp_size] = KV_tmp_f2[k1*(D/2) + i];
#endif // FAST_FP16_AVAILABLE
}
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
#ifdef FAST_FP16_AVAILABLE
KQ_k[j0/nwarps] = __half2half2(((const half *)KQ[j])[k0 + k1]);
#else
KQ_k[j0/nwarps] = KQ[j][k0 + k1];
#endif // FAST_FP16_AVAILABLE
}
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
#ifdef FAST_FP16_AVAILABLE
VKQ[j0/nwarps][i0/warp_size] += V_k[i0/warp_size] *KQ_k[j0/nwarps];
#else
VKQ[j0/nwarps][i0/warp_size].x += V_k[i0/warp_size].x*KQ_k[j0/nwarps];
VKQ[j0/nwarps][i0/warp_size].y += V_k[i0/warp_size].y*KQ_k[j0/nwarps];
#endif // FAST_FP16_AVAILABLE
}
}
}
__syncthreads();
}
}
// Attention sink: adjust running max and sum once per head
if (sinksf && blockIdx.y == 0) {
const float sink = sinksf[head];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
kqmax_new_j = warp_reduce_max<warp_size>(kqmax_new_j);
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j);
kqmax[j0/nwarps] = kqmax_new_j;
const float val = expf(sink - kqmax[j0/nwarps]);
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
if (threadIdx.x == 0) {
kqsum[j0/nwarps] += val;
}
#ifdef FAST_FP16_AVAILABLE
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
VKQ[j0/nwarps][i0/warp_size] *= KQ_max_scale_h2;
}
#else
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
VKQ[j0/nwarps][i0/warp_size].x *= KQ_max_scale;
VKQ[j0/nwarps][i0/warp_size].y *= KQ_max_scale;
}
#endif // FAST_FP16_AVAILABLE
}
}
float2 * dst2 = (float2 *) dst;
#pragma unroll
for (int j_VKQ_0 = 0; j_VKQ_0 < ncols; j_VKQ_0 += nwarps) {
const int j_VKQ = j_VKQ_0 + threadIdx.y;
if (ic0 + j_VKQ >= ne01) {
return;
}
float kqsum_j = kqsum[j_VKQ_0/nwarps];
kqsum_j = warp_reduce_sum<warp_size>(kqsum_j);
const int j_dst_unrolled = ((sequence*ne01 + ic0 + j_VKQ)*ne02 + head)*gridDim.y + blockIdx.y;
#pragma unroll
for (int i00 = 0; i00 < D/2; i00 += warp_size) {
const int i0 = i00 + threadIdx.x;
#ifdef FAST_FP16_AVAILABLE
float2 dst_val = __half22float2(VKQ[j_VKQ_0/nwarps][i0/warp_size]);
#else
float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/warp_size];
#endif // FAST_FP16_AVAILABLE
if (gridDim.y == 1) {
dst_val.x /= kqsum_j;
dst_val.y /= kqsum_j;
}
dst2[j_dst_unrolled*(D/2) + i0] = dst_val;
}
if (gridDim.y != 1 && threadIdx.x == 0) {
dst_meta[j_dst_unrolled] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
}
}
#else
GGML_UNUSED_VARS(Q, K, V, mask, sinks, KV_max, dst, dst_meta, scale,
max_bias, m0, m1, n_head_log2, logit_softcap,
ne00, ne01, ne02, ne03,
nb01, nb02, nb03,
ne10, ne11, ne12, ne13,
nb11, nb12, nb13,
nb21, nb22, nb23,
ne31, ne32, ne33,
nb31, nb32, nb33);
NO_DEVICE_CODE;
#endif // FLASH_ATTN_AVAILABLE
}
template <int D, bool use_logit_softcap>
static void launch_fattn_tile_switch_ncols(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
const int id = ggml_cuda_get_device();
const int cc = ggml_cuda_info().devices[id].cc;
const int warp_size = 32;
const int nwarps = FATTN_TILE_NTHREADS / warp_size;
constexpr size_t nbytes_shared = 0;
if (Q->ne[1] > 16) {
constexpr int cols_per_block = 32;
fattn_kernel_t fattn_kernel = flash_attn_tile<D, cols_per_block, use_logit_softcap>;
const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size);
launch_fattn<D, cols_per_block, 1>
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, kq_stride, true, true, false, warp_size);
return;
}
constexpr int cols_per_block = 16;
fattn_kernel_t fattn_kernel = flash_attn_tile<D, cols_per_block, use_logit_softcap>;
const int kq_stride = fattn_tile_get_kq_stride_host(D, cols_per_block, cc, warp_size);
launch_fattn<D, cols_per_block, 1>
(ctx, dst, fattn_kernel, nwarps, nbytes_shared, kq_stride, true, true, false, warp_size);
}
template <bool use_logit_softcap>
static void launch_fattn_tile_switch_head_size(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
switch (Q->ne[0]) {
case 64: {
launch_fattn_tile_switch_ncols< 64, use_logit_softcap>(ctx, dst);
} break;
case 128: {
launch_fattn_tile_switch_ncols<128, use_logit_softcap>(ctx, dst);
} break;
case 256: {
launch_fattn_tile_switch_ncols<256, use_logit_softcap>(ctx, dst);
} break;
default: {
GGML_ABORT("Unsupported head size");
} break;
}
}
void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
if (logit_softcap == 0.0f) {
constexpr bool use_logit_softcap = false;
launch_fattn_tile_switch_head_size<use_logit_softcap>(ctx, dst);
} else {
constexpr bool use_logit_softcap = true;
launch_fattn_tile_switch_head_size<use_logit_softcap>(ctx, dst);
}
}

View File

@@ -0,0 +1,3 @@
#include "common.cuh"
void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -1,8 +1,7 @@
#include "common.cuh"
#include "fattn-common.cuh"
#include "fattn-mma-f16.cuh"
#include "fattn-tile-f16.cuh"
#include "fattn-tile-f32.cuh"
#include "fattn-tile.cuh"
#include "fattn-vec-f16.cuh"
#include "fattn-vec-f32.cuh"
#include "fattn-wmma-f16.cuh"
@@ -271,8 +270,7 @@ static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, gg
// Best FlashAttention kernel for a specific GPU:
enum best_fattn_kernel {
BEST_FATTN_KERNEL_NONE = 0,
BEST_FATTN_KERNEL_TILE_F32 = 200,
BEST_FATTN_KERNEL_TILE_F16 = 210,
BEST_FATTN_KERNEL_TILE = 200,
BEST_FATTN_KERNEL_VEC_F32 = 100,
BEST_FATTN_KERNEL_VEC_F16 = 110,
BEST_FATTN_KERNEL_WMMA_F16 = 300,
@@ -411,10 +409,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
}
// If there is no suitable kernel for tensor cores or small batch sizes, use the generic kernel for large batch sizes:
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
return BEST_FATTN_KERNEL_TILE_F16;
}
return BEST_FATTN_KERNEL_TILE_F32;
return BEST_FATTN_KERNEL_TILE;
}
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -422,11 +417,8 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
switch (ggml_cuda_get_best_fattn_kernel(ggml_cuda_get_device(), dst)) {
case BEST_FATTN_KERNEL_NONE:
GGML_ABORT("fatal error");
case BEST_FATTN_KERNEL_TILE_F32:
ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
break;
case BEST_FATTN_KERNEL_TILE_F16:
ggml_cuda_flash_attn_ext_tile_f16(ctx, dst);
case BEST_FATTN_KERNEL_TILE:
ggml_cuda_flash_attn_ext_tile(ctx, dst);
break;
case BEST_FATTN_KERNEL_VEC_F32:
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);

View File

@@ -6,64 +6,66 @@ template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void k_get_rows(
const void * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
/*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/
/*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const int i00 = (blockIdx.y * blockDim.x + threadIdx.x)*2;
const int i10 = blockIdx.x;
const int i11 = blockIdx.z / ne12;
const int i12 = blockIdx.z % ne12;
for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
for (int64_t i00 = 2*(blockIdx.y*blockDim.x + threadIdx.x); i00 < ne00; i00 += gridDim.y*blockDim.x) {
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const int i10 = blockIdx.x;
const int i11 = z / ne12; // TODO fastdiv
const int i12 = z % ne12;
if (i00 >= ne00) {
return;
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const void * src0_row = (const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03;
const int ib = i00/qk; // block index
const int iqs = (i00%qk)/qr; // quant index
const int iybs = i00 - i00%qk; // dst block start index
const int y_offset = qr == 1 ? 1 : qk/2;
// dequantize
float2 v;
dequantize_kernel(src0_row, ib, iqs, v);
dst_row[iybs + iqs + 0] = ggml_cuda_cast<dst_t>(v.x);
dst_row[iybs + iqs + y_offset] = ggml_cuda_cast<dst_t>(v.y);
}
}
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const void * src0_row = (const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03;
const int ib = i00/qk; // block index
const int iqs = (i00%qk)/qr; // quant index
const int iybs = i00 - i00%qk; // dst block start index
const int y_offset = qr == 1 ? 1 : qk/2;
// dequantize
float2 v;
dequantize_kernel(src0_row, ib, iqs, v);
dst_row[iybs + iqs + 0] = ggml_cuda_cast<dst_t>(v.x);
dst_row[iybs + iqs + y_offset] = ggml_cuda_cast<dst_t>(v.y);
}
template<typename src0_t, typename dst_t>
static __global__ void k_get_rows_float(
const src0_t * __restrict__ src0, const int32_t * __restrict__ src1, dst_t * __restrict__ dst,
const int64_t ne00, /*const int64_t ne01, const int64_t ne02, const int64_t ne03,*/
/*const int64_t ne10, const int64_t ne11,*/ const int64_t ne12, /*const int64_t ne13,*/
/*const int64_t ne10,*/ const int64_t ne11, const int64_t ne12, /*const int64_t ne13,*/
/*const size_t s0,*/ const size_t s1, const size_t s2, const size_t s3,
/*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const int i00 = blockIdx.y * blockDim.x + threadIdx.x;
const int i10 = blockIdx.x;
const int i11 = blockIdx.z / ne12;
const int i12 = blockIdx.z % ne12;
for (int64_t z = blockIdx.z; z < ne11*ne12; z += gridDim.z) {
for (int64_t i00 = blockIdx.y*blockDim.x + threadIdx.x; i00 < ne00; i00 += gridDim.y*blockDim.x) {
// The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
const int i10 = blockIdx.x;
const int i11 = z / ne12; // TODO fastdiv
const int i12 = z % ne12;
if (i00 >= ne00) {
return;
if (i00 >= ne00) {
return;
}
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
dst_row[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
}
}
const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
const src0_t * src0_row = (const src0_t *)((const char *) src0 + i01*nb01 + i11*nb02 + i12*nb03);
dst_row[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);
}
template<typename grad_t, typename dst_t>
@@ -98,7 +100,7 @@ static void get_rows_cuda_q(
cudaStream_t stream) {
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_y = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
const dim3 block_nums(ne10, block_num_y, ne11*ne12);
const dim3 block_nums(ne10, MIN(block_num_y, UINT16_MAX), MIN(ne11*ne12, UINT16_MAX));
// strides in elements
// const size_t s0 = nb0 / sizeof(dst_t);
@@ -116,7 +118,7 @@ static void get_rows_cuda_q(
k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(
src0_d, src1_d, dst_d,
ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/
/*ne10,*/ ne11, ne12, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);
@@ -131,7 +133,7 @@ static void get_rows_cuda_float(
cudaStream_t stream) {
const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
const int block_num_y = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
const dim3 block_nums(ne10, block_num_y, ne11*ne12);
const dim3 block_nums(ne10, MIN(block_num_y, UINT16_MAX), MIN(ne11*ne12, UINT16_MAX));
// strides in elements
// const size_t s0 = nb0 / sizeof(dst_t);
@@ -147,7 +149,7 @@ static void get_rows_cuda_float(
k_get_rows_float<<<block_nums, block_dims, 0, stream>>>(
src0_d, src1_d, dst_d,
ne00, /*ne01, ne02, ne03,*/
/*ne10, ne11,*/ ne12, /*ne13,*/
/*ne10,*/ ne11, ne12, /*ne13,*/
/* s0,*/ s1, s2, s3,
/* nb00,*/ nb01, nb02, nb03,
s10, s11, s12/*, s13*/);

View File

@@ -2109,6 +2109,11 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
ggml_cuda_mul_mat_q(ctx, src0, src1, ids, dst);
return;
}
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2])) {
ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
return;
}
}
cudaStream_t stream = ctx.stream();
@@ -2452,6 +2457,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_IM2COL:
ggml_cuda_op_im2col(ctx, dst);
break;
case GGML_OP_IM2COL_3D:
ggml_cuda_op_im2col_3d(ctx, dst);
break;
case GGML_OP_CONV_2D:
ggml_cuda_op_conv2d(ctx, dst);
break;
@@ -3132,6 +3140,7 @@ static const ggml_backend_i ggml_backend_cuda_interface = {
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait,
/* .optimize_graph = */ NULL,
};
static ggml_guid_t ggml_backend_cuda_guid() {
@@ -3201,6 +3210,7 @@ struct ggml_backend_cuda_device_context {
int device;
std::string name;
std::string description;
std::string pci_bus_id;
};
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
@@ -3225,9 +3235,12 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
}
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
props->name = ggml_backend_cuda_device_get_name(dev);
props->description = ggml_backend_cuda_device_get_description(dev);
props->type = ggml_backend_cuda_device_get_type(dev);
props->device_id = ctx->pci_bus_id.empty() ? nullptr : ctx->pci_bus_id.c_str();
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
bool host_buffer = getenv("GGML_CUDA_NO_PINNED") == nullptr;
@@ -3458,6 +3471,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_IQ4_NL) {
return true;
}
if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_I32) {
return true;
}
if (src0_type == GGML_TYPE_I32 && src1_type == GGML_TYPE_F32) {
return true;
}
if (src0_type == src1_type && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1])) {
return true;
}
@@ -3559,6 +3578,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
}
case GGML_OP_IM2COL:
case GGML_OP_IM2COL_3D:
case GGML_OP_CONV_2D:
case GGML_OP_CONV_2D_DW:
case GGML_OP_CONV_TRANSPOSE_2D:
@@ -3570,9 +3590,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_SUM_ROWS:
case GGML_OP_MEAN:
case GGML_OP_GROUP_NORM:
case GGML_OP_PAD:
return ggml_is_contiguous(op->src[0]);
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
@@ -3788,6 +3808,10 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
dev_ctx->description = prop.name;
char pci_bus_id[16] = {};
snprintf(pci_bus_id, sizeof(pci_bus_id), "%04x:%02x:%02x.0", prop.pciDomainID, prop.pciBusID, prop.pciDeviceID);
dev_ctx->pci_bus_id = pci_bus_id;
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cuda_device_interface,
/* .reg = */ &reg,

View File

@@ -112,3 +112,153 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
}
}
// [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
template <typename T>
static __global__ void im2col_3d_kernel(
const float * src, T * dst,
int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
int64_t OH_OW, int64_t KD_KH_KW, int64_t ID_IH_IW, int64_t KH_KW, int64_t IH_IW, int64_t IC_ID_IH_IW,
int64_t IC_KD_KH_KW, int64_t OW_KD_KH_KW, int64_t OD_OH_OW_IC_KD_KH_KW, int64_t OH_OW_IC_KD_KH_KW,
int64_t OW_IC_KD_KH_KW, int64_t N_OD_OH, int64_t OD_OH,
int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x,
int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2) {
const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= IC_KD_KH_KW) {
return;
}
GGML_UNUSED(N); GGML_UNUSED(OC); GGML_UNUSED(OH_OW); GGML_UNUSED(OD); GGML_UNUSED(OW); GGML_UNUSED(KD); GGML_UNUSED(KH);
GGML_UNUSED(ID_IH_IW); GGML_UNUSED(IH_IW); GGML_UNUSED(IC_ID_IH_IW); GGML_UNUSED(OW_KD_KH_KW);
const int64_t iic = i / KD_KH_KW;
const int64_t ikd = (i - iic * KD_KH_KW) / KH_KW;
const int64_t ikh = (i - iic * KD_KH_KW - ikd * KH_KW) / KW;
const int64_t ikw = i % KW;
const int64_t iow = blockIdx.y;
for (int64_t iz = blockIdx.z; iz < N_OD_OH; iz+=MAX_GRIDDIM_Z) {
const int64_t in = iz / OD_OH;
const int64_t iod = (iz - in*OD_OH) / OH;
const int64_t ioh = iz % OH;
const int64_t iiw = iow * s0 + ikw * d0 - p0;
const int64_t iih = ioh * s1 + ikh * d1 - p1;
const int64_t iid = iod * s2 + ikd * d2 - p2;
const int64_t offset_dst = in*OD_OH_OW_IC_KD_KH_KW + iod*OH_OW_IC_KD_KH_KW + ioh*OW_IC_KD_KH_KW + iow*IC_KD_KH_KW + iic*KD_KH_KW + ikd * KH_KW + ikh*KW + ikw;
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW || iid < 0 || iid >= ID) {
dst[offset_dst] = 0.0f;
} else {
const int64_t offset_src = ((in * IC + iic) * stride_q) + (iid * stride_z) + (iih * stride_y) + (iiw * stride_x);
dst[offset_dst] = src[offset_src];
}
}
}
// [N*IC, ID, IH, IW] => [N*OD, OH, OW, IC * KD * KH * KW]
template <typename T>
static void im2col_3d_cuda(const float * src, T* dst,
int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x,
int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) {
const int64_t OH_OW = OH*OW;
const int64_t KD_KH_KW = KD*KH*KW;
const int64_t ID_IH_IW = ID*IH*IW;
const int64_t KH_KW = KH*KW;
const int64_t IH_IW = IH*IW;
const int64_t IC_KD_KH_KW = IC*KD*KH*KW;
const int64_t OW_KD_KH_KW = OW*KD*KH*KW;
const int64_t N_OD_OH = N*OD*OH;
const int64_t OD_OH = OD*OH;
const int64_t IC_ID_IH_IW = IC*ID*IH*IW;
const int64_t OD_OH_OW_IC_KD_KH_KW = OD*OH*OW*IC*KD*KH*KW;
const int64_t OH_OW_IC_KD_KH_KW = OH*OW*IC*KD*KH*KW;
const int64_t OW_IC_KD_KH_KW = OW*IC*KD*KH*KW;
const int64_t num_blocks = (IC_KD_KH_KW + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
dim3 block_nums(num_blocks, OW, MIN(N_OD_OH, MAX_GRIDDIM_Z));
im2col_3d_kernel<<<block_nums, MIN(IC_KD_KH_KW, CUDA_IM2COL_BLOCK_SIZE) , 0, stream>>>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
OH_OW, KD_KH_KW, ID_IH_IW, KH_KW, IH_IW, IC_ID_IH_IW,
IC_KD_KH_KW, OW_KD_KH_KW, OD_OH_OW_IC_KD_KH_KW,
OH_OW_IC_KD_KH_KW, OW_IC_KD_KH_KW, N_OD_OH, OD_OH,
stride_q, stride_z, stride_y, stride_x,
s0, s1, s2, p0, p1, p2, d0, d1, d2);
}
static void im2col_3d_cuda_f16(const float * src, half * dst,
int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x,
int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) {
im2col_3d_cuda<half>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
stride_q, stride_z, stride_y, stride_x,
s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
}
static void im2col_3d_cuda_f32(const float * src, float * dst,
int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, int64_t OC,
int64_t KD, int64_t KH, int64_t KW, int64_t OD, int64_t OH, int64_t OW,
int64_t stride_q, int64_t stride_z, int64_t stride_y, int64_t stride_x,
int s0, int s1, int s2, int p0, int p1, int p2, int d0, int d1, int d2, cudaStream_t stream) {
im2col_3d_cuda<float>(src, dst, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
stride_q, stride_z, stride_y, stride_x,
s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
}
void ggml_cuda_op_im2col_3d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
const float * src1_d = (const float *)src1->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
GGML_TENSOR_BINARY_OP_LOCALS
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
const int32_t s2 = ((const int32_t *)(dst->op_params))[2];
const int32_t p0 = ((const int32_t *)(dst->op_params))[3];
const int32_t p1 = ((const int32_t *)(dst->op_params))[4];
const int32_t p2 = ((const int32_t *)(dst->op_params))[5];
const int32_t d0 = ((const int32_t *)(dst->op_params))[6];
const int32_t d1 = ((const int32_t *)(dst->op_params))[7];
const int32_t d2 = ((const int32_t *)(dst->op_params))[8];
const int32_t IC = ((const int32_t *)(dst->op_params))[9];
const int64_t N = ne13 / IC;
const int64_t ID = ne12;
const int64_t IH = ne11;
const int64_t IW = ne10;
const int64_t OC = ne03 / IC;
const int64_t KD = ne02;
const int64_t KH = ne01;
const int64_t KW = ne00;
const int64_t OD = ne3 / N;
const int64_t OH = ne2;
const int64_t OW = ne1;
const size_t es = ggml_element_size(src1);
const int64_t stride_x = src1->nb[0] / es;
const int64_t stride_y = src1->nb[1] / es;
const int64_t stride_z = src1->nb[2] / es;
const int64_t stride_q = src1->nb[3] / es;
if(dst->type == GGML_TYPE_F16) {
im2col_3d_cuda_f16(src1_d, (half *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
stride_q, stride_z, stride_y, stride_x,
s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
} else {
im2col_3d_cuda_f32(src1_d, (float *) dst_d, N, IC, ID, IH, IW, OC, KD, KH, KW, OD, OH, OW,
stride_q, stride_z, stride_y, stride_x,
s0, s1, s2, p0, p1, p2, d0, d1, d2, stream);
}
}

View File

@@ -3,3 +3,4 @@
#define CUDA_IM2COL_BLOCK_SIZE 256
void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_im2col_3d(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@@ -1,3 +1,4 @@
#pragma once
// This file contains primitives that expose the tensor core PTX instructions for CUDA code.
// The primitives can be used in a similar way as the nvcuda::wmma interface but with a well-defined memory layout.
// The documentation for the PTX instructions can be found under:

View File

@@ -1,343 +1,12 @@
#include "ggml.h"
#include "common.cuh"
#include "mma.cuh"
#include "mmf.cuh"
using namespace ggml_cuda_mma;
#define MMF_ROWS_PER_BLOCK 32
template <typename T, int rows_per_block, int cols_per_block, int nwarps>
__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
static __global__ void mul_mat_f(
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
const int ncols, const int nchannels_y, const int stride_row, const int stride_col_y, const int stride_col_dst,
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
typedef tile<16, 8, T> tile_A;
typedef tile< 8, 8, T> tile_B;
typedef tile<16, 8, float> tile_C;
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr int tile_k_padded = warp_size + 4;
constexpr int ntA = rows_per_block / tile_A::I;
constexpr int ntB = (cols_per_block + tile_B::I - 1) / tile_B::I;
const int row0 = blockIdx.x * rows_per_block;
const int channel_dst = blockIdx.y;
const int channel_x = channel_dst / channel_ratio;
const int channel_y = channel_dst;
const int sample_dst = blockIdx.z;
const int sample_x = sample_dst / sample_ratio;
const int sample_y = sample_dst;
x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row0*stride_row ;
y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y;
dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst;
const float2 * y2 = (const float2 *) y;
extern __shared__ char data_mmv[];
tile_C C[ntA][ntB];
T * tile_xy = (T *) data_mmv + threadIdx.y*(tile_A::I * tile_k_padded);
for (int col = threadIdx.y*warp_size + threadIdx.x; col < ncols; col += nwarps*warp_size) {
tile_A A[ntA][warp_size / tile_A::J];
#pragma unroll
for (int itA = 0; itA < ntA; ++itA) {
#pragma unroll
for (int i = 0; i < tile_A::I; ++i) {
tile_xy[i*tile_k_padded + threadIdx.x] = x[(itA*tile_A::I + i)*stride_row + col];
}
#pragma unroll
for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) {
load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded);
}
}
#pragma unroll
for (int itB = 0; itB < ntB; ++itB) {
if constexpr (std::is_same_v<T, float>) {
#pragma unroll
for (int j0 = 0; j0 < tile_B::I; ++j0) {
const int j = j0 + itB*tile_B::I;
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f;
}
} else if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
#pragma unroll
for (int j0 = 0; j0 < tile_B::I; ++j0) {
const int j = j0 + itB*tile_B::I;
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
}
} else {
static_assert(std::is_same_v<T, void>, "unsupported type");
}
#pragma unroll
for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) {
tile_B B;
load_ldmatrix(B, tile_xy + k0, tile_k_padded);
#pragma unroll
for (int itA = 0; itA < ntA; ++itA) {
mma(C[itA][itB], A[itA][k0/tile_B::J], B);
}
}
}
}
float * buf_iw = (float *) data_mmv;
constexpr int kiw = nwarps*rows_per_block + 4;
if (nwarps > 1) {
__syncthreads();
}
#pragma unroll
for (int itB = 0; itB < ntB; ++itB) {
#pragma unroll
for (int itA = 0; itA < ntA; ++itA) {
#pragma unroll
for (int l = 0; l < tile_C::ne; ++l) {
const int i = threadIdx.y*rows_per_block + itA*tile_C::I + tile_C::get_i(l);
const int j = itB*tile_C::J + tile_C::get_j(l);
buf_iw[j*kiw + i] = C[itA][itB].x[l];
}
}
}
if (nwarps > 1) {
__syncthreads();
}
#pragma unroll
for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) {
const int j = j0 + threadIdx.y;
if (j0 + nwarps > cols_per_block && j >= cols_per_block) {
return;
}
float sum = 0.0f;
static_assert(rows_per_block == warp_size, "need loop/check");
#pragma unroll
for (int i0 = 0; i0 < nwarps*rows_per_block; i0 += rows_per_block) {
const int i = i0 + threadIdx.x;
sum += buf_iw[j*kiw + i];
}
dst[j*stride_col_dst + row0 + threadIdx.x] = sum;
}
#else
GGML_UNUSED_VARS(x, y, ids, dst,
ncols, nchannels_y, stride_row, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
NO_DEVICE_CODE;
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
}
template <typename T, int cols_per_block>
static void mul_mat_f_cuda(
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols_x, const int64_t nrows_x,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream) {
typedef tile<16, 8, T> tile_A;
typedef tile< 8, 8, T> tile_B;
GGML_ASSERT(!ids && "mul_mat_id not implemented");
GGML_ASSERT(ncols_x % 2 == 0);
GGML_ASSERT(stride_row % 2 == 0);
GGML_ASSERT(stride_col_y % 2 == 0);
GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
GGML_ASSERT( nsamples_dst % nsamples_x == 0);
const int64_t channel_ratio = nchannels_dst / nchannels_x;
const int64_t sample_ratio = nsamples_dst / nsamples_x;
const int device = ggml_cuda_get_device();
const int warp_size = ggml_cuda_info().devices[device].warp_size;
int64_t nwarps_best = 1;
int64_t niter_best = (ncols_x + warp_size*2 - 1) / (warp_size*2);
int64_t max_block_size = 256;
for (int64_t nwarps = 2; nwarps <= max_block_size/warp_size; nwarps++) {
const int64_t niter = (ncols_x + nwarps*warp_size*2 - 1) / (nwarps*warp_size*2);
if (niter < niter_best) {
niter_best = niter;
nwarps_best = nwarps;
}
}
constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
const int nbytes_shared_iter = nwarps_best * tile_A::I * (warp_size + 4) * 4;
const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4;
const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
const dim3 block_nums(nrows_x/rows_per_block, nchannels_dst, nsamples_dst);
const dim3 block_dims(warp_size, nwarps_best, 1);
switch (nwarps_best) {
case 1: {
mul_mat_f<T, rows_per_block, cols_per_block, 1><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 2: {
mul_mat_f<T, rows_per_block, cols_per_block, 2><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 3: {
mul_mat_f<T, rows_per_block, cols_per_block, 3><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 4: {
mul_mat_f<T, rows_per_block, cols_per_block, 4><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 5: {
mul_mat_f<T, rows_per_block, cols_per_block, 5><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 6: {
mul_mat_f<T, rows_per_block, cols_per_block, 6><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 7: {
mul_mat_f<T, rows_per_block, cols_per_block, 7><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 8: {
mul_mat_f<T, rows_per_block, cols_per_block, 8><<<block_nums, block_dims, nbytes_shared, stream>>>
(x, y, ids, dst, ncols_x, nchannels_y, stride_row, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
default: {
GGML_ABORT("fatal error");
} break;
}
}
template <typename T>
static void mul_mat_f_switch_cols_per_block(
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream) {
switch (ncols_dst) {
case 1: {
mul_mat_f_cuda<T, 1>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 2: {
mul_mat_f_cuda<T, 2>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 3: {
mul_mat_f_cuda<T, 3>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 4: {
mul_mat_f_cuda<T, 4>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 5: {
mul_mat_f_cuda<T, 5>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 6: {
mul_mat_f_cuda<T, 6>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 7: {
mul_mat_f_cuda<T, 7>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 8: {
mul_mat_f_cuda<T, 8>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 9: {
mul_mat_f_cuda<T, 9>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 10: {
mul_mat_f_cuda<T, 10>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 11: {
mul_mat_f_cuda<T, 11>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 12: {
mul_mat_f_cuda<T, 12>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 13: {
mul_mat_f_cuda<T, 13>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 14: {
mul_mat_f_cuda<T, 14>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 15: {
mul_mat_f_cuda<T, 15>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 16: {
mul_mat_f_cuda<T, 16>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
default: {
GGML_ABORT("fatal error");
} break;
}
}
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
GGML_ASSERT( src1->type == GGML_TYPE_F32);
GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_TENSOR_BINARY_OP_LOCALS;
const size_t ts_src0 = ggml_type_size(src0->type);
@@ -365,55 +34,72 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
const int64_t s13 = src1->nb[3] / ts_src1;
const int64_t s3 = dst->nb[3] / ts_dst;
const int64_t ids_s0 = ids ? ids->nb[0] / ggml_type_size(ids->type) : 0;
const int64_t ids_s1 = ids ? ids->nb[1] / ggml_type_size(ids->type) : 0;
// For MUL_MAT_ID the memory layout is different than for MUL_MAT:
const int64_t ncols_dst = ids ? ne2 : ne1;
const int64_t nchannels_y = ids ? ne11 : ne12;
const int64_t nchannels_dst = ids ? ne1 : ne2;
const int64_t stride_channel_dst = ids ? s1 : s2;
const int64_t stride_channel_y = ids ? s11 : s12;
const int64_t nchannels_dst = ids ? ne1 : ne2;
GGML_ASSERT(!ids || ncols_dst == 1);
const int64_t stride_col_dst = ids ? s2 : s1;
const int64_t stride_col_y = ids ? s12 : s11;
const int64_t stride_channel_dst = ids ? s1 : s2;
int64_t stride_channel_y = ids ? s11 : s12;
int64_t nchannels_y = ids ? ne11 : ne12;
//mul_mat_id: handle broadcast
if (ids && nchannels_y == 1) {
stride_channel_y = 0;
nchannels_y = ids->ne[0];
}
switch (src0->type) {
case GGML_TYPE_F32: {
const float * src0_d = (const float *) src0->data;
constexpr int vals_per_T = 1;
mul_mat_f_switch_cols_per_block(
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1,
ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
} break;
case GGML_TYPE_F16: {
const half2 * src0_d = (const half2 *) src0->data;
constexpr int vals_per_T = 2;
mul_mat_f_switch_cols_per_block(
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1,
ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
} break;
case GGML_TYPE_BF16: {
const nv_bfloat162 * src0_d = (const nv_bfloat162 *) src0->data;
constexpr int vals_per_T = 2;
mul_mat_f_switch_cols_per_block(
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, s11/vals_per_T, s1,
ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
src0_d, src1_d, ids_d, dst_d, ne00/vals_per_T, ne01, ncols_dst, s01/vals_per_T, stride_col_y/vals_per_T, stride_col_dst,
ids_s0, ids_s1, ne02, nchannels_y, nchannels_dst, s02/vals_per_T, stride_channel_y, stride_channel_dst,
ne03, ne3, s03/vals_per_T, s13, s3, ctx.stream());
} break;
default:
GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
}
}
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, int64_t ne11) {
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols) {
if (ggml_is_quantized(type)) {
return false;
}
if (src0_ne[0] % (warp_size * (4/ggml_type_size(type))) != 0) {
return false;
}
if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
return false;
}
if (ne11 > 16) {
if (src1_ncols > 16) {
return false;
}
switch (type) {
case GGML_TYPE_F32:
return ampere_mma_available(cc);

View File

@@ -1,5 +1,461 @@
#pragma once
#include "mma.cuh"
#include "common.cuh"
using namespace ggml_cuda_mma;
#define MMF_ROWS_PER_BLOCK 32
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, int64_t ne11);
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols);
template <typename T, int rows_per_block, int cols_per_block, int nwarps, bool has_ids>
__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
static __global__ void mul_mat_f(
const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
const int ncols, const int nchannels_dst, const int stride_row, const int stride_col_y, const int stride_col_dst,
const int stride_col_id, const int stride_row_id,
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
typedef tile<16, 8, T> tile_A;
typedef tile< 8, 8, T> tile_B;
typedef tile<16, 8, float> tile_C;
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr int tile_k_padded = warp_size + 4;
constexpr int ntA = rows_per_block / tile_A::I;
constexpr int ntB = (cols_per_block + tile_B::I - 1) / tile_B::I;
const int row0 = blockIdx.x * rows_per_block;
const int expert_idx = has_ids ? blockIdx.y : 0;
const int channel_dst = has_ids ? 0 : blockIdx.y;
const int channel_x = has_ids ? expert_idx : (channel_dst / channel_ratio);
const int channel_y = channel_dst;
const int sample_dst = blockIdx.z;
const int sample_x = sample_dst / sample_ratio;
const int sample_y = sample_dst;
x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row0*stride_row ;
y += int64_t(sample_y) *stride_sample_y + (has_ids ? 0 : channel_y *stride_channel_y);
dst += int64_t(sample_dst)*stride_sample_dst + (has_ids ? 0 : channel_dst*stride_channel_dst);
const float2 * y2 = (const float2 *) y;
extern __shared__ char data_mmv[];
char * shmem_base = data_mmv;
int * slot_map = (int *) shmem_base;
char * compute_base = has_ids ? (shmem_base + GGML_PAD(cols_per_block, 16) * sizeof(int)) : shmem_base;
tile_C C[ntA][ntB];
T * tile_xy = (T *) compute_base + threadIdx.y*(tile_A::I * tile_k_padded);
if constexpr (has_ids) {
int found = 0;
for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) {
const int j = j0 + threadIdx.y;
const int32_t * __restrict__ id_row = ids + j*stride_row_id;
if (threadIdx.x == 0) {
slot_map[j] = -1;
}
for (int k = threadIdx.x; k < nchannels_dst; k += warp_size) {
int match = id_row[k*stride_col_id] == expert_idx;
if (match) {
slot_map[j] = k;
found = 1;
break;
}
}
}
if (!__syncthreads_or(found)) {
return;
}
}
for (int col = threadIdx.y*warp_size + threadIdx.x; col < ncols; col += nwarps*warp_size) {
tile_A A[ntA][warp_size / tile_A::J];
#pragma unroll
for (int itA = 0; itA < ntA; ++itA) {
#pragma unroll
for (int i = 0; i < tile_A::I; ++i) {
tile_xy[i*tile_k_padded + threadIdx.x] = x[(itA*tile_A::I + i)*stride_row + col];
}
#pragma unroll
for (int k0 = 0; k0 < warp_size; k0 += tile_A::J) {
load_ldmatrix(A[itA][k0/tile_A::J], tile_xy + k0, tile_k_padded);
}
}
#pragma unroll
for (int itB = 0; itB < ntB; ++itB) {
if constexpr (std::is_same_v<T, float>) {
#pragma unroll
for (int j0 = 0; j0 < tile_B::I; ++j0) {
const int j = j0 + itB*tile_B::I;
if constexpr (!has_ids) {
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[j*stride_col_y + col] : 0.0f;
} else {
tile_xy[j0*tile_k_padded + threadIdx.x] = j < cols_per_block ? y[slot_map[j]*stride_channel_y + j*stride_col_y + col] : 0.0f;
}
}
} else if constexpr (std::is_same_v<T, half2> || std::is_same_v<T, nv_bfloat162>) {
#pragma unroll
for (int j0 = 0; j0 < tile_B::I; ++j0) {
const int j = j0 + itB*tile_B::I;
if constexpr (!has_ids) {
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
} else {
float2 tmp = j < cols_per_block && slot_map[j] >= 0 ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f);
tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
}
}
} else {
static_assert(std::is_same_v<T, void>, "unsupported type");
}
#pragma unroll
for (int k0 = 0; k0 < warp_size; k0 += tile_B::J) {
tile_B B;
load_ldmatrix(B, tile_xy + k0, tile_k_padded);
#pragma unroll
for (int itA = 0; itA < ntA; ++itA) {
mma(C[itA][itB], A[itA][k0/tile_B::J], B);
}
}
}
}
float * buf_iw = (float *) compute_base;
constexpr int kiw = nwarps*rows_per_block + 4;
if (nwarps > 1) {
__syncthreads();
}
#pragma unroll
for (int itB = 0; itB < ntB; ++itB) {
#pragma unroll
for (int itA = 0; itA < ntA; ++itA) {
#pragma unroll
for (int l = 0; l < tile_C::ne; ++l) {
const int i = threadIdx.y*rows_per_block + itA*tile_C::I + tile_C::get_i(l);
const int j = itB*tile_C::J + tile_C::get_j(l);
buf_iw[j*kiw + i] = C[itA][itB].x[l];
}
}
}
if (nwarps > 1) {
__syncthreads();
}
#pragma unroll
for (int j0 = 0; j0 < cols_per_block; j0 += nwarps) {
const int j = j0 + threadIdx.y;
if (j0 + nwarps > cols_per_block && j >= cols_per_block) {
return;
}
float sum = 0.0f;
static_assert(rows_per_block == warp_size, "need loop/check");
#pragma unroll
for (int i0 = 0; i0 < nwarps*rows_per_block; i0 += rows_per_block) {
const int i = i0 + threadIdx.x;
sum += buf_iw[j*kiw + i];
}
if constexpr (!has_ids) {
dst[j*stride_col_dst + row0 + threadIdx.x] = sum;
} else {
const int slot = (j < cols_per_block) ? slot_map[j] : -1;
if (slot >= 0) {
dst[slot*stride_channel_dst + j*stride_col_dst + row0 + threadIdx.x] = sum;
}
}
}
#else
GGML_UNUSED_VARS(x, y, ids, dst,
ncols, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
NO_DEVICE_CODE;
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
}
template<typename T, int cols_per_block, int nwarps>
static inline void mul_mat_f_switch_ids(
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols_x, const int64_t nchannels_dst,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t stride_col_id, const int64_t stride_row_id,
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared_total, cudaStream_t stream) {
if (ids) {
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, true><<<block_nums, block_dims, nbytes_shared_total, stream>>>
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
} else {
mul_mat_f<T, MMF_ROWS_PER_BLOCK, cols_per_block, nwarps, false><<<block_nums, block_dims, nbytes_shared_total, stream>>>
(x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
}
}
template <typename T, int cols_per_block>
void mul_mat_f_cuda(
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t stride_col_id, const int64_t stride_row_id,
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream) {
typedef tile<16, 8, T> tile_A;
typedef tile< 8, 8, T> tile_B;
GGML_ASSERT(ncols_x % 2 == 0);
GGML_ASSERT(stride_row % 2 == 0);
GGML_ASSERT(stride_col_y % 2 == 0);
GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
GGML_ASSERT( nsamples_dst % nsamples_x == 0);
const int64_t channel_ratio = nchannels_dst / nchannels_x;
const int64_t sample_ratio = nsamples_dst / nsamples_x;
const int device = ggml_cuda_get_device();
const int warp_size = ggml_cuda_info().devices[device].warp_size;
int64_t nwarps_best = 1;
int64_t niter_best = (ncols_x + warp_size*2 - 1) / (warp_size*2);
int64_t max_block_size = 256;
for (int64_t nwarps = 2; nwarps <= max_block_size/warp_size; nwarps++) {
const int64_t niter = (ncols_x + nwarps*warp_size*2 - 1) / (nwarps*warp_size*2);
if (niter < niter_best) {
niter_best = niter;
nwarps_best = nwarps;
}
}
constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
const int nbytes_shared_iter = nwarps_best * tile_A::I * (warp_size + 4) * 4;
const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4;
const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
const int nbytes_slotmap = ids ? GGML_PAD(cols_per_block, 16) * sizeof(int) : 0;
const int nbytes_shared_total = nbytes_shared + nbytes_slotmap;
const int64_t grid_y = ids ? nchannels_x : nchannels_dst; // per expert when ids present
const dim3 block_nums(nrows_x/rows_per_block, grid_y, nsamples_dst);
const dim3 block_dims(warp_size, nwarps_best, 1);
switch (nwarps_best) {
case 1: {
mul_mat_f_switch_ids<T, cols_per_block, 1>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 2: {
mul_mat_f_switch_ids<T, cols_per_block, 2>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 3: {
mul_mat_f_switch_ids<T, cols_per_block, 3>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 4: {
mul_mat_f_switch_ids<T, cols_per_block, 4>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 5: {
mul_mat_f_switch_ids<T, cols_per_block, 5>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 6: {
mul_mat_f_switch_ids<T, cols_per_block, 6>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 7: {
mul_mat_f_switch_ids<T, cols_per_block, 7>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
case 8: {
mul_mat_f_switch_ids<T, cols_per_block, 8>(
x, y, ids, dst, ncols_x, nchannels_dst, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, block_nums, block_dims, nbytes_shared_total, stream);
} break;
default: {
GGML_ABORT("fatal error");
} break;
}
GGML_UNUSED_VARS(nchannels_y);
}
template <typename T>
static void mul_mat_f_switch_cols_per_block(
const T * x, const float * y, const int32_t * ids, float * dst,
const int64_t ncols_x, const int64_t nrows_x, const int64_t ncols_dst,
const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
const int64_t stride_col_id, const int stride_row_id,
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
cudaStream_t stream) {
switch (ncols_dst) {
case 1: {
mul_mat_f_cuda<T, 1>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 2: {
mul_mat_f_cuda<T, 2>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 3: {
mul_mat_f_cuda<T, 3>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 4: {
mul_mat_f_cuda<T, 4>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 5: {
mul_mat_f_cuda<T, 5>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 6: {
mul_mat_f_cuda<T, 6>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 7: {
mul_mat_f_cuda<T, 7>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 8: {
mul_mat_f_cuda<T, 8>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 9: {
mul_mat_f_cuda<T, 9>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 10: {
mul_mat_f_cuda<T, 10>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 11: {
mul_mat_f_cuda<T, 11>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 12: {
mul_mat_f_cuda<T, 12>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 13: {
mul_mat_f_cuda<T, 13>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 14: {
mul_mat_f_cuda<T, 14>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 15: {
mul_mat_f_cuda<T, 15>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
case 16: {
mul_mat_f_cuda<T, 16>(x, y, ids, dst, ncols_x, nrows_x, stride_row, stride_col_y, stride_col_dst,
stride_col_id, stride_row_id, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
} break;
default: {
GGML_ABORT("fatal error");
} break;
}
}
#define DECL_MMF_CASE_HELPER(T, ncols_dst) \
template void mul_mat_f_cuda<T, ncols_dst>( \
const T * x, const float * y, const int32_t * ids, float * dst, \
const int64_t ncols_x, const int64_t nrows_x, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, \
const int64_t stride_col_id, const int64_t stride_row_id, \
const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, \
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,\
const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, \
cudaStream_t stream);
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
#define DECL_MMF_CASE_EXTERN(ncols_dst) \
extern DECL_MMF_CASE_HELPER(float, ncols_dst) \
extern DECL_MMF_CASE_HELPER(half2, ncols_dst) \
extern DECL_MMF_CASE_HELPER(nv_bfloat162, ncols_dst)
#define DECL_MMF_CASE(ncols_dst) \
DECL_MMF_CASE_HELPER(float, ncols_dst) \
DECL_MMF_CASE_HELPER(half2, ncols_dst) \
DECL_MMF_CASE_HELPER(nv_bfloat162, ncols_dst)
DECL_MMF_CASE_EXTERN(1);
DECL_MMF_CASE_EXTERN(2);
DECL_MMF_CASE_EXTERN(3);
DECL_MMF_CASE_EXTERN(4);
DECL_MMF_CASE_EXTERN(5);
DECL_MMF_CASE_EXTERN(6);
DECL_MMF_CASE_EXTERN(7);
DECL_MMF_CASE_EXTERN(8);
DECL_MMF_CASE_EXTERN(9);
DECL_MMF_CASE_EXTERN(10);
DECL_MMF_CASE_EXTERN(11);
DECL_MMF_CASE_EXTERN(12);
DECL_MMF_CASE_EXTERN(13);
DECL_MMF_CASE_EXTERN(14);
DECL_MMF_CASE_EXTERN(15);
DECL_MMF_CASE_EXTERN(16);
#else
#define DECL_MMF_CASE(ncols_dst)
#endif

View File

@@ -141,9 +141,10 @@ template <ggml_type type, int ncols_dst>
__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
static __global__ void mul_mat_vec_q(
const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, float * __restrict__ dst,
const int ncols_x, const int nchannels_y, const int stride_row_x, const int stride_col_y, const int stride_col_dst,
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst) {
constexpr int qk = ggml_cuda_type_traits<type>::qk;
constexpr int qi = ggml_cuda_type_traits<type>::qi;
@@ -161,12 +162,12 @@ static __global__ void mul_mat_vec_q(
constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
// The MUL_MAT_ID code path with ids != nullptr is only implemented for ncols_dst == 1.
const int channel_dst = blockIdx.y;
const int channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : channel_dst / channel_ratio;
const int channel_y = ncols_dst == 1 && ids ? channel_dst % nchannels_y : channel_dst;
const int sample_dst = blockIdx.z;
const int sample_x = sample_dst / sample_ratio;
const int sample_y = sample_dst;
const uint32_t channel_dst = blockIdx.y;
const uint32_t channel_x = ncols_dst == 1 && ids ? ids[channel_dst] : fastdiv(channel_dst, channel_ratio);
const uint32_t channel_y = ncols_dst == 1 && ids ? fastmodulo(channel_dst, nchannels_y) : channel_dst;
const uint32_t sample_dst = blockIdx.z;
const uint32_t sample_x = fastdiv(sample_dst, sample_ratio);
const uint32_t sample_y = sample_dst;
// partial sum for each thread
float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}};
@@ -247,8 +248,9 @@ static void mul_mat_vec_q_switch_ncols_dst(
GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE);
const int channel_ratio = nchannels_dst / nchannels_x;
const int sample_ratio = nsamples_dst / nsamples_x;
const uint3 nchannels_y_fd = ids ? init_fastdiv_values(nchannels_y) : make_uint3(0, 0, 0);
const uint3 channel_ratio_fd = ids ? make_uint3(0, 0, 0) : init_fastdiv_values(nchannels_dst / nchannels_x);
const uint3 sample_ratio_fd = init_fastdiv_values(nsamples_dst / nsamples_x);
const int device = ggml_cuda_get_device();
const int warp_size = ggml_cuda_info().devices[device].warp_size;
@@ -256,86 +258,70 @@ static void mul_mat_vec_q_switch_ncols_dst(
GGML_ASSERT(!ids || ncols_dst == 1);
switch (ncols_dst) {
case 1:
{
case 1: {
constexpr int c_ncols_dst = 1;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
break;
}
case 2:
{
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 2: {
constexpr int c_ncols_dst = 2;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
break;
}
case 3:
{
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 3: {
constexpr int c_ncols_dst = 3;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
break;
}
case 4:
{
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 4: {
constexpr int c_ncols_dst = 4;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
break;
}
case 5:
{
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 5: {
constexpr int c_ncols_dst = 5;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
break;
}
case 6:
{
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 6: {
constexpr int c_ncols_dst = 6;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
break;
}
case 7:
{
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 7: {
constexpr int c_ncols_dst = 7;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
break;
}
case 8:
{
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
case 8: {
constexpr int c_ncols_dst = 8;
std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
(vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
break;
}
(vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
} break;
default:
GGML_ABORT("fatal error");
break;

View File

@@ -105,29 +105,29 @@ static __global__ void group_norm_f32(const float * x, float * dst, const int gr
}
template <int block_size, bool do_multiply = false, bool do_add = false>
static __global__ void rms_norm_f32(const float * x, float * dst,
static __global__ void rms_norm_f32(const float * x,
float * dst,
const int ncols,
const int64_t stride_row,
const int64_t stride_channel,
const int64_t stride_sample,
const float eps,
const float * mul = nullptr,
const int64_t mul_stride_row = 0,
const int64_t mul_stride_channel = 0,
const int64_t mul_stride_sample = 0,
const int mul_ncols = 0,
const int mul_nrows = 0,
const int mul_nchannels = 0,
const int mul_nsamples = 0,
const float * add = nullptr,
const int64_t add_stride_row = 0,
const int64_t add_stride_channel = 0,
const int64_t add_stride_sample = 0,
const int add_ncols = 0,
const int add_nrows = 0,
const int add_nchannels = 0,
const int add_nsamples = 0) {
const float * mul = nullptr,
const int64_t mul_stride_row = 0,
const int64_t mul_stride_channel = 0,
const int64_t mul_stride_sample = 0,
const uint3 mul_ncols_packed = make_uint3(0, 0, 0),
const uint3 mul_nrows_packed = make_uint3(0, 0, 0),
const uint3 mul_nchannels_packed = make_uint3(0, 0, 0),
const uint3 mul_nsamples_packed = make_uint3(0, 0, 0),
const float * add = nullptr,
const int64_t add_stride_row = 0,
const int64_t add_stride_channel = 0,
const int64_t add_stride_sample = 0,
const uint3 add_ncols_packed = make_uint3(0, 0, 0),
const uint3 add_nrows_packed = make_uint3(0, 0, 0),
const uint3 add_nchannels_packed = make_uint3(0, 0, 0),
const uint3 add_nsamples_packed = make_uint3(0, 0, 0)) {
const int nrows = gridDim.x;
const int nchannels = gridDim.y;
@@ -142,16 +142,16 @@ static __global__ void rms_norm_f32(const float * x, float * dst,
dst += ((sample*nchannels + channel)*nrows + row)*ncols;
if constexpr (do_multiply) {
const int mul_row = row % mul_nrows;
const int mul_channel = channel % mul_nchannels;
const int mul_sample = sample % mul_nsamples;
mul += mul_sample*mul_stride_sample + mul_channel*mul_stride_channel + mul_row*mul_stride_row;
const uint32_t mul_row = fastmodulo(row, mul_nrows_packed);
const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed);
const uint32_t mul_sample = fastmodulo(sample, mul_nsamples_packed);
mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row;
}
if constexpr (do_add) {
const int add_row = row % add_nrows;
const int add_channel = channel % add_nchannels;
const int add_sample = sample % add_nsamples;
const int add_row = fastmodulo(row, add_nrows_packed);
const int add_channel = fastmodulo(channel, add_nchannels_packed);
const int add_sample = fastmodulo(sample, add_nsamples_packed);
add += add_sample * add_stride_sample + add_channel * add_stride_channel + add_row * add_stride_row;
}
@@ -165,15 +165,18 @@ static __global__ void rms_norm_f32(const float * x, float * dst,
// sum up partial sums
tmp = warp_reduce_sum(tmp);
if constexpr (block_size > WARP_SIZE) {
static_assert(block_size == 1024, "unexpected block_size");
static_assert((block_size <= 1024) && (block_size % 32 == 0), "unexpected block_size");
__shared__ float s_sum[32];
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
const int warp_id = tid / WARP_SIZE;
const int lane_id = tid % WARP_SIZE;
if (lane_id == 0) {
s_sum[warp_id] = tmp;
}
__syncthreads();
tmp = s_sum[lane_id];
tmp = 0.0f;
if (lane_id < (block_size / WARP_SIZE)) {
tmp = s_sum[lane_id];
}
tmp = warp_reduce_sum(tmp);
}
@@ -182,12 +185,12 @@ static __global__ void rms_norm_f32(const float * x, float * dst,
for (int col = tid; col < ncols; col += block_size) {
if constexpr (do_multiply && do_add) {
const int mul_col = col % mul_ncols;
const int add_col = col % add_ncols;
dst[col] = scale * x[col] * mul[mul_col] + add[add_col];
const int mul_col = fastmodulo(col, mul_ncols_packed);
const int add_col = fastmodulo(col, add_ncols_packed);
dst[col] = scale * x[col] * mul[mul_col] + add[add_col];
} else if constexpr (do_multiply) {
const int mul_col = col % mul_ncols;
dst[col] = scale * x[col] * mul[mul_col];
const int mul_col = fastmodulo(col, mul_ncols_packed);
dst[col] = scale * x[col] * mul[mul_col];
} else {
dst[col] = scale * x[col];
}
@@ -354,77 +357,86 @@ static void rms_norm_f32_cuda(
const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, cudaStream_t stream) {
const dim3 blocks_num(nrows, nchannels, nsamples);
if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1);
rms_norm_f32<WARP_SIZE, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
const dim3 block_dims(256, 1, 1);
rms_norm_f32<256, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
} else {
const dim3 block_dims(1024, 1, 1);
rms_norm_f32<1024, false><<<blocks_num, block_dims, 0, stream>>>(x, dst, ncols, stride_row, stride_channel, stride_sample, eps);
}
}
static void rms_norm_mul_f32_cuda(const float * x,
const float * mul,
const float * add,
float * dst,
const int ncols,
const int nrows,
const int nchannels,
const int nsamples,
const int64_t stride_row,
const int64_t stride_channel,
const int64_t stride_sample,
const int64_t mul_stride_row,
const int64_t mul_stride_channel,
const int64_t mul_stride_sample,
const int mul_ncols,
const int mul_nrows,
const int mul_nchannels,
const int mul_nsamples,
const int64_t add_stride_row,
const int64_t add_stride_channel,
const int64_t add_stride_sample,
const int add_ncols,
const int add_nrows,
const int add_nchannels,
const int add_nsamples,
const float eps,
cudaStream_t stream) {
static void rms_norm_mul_f32_cuda(const float * x,
const float * mul,
const float * add,
float * dst,
const int ncols,
const int nrows,
const int nchannels,
const int nsamples,
const int64_t stride_row,
const int64_t stride_channel,
const int64_t stride_sample,
const int64_t mul_stride_row,
const int64_t mul_stride_channel,
const int64_t mul_stride_sample,
const uint32_t mul_ncols,
const uint32_t mul_nrows,
const uint32_t mul_nchannels,
const uint32_t mul_nsamples,
const int64_t add_stride_row,
const int64_t add_stride_channel,
const int64_t add_stride_sample,
const uint32_t add_ncols,
const uint32_t add_nrows,
const uint32_t add_nchannels,
const uint32_t add_nsamples,
const float eps,
cudaStream_t stream) {
const dim3 blocks_num(nrows, nchannels, nsamples);
if (mul == nullptr) {
rms_norm_f32_cuda(x, dst, ncols, nrows, nchannels, nsamples, stride_row, stride_channel, stride_sample, eps, stream);
return;
}
if (add == nullptr) {
const uint3 mul_ncols_packed = init_fastdiv_values(mul_ncols);
const uint3 mul_nrows_packed = init_fastdiv_values(mul_nrows);
const uint3 mul_nchannels_packed = init_fastdiv_values(mul_nchannels);
const uint3 mul_nsamples_packed = init_fastdiv_values(mul_nsamples);
if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1);
rms_norm_f32<WARP_SIZE, true><<<blocks_num, block_dims, 0, stream>>>(x, dst,
ncols, stride_row, stride_channel, stride_sample, eps,
mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
mul_ncols, mul_nrows, mul_nchannels, mul_nsamples);
const dim3 block_dims(256, 1, 1);
rms_norm_f32<256, true><<<blocks_num, block_dims, 0, stream>>>(
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
} else {
const dim3 block_dims(1024, 1, 1);
rms_norm_f32<1024, true><<<blocks_num, block_dims, 0, stream>>>(x, dst,
ncols, stride_row, stride_channel, stride_sample, eps,
mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
mul_ncols, mul_nrows, mul_nchannels, mul_nsamples);
rms_norm_f32<1024, true><<<blocks_num, block_dims, 0, stream>>>(
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed);
}
} else {
const uint3 mul_ncols_packed = init_fastdiv_values(mul_ncols);
const uint3 mul_nrows_packed = init_fastdiv_values(mul_nrows);
const uint3 mul_nchannels_packed = init_fastdiv_values(mul_nchannels);
const uint3 mul_nsamples_packed = init_fastdiv_values(mul_nsamples);
const uint3 add_ncols_packed = init_fastdiv_values(add_ncols);
const uint3 add_nrows_packed = init_fastdiv_values(add_nrows);
const uint3 add_nchannels_packed = init_fastdiv_values(add_nchannels);
const uint3 add_nsamples_packed = init_fastdiv_values(add_nsamples);
if (ncols < 1024) {
const dim3 block_dims(WARP_SIZE, 1, 1);
rms_norm_f32<WARP_SIZE, true, true><<<blocks_num, block_dims, 0, stream>>>(x, dst,
ncols, stride_row, stride_channel, stride_sample, eps,
mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
mul_ncols, mul_nrows, mul_nchannels, mul_nsamples,
add, add_stride_row, add_stride_channel, add_stride_sample,
add_ncols, add_nrows, add_nchannels, add_nsamples);
const dim3 block_dims(256, 1, 1);
rms_norm_f32<256, true, true><<<blocks_num, block_dims, 0, stream>>>(
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
add_nchannels_packed, add_nsamples_packed);
} else {
const dim3 block_dims(1024, 1, 1);
rms_norm_f32<1024, true, true><<<blocks_num, block_dims, 0, stream>>>(x, dst,
ncols, stride_row, stride_channel, stride_sample, eps,
mul, mul_stride_row, mul_stride_channel, mul_stride_sample,
mul_ncols, mul_nrows, mul_nchannels, mul_nsamples,
add, add_stride_row, add_stride_channel, add_stride_sample,
add_ncols, add_nrows, add_nchannels, add_nsamples);
rms_norm_f32<1024, true, true><<<blocks_num, block_dims, 0, stream>>>(
x, dst, ncols, stride_row, stride_channel, stride_sample, eps, mul, mul_stride_row, mul_stride_channel,
mul_stride_sample, mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, add,
add_stride_row, add_stride_channel, add_stride_sample, add_ncols_packed, add_nrows_packed,
add_nchannels_packed, add_nsamples_packed);
}
}
}

View File

@@ -1,36 +1,50 @@
#include "pad.cuh"
static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) {
// blockIdx.z: idx of ne2*ne3, aka ne02*ne03
// blockIdx.y: idx of ne1
// blockIDx.x: idx of ne0 / BLOCK_SIZE
int nidx = threadIdx.x + blockIdx.x * blockDim.x;
if (nidx >= ne0) {
static __global__ void pad_f32(const float * src, float * dst,
const int lp0, const int rp0, const int lp1, const int rp1,
const int lp2, const int rp2, const int lp3, const int rp3,
const int ne0, const int ne1, const int ne2, const int ne3) {
// blockIdx.z: i3*ne2+i2
// blockIdx.y: i1
// blockIDx.x: i0 / CUDA_PAD_BLOCK_SIZE
// gridDim.y: ne1
int i0 = threadIdx.x + blockIdx.x * blockDim.x;
int i1 = blockIdx.y;
int i2 = blockIdx.z % ne2;
int i3 = blockIdx.z / ne2;
if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
return;
}
// operation
int offset_dst =
nidx +
blockIdx.y * ne0 +
blockIdx.z * ne0 * gridDim.y;
if (nidx < ne00 && blockIdx.y < (unsigned)ne01 && blockIdx.z < (unsigned)(ne02*ne03)) {
int offset_src =
nidx +
blockIdx.y * ne00 +
blockIdx.z * ne00 * ne01;
dst[offset_dst] = x[offset_src];
const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
if ((i0 >= lp0 && i0 < ne0 - rp0) &&
(i1 >= lp1 && i1 < ne1 - rp1) &&
(i2 >= lp2 && i2 < ne2 - rp2) &&
(i3 >= lp3 && i3 < ne3 - rp3)) {
const int64_t i00 = i0 - lp0;
const int64_t i01 = i1 - lp1;
const int64_t i02 = i2 - lp2;
const int64_t i03 = i3 - lp3;
const int64_t ne02 = ne2 - lp2 - rp2;
const int64_t ne01 = ne1 - lp1 - rp1;
const int64_t ne00 = ne0 - lp0 - rp0;
const int64_t src_idx = i03*(ne00*ne01*ne02) + i02*(ne00*ne01) + i01*ne00 + i00;
dst[dst_idx] = src[src_idx];
} else {
dst[offset_dst] = 0.0f;
dst[dst_idx] = 0.0f;
}
}
static void pad_f32_cuda(const float * x, float * dst,
const int ne00, const int ne01, const int ne02, const int ne03,
static void pad_f32_cuda(const float * src, float * dst,
const int lp0, const int rp0, const int lp1, const int rp1,
const int lp2, const int rp2, const int lp3, const int rp3,
const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) {
int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
dim3 gridDim(num_blocks, ne1, ne2*ne3);
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(x, dst, ne0, ne00, ne01, ne02, ne03);
pad_f32<<<gridDim, CUDA_PAD_BLOCK_SIZE, 0, stream>>>(src, dst, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3, ne0, ne1, ne2, ne3);
}
void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -41,9 +55,18 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F32);
GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
GGML_ASSERT(ggml_is_contiguous(src0));
const int32_t lp0 = ((const int32_t*)(dst->op_params))[0];
const int32_t rp0 = ((const int32_t*)(dst->op_params))[1];
const int32_t lp1 = ((const int32_t*)(dst->op_params))[2];
const int32_t rp1 = ((const int32_t*)(dst->op_params))[3];
const int32_t lp2 = ((const int32_t*)(dst->op_params))[4];
const int32_t rp2 = ((const int32_t*)(dst->op_params))[5];
const int32_t lp3 = ((const int32_t*)(dst->op_params))[6];
const int32_t rp3 = ((const int32_t*)(dst->op_params))[7];
pad_f32_cuda(src0_d, dst_d,
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3,
dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream);
}

View File

@@ -1,26 +1,27 @@
#include "quantize.cuh"
#include <cstdint>
__launch_bounds__(CUDA_QUANTIZE_BLOCK_SIZE, 1)
static __global__ void quantize_q8_1(
const float * __restrict__ x, void * __restrict__ vy,
const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
const int64_t ne0, const int ne1, const int ne2) {
const int64_t ne0, const uint32_t ne1, const uint3 ne2) {
const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
if (i0 >= ne0) {
return;
}
const int64_t i3 = fastdiv(blockIdx.z, ne2);
const int64_t i2 = blockIdx.z - i3*ne2.z;
const int64_t i1 = blockIdx.y;
const int64_t i2 = blockIdx.z % ne2;
const int64_t i3 = blockIdx.z / ne2;
const int64_t & i00 = i0;
const int64_t & i01 = i1;
const int64_t & i02 = i2;
const int64_t & i03 = i3;
const int64_t i_cont = ((i3*ne2 + i2) * ne1 + i1) * ne0 + i0;
const int64_t i_cont = ((i3*ne2.z + i2) * ne1 + i1) * ne0 + i0;
block_q8_1 * y = (block_q8_1 *) vy;
@@ -31,10 +32,10 @@ static __global__ void quantize_q8_1(
float amax = fabsf(xi);
float sum = xi;
amax = warp_reduce_max(amax);
sum = warp_reduce_sum(sum);
amax = warp_reduce_max<QK8_1>(amax);
sum = warp_reduce_sum<QK8_1>(sum);
const float d = amax / 127;
const float d = amax / 127.0f;
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
y[ib].qs[iqs] = q;
@@ -43,8 +44,7 @@ static __global__ void quantize_q8_1(
return;
}
reinterpret_cast<half&>(y[ib].ds.x) = d;
reinterpret_cast<half&>(y[ib].ds.y) = sum;
y[ib].ds = make_half2(d, sum);
}
template <mmq_q8_1_ds_layout ds_layout>
@@ -152,10 +152,12 @@ void quantize_row_q8_1_cuda(
GGML_ASSERT(!ids);
GGML_ASSERT(ne0 % QK8_1 == 0);
const uint3 ne2_fastdiv = init_fastdiv_values(ne2);
const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2_fastdiv);
GGML_UNUSED(type_src0);
}

View File

@@ -1,18 +1,19 @@
#include "scale.cuh"
static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
#define MAX_GRIDDIM_X 0x7FFFFFFF
if (i >= k) {
return;
static __global__ void scale_f32(const float * x, float * dst, const float scale, const float bias, const int64_t nelements) {
int64_t tid = (int64_t)blockIdx.x * (int64_t)blockDim.x + (int64_t)threadIdx.x;
int64_t stride = (int64_t)blockDim.x * (int64_t)gridDim.x;
for (int64_t i = tid; i < nelements; i += stride) {
dst[i] = scale * x[i] + bias;
}
dst[i] = scale * x[i] + bias;
}
static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, k);
static void scale_f32_cuda(const float * x, float * dst, const float scale, const float bias, const int64_t nelements, cudaStream_t stream) {
const int64_t num_blocks = (nelements + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
scale_f32<<<MIN(MAX_GRIDDIM_X, num_blocks), CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, bias, nelements);
}
void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

View File

@@ -24,7 +24,7 @@ TYPES_MMQ = [
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS", "GGML_TYPE_MXFP4"
]
SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
@@ -34,6 +34,13 @@ SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do
DECL_MMQ_CASE({type});
"""
SOURCE_MMF = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmf.cuh"
DECL_MMF_CASE({type});
"""
def get_short_name(long_quant_name):
return long_quant_name.replace("GGML_TYPE_", "").lower()
@@ -76,3 +83,7 @@ for ncols in [8, 16, 32, 64]:
for type in TYPES_MMQ:
with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
f.write(SOURCE_MMQ.format(type=type))
for type in range(1, 17):
with open(f"mmf-instance-ncols_{type}.cu", "w") as f:
f.write(SOURCE_MMF.format(type=type))

View File

@@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmf.cuh"
DECL_MMF_CASE(1);

View File

@@ -0,0 +1,5 @@
// This file has been autogenerated by generate_cu_files.py, do not edit manually.
#include "../mmf.cuh"
DECL_MMF_CASE(10);

Some files were not shown because too many files have changed in this diff Show More