Compare commits

..

21 Commits

Author SHA1 Message Date
Francis Couture-Harpin
d9b625edb6 ggml-quants : handle imatrix for MXFP4 2025-08-11 22:12:10 -04:00
hipudding
be48528b06 CANN: Add broadcast for softmax and FA (#15208)
* refactor softmax

* fix fa

* fix mask shape

* format

* add comments

* Remove whitespace
2025-08-11 22:50:31 +08:00
rainred
cf9e5648a7 mtmd : Fix MinicpmV model converter and clip to avoid using hardcode. (#14750)
* Fix MinicpmV model converter and clip to avoid using hardcode.

* Code update for pr/14750

* Remove unused field, update script path in docs.

* Add version 5 for fallback code.

---------

Co-authored-by: lzhang <zhanglei@modelbest.cn>
2025-08-11 16:12:12 +02:00
Xuan-Son Nguyen
fba5c0d680 chat : hotfix gpt-oss jinja raising an exception (#15243)
* chat : hotfix gpt-oss jinja raising an exception

* fix
2025-08-11 15:31:35 +02:00
Xuan-Son Nguyen
53d0a12658 server : allow specifying reasoning_format in HTTP request (#15238) 2025-08-11 14:48:41 +02:00
Zagaj
27093afe78 readme : update infra list (#15234) 2025-08-11 15:27:54 +03:00
Georgi Gerganov
228f724d9c kv-cache : fix seq_rm with seq_id == -1 (#15226)
* kv-cache : fix seq_rm with seq_id == -1

ggml-ci

* cont : iterate over streams

ggml-ci
2025-08-11 13:58:24 +03:00
Daniel Bevenius
cd3069dfcb kv-cache : log (debug) all streams in find_slot (#15176)
This commit updates `llama_kv_cache_unified::find_slot` to log
information for all streams when debug is enabled.

The motivation for this change is that currently if a non-unified
kv-cache is used, then only one stream will be logged because the
code was currently uses `seq_to_stream[1]`.
2025-08-11 11:21:19 +02:00
Sigbjørn Skjæret
50e81bdf5d convert : fix merge conflicts (#15229) 2025-08-11 11:15:44 +02:00
Daniel Bevenius
1ebbaddff2 perplexity : update comments/error msg to use decode [no ci] (#15227)
This commit updates comments and error messages to use "decode" instead
of "eval" in perplexity.cpp.

The motivation for this is that `llama_eval` was renamed to
`llama_decode` a while ago, but the comments and error messages
still referred to "eval". This change ensures consistency and clarity.
2025-08-11 11:21:24 +03:00
Julien Denize
a3a7874272 convert : improve Mistral models integration (#14737)
* Improve Mistral models integration with llama.cpp

* Revert changes and fix gguf

* Revert change

* refactor convert_mistral_to_gguf.py in convert_hf_to_gguf.py

* Revert collateral

* Rename model name

* refactor

* revert

* remove duplicate

* Remove duplication code

* Fixes

* Fix flake issues

* Apply comments

* Apply comments

* Apply comments

* Fix remote

* add default chat template

* Revert

* nit
2025-08-11 10:07:49 +02:00
Charles Xu
002cb1bb33 kleidiai: fix unsigned overflow bug (#15150)
* kleidiai: fix unsigned overflow bug

* address review comments
2025-08-11 09:59:26 +02:00
David Zhao
79c1160b07 cuda: refactored ssm_scan and use CUB (#13291)
Some checks failed
CI / macOS-latest-cmake-arm64 (push) Has been cancelled
CI / macOS-latest-cmake-x64 (push) Has been cancelled
CI / macOS-latest-cmake-arm64-webgpu (push) Has been cancelled
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Has been cancelled
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Has been cancelled
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Has been cancelled
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Has been cancelled
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Has been cancelled
CI / ubuntu-latest-llguidance (push) Has been cancelled
CI / ubuntu-latest-cmake-rpc (push) Has been cancelled
CI / ubuntu-22-cmake-vulkan (push) Has been cancelled
CI / ubuntu-22-cmake-webgpu (push) Has been cancelled
CI / ubuntu-22-cmake-hip (push) Has been cancelled
CI / ubuntu-22-cmake-musa (push) Has been cancelled
CI / ubuntu-22-cmake-sycl (push) Has been cancelled
CI / ubuntu-22-cmake-sycl-fp16 (push) Has been cancelled
CI / build-linux-cross (push) Has been cancelled
CI / build-cmake-pkg (push) Has been cancelled
CI / macOS-latest-cmake-ios (push) Has been cancelled
CI / macOS-latest-cmake-tvos (push) Has been cancelled
CI / macOS-latest-cmake-visionos (push) Has been cancelled
CI / macOS-latest-swift (generic/platform=iOS) (push) Has been cancelled
CI / macOS-latest-swift (generic/platform=macOS) (push) Has been cancelled
CI / macOS-latest-swift (generic/platform=tvOS) (push) Has been cancelled
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Has been cancelled
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Has been cancelled
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Has been cancelled
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Has been cancelled
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Has been cancelled
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Has been cancelled
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Has been cancelled
CI / ubuntu-latest-cmake-cuda (push) Has been cancelled
CI / windows-2022-cmake-cuda (12.4) (push) Has been cancelled
CI / windows-latest-cmake-sycl (push) Has been cancelled
CI / windows-latest-cmake-hip (push) Has been cancelled
CI / ios-xcode-build (push) Has been cancelled
CI / android-build (push) Has been cancelled
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Has been cancelled
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Has been cancelled
Close inactive issues / close-issues (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/cpu.Dockerfile free_disk_space:false full:true light:true platforms:linux/amd64 server:true tag:cpu]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/cuda.Dockerfile free_disk_space:false full:true light:true platforms:linux/amd64 server:true tag:cuda]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/intel.Dockerfile free_disk_space:true full:true light:true platforms:linux/amd64 server:true tag:intel]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/musa.Dockerfile free_disk_space:true full:true light:true platforms:linux/amd64 server:true tag:musa]) (push) Has been cancelled
Publish Docker image / Push Docker image to Docker Hub (map[dockerfile:.devops/vulkan.Dockerfile free_disk_space:false full:true light:true platforms:linux/amd64 server:true tag:vulkan]) (push) Has been cancelled
Update Winget Package / Update Winget Package (push) Has been cancelled
* cuda: refactored ssm_scan to use CUB

* fixed compilation error when when not using CUB

* assign L to constant and use size_t instead of int

* deduplicated functions

* change min blocks per mp to 1

* Use cub load and store warp transpose

* suppress clang warning
2025-08-09 20:29:43 +02:00
Aman Gupta
34c9d765bf CUDA: add attention sinks for tile and wmma (#15178)
Some checks are pending
CI / macOS-latest-cmake-arm64 (push) Waiting to run
CI / macOS-latest-cmake-x64 (push) Waiting to run
CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI / ubuntu-latest-llguidance (push) Waiting to run
CI / ubuntu-latest-cmake-rpc (push) Waiting to run
CI / ubuntu-22-cmake-vulkan (push) Waiting to run
CI / ubuntu-22-cmake-webgpu (push) Waiting to run
CI / ubuntu-22-cmake-hip (push) Waiting to run
CI / ubuntu-22-cmake-musa (push) Waiting to run
CI / ubuntu-22-cmake-sycl (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run
CI / build-linux-cross (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-cmake-ios (push) Waiting to run
CI / macOS-latest-cmake-tvos (push) Waiting to run
CI / macOS-latest-cmake-visionos (push) Waiting to run
CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cmake-cuda (push) Waiting to run
CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run
CI / windows-latest-cmake-sycl (push) Waiting to run
CI / windows-latest-cmake-hip (push) Waiting to run
CI / ios-xcode-build (push) Waiting to run
CI / android-build (push) Waiting to run
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
* CUDA: add attention sinks for tile and wmma

* Review: formatting changes + remove syncthreads from tile + remove warp_reduce_max from wmma
2025-08-09 20:00:24 +08:00
compilade
e54d41befc gguf-py : add Numpy MXFP4 de/quantization support (#15111)
Some checks failed
CI / macOS-latest-cmake-arm64 (push) Waiting to run
CI / macOS-latest-cmake-x64 (push) Waiting to run
CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI / ubuntu-latest-llguidance (push) Waiting to run
CI / ubuntu-latest-cmake-rpc (push) Waiting to run
CI / ubuntu-22-cmake-vulkan (push) Waiting to run
CI / ubuntu-22-cmake-webgpu (push) Waiting to run
CI / ubuntu-22-cmake-hip (push) Waiting to run
CI / ubuntu-22-cmake-musa (push) Waiting to run
CI / ubuntu-22-cmake-sycl (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run
CI / build-linux-cross (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-cmake-ios (push) Waiting to run
CI / macOS-latest-cmake-tvos (push) Waiting to run
CI / macOS-latest-cmake-visionos (push) Waiting to run
CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cmake-cuda (push) Waiting to run
CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run
CI / windows-latest-cmake-sycl (push) Waiting to run
CI / windows-latest-cmake-hip (push) Waiting to run
CI / ios-xcode-build (push) Waiting to run
CI / android-build (push) Waiting to run
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
flake8 Lint / Lint (push) Has been cancelled
Python Type-Check / pyright type-check (push) Has been cancelled
* gguf-py : add MXFP4 de/quantization support

* ggml-quants : handle zero amax for MXFP4
2025-08-08 17:48:26 -04:00
Johannes Gäßler
4850b52aed server-bench: external OAI servers, sqlite (#15179)
* server-bench: external OAI servers, sqlite

* Update scripts/server-bench.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update scripts/server-bench.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update scripts/server-bench.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* raise_for_status

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-08-08 23:04:36 +02:00
AN Long
cd6983d56d ggml : fix field name when new ggml_backend (#14944)
Some checks are pending
CI / macOS-latest-cmake-arm64 (push) Waiting to run
CI / macOS-latest-cmake-x64 (push) Waiting to run
CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI / ubuntu-latest-llguidance (push) Waiting to run
CI / ubuntu-latest-cmake-rpc (push) Waiting to run
CI / ubuntu-22-cmake-vulkan (push) Waiting to run
CI / ubuntu-22-cmake-webgpu (push) Waiting to run
CI / ubuntu-22-cmake-hip (push) Waiting to run
CI / ubuntu-22-cmake-musa (push) Waiting to run
CI / ubuntu-22-cmake-sycl (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run
CI / build-linux-cross (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-cmake-ios (push) Waiting to run
CI / macOS-latest-cmake-tvos (push) Waiting to run
CI / macOS-latest-cmake-visionos (push) Waiting to run
CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cmake-cuda (push) Waiting to run
CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run
CI / windows-latest-cmake-sycl (push) Waiting to run
CI / windows-latest-cmake-hip (push) Waiting to run
CI / ios-xcode-build (push) Waiting to run
CI / android-build (push) Waiting to run
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
2025-08-08 14:37:22 +02:00
Olivier Chafik
6c7e9a5440 vendor: sync minja (#15161)
* vendor: sync minja

* Update minja.hpp

* Apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-08-08 10:45:18 +01:00
Johannes Gäßler
1425f587a8 CUDA: attention sinks for mma FlashAttention (#15157)
Some checks are pending
CI / macOS-latest-cmake-arm64 (push) Waiting to run
CI / macOS-latest-cmake-x64 (push) Waiting to run
CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI / ubuntu-latest-llguidance (push) Waiting to run
CI / ubuntu-latest-cmake-rpc (push) Waiting to run
CI / ubuntu-22-cmake-vulkan (push) Waiting to run
CI / ubuntu-22-cmake-webgpu (push) Waiting to run
CI / ubuntu-22-cmake-hip (push) Waiting to run
CI / ubuntu-22-cmake-musa (push) Waiting to run
CI / ubuntu-22-cmake-sycl (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run
CI / build-linux-cross (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-cmake-ios (push) Waiting to run
CI / macOS-latest-cmake-tvos (push) Waiting to run
CI / macOS-latest-cmake-visionos (push) Waiting to run
CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cmake-cuda (push) Waiting to run
CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run
CI / windows-latest-cmake-sycl (push) Waiting to run
CI / windows-latest-cmake-hip (push) Waiting to run
CI / ios-xcode-build (push) Waiting to run
CI / android-build (push) Waiting to run
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
2025-08-08 08:19:58 +02:00
lhez
aaa3d07ae7 opencl: support sink in soft_max (attn sinks) (#15152) 2025-08-07 21:47:03 -07:00
Xuan-Son Nguyen
50aa938901 convert : support non-mxfp4 HF model (#15153)
Some checks failed
CI / macOS-latest-cmake-arm64 (push) Waiting to run
CI / macOS-latest-cmake-x64 (push) Waiting to run
CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI / ubuntu-latest-llguidance (push) Waiting to run
CI / ubuntu-latest-cmake-rpc (push) Waiting to run
CI / ubuntu-22-cmake-vulkan (push) Waiting to run
CI / ubuntu-22-cmake-webgpu (push) Waiting to run
CI / ubuntu-22-cmake-hip (push) Waiting to run
CI / ubuntu-22-cmake-musa (push) Waiting to run
CI / ubuntu-22-cmake-sycl (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run
CI / build-linux-cross (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-cmake-ios (push) Waiting to run
CI / macOS-latest-cmake-tvos (push) Waiting to run
CI / macOS-latest-cmake-visionos (push) Waiting to run
CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cmake-cuda (push) Waiting to run
CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run
CI / windows-latest-cmake-sycl (push) Waiting to run
CI / windows-latest-cmake-hip (push) Waiting to run
CI / ios-xcode-build (push) Waiting to run
CI / android-build (push) Waiting to run
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Has been cancelled
Python check requirements.txt / check-requirements (push) Has been cancelled
* convert : support non-mxfp4 HF model

* rm redundant check

* disable debug check
2025-08-07 23:26:03 +02:00
48 changed files with 1856 additions and 853 deletions

View File

@@ -240,7 +240,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
<details>
<summary>Infrastructure</summary>
- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
- [Paddler](https://github.com/intentee/paddler) - Open-source LLMOps platform for hosting and scaling AI in your own infrastructure
- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server

View File

@@ -2949,11 +2949,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
"(default: auto)",
[](common_params & params, const std::string & value) {
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
else { throw std::invalid_argument("invalid value"); }
params.reasoning_format = common_reasoning_format_from_name(value);
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
add_opt(common_arg(

View File

@@ -552,6 +552,17 @@ common_chat_templates_ptr common_chat_templates_init(
default_template_src = CHATML_TEMPLATE_SRC;
}
}
// TODO @ngxson : this is a temporary hack to prevent chat template from throwing an error
// Ref: https://github.com/ggml-org/llama.cpp/pull/15230#issuecomment-3173959633
if (default_template_src.find("<|channel|>") != std::string::npos
// search for the error message and patch it
&& default_template_src.find("in message.content or") != std::string::npos) {
string_replace_all(default_template_src,
"{%- if \"<|channel|>analysis<|message|>\" in message.content or \"<|channel|>final<|message|>\" in message.content %}",
"{%- if false %}");
}
std::string token_bos = bos_token_override;
std::string token_eos = eos_token_override;
bool add_bos = false;
@@ -625,6 +636,19 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
}
}
common_reasoning_format common_reasoning_format_from_name(const std::string & format) {
if (format == "none") {
return COMMON_REASONING_FORMAT_NONE;
} else if (format == "auto") {
return COMMON_REASONING_FORMAT_AUTO;
} else if (format == "deepseek") {
return COMMON_REASONING_FORMAT_DEEPSEEK;
} else if (format == "deepseek-legacy") {
return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
}
throw std::runtime_error("Unknown reasoning format: " + format);
}
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
std::string arguments;
if (builder.is_partial()) {

View File

@@ -191,6 +191,7 @@ std::string common_chat_format_example(
const char* common_chat_format_name(common_chat_format format);
const char* common_reasoning_format_name(common_reasoning_format format);
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

View File

@@ -28,6 +28,14 @@ if TYPE_CHECKING:
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
from gguf.vocab import MistralTokenizerType, MistralVocab
from mistral_common.tokens.tokenizers.base import TokenizerVersion
from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
from mistral_common.tokens.tokenizers.sentencepiece import (
SentencePieceTokenizer,
)
logger = logging.getLogger("hf-to-gguf")
@@ -81,6 +89,8 @@ class ModelBase:
block_count: int
tensor_map: gguf.TensorNameMap
is_mistral_format: bool = False
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
use_temp_file: bool = False, eager: bool = False,
metadata_override: Path | None = None, model_name: str | None = None,
@@ -106,16 +116,17 @@ class ModelBase:
logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
self.tensor_names = set(name for name in remote_tensors.keys())
for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id).items():
for name, remote_tensor in remote_tensors.items():
yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor))
self.get_tensors = get_remote_tensors
else:
self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors")
prefix = "model" if not self.is_mistral_format else "consolidated"
self.part_names = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors")
self.is_safetensors = len(self.part_names) > 0
if not self.is_safetensors:
self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams
self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams
self.tensor_names = None
self.metadata_override = metadata_override
self.model_name = model_name
@@ -153,19 +164,23 @@ class ModelBase:
def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
tensor_names_from_parts: set[str] = set()
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
index_name += ".index.json"
index_file = self.dir_model / index_name
if not self.is_mistral_format:
index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin"
index_name += ".index.json"
index_file = self.dir_model / index_name
if index_file.is_file():
self.tensor_names = set()
logger.info(f"gguf: loading model weight map from '{index_name}'")
with open(index_file, "r", encoding="utf-8") as f:
index: dict[str, Any] = json.load(f)
weight_map = index.get("weight_map")
if weight_map is None or not isinstance(weight_map, dict):
raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
self.tensor_names.update(weight_map.keys())
if index_file.is_file():
self.tensor_names = set()
logger.info(f"gguf: loading model weight map from '{index_name}'")
with open(index_file, "r", encoding="utf-8") as f:
index: dict[str, Any] = json.load(f)
weight_map = index.get("weight_map")
if weight_map is None or not isinstance(weight_map, dict):
raise ValueError(f"Can't load 'weight_map' from {index_name!r}")
self.tensor_names.update(weight_map.keys())
else:
self.tensor_names = tensor_names_from_parts
weight_map = {}
else:
self.tensor_names = tensor_names_from_parts
weight_map = {}
@@ -426,7 +441,12 @@ class ModelBase:
return part_names
@staticmethod
def load_hparams(dir_model: Path):
def load_hparams(dir_model: Path, is_mistral_format: bool):
if is_mistral_format:
with open(dir_model / "params.json", "r", encoding="utf-8") as f:
config = json.load(f)
return config
try:
# for security reason, we don't allow loading remote code by default
# if a model need remote code, we will fallback to config.json
@@ -476,7 +496,10 @@ class TextModel(ModelBase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.hf_arch = get_model_architecture(self.hparams, self.model_type)
if not self.is_mistral_format:
self.hf_arch = get_model_architecture(self.hparams, self.model_type)
else:
self.hf_arch = ""
if "text_config" in self.hparams:
# move the text_config to the root level
@@ -542,14 +565,14 @@ class TextModel(ModelBase):
self.gguf_writer.add_head_count(n_head)
logger.info(f"gguf: head count = {n_head}")
if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
if (n_head_kv := self.find_hparam(["num_key_value_heads", "n_kv_heads"], optional=True)) is not None:
self.gguf_writer.add_head_count_kv(n_head_kv)
logger.info(f"gguf: key-value head count = {n_head_kv}")
if (rope_theta := self.hparams.get("rope_theta")) is not None:
self.gguf_writer.add_rope_freq_base(rope_theta)
logger.info(f"gguf: rope theta = {rope_theta}")
if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
@@ -1210,12 +1233,19 @@ class MmprojModel(ModelBase):
raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ")
# get n_embd of the text model
if "text_config" not in self.hparams:
self.hparams["text_config"] = {}
if "audio_config" not in self.hparams:
self.hparams["audio_config"] = {}
text_config = {**self.hparams, **self.hparams["text_config"]}
self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
if not self.is_mistral_format:
if "text_config" not in self.hparams:
self.hparams["text_config"] = {}
if "audio_config" not in self.hparams:
self.hparams["audio_config"] = {}
text_config = {**self.hparams, **self.hparams["text_config"]}
self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0))
else:
text_config = {
k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"]
}
self.n_embd_text = text_config.get("hidden_dim", 0)
assert self.n_embd_text > 0, "n_embd not found in hparams"
# move vision config to the top level, while preserving the original hparams in global_config
@@ -1236,11 +1266,13 @@ class MmprojModel(ModelBase):
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
# load preprocessor config
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f)
if not self.is_mistral_format:
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f)
def get_vision_config(self) -> dict[str, Any] | None:
return self.global_config.get("vision_config")
config_name = "vision_config" if not self.is_mistral_format else "vision_encoder"
return self.global_config.get(config_name)
def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config.get("audio_config")
@@ -1264,8 +1296,11 @@ class MmprojModel(ModelBase):
self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))
# preprocessor config
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"])
image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
image_std = DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"]
self.gguf_writer.add_vision_image_mean(image_mean)
self.gguf_writer.add_vision_image_std(image_std)
if self.has_audio_encoder:
self.gguf_writer.add_clip_has_audio_encoder(True)
@@ -1924,11 +1959,63 @@ class LlamaModel(TextModel):
if self.hf_arch == "VLlama3ForCausalLM":
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
def _set_vocab_mistral(self):
vocab = MistralVocab(self.dir_model)
logger.info(
f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
)
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
tokens = []
scores = []
toktypes = []
for text, score, toktype in vocab.all_tokens():
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
assert len(tokens) == vocab.vocab_size, (
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
)
if vocab.tokenizer_type == MistralTokenizerType.tekken:
self.gguf_writer.add_tokenizer_pre("tekken")
self.gguf_writer.add_token_merges(
vocab.extract_vocab_merges_from_model()
)
logger.info(
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
)
self.gguf_writer.add_bos_token_id(vocab.bos_id)
self.gguf_writer.add_eos_token_id(vocab.eos_id)
self.gguf_writer.add_unk_token_id(vocab.unk_id)
self.gguf_writer.add_pad_token_id(vocab.pad_id)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_vocab_size(vocab.vocab_size)
self.gguf_writer.add_add_bos_token(True)
self.gguf_writer.add_add_eos_token(False)
template_dir = Path(__file__).parent / "models/templates/"
template = MistralModel.get_community_chat_template(vocab, template_dir)
self.gguf_writer.add_chat_template(template)
def set_vocab(self):
if self.is_mistral_format:
return self._set_vocab_mistral()
path_tekken_json = self.dir_model / "tekken.json"
path_tokenizer_json = self.dir_model / "tokenizer.json"
if path_tekken_json.is_file() and not path_tokenizer_json.is_file():
return self.set_vocab_tekken()
self._set_vocab_mistral()
try:
self._set_vocab_sentencepiece()
@@ -1962,56 +2049,12 @@ class LlamaModel(TextModel):
if self.hparams.get("vocab_size", 32000) == 49152:
self.gguf_writer.add_add_bos_token(False)
def set_vocab_tekken(self):
vocab = gguf.vocab.MistralVocab(self.dir_model)
self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
tokens = []
scores = []
toktypes = []
for text, score, toktype in vocab.all_tokens():
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
assert len(tokens) == vocab.vocab_size, (
f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
)
if vocab.tokenizer_type == gguf.vocab.MistralTokenizerType.tekken:
self.gguf_writer.add_tokenizer_pre("tekken")
self.gguf_writer.add_token_merges(
vocab.extract_vocab_merges_from_model()
)
logger.info(
f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
)
self.gguf_writer.add_bos_token_id(vocab.bos_id)
self.gguf_writer.add_eos_token_id(vocab.eos_id)
self.gguf_writer.add_unk_token_id(vocab.unk_id)
self.gguf_writer.add_pad_token_id(vocab.pad_id)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_vocab_size(vocab.vocab_size)
self.gguf_writer.add_add_bos_token(True)
self.gguf_writer.add_add_eos_token(False)
script_dir = Path(__file__).parent
template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja"
with open(template_path, "r", encoding="utf-8") as f:
template = f.read()
self.gguf_writer.add_chat_template(template)
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if not self.is_mistral_format:
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if (rope_dim := hparams.get("head_dim")) is None:
rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
@@ -2033,13 +2076,25 @@ class LlamaModel(TextModel):
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
n_head = self.find_hparam(["n_heads", "num_attention_heads"])
n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"])
vision_prefixes = [
"vision_encoder.",
"vision_language_adapter.",
"patch_merger.",
"pre_mm_projector_norm",
]
is_multimodal_tensor = "vision_tower" in name \
or "vision_model" in name \
or "audio_tower" in name \
or "model.connector" in name \
or "multi_modal_projector" in name
or "multi_modal_projector" in name \
or any(
name.startswith(prefix)
for prefix in vision_prefixes
)
if is_multimodal_tensor:
return [] # skip vision tensors
@@ -2155,13 +2210,18 @@ class LlavaVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if self.hparams["model_type"] == "pixtral":
if self.hparams.get("model_type") == "pixtral":
# layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
logger.info(f"Image break token id: {self.img_break_tok_id}")
elif self.is_mistral_format:
# hparams is already vision config here so norm_eps is only defined in global_config.
self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
else:
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
logger.info(f"Image break token id: {self.img_break_tok_id}")
def get_token_id(self, token: str) -> int:
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
@@ -2175,7 +2235,7 @@ class LlavaVisionModel(MmprojModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
if hparams["model_type"] == "pixtral":
if hparams.get("model_type") == "pixtral":
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
@@ -2193,18 +2253,30 @@ class LlavaVisionModel(MmprojModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
n_head = self.hparams["num_attention_heads"]
n_head = (
self.hparams["num_attention_heads"] if not self.is_mistral_format else self.find_vparam(["num_attention_heads"])
)
n_kv_head = n_head
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."):
valid_prefixes = (
"multi_modal_projector.",
"vision_tower.",
"vision_encoder.",
"vision_language_adapter.",
"patch_merger.",
"pre_mm_projector_norm",
)
if any(name.startswith(prefix) for prefix in valid_prefixes):
# process vision tensors
if name.endswith(("q_proj.weight", "q_proj.bias")):
if name.endswith(("q_proj.weight", "q_proj.bias")) and not self.is_mistral_format:
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith(("k_proj.weight", "k_proj.bias")):
if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format:
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
return [(self.map_tensor_name(name), data_torch)]
if self.img_break_tok_id > 0 and "embed_tokens.weight" in name:
embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight"
if self.img_break_tok_id > 0 and embed_key in name:
logger.info(f"Extracting [IMG_BREAK] token embedding from {name}")
# for pixtral model, we need to extract the [IMG_BREAK] token embedding
img_break_embd = data_torch[self.img_break_tok_id]
@@ -3526,7 +3598,7 @@ class Qwen3MoeModel(Qwen2MoeModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
hparams = ModelBase.load_hparams(self.dir_model)
hparams = ModelBase.load_hparams(self.dir_model, False)
self.origin_hf_arch = hparams.get('architectures', [None])[0]
def set_vocab(self):
@@ -4683,7 +4755,7 @@ class NomicBertModel(BertModel):
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
hparams = kwargs.pop("hparams", None)
if hparams is None:
hparams = ModelBase.load_hparams(dir_model)
hparams = ModelBase.load_hparams(dir_model, False)
self.is_moe = bool(hparams.get("moe_every_n_layers"))
self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT
@@ -8102,7 +8174,6 @@ class GptOssModel(TextModel):
def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
blocks0: Tensor = torch.zeros(1)
blocks1: Tensor = torch.zeros(1)
found_mxfp4_tensors = False
# we assume that tensors are loaded in the correct order
for name, data_torch in self.get_tensors():
if "mlp.experts.down_proj_blocks" in name:
@@ -8110,7 +8181,6 @@ class GptOssModel(TextModel):
elif "mlp.experts.down_proj_scales" in name:
new_name = self.map_tensor_name(name.replace("_scales", ".weight"))
self.repack_mxfp4(new_name, blocks0, data_torch)
found_mxfp4_tensors = True
elif "mlp.experts.gate_up_proj_blocks" in name:
blocks0, blocks1 = data_torch[:, ::2, :, :], data_torch[:, 1::2, :, :]
elif "mlp.experts.gate_up_proj_scales" in name:
@@ -8119,9 +8189,6 @@ class GptOssModel(TextModel):
new_name_up = self.map_tensor_name(name.replace("gate_up_proj_scales", "up_proj.weight"))
self.repack_mxfp4(new_name_gate, blocks0, scales0)
self.repack_mxfp4(new_name_up, blocks1, scales1)
found_mxfp4_tensors = True
if not found_mxfp4_tensors:
raise ValueError("No MXFP4 tensors found in the model. Please make sure you are using MXFP4 model.")
return []
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@@ -8134,7 +8201,12 @@ class GptOssModel(TextModel):
if "down_proj" in name:
if name.endswith("_bias"):
name = name.replace("down_proj_bias", "down_proj.bias")
elif "_blocks" not in name and "_scales" not in name:
logger.warning(f"{name} is not in MXFP4, performance may be degraded")
name = name.replace("down_proj", "down_proj.weight")
data_torch = data_torch.transpose(-1, -2)
else:
# otherwise, it should already be repacked to ggml MXFP4 format
return []
# split the gate_up into gate and up
@@ -8147,7 +8219,18 @@ class GptOssModel(TextModel):
(self.map_tensor_name(name_gate), gate_proj_bias),
(self.map_tensor_name(name_up), up_proj_bias)
]
elif "_blocks" not in name and "_scales" not in name:
logger.warning(f"{name} is not in MXFP4, performance may be degraded")
name_up = name.replace("gate_up_proj", "up_proj.weight")
name_gate = name.replace("gate_up_proj", "gate_proj.weight")
data_torch = data_torch.transpose(-1, -2)
gate_proj_weight, up_proj_weight = data_torch[:, ::2, :], data_torch[:, 1::2, :]
return [
(self.map_tensor_name(name_gate), gate_proj_weight),
(self.map_tensor_name(name_up), up_proj_weight)
]
else:
# otherwise, it should already be repacked to ggml MXFP4 format
return []
return [(self.map_tensor_name(name), data_torch)]
@@ -8293,6 +8376,77 @@ class SmallThinkerModel(TextModel):
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
class MistralModel(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA
model_name = "Mistral"
hf_arch = ""
is_mistral_format = True
undo_permute = False
@staticmethod
def get_community_chat_template(vocab: MistralVocab, templates_dir: Path):
assert TokenizerVersion is not None, "mistral_common is not installed"
assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), (
f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}"
)
if vocab.tokenizer.version == TokenizerVersion.v1:
return "mistral-v1"
elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.spm:
return "mistral-v3"
elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.tekken:
return "mistral-v3-tekken"
elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.spm:
return "mistral-v7"
elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.tekken:
return "mistral-v7-tekken"
elif vocab.tokenizer.version == TokenizerVersion.v11:
template_file = "Mistral-Small-3.2-24B-Instruct-2506.jinja"
elif vocab.tokenizer.version == TokenizerVersion.v13:
template_file = "unsloth-mistral-Devstral-Small-2507.jinja"
else:
raise ValueError(f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}")
template_path = templates_dir / template_file
if not template_path.exists():
raise FileNotFoundError(f"Template file not found: {template_path}")
with open(template_path, "r", encoding="utf-8") as f:
template = f.read()
return template
class PixtralModel(LlavaVisionModel):
model_name = "Pixtral"
hf_arch = ""
is_mistral_format = True
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL)
self.gguf_writer.add_vision_attention_layernorm_eps(
self.find_hparam(["norm_eps"])
)
self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"]))
self.gguf_writer.add_vision_use_silu(True)
# spatial_merge_size
if self.find_vparam(["mm_projector_id"]) == "patch_merge":
self.gguf_writer.add_vision_spatial_merge_size(
self.find_vparam(["spatial_merge_size"])
)
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
if name == "vision_language_adapter.w_in.weight":
return "mm.1.weight"
elif name == "vision_language_adapter.w_out.weight":
return "mm.2.weight"
return super().map_tensor_name(name, try_suffixes)
###### CONVERSION LOGIC ######
@@ -8443,6 +8597,10 @@ def parse_args() -> argparse.Namespace:
"--mmproj", action="store_true",
help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.",
)
parser.add_argument(
"--mistral-format", action="store_true",
help="Whether the model is stored following the Mistral format.",
)
args = parser.parse_args()
if not args.print_supported_models and args.model is None:
@@ -8548,17 +8706,25 @@ def main() -> None:
if "mmproj" not in fname_out.name:
fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
is_mistral_format = args.mistral_format
with torch.inference_mode():
output_type = ftype_map[args.outtype]
model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT
hparams = ModelBase.load_hparams(dir_model)
model_architecture = get_model_architecture(hparams, model_type)
logger.info(f"Model architecture: {model_architecture}")
try:
model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
except NotImplementedError:
logger.error(f"Model {model_architecture} is not supported")
sys.exit(1)
hparams = ModelBase.load_hparams(dir_model, is_mistral_format)
if not is_mistral_format:
model_architecture = get_model_architecture(hparams, model_type)
logger.info(f"Model architecture: {model_architecture}")
try:
model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
except NotImplementedError:
logger.error(f"Model {model_architecture} is not supported")
sys.exit(1)
elif args.mmproj:
assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
model_class = PixtralModel
else:
model_class = MistralModel
model_instance = model_class(dir_model, output_type, fname_out,
is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
@@ -8567,7 +8733,8 @@ def main() -> None:
split_max_tensors=args.split_max_tensors,
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
small_first_shard=args.no_tensor_first_split,
remote_hf_model_id=hf_repo_id)
remote_hf_model_id=hf_repo_id,
)
if args.vocab_only:
logger.info("Exporting model vocab...")

View File

@@ -340,7 +340,7 @@ if __name__ == '__main__':
sys.exit(1)
else:
logger.info(f"Loading base model: {dir_base_model.name}")
hparams = ModelBase.load_hparams(dir_base_model)
hparams = ModelBase.load_hparams(dir_base_model, False)
with torch.inference_mode():
try:

View File

@@ -13,7 +13,7 @@ If there are differences in usage, please refer to the official build [documenta
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```

View File

@@ -12,7 +12,7 @@ If there are differences in usage, please refer to the official build [documenta
Clone llama.cpp:
```bash
git clone https://github.com/ggerganov/llama.cpp
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
```

View File

@@ -281,10 +281,10 @@ ggml_backend_t ggml_backend_blas_init(void) {
ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
ggml_backend_t backend = new ggml_backend {
/* .guid = */ ggml_backend_blas_guid(),
/* .interface = */ blas_backend_i,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
/* .context = */ ctx,
/* .guid = */ ggml_backend_blas_guid(),
/* .iface = */ blas_backend_i,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
/* .context = */ ctx,
};
#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)

View File

@@ -812,7 +812,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_cann_release_resources(ctx, src_trans_tensor);
return;
} else {
GGML_ABORT("Unsupport dst is not tontiguous.");
GGML_ABORT("Unsupport dst is not contiguous.");
}
}
ggml_cann_release_resources(ctx, acl_src, acl_dst);
@@ -1330,160 +1330,196 @@ static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx,
}
/**
* @brief Applies the Alibi (Attention with Linear Biases) mechanism to the
* @details This function implements the Alibi mechanism, which introduces
* learnable biases into the attention scores to simulate relative
* position encoding without the need for explicit positional
* embeddings.
* @brief Generate a range of values and apply a scalar base exponentiation.
*
* @param ctx The backend CANN context for executing operations.
* @param acl_src The source tensor representing the query or key.
* @param acl_position The position tensor containing relative positions.
* @param acl_dst The destination tensor where the result will be stored.
* @param n_head The number of attention heads.
* @param src_ne The dimensions of the source tensor.
* @param src_nb0 The byte size of the first dimension of the source
tensor.
* @param max_bias The maximum bias value used in the Alibi mechanism.
* @param dst The destination tensor object for additional metadata.
* This function creates an evenly spaced sequence from `start` to `stop` (exclusive),
* with step size `step`, stores it in a temporary buffer, and then computes:
*
* The function performs the following steps:
* 1. Calculates the logarithm floor of the number of heads to determine the
base for bias calculation.
* 2. Initializes arrays with arithmetic sequences and fills them with bias
values.
* 3. Computes the bias tensor based on the calculated biases and arithmetic
sequences.
* 4. Reshapes the bias tensor to match the dimensions of the input tensors.
* 5. Multiplies the position tensor by the bias tensor.
* 6. Adds the result of the multiplication to the source tensor to produce the
final output.
* @f[
* slope[i] = m^{\left( start + i \cdot step \right)}, \quad 0 \le i < size
* @f]
*
* The results are written to the provided @p slope_buffer.
*
* @param ctx CANN backend context for memory allocation and operator execution.
* @param slope_buffer Pointer to the output buffer (float array) for the computed slope values.
* @param m Scalar base for the exponentiation.
* @param size Number of elements in the generated sequence.
* @param start Starting exponent offset.
* @param stop Stopping exponent offset (exclusive).
* @param step Step size for the exponent increment.
*/
static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src,
aclTensor* acl_position, aclTensor* acl_dst,
const int n_head, int64_t* src_ne, const size_t src_nb0,
float max_bias, ggml_tensor* dst) {
const int64_t ne2_ne3 = src_ne[2] * src_ne[3];
GGML_ASSERT(src_nb0 == sizeof(float));
GGML_ASSERT(n_head == src_ne[2]);
static void aclnn_get_slope_inner(ggml_backend_cann_context& ctx, void* slope_buffer,
float m, int64_t size, float start, float stop, float step){
int64_t ne[] = {size};
size_t nb[] = {sizeof(float)};
const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
ggml_cann_pool_alloc arange_allocator(ctx.pool(), size * sizeof(float));
void* arange_buffer = arange_allocator.get();
float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
aclTensor* arange_tensor = ggml_cann_create_tensor(
arange_buffer, ACL_FLOAT, sizeof(float), ne, nb, 1);
aclnn_arange(ctx, arange_tensor, start, stop, step, size);
// init arange
ggml_cann_pool_alloc arange_allocator(ctx.pool(),
ne2_ne3 * ggml_type_size(dst->type));
void* tmp_arange_buffer = arange_allocator.get();
aclTensor* slope_tensor = ggml_cann_create_tensor(
slope_buffer, ACL_FLOAT, sizeof(float), ne, nb, 1);
// arange1: [1, ..., n_heads_log2_floor+1)
float start = 1;
float stop = n_heads_log2_floor + 1;
float step = 1;
int64_t n_elements_arange = n_heads_log2_floor;
aclScalar* sc = aclCreateScalar(&m, aclDataType::ACL_FLOAT);
int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
size_t tmp_arange1_nb[] = {sizeof(dst->type)};
aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb,
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
aclTensor* tmp_arange2_tensor = nullptr;
if (n_heads_log2_floor < ne2_ne3) {
// arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
start = 1;
stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
step = 2;
n_elements_arange = ne2_ne3 - n_heads_log2_floor;
int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
size_t tmp_arange2_nb[] = {sizeof(dst->type)};
aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
(char*)tmp_arange_buffer +
n_heads_log2_floor * ggml_type_size(dst->type),
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
n_elements_arange);
}
// init mk_base
ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
ne2_ne3 * ggml_type_size(dst->type));
void* tmp_mk_base_buffer = mk_base_allocator.get();
int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
size_t tmp_mk_base1_nb[] = {sizeof(dst->type)};
aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb,
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
aclTensor* tmp_mk_base2_tensor = nullptr;
if (n_heads_log2_floor < ne2_ne3) {
int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
size_t tmp_mk_base2_nb[] = {sizeof(dst->type)};
aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
(char*)tmp_mk_base_buffer +
n_heads_log2_floor * ggml_type_size(dst->type),
ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type),
tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
}
// init mk
int64_t tmp_mk_base_ne[] = {ne2_ne3};
size_t tmp_mk_base_nb[] = {sizeof(dst->type)};
aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
tmp_arange_buffer, ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb,
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
// reshape mk
int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]};
size_t tmp_mk_nb[GGML_MAX_DIMS];
tmp_mk_nb[0] = ggml_type_size(dst->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
}
aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
ACL_FORMAT_ND);
// acl_position * mk
int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]};
size_t tmp_output_nb[GGML_MAX_DIMS];
tmp_output_nb[0] = ggml_type_size(dst->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1];
}
ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst));
void* tmp_output_buffer = output_allocator.get();
aclTensor* tmp_output_tensor = ggml_cann_create_tensor(
tmp_output_buffer, ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS,
ACL_FORMAT_ND);
aclnn_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor);
// add
aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst);
ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
tmp_arange_tensor, tmp_mk_tensor, tmp_output_tensor);
GGML_CANN_CALL_ACLNN_OP(ctx, PowScalarTensor, sc, arange_tensor, slope_tensor);
ggml_cann_release_resources(ctx, sc, arange_tensor, slope_tensor);
}
void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
/**
* @brief Compute slope values for multiple attention heads based on ALiBi bias parameters.
*
* This function generates slope values for each attention head according to the ALiBi
* (Attention with Linear Biases) method. It splits the computation into two ranges depending
* on whether the head index is less than @p n_head_log2 or not, and uses different base values
* (`m0` and `m1`) for the exponentiation.
*
* @f[
* slope[h] =
* \begin{cases}
* m_0^{(h + 1)}, & h < n\_head\_log2 \\
* m_1^{\left( 2 \cdot (h - n\_head\_log2) + 1 \right)}, & h \geq n\_head\_log2
* \end{cases}
* \quad , \quad \text{if } max\_bias > 0
* @f]
*
* If @p max_bias <= 0, all slope values are set to 1.0.
*
* @param ctx CANN backend context for memory allocation and operator execution.
* @param n_head Total number of attention heads.
* @param slope_buffer Pointer to the output buffer (float array) for storing slopes.
* @param max_bias Maximum bias value for slope computation.
*
*/
static void aclnn_get_slope(ggml_backend_cann_context & ctx, int64_t n_head,
void* slope_buffer, float max_bias) {
const int n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
float m0 = powf(2.0f, -(max_bias) / n_head_log2);
float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
// const float slope = (max_bias > 0.0f) ?
// h < n_head_log2 ?
// powf(m0, h + 1) :
// powf(m1, 2*(h - n_head_log2) + 1) :
// 1.0f;
// arange1
float start = 0 + 1;
float end = (n_head_log2 - 1) + 1;
float step = 1;
float count = n_head_log2;
// end needs to be +1 because aclnn uses a left-closed, right-open interval.
aclnn_get_slope_inner(ctx, slope_buffer, m0, count, start, end + 1, step);
if (n_head_log2 < n_head) {
// arange2
start = 2 * (n_head_log2 - n_head_log2) + 1;
end = 2 * ((n_head - 1) - n_head_log2) + 1;
step = 2;
count = n_head - n_head_log2;
aclnn_get_slope_inner(
ctx, (char *) slope_buffer + n_head_log2 * sizeof(float),
m1, count, start, end + 1, step);
}
}
/**
* @brief Add ALiBi (Attention with Linear Biases) positional biases to the attention mask.
*
* This function computes the ALiBi slopes for each attention head (if max_bias > 0),
* multiplies them with the attention mask to produce bias tensors, and adds these biases
* to the destination tensor (@p dst).
*
* The function performs necessary broadcasting of the mask and slope tensors to match
* the shape of the destination tensor, then applies element-wise multiplication and addition
* using CANN operators.
*
* @param ctx CANN backend context for memory management and operator execution.
* @param mask Input attention mask tensor, assumed to be contiguous.
* @param dst Destination tensor to which ALiBi biases will be added.
* @param dst_ptr Pointer to the memory of the destination tensor.
* @param max_bias Maximum bias value controlling the slope scaling.
*
* @note
* - Write data into dst_ptr using only the shape information of the dst tensor.
* - `GGML_MAX_DIMS + 2` is used to extend tensor dimensions for broadcasting.
*/
static void aclnn_add_alibi(ggml_backend_cann_context& ctx, ggml_tensor* mask,
ggml_tensor* dst, void* dst_ptr, float max_bias) {
void* slope_buffer = nullptr;
void* bias_buffer = nullptr;
if (max_bias > 0.0f) {
int64_t n_heads = dst->ne[2];
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
slope_buffer = slope_allocator.get();
ggml_cann_pool_alloc bias_allocator(
ctx.pool(), ggml_nelements(dst) * ggml_element_size(dst));
bias_buffer = bias_allocator.get();
aclnn_get_slope(ctx, n_heads, slope_buffer, max_bias);
}
// broadcast for mask, slop and dst;
int64_t nr2 = dst->ne[2] / mask->ne[2];
int64_t nr3 = dst->ne[3] / mask->ne[3];
// broadcast the mask across rows
int64_t mask_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], 1, mask->ne[3], 1 };
size_t mask_nb[] = {
mask_nb[0] = mask->nb[0], mask_nb[1] = mask->nb[1], mask_nb[2] = mask->nb[2],
mask_nb[3] = mask->nb[2], mask_nb[4] = mask->nb[3], mask_nb[5] = mask->nb[3]
};
int64_t dst_ne[] = { dst->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], nr3 };
size_t dst_nb[] = {
dst_nb[0] = dst->nb[0], dst_nb[1] = dst->nb[1], dst_nb[2] = dst->nb[2],
dst_nb[3] = dst->nb[2], dst_nb[4] = dst->nb[3], dst_nb[5] = dst->nb[3]
};
// slope is a 1 dim tensor, slope.ne2 == dst.ne2
int64_t slope_ne[] = { 1, 1, mask->ne[2], nr2, 1, 1 };
size_t slope_nb[GGML_MAX_DIMS + 2];
slope_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
slope_nb[i] = slope_nb[i - 1] * slope_ne[i - 1];
}
aclTensor* acl_slope = ggml_cann_create_tensor(
slope_buffer, ACL_FLOAT, sizeof(float),
slope_ne, slope_nb, GGML_MAX_DIMS + 2);
aclTensor* acl_mask = ggml_cann_create_tensor(
mask, mask_ne, mask_nb, GGML_MAX_DIMS + 2);
// write data into dst_ptr using only the shape information of the dst tensor.
aclTensor* acl_dst = ggml_cann_create_tensor(
dst_ptr, ggml_cann_type_mapping(dst->type),
ggml_type_size(dst->type), dst_ne, dst_nb,
GGML_MAX_DIMS + 2);
if (max_bias > 0.0f) {
int64_t bias_ne[] = { mask->ne[0], dst->ne[1], mask->ne[2], nr2, mask->ne[3], 1 };
size_t bias_nb[GGML_MAX_DIMS + 2];
bias_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS + 2; i++) {
bias_nb[i] = bias_nb[i - 1] * bias_ne[i - 1];
}
aclTensor* bias_tensor = ggml_cann_create_tensor(
bias_buffer, ACL_FLOAT, sizeof(float),
bias_ne, bias_nb, GGML_MAX_DIMS + 2);
aclnn_mul(ctx, acl_slope, acl_mask, bias_tensor);
aclnn_add(ctx, acl_dst, bias_tensor);
ggml_cann_release_resources(ctx, bias_tensor);
} else {
aclnn_add(ctx, acl_dst, acl_mask);
}
ggml_cann_release_resources(ctx, acl_slope, acl_mask, acl_dst);
}
void ggml_cann_cpy(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_cann_dup(ctx, dst);
}
@@ -1501,118 +1537,41 @@ void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
* @param acl_dst The destination tensor where the softmax results will be
* stored.
*/
static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src,
int64_t dim, aclTensor* acl_dst) {
static void aclnn_softmax(ggml_backend_cann_context & ctx,
aclTensor* acl_src, int64_t dim, aclTensor * acl_dst) {
GGML_CANN_CALL_ACLNN_OP(ctx, Softmax, acl_src, dim, acl_dst);
}
void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
void ggml_cann_softmax(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
ggml_tensor* src0 = dst->src[0];
ggml_tensor* src1 = dst->src[1]; // mask
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
float scale = 1.0f;
float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&scale, (float*)dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float));
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
// input mul scale
aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT);
ggml_cann_pool_alloc src_tensor_allocator(ctx.pool(), ggml_nbytes(src0));
void* src_tensor_buffer = src_tensor_allocator.get();
aclTensor* softmax_tensor = ggml_cann_create_tensor(
src_tensor_buffer, ggml_cann_type_mapping(src0->type),
ggml_element_size(src0), src0->ne, src0->nb,GGML_MAX_DIMS);
size_t n_bytes = ggml_nbytes(src0);
ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes);
void* input_mul_scale_buffer = mul_scale_allocator.get();
aclTensor* acl_input_mul_scale_tensor = ggml_cann_create_tensor(
input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne,
src0->nb, GGML_MAX_DIMS);
bool inplace = false;
aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace);
aclnn_muls(ctx, acl_src0, scale, softmax_tensor, false);
// mask
aclTensor* acl_src1_fp32_tensor = nullptr;
aclTensor* tmp_mask_tensor = nullptr;
ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool());
if (src1) {
const bool use_f16 = src1->type == GGML_TYPE_F16;
if (use_f16) {
// cast to fp32
size_t n_bytes = ggml_nelements(src1) * sizeof(float_t);
size_t src1_fp32_nb[GGML_MAX_DIMS];
src1_fp32_nb[0] = sizeof(float_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1];
}
src1_fp32_allocator.alloc(n_bytes);
void* src1_fp32_buffer = src1_fp32_allocator.get();
acl_src1_fp32_tensor = ggml_cann_create_tensor(
src1_fp32_buffer, ACL_FLOAT, sizeof(float), src1->ne,
src1_fp32_nb, GGML_MAX_DIMS);
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT);
ggml_cann_release_resources(ctx, acl_src1);
} else {
acl_src1_fp32_tensor = ggml_cann_create_tensor(src1);
}
// broadcast the mask across rows, only use ne11 of ne01 in mask
if (src1->ne[1] != src0->ne[1]) {
// mask shape: [1,1,ne11,ne10]
int64_t tmp_mask_ne[] = {src0->ne[0], src0->ne[1], 1, 1};
size_t tmp_mask_nb[GGML_MAX_DIMS];
tmp_mask_nb[0] = sizeof(float_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1];
}
tmp_mask_tensor = ggml_cann_create_tensor(
src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb,
GGML_MAX_DIMS, ACL_FORMAT_ND);
}
// alibi
const int n_head = src0->ne[2];
const size_t src_nb0 = src0->nb[0];
n_bytes = ggml_nbytes(dst);
ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes);
void* output_buffer = output_allocator.get();
aclTensor* alibi_output_tensor = ggml_cann_create_tensor(
output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne,
dst->nb, GGML_MAX_DIMS);
if (max_bias <= 0.0f) {
// slope = 1.0
if (tmp_mask_tensor) {
aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor,
alibi_output_tensor);
} else {
aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor,
alibi_output_tensor);
}
} else {
// slope != 1.0
if (tmp_mask_tensor) {
aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor,
alibi_output_tensor, n_head, src0->ne, src_nb0,
max_bias, dst);
} else {
aclnn_alibi(ctx, acl_input_mul_scale_tensor,
acl_src1_fp32_tensor, alibi_output_tensor, n_head,
src0->ne, src_nb0, max_bias, dst);
}
}
// softmax
aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst);
ggml_cann_release_resources(ctx, alibi_output_tensor);
} else {
aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst);
aclnn_add_alibi(ctx, src1, src0, src_tensor_buffer, max_bias);
}
ggml_cann_release_resources(ctx, acl_src0, acl_src1_fp32_tensor, acl_dst,
acl_scale, acl_input_mul_scale_tensor, tmp_mask_tensor);
// softmax
aclnn_softmax(ctx, softmax_tensor, 3, acl_dst);
ggml_cann_release_resources(ctx, acl_src0, acl_dst, acl_scale, softmax_tensor);
}
/**
@@ -3208,104 +3167,24 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context& ctx, ggml_tensor* dst){
// Compute the slope if needed. Derived from ggml_cann_softmax().
if(maxBias != 0.0f){
// alibi
const int64_t ne2_ne3 = src0->ne[2] * src0->ne[3];
const int64_t n_head = src0->ne[2];
const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head));
float m0 = powf(2.0f, -(maxBias) / n_heads_log2_floor);
float m1 = powf(2.0f, -(maxBias / 2.0f) / n_heads_log2_floor);
// init arange
ggml_cann_pool_alloc arange_allocator(ctx.pool(),
ne2_ne3 * faElemSize);
void* tmp_arange_buffer = arange_allocator.get();
const int64_t n_heads = src0->ne[2];
ggml_cann_pool_alloc slope_allocator(ctx.pool(), n_heads * sizeof(float));
void* slope_buffer = slope_allocator.get();
aclnn_get_slope(ctx, n_heads, slope_buffer, maxBias);
// arange1: [1, ..., n_heads_log2_floor+1)
float start = 1;
float stop = n_heads_log2_floor + 1;
float step = 1;
int64_t n_elements_arange = n_heads_log2_floor;
int64_t tmp_arange1_ne[] = {n_heads_log2_floor};
size_t tmp_arange1_nb[] = {faElemSize};
aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor(
tmp_arange_buffer, faDataType, faElemSize,
tmp_arange1_ne, tmp_arange1_nb,
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange);
aclTensor* tmp_arange2_tensor = nullptr;
if (n_heads_log2_floor < ne2_ne3) {
// arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1)
start = 1;
stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1;
step = 2;
n_elements_arange = ne2_ne3 - n_heads_log2_floor;
int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor};
size_t tmp_arange2_nb[] = {faElemSize};
aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor(
(char*)tmp_arange_buffer +
n_heads_log2_floor * faElemSize,
faDataType, faElemSize,
tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step,
n_elements_arange);
int64_t slope_ne[] = {1, 1, n_heads, 1};
size_t slope_nb[GGML_MAX_DIMS];
slope_nb[0] = sizeof(float);
for(int i = 1;i<GGML_MAX_DIMS;i++) {
slope_nb[i] = slope_nb[i-1] * slope_ne[0];
}
// init mk_base
ggml_cann_pool_alloc mk_base_allocator(ctx.pool(),
ne2_ne3 * faElemSize);
void* tmp_mk_base_buffer = mk_base_allocator.get();
int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor};
size_t tmp_mk_base1_nb[] = {faElemSize};
aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor(
tmp_mk_base_buffer, faDataType, faElemSize,
tmp_mk_base1_ne, tmp_mk_base1_nb,
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
aclTensor* slope_tensor = ggml_cann_create_tensor(
slope_buffer, ACL_FLOAT, sizeof(float),
slope_ne, slope_nb, GGML_MAX_DIMS);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, slope_tensor);
aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor);
aclTensor* tmp_mk_base2_tensor = nullptr;
if (n_heads_log2_floor < ne2_ne3) {
int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor};
size_t tmp_mk_base2_nb[] = {faElemSize};
aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor(
(char*)tmp_mk_base_buffer +
n_heads_log2_floor * faElemSize,
faDataType, faElemSize,
tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor);
}
// init mk
int64_t tmp_mk_base_ne[] = {ne2_ne3};
size_t tmp_mk_base_nb[] = {faElemSize};
aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor(
tmp_mk_base_buffer, faDataType, faElemSize,
tmp_mk_base_ne, tmp_mk_base_nb,
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
aclTensor* tmp_arange_tensor = ggml_cann_create_tensor(
tmp_arange_buffer, faDataType, faElemSize,
tmp_mk_base_ne, tmp_mk_base_nb,
GGML_MAX_DIMS - 3, ACL_FORMAT_ND);
aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor);
// reshape mk
int64_t tmp_mk_ne[] = {1, 1, src0->ne[2], src0->ne[3]};
size_t tmp_mk_nb[GGML_MAX_DIMS];
tmp_mk_nb[0] = faElemSize;
for (int i = 1; i < GGML_MAX_DIMS; i++) {
tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1];
}
aclTensor* tmp_mk_tensor = ggml_cann_create_tensor(
tmp_mk_base_buffer, faDataType, faElemSize,
tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS,
ACL_FORMAT_ND);
GGML_CANN_CALL_ACLNN_OP(ctx, InplaceMul, bcast_pse_tensor, tmp_mk_tensor);
ggml_cann_release_resources(ctx, tmp_arange1_tensor, tmp_arange2_tensor,
tmp_mk_base1_tensor, tmp_mk_base2_tensor, tmp_mk_base_tensor,
tmp_arange_tensor, tmp_mk_tensor);
ggml_cann_release_resources(ctx, slope_tensor);
}
}

View File

@@ -2391,7 +2391,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
// only support F32 and F16.
return false;
}
return true;
return ggml_is_contiguous(op);
} break;
case GGML_OP_CONT: {
// TODO: support GGML_TYPE_BF16
@@ -2456,8 +2456,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
// value of paddingW should be at most half of kernelW
return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
}
case GGML_OP_SUM:
case GGML_OP_DUP:
return ggml_is_contiguous(op);
case GGML_OP_SUM:
case GGML_OP_IM2COL:
case GGML_OP_CONCAT:
case GGML_OP_REPEAT:
@@ -2503,9 +2504,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
if (op->src[2]) {
return false;
}
// TODO: support broadcast
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
return !op->src[1] || (op->src[1]->ne[2] == 1 && op->src[1]->ne[3] == 1);
return true;
case GGML_OP_FLASH_ATTN_EXT:{
// derived from [ggml-cuda.cu]
if(op->src[1]->type != GGML_TYPE_F16 || op->src[2]->type != GGML_TYPE_F16){
@@ -2532,11 +2531,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
// DeepSeek MLA
return false;
}
// TODO: support broadcast
// ref: https://github.com/ggml-org/llama.cpp/pull/14435
if (op->src[0]->ne[3] != 1) {
return false;
}
float logitSoftcap = 0.0f;
memcpy(&logitSoftcap, (float*)op->op_params + 2, sizeof(float));
if(logitSoftcap != 0.0f) {

View File

@@ -214,10 +214,10 @@ ggml_backend_t ggml_backend_cpu_init(void) {
ctx->abort_callback_data = NULL;
ggml_backend_t cpu_backend = new ggml_backend {
/* .guid = */ ggml_backend_cpu_guid(),
/* .interface = */ ggml_backend_cpu_i,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ ctx,
/* .guid = */ ggml_backend_cpu_guid(),
/* .iface = */ ggml_backend_cpu_i,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ ctx,
};
if (cpu_backend == NULL) {

View File

@@ -259,7 +259,10 @@ class tensor_traits : public ggml::cpu::tensor_traits {
const int64_t m_start = 0;
const int64_t n_step = static_cast<int64_t>(kernel->get_n_step());
const int64_t num_threads = KAI_MIN(n / n_step, nth);
int64_t num_threads = KAI_MIN(n / n_step, nth);
if (num_threads <= 0) {
num_threads = 1;
}
if (ith < num_threads) {
const int64_t num_n_per_thread0 = round_down(n / num_threads, n_step);
@@ -309,7 +312,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
GGML_ASSERT(kernel);
const int ith = params->ith;
const int nth = params->nth;
const int nth_raw = params->nth;
const int nth = nth_raw > 0 ? nth_raw : 1;
const size_t k = ne00;
const size_t m = ne11;
@@ -327,9 +331,12 @@ class tensor_traits : public ggml::cpu::tensor_traits {
const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
const size_t n_start = ith * num_n_per_thread;
size_t n_to_process = num_n_per_thread;
if ((n_start + n_to_process) > n) {
n_to_process = n - n_start;
size_t n_to_process = 0;
if (n_start < n) {
n_to_process = num_n_per_thread;
if ((n_start + n_to_process) > n) {
n_to_process = n - n_start;
}
}
// Calculate number of columns to be processed per thread
@@ -361,8 +368,10 @@ class tensor_traits : public ggml::cpu::tensor_traits {
const void* lhs_ptr = (const void*)((const char *)lhs_packed + lhs_packed_offset);
float *dst_ptr = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
sizeof(float), -FLT_MAX, FLT_MAX);
if (n_to_process > 0) {
variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
sizeof(float), -FLT_MAX, FLT_MAX);
}
return true;
}

View File

@@ -785,6 +785,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
const half2 * const __restrict__ K_h2,
const half2 * const __restrict__ V_h2,
const half2 * const __restrict__ mask_h2,
const float * const __restrict__ sinks_f,
float2 * const __restrict__ dstk,
float2 * const __restrict__ dstk_fixup,
const float scale,
@@ -957,6 +958,52 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
}
}
// If attention sinks are used, potentially re-scale if KQ_max is small.
// Also add the sink as a value to KQ_rowsum, this is done after synchonization of KQ_rowsum
// so it's being done unconditionally for every thread.
if (!is_fixup && (np == 1 || threadIdx.y % np == 0) && sinks_f) {
float KQ_max_scale[cols_per_thread];
#pragma unroll
for (int col = 0; col < cols_per_thread; ++col) {
static_assert(ntiles == 1 || ntiles == 2, "ntiles > 2 not implemented");
const int jc = ntiles == 1 ? 2*tile_C_VKQ::get_j(col/2) + col % 2 : tile_C_VKQ_16::get_i(col);
const float sink = sinks_f[jc % ncols2];
const float KQ_max_new = fmaxf(KQ_max[col], sink);
const float KQ_max_diff = KQ_max[col] - KQ_max_new;
KQ_max_scale[col] = expf(KQ_max_diff);
KQ_max[col] = KQ_max_new;
*((uint32_t *) &KQ_max_scale[col]) *= KQ_max_diff >= SOFTMAX_FTZ_THRESHOLD;
const float KQ_max_add = expf(sink - KQ_max_new);
KQ_rowsum[col] = KQ_max_scale[col]*KQ_rowsum[col] + KQ_max_add;
}
if (ntiles == 1) {
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[0], KQ_max_scale[1]);
#pragma unroll
for (int i = 0; i < DV/tile_C_VKQ::I; ++i) {
#pragma unroll
for (int l = 0; l < tile_C_VKQ::ne; ++l) {
VKQ_C[i].x[l] *= KQ_max_scale_h2;
}
}
} else {
#pragma unroll
for (int col = 0; col < cols_per_thread; ++col) {
const half2 KQ_max_scale_h2 = make_half2(KQ_max_scale[col], KQ_max_scale[col]);
#pragma unroll
for (int i = 0; i < DV/tile_C_VKQ_16::J; ++i) {
#pragma unroll
for (int l0 = 0; l0 < tile_C_VKQ_16::ne; l0 += 2) {
VKQ_C_16[i*ntiles/2 + col/2].x[l0 + col % 2] *= KQ_max_scale_h2;
}
}
}
}
}
// Combine VKQ accumulator values if np > 1.
// It's also faster to do small writes to shared memory, then large write to VRAM than to do small writes to VRAM.
// So also write VKQ accumulators to shared memory in column-major format if np == 1.
@@ -1271,18 +1318,21 @@ static __global__ void flash_attn_ext_f16(
while (kbc < kbc_stop && kb0_stop == iter_k) {
const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2));
const int head = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j);
const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile.
const int zt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); // head in units of ncols2
const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile.
const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02*(head*ncols2));
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head*ncols2 / gqa_ratio));
const int head0 = zt * ncols2;
const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02* head0);
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr :
(const half2 *) (mask + nb33*(sequence % ne33) + nb31*jt*ncols1);
float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head*ncols2) * (DV/2);
float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head0) * (DV/2);
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head*ncols2 / gqa_ratio));
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr;
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f;
const int kb0_start_kernel = kb0_start * kb_niter;
int kb0_stop_kernel = kb0_stop * kb_niter;
@@ -1295,12 +1345,12 @@ static __global__ void flash_attn_ext_f16(
if (kb0_start == 0) {
constexpr bool needs_fixup = false; // CUDA block is working on an entire tile.
flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup>
(Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
(Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
} else {
constexpr bool needs_fixup = true; // CUDA block is working on the beginning of a tile.
flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup>
(Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
(Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
}
@@ -1316,18 +1366,21 @@ static __global__ void flash_attn_ext_f16(
}
const int sequence = kbc / (iter_k*iter_j*(ne02/ncols2));
const int head = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j);
const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*head) / iter_k; // j index of current tile.
const int zt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence) / (iter_k*iter_j); // head in units of ncols2
const int jt = (kbc - iter_k*iter_j*(ne02/ncols2)*sequence - iter_k*iter_j*zt) / iter_k; // j index of current tile.
const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02*(head*ncols2));
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head*ncols2 / gqa_ratio));
const int head0 = zt * ncols2;
const float2 * Q_f2 = (const float2 *) (Q + nb03*sequence + nb02* head0);
const half2 * K_h2 = (const half2 *) (K + nb13*sequence + nb12*(head0 / gqa_ratio));
const half2 * mask_h2 = ncols2 == 1 && !mask ? nullptr :
(const half2 *) (mask + nb33*(sequence % ne33) + nb31*jt*ncols1);
float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head*ncols2) * (DV/2);
float2 * dstk = ((float2 *) dst) + (sequence*ne01*ne02 + head0) * (DV/2);
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head*ncols2 / gqa_ratio));
const half2 * V_h2 = mla ? K_h2 + (DKQ/2 - DV/2) : (const half2 *) (V + nb23*sequence + nb22*(head0 / gqa_ratio));
const float * sinks_f = sinks ? (const float *) sinks + head0 : nullptr;
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head, n_head_log2, m0, m1) : 1.0f;
const float slope = ncols2 == 1 ? get_alibi_slope(max_bias, head0, n_head_log2, m0, m1) : 1.0f;
const int kb0_start_kernel = kb0_start * kb_niter;
int kb0_stop_kernel = kb0_stop * kb_niter;
@@ -1339,7 +1392,7 @@ static __global__ void flash_attn_ext_f16(
constexpr bool is_fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
constexpr bool needs_fixup = false;
flash_attn_ext_f16_process_tile<DKQ, DV, ncols1, ncols2, nwarps, ntiles, use_logit_softcap, mla, needs_fixup, is_fixup>
(Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
(Q_f2, K_h2, V_h2, mask_h2, sinks_f, dstk, dst_meta, scale, slope, logit_softcap,
ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
#else
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask); GGML_UNUSED(sinks);

View File

@@ -49,10 +49,11 @@ static __global__ void flash_attn_tile_ext_f16(
const int sequence = blockIdx.z / ne02;
const int head = blockIdx.z - sequence*ne02;
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const float * sinksf = (const float *) (sinks);
const int stride_KV2 = nb11 / sizeof(half2);
@@ -242,6 +243,31 @@ static __global__ void flash_attn_tile_ext_f16(
__syncthreads();
}
//Attention sink: adjust running max and sum once per head
if (sinksf && blockIdx.y == 0) {
const half sink = __float2half(sinksf[head]);
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
half kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const half2 KQ_max_scale = __half2half2(hexp(kqmax[j0/nwarps] - kqmax_new_j));
kqmax[j0/nwarps] = kqmax_new_j;
const half val = hexp(sink - kqmax[j0/nwarps]);
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
if (threadIdx.x == 0) {
kqsum[j0/nwarps].x = __hadd(kqsum[j0/nwarps].x, val);
}
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
VKQ[j0/nwarps][i0/WARP_SIZE] *= KQ_max_scale;
}
}
}
float2 * dst2 = (float2 *) dst;
#pragma unroll

View File

@@ -60,10 +60,11 @@ static __global__ void flash_attn_tile_ext_f32(
const int sequence = blockIdx.z / ne02;
const int head = blockIdx.z - sequence*ne02;
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const float2 * Q_f2 = (const float2 *) (Q + nb03* sequence + nb02* head + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb13* sequence + nb12*(head / gqa_ratio));
const half2 * V_h2 = (const half2 *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const float * sinksf = (const float *) (sinks);
const int stride_KV2 = nb11 / sizeof(half2);
@@ -252,6 +253,33 @@ static __global__ void flash_attn_tile_ext_f32(
__syncthreads();
}
//Attention sink: adjust running max and sum once per head
if (sinksf && blockIdx.y == 0) {
const float sink = sinksf[head];
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
float kqmax_new_j = fmaxf(kqmax[j0/nwarps], sink);
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const float KQ_max_scale = expf(kqmax[j0/nwarps] - kqmax_new_j);
kqmax[j0/nwarps] = kqmax_new_j;
const float val = expf(sink - kqmax[j0/nwarps]);
kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale;
if (threadIdx.x == 0) {
kqsum[j0/nwarps] += val;
}
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
VKQ[j0/nwarps][i0/WARP_SIZE].x *= KQ_max_scale;
VKQ[j0/nwarps][i0/WARP_SIZE].y *= KQ_max_scale;
}
}
}
float2 * dst2 = (float2 *) dst;
#pragma unroll

View File

@@ -82,11 +82,12 @@ static __global__ void flash_attn_ext_f16(
const int sequence = blockIdx.z / ne02;
const int head = blockIdx.z - sequence*ne02;
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float * Q_f = (const float *) (Q + nb03* sequence + nb02* head + nb01*ic0);
const half * K_h = (const half *) (K + nb13* sequence + nb12*(head / gqa_ratio));
const half * V_h = (const half *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const half2 * mask2 = (const half2 *) maskh;
const float * Q_f = (const float *) (Q + nb03* sequence + nb02* head + nb01*ic0);
const half * K_h = (const half *) (K + nb13* sequence + nb12*(head / gqa_ratio));
const half * V_h = (const half *) (V + nb13* sequence + nb12*(head / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) (mask + nb33*(sequence % ne33) + nb31*ic0);
const half2 * mask2 = (const half2 *) maskh;
const float * sinksf = (const float *) sinks;
const int stride_Q = nb01 / sizeof(float);
const int stride_KV = nb11 / sizeof(half);
@@ -381,6 +382,53 @@ static __global__ void flash_attn_ext_f16(
__syncthreads();
}
// Apply attention sinks
if (sinksf && blockIdx.y == 0) {
const float sinkf = sinksf[head];
const half sinkh = __float2half(sinkf);
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j = j0 + threadIdx.y;
if (std::is_same<KQ_acc_t, float>::value) {
float kqmax_new = fmaxf(KQ_max_f[j0/nwarps], sinkf);
const float KQ_max_scale = expf(KQ_max_f[j0/nwarps] - kqmax_new);
KQ_max_f[j0/nwarps] = kqmax_new;
KQ_rowsum_f[j0/nwarps] = KQ_rowsum_f[j0/nwarps] * KQ_max_scale + expf(sinkf - KQ_max_f[j0/nwarps]);
const half2 scale_h2 = make_half2(KQ_max_scale, KQ_max_scale);
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
const int i = i0 + threadIdx.x;
if (i0 + warp_size > D/2 && i >= D/2) break;
VKQ2[j*(D_padded/2) + i] *= scale_h2;
}
} else {
half kqmax_old = __low2half(KQ_max_h2[j0/nwarps]);
half kqmax_new = fmaxf(kqmax_old, sinkh);
KQ_max_h2[j0/nwarps] = __half2half2(kqmax_new);
const half KQ_max_scale_h = hexp(kqmax_old - kqmax_new);
const half2 KQ_max_scale = __half2half2(KQ_max_scale_h);
KQ_rowsum_h2[j0/nwarps] = KQ_rowsum_h2[j0/nwarps] * KQ_max_scale;
const half val = hexp(sinkh - kqmax_new);
KQ_rowsum_h2[j0/nwarps].x = __hadd(KQ_rowsum_h2[j0/nwarps].x, val);
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += warp_size) {
const int i = i0 + threadIdx.x;
if (i0 + warp_size > D/2 && i >= D/2) break;
VKQ2[j*(D_padded/2) + i] *= KQ_max_scale;
}
}
}
__syncthreads();
}
#pragma unroll
for (int j0 = 0; j0 < ncols; j0 += nwarps) {
const int j_VKQ = j0 + threadIdx.y;

View File

@@ -274,23 +274,12 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];
const ggml_tensor * mask = dst->src[3];
const ggml_tensor * sinks = dst->src[4];
ggml_cuda_set_device(ctx.device);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
// TODO: currently only vec implementation for sinks is supported [TAG_ATTN_SINKS]
if (sinks) {
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
} else {
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
}
return;
}
#if defined(GGML_HIP_ROCWMMA_FATTN)
if (GGML_CUDA_CC_IS_AMD(cc) && fp16_mma_available(cc)) {
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);

View File

@@ -3532,7 +3532,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0;
}
// TODO: more general-purpose attention sink support [TAG_ATTN_SINKS]
if (op->src[4] && op->src[0]->ne[0] != 64 && op->src[0]->ne[0] != 128) { // currently only sinks for head_size 64 and 128 are supported
if (op->src[4] && !fp16_mma_available(ggml_cuda_info().devices[dev_ctx->device].cc)
&& op->src[0]->ne[0] != 64 && op->src[0]->ne[0] != 128) {
return false;
}
if (op->src[0]->ne[0] == 192) {
@@ -3798,10 +3799,10 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
}
ggml_backend_t cuda_backend = new ggml_backend {
/* .guid = */ ggml_backend_cuda_guid(),
/* .interface = */ ggml_backend_cuda_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
/* .context = */ ctx,
/* .guid = */ ggml_backend_cuda_guid(),
/* .iface = */ ggml_backend_cuda_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device),
/* .context = */ ctx,
};
return cuda_backend;

View File

@@ -1,87 +1,117 @@
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
#define USE_CUB
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070
#ifdef USE_CUB
#include <cub/cub.cuh>
using namespace cub;
#endif // USE_CUB
#include "ssm-scan.cuh"
template <size_t splitD, size_t N>
__global__ void __launch_bounds__(splitD, 2)
ssm_scan_f32(const float * __restrict__ src0, const float * __restrict__ src1, const float * __restrict__ src2,
const float * __restrict__ src3, const float * __restrict__ src4, const float * __restrict__ src5,
// We would like to keep pragma unroll for cases where L_template is not 0,
// so we suppress the clang transformation warning.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
#endif // __clang__
template <size_t splitD, size_t N, size_t L_template>
__global__ void __launch_bounds__(splitD, 1)
ssm_scan_f32(const float *__restrict__ src0, const float *__restrict__ src1, const float *__restrict__ src2,
const float *__restrict__ src3, const float *__restrict__ src4, const float *__restrict__ src5,
const int32_t * __restrict__ src6, float * __restrict__ dst,
const int src0_nb2, const int src0_nb3, const int src1_nb2, const int src1_nb3,
const int src2_nb1, const int src2_nb2, const int src3_nb1,
const int src4_nb2, const int src4_nb3, const int src5_nb2, const int src5_nb3,
const int64_t s_off, const int64_t d_inner, const int64_t L) {
const int64_t s_off, const int64_t d_inner, const int64_t L_param)
{
const size_t L = L_template == 0 ? L_param : L_template;
const float *s0_block = (const float *)((const char *)src0 + src6[blockIdx.x] * src0_nb3 + blockIdx.y * splitD * src0_nb2);
const float *x_block = (const float *)((const char *)src1 + (blockIdx.x * src1_nb3) + blockIdx.y * splitD * sizeof(float));
const float *dt_block = (const float *)((const char *)src2 + (blockIdx.x * src2_nb2) + blockIdx.y * splitD * sizeof(float));
const float *A_block = (const float *)((const char *)src3 + blockIdx.y * splitD * src3_nb1);
const float *B_block = (const float *)((const char *)src4 + (blockIdx.x * src4_nb3));
const float *C_block = (const float *)((const char *)src5 + (blockIdx.x * src5_nb3));
float *y_block = (float *)((char *)dst + (blockIdx.x * d_inner * L * sizeof(float)) + blockIdx.y * splitD * sizeof(float));
float *s_block = (float *)((char *)dst + s_off + blockIdx.x * src0_nb3 + blockIdx.y * splitD * src0_nb2);
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
const int bidx = blockIdx.x; // split along B (sequences)
const int bidy = blockIdx.y; // split along D (d_inner)
const int tid = threadIdx.x;
const int wid = tid / 32;
const int wtid = tid % 32;
extern __shared__ float smem[];
const int stride_sA = N + 1;
const int stride_ss0 = N + 1;
float * smem_A = smem;
float * smem_s0 = smem_A + splitD * stride_sA;
const float * s0_block = (const float *) ((const char *) src0 + src6[bidx] * src0_nb3 + bidy * splitD * src0_nb2);
const float * x_block = (const float *) ((const char *) src1 + (bidx * src1_nb3) + bidy * splitD * sizeof(float));
const float * dt_block = (const float *) ((const char *) src2 + (bidx * src2_nb2) + bidy * splitD * sizeof(float));
const float * A_block = (const float *) ((const char *) src3 + bidy * splitD * src3_nb1);
const float * B_block = (const float *) ((const char *) src4 + (bidx * src4_nb3));
const float * C_block = (const float *) ((const char *) src5 + (bidx * src5_nb3));
float * y_block = (float *) ((char *) dst + (bidx * d_inner * L * sizeof(float)) + bidy * splitD * sizeof(float));
float * s_block = (float *) ((char *) dst + s_off + bidx * src0_nb3 + bidy * splitD * src0_nb2);
const int stride_s0 = src0_nb2 / sizeof(float);
const int stride_x = src1_nb2 / sizeof(float);
const int stride_x = src1_nb2 / sizeof(float);
const int stride_dt = src2_nb1 / sizeof(float);
const int stride_A = src3_nb1 / sizeof(float);
const int stride_B = src4_nb2 / sizeof(float);
const int stride_C = src5_nb2 / sizeof(float);
const int stride_s = stride_s0;
const int stride_y = d_inner;
const int stride_B = src4_nb2 / sizeof(float);
const int stride_C = src5_nb2 / sizeof(float);
const int stride_y = d_inner;
// can N not be 16? for example 32?
if (N == 16) {
float regA[N];
float regs0[N];
__shared__ float smemB[N];
__shared__ float smemC[N];
#ifdef USE_CUB
using BlockLoad = cub::BlockLoad<float, splitD, N, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
using BlockStore = cub::BlockStore<float, splitD, N, cub::BLOCK_STORE_WARP_TRANSPOSE>;
union CubTempStorage {
typename BlockLoad::TempStorage load_temp;
typename BlockStore::TempStorage store_temp;
};
__shared__ CubTempStorage cub_temp_storage;
BlockLoad(cub_temp_storage.load_temp).Load(A_block, regA);
BlockLoad(cub_temp_storage.load_temp).Load(s0_block, regs0);
#else
const int stride_s0 = src0_nb2 / sizeof(float);
const int stride_A = src3_nb1 / sizeof(float);
#pragma unroll
for (size_t i = 0; i < splitD / 4; i += 2) {
float value = A_block[(wid * warp_size + i) * stride_A + wtid];
// todo: bank conflict
// I am always confused with how to use the swizzling method to solve
// bank conflit. Hoping somebody can tell me.
smem_A[(wid * warp_size + i) * stride_sA + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
}
#pragma unroll
for (size_t i = 0; i < splitD / 4; i += 2) {
float value = s0_block[(wid * warp_size + i) * stride_s0 + wtid];
smem_s0[(wid * warp_size + i) * stride_ss0 + wtid + ((wtid / 16) > 0 ? 1 : 0)] = value;
}
for (size_t n = 0; n < N; ++n)
{
regA[n] = A_block[threadIdx.x * stride_A + n];
regs0[n] = s0_block[threadIdx.x * stride_s0 + n];
}
#endif
__syncthreads();
for (int64_t i = 0; i < L; i++) {
float dt_soft_plus = dt_block[i * stride_dt + tid];
if (dt_soft_plus <= 20.0f) {
dt_soft_plus = log1pf(exp(dt_soft_plus));
}
float x_dt = x_block[i * stride_x + tid] * dt_soft_plus;
float sumf = 0.0f;
#pragma unroll
for (size_t j = 0; j < N; j++) {
float state = (smem_s0[tid * stride_ss0 + j] * expf(dt_soft_plus * smem_A[tid * stride_sA + j])) +
(B_block[i * stride_B + j] * x_dt);
sumf += state * C_block[i * stride_C + j];
if (i == L - 1) {
s_block[tid * stride_s + j] = state;
} else {
smem_s0[tid * stride_ss0 + j] = state;
}
for (size_t i = 0; i < L; i++)
{
if (threadIdx.x < N)
{
smemB[threadIdx.x] = B_block[i * stride_B + threadIdx.x];
smemC[threadIdx.x] = C_block[i * stride_C + threadIdx.x];
}
__syncthreads();
y_block[i * stride_y + tid] = sumf;
float dt_soft_plus = dt_block[i * stride_dt + threadIdx.x];
if (dt_soft_plus <= 20.0f)
{
dt_soft_plus = log1pf(expf(dt_soft_plus));
}
float x_dt = x_block[i * stride_x + threadIdx.x] * dt_soft_plus;
float sumf = 0.0f;
#pragma unroll
for (size_t n = 0; n < N; n++)
{
float state = regs0[n] * expf(dt_soft_plus * regA[n]) + smemB[n] * x_dt;
sumf += state * smemC[n];
regs0[n] = state;
}
y_block[i * stride_y + threadIdx.x] = sumf;
}
#ifdef USE_CUB
BlockStore(cub_temp_storage.store_temp).Store(s_block, regs0);
#else
const int stride_s = stride_s0;
#pragma unroll
for (size_t n = 0; n < N; ++n)
{
s_block[threadIdx.x * stride_s + n] = regs0[n];
}
#endif
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif // __clang__
// assumes as many threads as d_state
template <int splitH, int d_state>
@@ -201,11 +231,11 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
const int src5_nb3, const int64_t s_off, const int64_t d_state, const int64_t head_dim,
const int64_t n_head, const int64_t n_group, const int64_t n_tok, const int64_t n_seq,
cudaStream_t stream) {
const int threads = 128;
// NOTE: if you change conditions here, be sure to update the corresponding supports_op condition!
if (src3_nb1 == sizeof(float)) {
// Mamba-2
if (d_state == 128) {
const int threads = 128;
GGML_ASSERT(d_state % threads == 0);
// NOTE: can be any power of two between 4 and 64
const int splitH = 16;
@@ -229,7 +259,6 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
GGML_ABORT("doesn't support d_state!=(128 or 256).");
}
} else {
const int threads = 128;
// Mamba-1
GGML_ASSERT(n_head % threads == 0);
GGML_ASSERT(head_dim == 1);
@@ -237,10 +266,63 @@ static void ssm_scan_f32_cuda(const float * src0, const float * src1, const floa
const dim3 blocks(n_seq, (n_head + threads - 1) / threads, 1);
const int smem_size = (threads * (d_state + 1) * 2) * sizeof(float);
if (d_state == 16) {
ssm_scan_f32<128, 16><<<blocks, threads, smem_size, stream>>>(
src0, src1, src2, src3, src4, src5, src6, dst,
switch (n_tok)
{
case 1:
ssm_scan_f32<threads, 16, 1><<<blocks, threads, smem_size, stream>>>(
src0, src1, src2, src3, src4, src5, src6, dst,
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
break;
case 2:
ssm_scan_f32<threads, 16, 2><<<blocks, threads, smem_size, stream>>>(
src0, src1, src2, src3, src4, src5, src6, dst,
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
break;
case 3:
ssm_scan_f32<threads, 16, 3><<<blocks, threads, smem_size, stream>>>(
src0, src1, src2, src3, src4, src5, src6, dst,
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
break;
case 4:
ssm_scan_f32<threads, 16, 4><<<blocks, threads, smem_size, stream>>>(
src0, src1, src2, src3, src4, src5, src6, dst,
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
break;
case 5:
ssm_scan_f32<threads, 16, 5><<<blocks, threads, smem_size, stream>>>(
src0, src1, src2, src3, src4, src5, src6, dst,
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
break;
case 6:
ssm_scan_f32<threads, 16, 6><<<blocks, threads, smem_size, stream>>>(
src0, src1, src2, src3, src4, src5, src6, dst,
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
break;
case 7:
ssm_scan_f32<threads, 16, 7><<<blocks, threads, smem_size, stream>>>(
src0, src1, src2, src3, src4, src5, src6, dst,
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
break;
case 8:
ssm_scan_f32<threads, 16, 8><<<blocks, threads, smem_size, stream>>>(
src0, src1, src2, src3, src4, src5, src6, dst,
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
break;
default:
ssm_scan_f32<threads, 16, 0><<<blocks, threads, smem_size, stream>>>(
src0, src1, src2, src3, src4, src5, src6, dst,
src0_nb2, src0_nb3, src1_nb2, src1_nb3, src2_nb1, src2_nb2,
src3_nb1, src4_nb2, src4_nb3, src5_nb2, src5_nb3, s_off, n_head, n_tok);
break;
}
} else {
GGML_ABORT("doesn't support d_state!=16.");
}

View File

@@ -468,9 +468,22 @@ static inline float ggml_e8m0_to_fp32_half(uint8_t x) {
return result;
}
static inline uint8_t ggml_fp32_to_e8m0(float x) {
uint32_t bits;
memcpy(&bits, &x, sizeof(float));
// round half-way away from zero
bits += (bits & 0x00400000) << 1;
return (uint8_t) (bits >> 23);
}
#define GGML_E8M0_TO_FP32(x) ggml_e8m0_to_fp32(x)
#define GGML_E8M0_TO_FP32_HALF(x) ggml_e8m0_to_fp32_half(x)
#define GGML_FP32_TO_E8M0(x) ggml_fp32_to_e8m0(x)
/**
* Converts brain16 to float32.
*

View File

@@ -2520,8 +2520,6 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
case GGML_OP_CLAMP:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_SOFT_MAX:
// TODO: support attention sinks [TAG_ATTN_SINKS]
return op->src[2] == nullptr;
case GGML_OP_NORM:
case GGML_OP_RMS_NORM:
return true;
@@ -2626,10 +2624,10 @@ ggml_backend_t ggml_backend_opencl_init(void) {
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev);
ggml_backend_t backend = new ggml_backend {
/* .guid = */ ggml_backend_opencl_guid(),
/* .interface = */ ggml_backend_opencl_i,
/* .device = */ dev,
/* .context = */ backend_ctx
/* .guid = */ ggml_backend_opencl_guid(),
/* .iface = */ ggml_backend_opencl_i,
/* .device = */ dev,
/* .context = */ backend_ctx
};
return backend;
@@ -6594,17 +6592,24 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
GGML_ASSERT(src1->extra);
}
const ggml_tensor * src2 = dst->src[2];
if (src2) {
GGML_ASSERT(src2->extra);
}
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr;
ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr;
cl_ulong offset0 = extra0->offset + src0->view_offs;
cl_ulong offsetd = extrad->offset + dst->view_offs;
cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0;
cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0;
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
@@ -6672,25 +6677,27 @@ static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, c
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), extra1 ? &extra1->data_device : &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb3));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(float), &scale));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(float), &max_bias));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &m0));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &m1));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &n_head_log2));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), extra2 ? &extra2->data_device : &extra0->data_device));
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01));
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03));
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne12));
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne13));
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb11));
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb12));
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb13));
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb1));
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb2));
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb3));
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(float), &scale));
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(float), &max_bias));
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(float), &m0));
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float), &m1));
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &n_head_log2));
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
size_t local_work_size[] = {(size_t)nth, 1, 1};

View File

@@ -26,6 +26,8 @@ kernel void kernel_soft_max_4_f16(
ulong offset0,
global char * src1,
ulong offset1,
global char * src2,
ulong offset2,
global char * dst,
ulong offsetd,
int ne00,
@@ -48,6 +50,7 @@ kernel void kernel_soft_max_4_f16(
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
src2 = src2 + offset2;
dst = dst + offsetd;
int i03 = get_group_id(2);
@@ -60,6 +63,7 @@ kernel void kernel_soft_max_4_f16(
global float4 * psrc4 = (global float4 *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
global half4 * pmask = src1 != src0 ? (global half4 *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0;
global float4 * pdst4 = (global float4 *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
float slope = 1.0f;
@@ -75,7 +79,7 @@ kernel void kernel_soft_max_4_f16(
}
// parallel max
float4 lmax4 = -INFINITY;
float4 lmax4 = psrc2 ? psrc2[i02] : -INFINITY;
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
lmax4 = fmax(lmax4, psrc4[i00]*scale + slope*(pmask ? convert_float4(pmask[i00]) : 0.0f));
}
@@ -92,7 +96,11 @@ kernel void kernel_soft_max_4_f16(
}
float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3;
const float sum = sub_group_reduce_add(lsum);
float sum = sub_group_reduce_add(lsum);
if (psrc2) {
sum += exp(psrc2[i02] - max);
}
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
pdst4[i00] /= sum;

View File

@@ -26,6 +26,8 @@ kernel void kernel_soft_max_4(
ulong offset0,
global char * src1,
ulong offset1,
global char * src2,
ulong offset2,
global char * dst,
ulong offsetd,
int ne00,
@@ -48,6 +50,7 @@ kernel void kernel_soft_max_4(
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
src2 = src2 + offset2;
dst = dst + offsetd;
int i03 = get_group_id(2);
@@ -60,6 +63,7 @@ kernel void kernel_soft_max_4(
global float4 * psrc4 = (global float4 *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
global float4 * pmask = src1 != src0 ? (global float4 *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0;
global float4 * pdst4 = (global float4 *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
float slope = 1.0f;
@@ -75,7 +79,7 @@ kernel void kernel_soft_max_4(
}
// parallel max
float4 lmax4 = -INFINITY;
float4 lmax4 = psrc2 ? psrc2[i02] : -INFINITY;
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
}
@@ -92,7 +96,11 @@ kernel void kernel_soft_max_4(
}
float lsum = lsum4.s0 + lsum4.s1 + lsum4.s2 + lsum4.s3;
const float sum = sub_group_reduce_add(lsum);
float sum = sub_group_reduce_add(lsum);
if (psrc2) {
sum += exp(psrc2[i02] - max);
}
for (int i00 = get_local_id(0); i00 < ne00/4; i00 += get_local_size(0)) {
pdst4[i00] /= sum;

View File

@@ -26,6 +26,8 @@ kernel void kernel_soft_max_f16(
ulong offset0,
global char * src1,
ulong offset1,
global char * src2,
ulong offset2,
global char * dst,
ulong offsetd,
int ne00,
@@ -48,6 +50,7 @@ kernel void kernel_soft_max_f16(
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
src2 = src2 + offset2;
dst = dst + offsetd;
int i03 = get_group_id(2);
@@ -60,6 +63,7 @@ kernel void kernel_soft_max_f16(
global float * psrc0 = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
global half * pmask = src1 != src0 ? (global half *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0;
global float * pdst = (global float *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
float slope = 1.0f;
@@ -75,7 +79,7 @@ kernel void kernel_soft_max_f16(
}
// parallel max
float lmax = -INFINITY;
float lmax = psrc2 ? psrc2[i02] : -INFINITY;
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
}
@@ -91,7 +95,11 @@ kernel void kernel_soft_max_f16(
pdst[i00] = exp_psrc0;
}
const float sum = sub_group_reduce_add(lsum);
float sum = sub_group_reduce_add(lsum);
if (psrc2) {
sum += exp(psrc2[i02] - max);
}
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
pdst[i00] /= sum;

View File

@@ -26,6 +26,8 @@ kernel void kernel_soft_max(
ulong offset0,
global char * src1,
ulong offset1,
global char * src2,
ulong offset2,
global char * dst,
ulong offsetd,
int ne00,
@@ -48,6 +50,7 @@ kernel void kernel_soft_max(
) {
src0 = src0 + offset0;
src1 = src1 + offset1;
src2 = src2 + offset2;
dst = dst + offsetd;
int i03 = get_group_id(2);
@@ -60,6 +63,7 @@ kernel void kernel_soft_max(
global float * psrc0 = (global float *)(src0 + i01*nb01 + i02*nb02 + i03*nb03);
global float * pmask = src1 != src0 ? (global float *)(src1 + i11*nb11 + i12*nb12 + i13*nb13) : 0;
global float * psrc2 = src2 != src0 ? (global float *)(src2) : 0;
global float * pdst = (global float *)(dst + i01*nb1 + i02*nb2 + i03*nb3);
float slope = 1.0f;
@@ -75,7 +79,7 @@ kernel void kernel_soft_max(
}
// parallel max
float lmax = -INFINITY;
float lmax = psrc2 ? psrc2[i02] : -INFINITY;
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
lmax = fmax(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
}
@@ -91,7 +95,11 @@ kernel void kernel_soft_max(
pdst[i00] = exp_psrc0;
}
const float sum = sub_group_reduce_add(lsum);
float sum = sub_group_reduce_add(lsum);
if (psrc2) {
sum += exp(psrc2[i02] - max);
}
for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) {
pdst[i00] /= sum;

View File

@@ -288,7 +288,11 @@ void quantize_row_mxfp4_ref(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RE
}
}
const uint8_t e = (uint8_t) (floorf(log2f(amax)) - 2 + 127);
// use -4.0f to 4.0f for the range because -6.0f to 6.0f yields worse result
// because this is a naive quantization
// TODO: use make_qkxs_nl_e8m0 instead
const uint8_t e = GGML_FP32_TO_E8M0(amax / 4.0f);
// const uint8_t e = amax > 0.0f ? (uint8_t) (floorf(log2f(amax)) - 2 + 127) : 0;
const float d = GGML_E8M0_TO_FP32_HALF(e);
@@ -448,6 +452,303 @@ static inline int nearest_int(float fval) {
return (i & 0x007fffff) - 0x00400000;
}
// Fast sorting of scales with a hybrid non-comparative sort
struct k_sort {
int n;
int k; // number of k_values
// some useful info about the k_values
int8_t kmin; // absmin k_value (but with its sign)
int8_t kmax; // absmax k_value (but with its sign)
int8_t mid_k; // id of kmin into kvalues
// These have size k
const int8_t * k_values; // if NULL, it's assumed to be linear (i - mid_k)
float * odd; // k_values[i + 1] + k_values[i] (odd numbers when linear, hence the name)
float * step; // k_values[i + 1] - k_values[i] (if NULL, assumed to be 1)
// All of the below arrays need to have size n at least.
int32_t * ids; // original ids (into the full-precision block)
int32_t * k_ids; // denominator ids (into odd and step)
int32_t * aux_ids; // argsort ids;
float * frac; // what is actually being sorted
// temporary buffer when sorting the other buffers
union {
float * aux_f;
int32_t * aux_i;
};
// Holds indices into the bucket counts
uint16_t * Iaux;
// Where the histogram will be counted
// TODO: experiment with different bucket sizes than n
uint16_t * buckets;
// For faster non-linear rounding, always 510 bytes in size
// TODO: static buffer, but how to not include it for non-linear quants?
int8_t * k_indices;
};
// helper for k_sort buffer sizes
#define K_SORT_BUF_SIZE(n, k, range, nl) ( \
(/* odd, step */ (k) * (sizeof(float) * (1 + !!(nl)))) + \
(/* ids, k_ids, aux_ids, frac, aux */ ((n) * (range)) * (sizeof(int32_t) * 3 + sizeof(float) * 2)) + \
(/* Iaux, buckets */ ((n) * (range) * (sizeof(uint16_t) * 2))) + \
(/* k_indices */ ((nl) ? 510 * sizeof(int8_t) : 0)) \
)
// For non-linear quants.
// k is the number of possible k-values,
// range is the longest number of k-values starting from the middle one,
// block is the size of a block.
#define K_SORT_BUF_SIZE_NL(block, k, range) (K_SORT_BUF_SIZE((block), (k), (range), 1))
// For linear quants. nmin should be <= 0, and nmax >= 0. block is the size of a block.
#define K_SORT_BUF_SIZE_LINEAR(block, nmin, nmax) (K_SORT_BUF_SIZE((block), (nmax) - (nmin) + 1, (nmax) > -(nmin) ? (nmax) : -(nmin), 0))
// for non-linear quants
// TODO: maybe use an array of structs instead, or malloc to simplify initialization
static void k_sort_init(struct k_sort * s, int n, int k, const int8_t * kvalues, uint8_t * buf) {
s->n = 0;
s->k = k;
const uint8_t * buf_start = buf;
s->k_values = kvalues;
s->odd = (float *) (buf);
s->step = (float *) (buf + k * sizeof(float));
buf += (2 * k) * sizeof(float);
int k_amin = abs(kvalues[0]);
int k_amax = abs(kvalues[0]);
int mid_k = 0;
int max_k = 0;
for (int i = 1; i < k; ++i) {
const int ak = abs(kvalues[i]);
if (ak < k_amin) { k_amin = ak; mid_k = i; }
if (ak > k_amax) { k_amax = ak; max_k = i; }
}
const int max_range = (mid_k > (k - mid_k)) ? mid_k : k - mid_k;
s->ids = (int32_t *) (buf + max_range * n * (sizeof(int32_t) * 0));
s->k_ids = (int32_t *) (buf + max_range * n * (sizeof(int32_t) * 1));
s->aux_ids = (int32_t *) (buf + max_range * n * (sizeof(int32_t) * 2));
s->frac = (float *) (buf + max_range * n * (sizeof(int32_t) * 3));
s->aux_f = (float *) (buf + max_range * n * (sizeof(int32_t) * 3 + sizeof(float)));
buf += max_range * n * (sizeof(int32_t) * 3 + sizeof(float) * 2);
s->Iaux = (uint16_t *) (buf);
s->buckets = (uint16_t *) (buf + n * max_range * sizeof(uint16_t));
buf += 2 * n * max_range * sizeof(uint16_t);
s->k_indices = (int8_t *) buf;
buf += 510;
GGML_ASSERT((int64_t) (buf - buf_start) == (int64_t) K_SORT_BUF_SIZE_NL(n, k, max_range));
for (int i = 1; i < k; ++i) {
// 0 to k - 1, skipping mid_k; only transitions are stored
const int j = i - ((int) (i <= mid_k));
s->odd[j] = abs(kvalues[i] + kvalues[i - 1]);
s->step[j] = abs(kvalues[i] - kvalues[i - 1]);
}
s->odd[mid_k] = 1.0f;
s->step[mid_k] = 1.0f;
s->kmin = kvalues[mid_k];
s->kmax = kvalues[max_k];
s->mid_k = mid_k;
// for faster non-linear rounding
{
int cur_k = 0;
int cur = (int) kvalues[cur_k] * 2;
int next = (int) kvalues[cur_k + 1] * 2; // assuming k is at least 2
for (int i = -256; i < 254; ++i) {
// TODO: is this always correct?
if (next != cur && abs(i - next) <= abs(i - cur)) {
cur = next;
cur_k += 1;
if (cur_k + 1 < k) {
next = (int) kvalues[cur_k + 1] * 2;
}
}
s->k_indices[i + 256] = cur_k;
}
}
}
// buf should have size from K_SORT_BUF_SIZE_LINEAR(n, nmin, nmax)
static void k_sort_init_linear(struct k_sort * s, int n, int nmin, int nmax, uint8_t * buf) {
nmin = MIN(0, nmin);
nmax = MAX(0, nmax);
const int max_range = (nmax > -nmin ? nmax : -nmin);
s->n = 0;
s->k = nmax - nmin + 1;
s->mid_k = -nmin;
s->kmin = 0;
s->kmax = -nmin > nmax ? nmin : nmax;
s->k_values = NULL;
s->odd = (float *) (buf);
s->step = NULL;
buf += s->k * sizeof(float);
s->ids = (int32_t *) (buf + max_range * n * (sizeof(int32_t) * 0));
s->k_ids = (int32_t *) (buf + max_range * n * (sizeof(int32_t) * 1));
s->aux_ids = (int32_t *) (buf + max_range * n * (sizeof(int32_t) * 2));
s->frac = (float *) (buf + max_range * n * (sizeof(int32_t) * 3));
s->aux_f = (float *) (buf + max_range * n * (sizeof(int32_t) * 3 + sizeof(float)));
buf += max_range * n * (sizeof(int32_t) * 3 + sizeof(float) * 2);
s->Iaux = (uint16_t *) (buf);
s->buckets = (uint16_t *) (buf + n * max_range * sizeof(uint16_t));
s->k_indices = NULL;
for (int i = nmin; i < nmax; ++i) {
const int j = i - nmin + (i >= 0);
s->odd[j] = abs(i + (i + 1));
}
s->odd[-nmin] = 1.0f;
}
static inline int k_sort_best_index(struct k_sort * s, float x) {
if (x <= -128.0f) {
return 0;
}
if (x >= 127.0f) {
return s->k - 1;
}
// (-256 to 253) --> (0 to 509)
// const int i = (int)floorf(x) + lroundf(x) + 256;
// NOTE: using faster primitives for rounding
const int i = (int) (x + 128.0f) + nearest_int(x) + 128;
return s->k_indices[i];
}
// Interpolation sort using an hybrid of non-comparative counting sort and insertion sort.
static void k_sort_frac_descending(struct k_sort * s) {
const int N_BUCKETS = s->n;
memset(s->buckets, 0, N_BUCKETS * sizeof(*(s->buckets)));
float max_frac = s->frac[0];
float min_frac = max_frac;
for (int i = 1; i < s->n; ++i) {
const float f = s->frac[i];
if (f > max_frac) { max_frac = f; }
if (f < min_frac) { min_frac = f; }
}
if (max_frac - min_frac > GROUP_MAX_EPS) {
const float iscale = (N_BUCKETS - 1) / (max_frac - min_frac);
// Counting sort (descending)
// This partially sorts the values and works best for uniform distributions.
for (int i = 0; i < s->n; ++i) {
const int j = N_BUCKETS - 1 - MAX(0, MIN(nearest_int((s->frac[i] - min_frac) * iscale), N_BUCKETS - 1));
s->buckets[j] += 1;
s->Iaux[i] = j;
}
for (int j = 1; j < N_BUCKETS; ++j) {
s->buckets[j] += s->buckets[j - 1];
}
for (int i = s->n - 1; i >= 0; --i) {
const int l = s->Iaux[i];
const int j = --(s->buckets[l]);
s->aux_ids[j] = i;
s->aux_f[j] = s->frac[i];
}
{ float * tmp = s->frac; s->frac = s->aux_f; s->aux_f = tmp; }
for (int i = 0; i < s->n; ++i) {
const int j = s->aux_ids[i];
s->aux_i[i] = s->k_ids[j];
}
{ int32_t * tmp = s->k_ids; s->k_ids = s->aux_i; s->aux_i = tmp; }
for (int i = 0; i < s->n; ++i) {
const int j = s->aux_ids[i];
s->aux_i[i] = s->ids[j];
}
{ int32_t * tmp = s->ids; s->ids = s->aux_i; s->aux_i = tmp; }
}
// Insertion sort (descending)
// This is very fast on mostly-sorted data,
// but will be slow if everything ended up
// in a single bucket in the previous step.
// TODO: use another adaptive sort algorithm with a better worst case time complexity
for (int i = 1; i < s->n; ++i) {
const float tmp = s->frac[i];
const int32_t tmp_k_id = s->k_ids[i];
const int32_t tmp_id = s->ids[i];
int j = i;
for (; j > 0 && s->frac[j - 1] < tmp; --j) {
s->frac[j] = s->frac[j - 1];
s->k_ids[j] = s->k_ids[j - 1];
s->ids[j] = s->ids[j - 1];
}
if (j != i) {
s->frac[j] = tmp;
s->k_ids[j] = tmp_k_id;
s->ids[j] = tmp_id;
}
}
}
static void k_sort_set_x_L(struct k_sort * s, int n, int w_amax_i, const float * GGML_RESTRICT x,
const int8_t * GGML_RESTRICT L, bool negative_scale) {
const float wmax = fabsf(x[w_amax_i]);
const int k = s->k;
// Extrapolate the extremities (assuming k is at least 2)
const float max_odd = (x[w_amax_i] < 0.0f) != negative_scale ? s->odd[0] + fabsf(s->odd[0] - s->odd[1]) :
s->odd[k - 1] + fabsf(s->odd[k - 1] - s->odd[k - 2]);
int m = 0;
for (int i = 0; i < n; ++i) {
if (x[i] == 0.0f) { continue; }
const float v = fabsf(x[i]);
const float v_max_odd = v * max_odd;
const int odd_dir = (x[i] < 0.0f) != negative_scale ? -1 : 1;
for (int j = L[i] + odd_dir; 0 <= j && j < s->k; j += odd_dir) {
const float odd = s->odd[j];
// Only include scales which would not clamp the "most important" value
if (wmax * odd < v_max_odd) {
s->frac[m] = v / odd;
s->ids[m] = i;
s->k_ids[m] = j;
m += 1;
} else {
break;
}
}
}
s->n = m;
k_sort_frac_descending(s);
}
static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
const float * GGML_RESTRICT qw) {
float max = 0;
@@ -700,6 +1001,106 @@ static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, co
return scale;
}
// non-linear (nearly) exhaustive search with cumulative sums
// assumes E8M0 scale and symmetric non-linear mappings (because only one sign is tried for the scale)
// also assumes the kvalues are 2 times their actual value
// (intended to be a good fit for mxfp4, which is non-linear and symmetric)
static uint8_t make_qkxs_nl_e8m0_quants(int n, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights, int8_t * GGML_RESTRICT L, int8_t * GGML_RESTRICT Laux, struct k_sort * GGML_RESTRICT k_sort) {
float sumlx = 0.0f;
float suml2 = 0.0f;
float amax = 0.0f;
float w_amax = -1.0f;
int w_amax_i = -1;
const int8_t kmin = k_sort->kmin;
for (int i = 0; i < n; ++i) {
const float w = weights ? weights[i] : x[i] * x[i];
const float ax = fabsf(x[i]);
const float wax = w * ax;
if (ax > amax) {
amax = ax;
}
if (wax > w_amax) {
w_amax = wax;
w_amax_i = i;
}
sumlx += w * x[i] * kmin;
suml2 += w * kmin * kmin;
}
if (amax < GROUP_MAX_EPS) { // all zero
memset(L, 0, n);
return 0.0f;
}
memset(Laux, k_sort->mid_k, n);
memset(L, k_sort->mid_k, n);
// NOTE: for mxfp4, it doesn't seem beneficial to skip small max values
// {
// // start with the max at 4
// const float s = 4.0f / amax;
// sumlx = 0.0f;
// suml2 = 0.0f;
// for (int i = 0; i < n; ++i) {
// const int l = k_sort_best_index(k_sort, x[i] * s);
// const float w = weights ? weights[i] : x[i] * x[i];
// Laux[i] = l;
// L[i] = l;
// sumlx += w * k_sort->k_values[l] * x[i];
// suml2 += w * k_sort->k_values[l] * k_sort->k_values[l];
// }
// }
k_sort_set_x_L(k_sort, n, w_amax_i, x, Laux, false);
float best_err;
uint8_t best_scale_e8;
if (suml2 != 0.0f) {
const float scale = sumlx / suml2;
const uint8_t e8 = GGML_FP32_TO_E8M0(2.0f * scale);
const float new_scale = GGML_E8M0_TO_FP32_HALF(e8);
// expansion of sum((new_scale * l[i] - x[i])**2) without the sumx2 factor
const float sq_err = suml2 * (new_scale * new_scale) - 2 * sumlx * new_scale;
best_err = sq_err;
best_scale_e8 = e8;
} else {
best_err = 0.0f; // the actual best is -sumx2
best_scale_e8 = 0;
}
int best_i = -1; // consecutive with 0..k_sort->n
for (int i = 0; i < k_sort->n; ++i) {
const int ii = k_sort->ids[i];
const int k_i = k_sort->k_ids[i];
const float odd = k_sort->odd[k_i];
const float step = k_sort->step[k_i];
const float w = weights ? weights[ii] : x[ii] * x[ii];
sumlx += w * (fabsf(x[ii]) * step);
suml2 += w * (odd * step);
Laux[ii] = k_i;
if (suml2 > 0.0f) {
const float scale = sumlx / suml2;
const uint8_t e8 = GGML_FP32_TO_E8M0(2.0f * scale);
const float new_scale = GGML_E8M0_TO_FP32_HALF(e8);
// expansion of sum((new_scale * l[i] - x[i])**2) without the `+ x**2` factor
const float sq_err = suml2 * (new_scale * new_scale) - 2 * sumlx * new_scale;
if (sq_err < best_err) {
best_err = sq_err;
best_scale_e8 = e8;
if (i == best_i + 1) {
// reduce copies for consecutive bests
L[ii] = k_i;
} else {
memcpy(L, Laux, n);
}
best_i = i;
}
}
}
return best_scale_e8;
}
static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) {
if (j < 4) {
*d = q[j] & 63; *m = q[j + 4] & 63;
@@ -2092,10 +2493,71 @@ size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst,
return nrow * row_size;
}
static void quantize_row_mxfp4_impl(const float * GGML_RESTRICT x, block_mxfp4 * GGML_RESTRICT y, int64_t n_per_row, const float * quant_weights) {
if (!quant_weights) {
quantize_row_mxfp4_ref(x, y, n_per_row);
return;
}
// like kvalues_mxfp4, but sorted
const int8_t kvalues_mxfp4_sorted[15] = {-12, -8, -6, -4, -3, -2, -1, 0, 1, 2, 3, 4, 6, 8, 12};
float weight[QK_MXFP4];
int8_t L[QK_MXFP4];
int8_t Laux[QK_MXFP4];
struct k_sort k_sort;
uint8_t buf[K_SORT_BUF_SIZE_NL(QK_MXFP4, 15, 8)] = {0};
k_sort_init(&k_sort, QK_MXFP4, 15, kvalues_mxfp4_sorted, buf);
float sum_x2 = 0;
for (int j = 0; j < n_per_row; ++j) {
sum_x2 += x[j] * x[j];
}
const float sigma2 = sum_x2 / n_per_row;
const int nb = n_per_row / QK_MXFP4;
for (int ib = 0; ib < nb; ++ib) {
const float * xb = x + QK_MXFP4 * ib;
const float * qw = quant_weights + QK_MXFP4 * ib;
for (int j = 0; j < QK_MXFP4; ++j) {
weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
}
const uint8_t e = make_qkxs_nl_e8m0_quants(QK_MXFP4, xb, weight, L, Laux, &k_sort);
y[ib].e = e;
for (int j = 0; j < QK_MXFP4; ++j) {
int8_t l = L[j] - k_sort.mid_k;
L[j] = (l & 0x08) | abs(l);
}
for (int j = 0; j < QK_MXFP4/2; ++j) {
const uint8_t x0 = L[j];
const uint8_t x1 = L[QK_MXFP4/2 + j];
y[ib].qs[j] = x0;
y[ib].qs[j] |= x1 << 4;
}
}
}
size_t quantize_mxfp4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
GGML_UNUSED(quant_weights);
quantize_row_mxfp4_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * ggml_row_size(GGML_TYPE_MXFP4, n_per_row);
if (!quant_weights) {
quantize_row_mxfp4_ref(src, dst, (int64_t)nrow*n_per_row);
return nrow * ggml_row_size(GGML_TYPE_MXFP4, n_per_row);
}
size_t row_size = ggml_row_size(GGML_TYPE_MXFP4, n_per_row);
char * qrow = (char *)dst;
for (int64_t row = 0; row < nrow; ++row) {
quantize_row_mxfp4_impl(src, (block_mxfp4*)qrow, n_per_row, quant_weights);
src += n_per_row;
qrow += row_size;
}
return nrow * row_size;
}
// ====================== Ternary (de)-quantization (BitNet b1.58 and TriLMs)

View File

@@ -823,10 +823,10 @@ ggml_backend_t ggml_backend_rpc_init(const char * endpoint) {
};
ggml_backend_t backend = new ggml_backend {
/* .guid = */ ggml_backend_rpc_guid(),
/* .interface = */ ggml_backend_rpc_interface,
/* .device = */ ggml_backend_rpc_add_device(endpoint),
/* .context = */ ctx
/* .guid = */ ggml_backend_rpc_guid(),
/* .iface = */ ggml_backend_rpc_interface,
/* .device = */ ggml_backend_rpc_add_device(endpoint),
/* .context = */ ctx
};
return backend;
}

View File

@@ -4586,10 +4586,10 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
};
ggml_backend_t sycl_backend = new ggml_backend {
/* .guid = */ ggml_backend_sycl_guid(),
/* .interface = */ ggml_backend_sycl_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device),
/* .context = */ ctx
/* .guid = */ ggml_backend_sycl_guid(),
/* .iface = */ ggml_backend_sycl_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_sycl_reg(), device),
/* .context = */ ctx
};
return sycl_backend;

View File

@@ -10767,10 +10767,10 @@ ggml_backend_t ggml_backend_vk_init(size_t dev_num) {
ggml_vk_init(ctx, dev_num);
ggml_backend_t vk_backend = new ggml_backend {
/* .guid = */ ggml_backend_vk_guid(),
/* .interface = */ ggml_backend_vk_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
/* .context = */ ctx,
/* .guid = */ ggml_backend_vk_guid(),
/* .iface = */ ggml_backend_vk_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_vk_reg(), dev_num),
/* .context = */ ctx,
};
return vk_backend;

View File

@@ -228,8 +228,7 @@ class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
d = max / -8
with np.errstate(divide="ignore"):
id = np.where(d == 0, 0, 1 / d)
# FIXME: Q4_0's reference rounding is cursed and depends on FMA
qs = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
qs = np.trunc((blocks * id) + np.float32(8.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
qs = qs.reshape((n_blocks, 2, cls.block_size // 2))
qs = qs[..., 0, :] | (qs[..., 1, :] << np.uint8(4))
@@ -300,8 +299,7 @@ class Q5_0(__Quant, qtype=GGMLQuantizationType.Q5_0):
d = max / -16
with np.errstate(divide="ignore"):
id = np.where(d == 0, 0, 1 / d)
# FIXME: Q5_0's reference rounding is cursed and depends on FMA
q = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
q = np.trunc((blocks * id) + np.float32(16.5), dtype=np.float32).astype(np.uint8).clip(0, 31)
qs = q.reshape((n_blocks, 2, cls.block_size // 2))
qs = (qs[..., 0, :] & np.uint8(0x0F)) | (qs[..., 1, :] << np.uint8(4))
@@ -655,6 +653,59 @@ class TQ2_0(__Quant, qtype=GGMLQuantizationType.TQ2_0):
return (d * qs.astype(np.float32))
class MXFP4(__Quant, qtype=GGMLQuantizationType.MXFP4):
# e2m1 values (doubled)
# ref: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
kvalues = (0, 1, 2, 3, 4, 6, 8, 12, 0, -1, -2, -3, -4, -6, -8, -12)
@staticmethod
# see ggml_e8m0_to_fp32_half in ggml-impl.h
def e8m0_to_fp32_half(x: np.ndarray) -> np.ndarray:
bits = np.where(x < 2, np.uint32(0x00200000) << np.uint32(x), np.uint32(x - 1) << np.uint32(23))
return bits.view(np.float32)
@classmethod
def quantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
d = abs(blocks).max(axis=-1, keepdims=True)
scale = (d / np.float32(4)).view(np.uint32)
# round away from zero
scale += (scale & np.uint32(0x00400000)) << 1
e = ((scale >> 23) & np.uint32(0xFF)).astype(np.uint8)
d = cls.e8m0_to_fp32_half(e)
kvalues = np.array(cls.kvalues, dtype=np.int8).reshape((1, 1, 16))
errs = np.abs(d.reshape((n_blocks, 1, 1)) * kvalues.astype(np.float32) - blocks.reshape((n_blocks, cls.block_size, 1)))
best = np.argmin(errs, axis=-1, keepdims=True)
qs = best.reshape(n_blocks, 2, cls.block_size // 2).astype(np.uint8)
qs = qs[:, 0] | (qs[:, 1] << np.uint8(4))
qs = qs.reshape((n_blocks, cls.block_size // 2))
return np.concatenate([e, qs], axis=-1)
@classmethod
def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
n_blocks = blocks.shape[0]
e, qs = np.hsplit(blocks, [1])
d = cls.e8m0_to_fp32_half(e)
qs = qs.reshape((n_blocks, 1, cls.block_size // 2)) >> np.array([0, 4], dtype=np.uint8).reshape((1, 2, 1))
qs = (qs & np.uint8(0x0F)).view(np.int8)
kvalues = np.array(cls.kvalues, dtype=np.int8).reshape(1, 1, 16)
qs = np.take_along_axis(kvalues, qs, axis=-1).reshape((n_blocks, cls.block_size))
return (d * qs.astype(np.float32))
class IQ2_XXS(__Quant, qtype=GGMLQuantizationType.IQ2_XXS):
ksigns: bytes = (
b"\x00\x81\x82\x03\x84\x05\x06\x87\x88\x09\x0a\x8b\x0c\x8d\x8e\x0f"

View File

@@ -1119,7 +1119,8 @@ class TensorNameMap:
"model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
"vpm.embeddings.patch_embedding",
"model.vision_model.embeddings.patch_embedding", # SmolVLM
"vision_tower.patch_conv", # pixtral
"vision_tower.patch_conv", # pixtral-hf
"vision_encoder.patch_conv", # pixtral
"vision_model.patch_embedding.linear", # llama 4
"visual.patch_embed.proj", # qwen2vl
),
@@ -1138,7 +1139,8 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.self_attn.q_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
),
@@ -1153,7 +1155,8 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.self_attn.k_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
),
@@ -1168,7 +1171,8 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.self_attn.v_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
),
@@ -1178,7 +1182,8 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
"vpm.encoder.layers.{bid}.layer_norm1",
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
"vision_model.model.layers.{bid}.input_layernorm", # llama4
"visual.blocks.{bid}.norm1", # qwen2vl
),
@@ -1190,7 +1195,8 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.self_attn.out_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral
"visual.blocks.{bid}.attn.proj", # qwen2vl
),
@@ -1201,7 +1207,8 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.layer_norm2",
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
"vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
"visual.blocks.{bid}.norm2", # qwen2vl
),
@@ -1210,14 +1217,16 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
"vpm.encoder.layers.{bid}.mlp.fc1",
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
"visual.blocks.{bid}.mlp.fc1", # qwen2vl
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
),
MODEL_TENSOR.V_ENC_FFN_GATE: (
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
),
@@ -1226,7 +1235,8 @@ class TensorNameMap:
"model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
"vpm.encoder.layers.{bid}.mlp.fc2",
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
"visual.blocks.{bid}.mlp.fc2", # qwen2vl
"visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
@@ -1244,7 +1254,8 @@ class TensorNameMap:
MODEL_TENSOR.V_PRE_NORM: (
"vision_tower.vision_model.pre_layrnorm",
"vision_tower.ln_pre", # pixtral
"vision_tower.ln_pre", # pixtral-hf
"vision_encoder.ln_pre", # pixtral
"vision_model.layernorm_pre", # llama4
),
@@ -1261,6 +1272,7 @@ class TensorNameMap:
MODEL_TENSOR.V_MM_INP_NORM: (
"multi_modal_projector.norm",
"pre_mm_projector_norm",
),
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
@@ -1316,7 +1328,8 @@ class TensorNameMap:
),
MODEL_TENSOR.V_MM_PATCH_MERGER: (
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf
"patch_merger.merging_layer", # mistral
),
# audio (mtmd)

View File

@@ -145,7 +145,11 @@ class SafetensorRemote:
tensors[key] = val
return tensors
raise ValueError(f"Model {model_id} does not have any safetensor files")
raise ValueError(
f"No safetensor file has been found for model {model_id}."
"If the repo has safetensor files, make sure the model is public or you have a "
"valid Hugging Face token set in the environment variable HF_TOKEN."
)
@classmethod
def get_list_tensors(cls, url: str) -> dict[str, RemoteTensor]:

View File

@@ -67,6 +67,7 @@ class GGMLQuants:
"q4_0", "q4_1", "q5_0", "q5_1", "q8_0",
"q2_K", "q3_K", "q4_K", "q5_K", "q6_K",
"tq1_0", "tq2_0",
"mxfp4",
"iq2_xxs", "iq2_xs", "iq2_s", "iq3_xxs", "iq3_s", "iq1_s", "iq1_m",
"iq4_nl", "iq4_xs",
):
@@ -140,14 +141,21 @@ def compare_tensors(t1: np.ndarray, t2: np.ndarray, qtype: GGMLQuantizationType)
return False
def do_test(libggml_path: Path, quick: bool = False):
def do_test(libggml_path: Path, quick: bool = False, user_type: GGMLQuantizationType | None = None):
ggml_quants = GGMLQuants(libggml_path)
np.set_printoptions(precision=None, threshold=(4 * 256) + 1, formatter={"int": lambda n: "0x%02X" % n})
r = np.random.randn(8, 1024, 1024).astype(np.float32, copy=False)
# test zero blocks
r[0, 0, :] = 0
## Maybe test infinities? (can make NANs, not really useful in practice)
# r[0, 1, 0] = np.inf
# r[0, 2, 0] = -np.inf
# r[0, 3, 0] = np.inf
# r[0, 3, 1] = -np.inf
for qtype in (GGMLQuantizationType.F16, *gguf.quants._type_traits.keys()):
for qtype in ((GGMLQuantizationType.F16, *gguf.quants._type_traits.keys()) if user_type is None else (user_type,)):
has_dequantize = False
has_quantize = False
@@ -228,11 +236,12 @@ def do_test(libggml_path: Path, quick: bool = False):
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Test Python (de)quantization against the reference C implementation")
parser.add_argument("--libggml", type=Path, default=Path(__file__).parent.parent.parent / "build" / "ggml" / "src" / "libggml.so", help="The path to libggml.so")
parser.add_argument("--libggml", type=Path, default=Path(__file__).parent.parent.parent / "build" / "bin" / "libggml.so", help="The path to libggml.so")
parser.add_argument("--quick", action="store_true", help="Don't quantize with C when it's not strictly necessary")
parser.add_argument("--type", type=str, help="The quant type to test (all by default)")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG)
do_test(args.libggml, args.quick)
do_test(args.libggml, args.quick, GGMLQuantizationType[args.type.upper()] if args.type is not None else None)

View File

@@ -4,6 +4,7 @@ import argparse
import json
import os
import random
import sqlite3
import subprocess
from time import sleep, time
from typing import Optional, Union
@@ -47,6 +48,8 @@ def get_prompts_rng(prompt_lengths: list[int]) -> list[list[int]]:
def get_server(path_server: str, path_log: Optional[str]) -> dict:
if path_server.startswith("http://") or path_server.startswith("https://"):
return {"process": None, "address": path_server, "fout": None}
if os.environ.get("LLAMA_ARG_HOST") is None:
logger.info("LLAMA_ARG_HOST not explicitly set, using 127.0.0.1")
os.environ["LLAMA_ARG_HOST"] = "127.0.0.1"
@@ -89,15 +92,13 @@ def get_prompt_length(data: dict) -> int:
f"{server_address}/apply-template",
json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]}
)
if response.status_code != 200:
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
response.raise_for_status()
prompt: str = json.loads(response.text)["prompt"]
response = session.post(
f"{server_address}/tokenize",
json={"content": prompt, "add_special": True}
)
if response.status_code != 200:
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
response.raise_for_status()
tokens: list[str] = json.loads(response.text)["tokens"]
return len(tokens)
@@ -107,7 +108,12 @@ def send_prompt(data: dict) -> tuple[float, list[float]]:
server_address: str = data["server_address"]
t_submit = time()
if data["synthetic_prompt"]:
if data["external_server"]:
json_data: dict = {
"prompt": data["prompt"], "ignore_eos": True,
"seed": data["seed"], "max_tokens": data["n_predict"], "stream": True}
response = session.post(f"{server_address}/v1/completions", json=json_data, stream=True)
elif data["synthetic_prompt"]:
json_data: dict = {
"prompt": data["prompt"], "ignore_eos": True, "cache_prompt": False,
"seed": data["seed"], "n_predict": data["n_predict"], "stream": True}
@@ -117,34 +123,38 @@ def send_prompt(data: dict) -> tuple[float, list[float]]:
f"{server_address}/apply-template",
json={"messages": [{"role": "user", "content": data["prompt"], "stream": True}]}
)
if response.status_code != 200:
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
response.raise_for_status()
prompt: str = json.loads(response.text)["prompt"]
json_data: dict = {"prompt": prompt, "seed": data["seed"], "n_predict": data["n_predict"], "stream": True}
response = session.post(f"{server_address}/completion", json=json_data, stream=True)
response.raise_for_status()
lines = []
token_arrival_times: list[float] = []
for line in response.iter_lines(decode_unicode=False):
if not line.startswith(b"data: "):
continue
lines.append(line)
token_arrival_times.append(time())
token_arrival_times = token_arrival_times[:-1]
if response.status_code != 200:
raise RuntimeError(f"Server returned status code {response.status_code}: {response.text}")
if len(lines) > 1 and "timings" in json.loads(lines[-2][6:]):
token_arrival_times = token_arrival_times[:-1]
return (t_submit, token_arrival_times)
def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_prompts: int, n_predict: int, n_predict_min: int, seed_offset: int):
def benchmark(
path_server: str, path_log: Optional[str], path_db: Optional[str], name: Optional[str], prompt_source: str, n_prompts: int,
n_predict: int, n_predict_min: int, seed_offset: int):
external_server: bool = path_server.startswith("http://") or path_server.startswith("https://")
if os.environ.get("LLAMA_ARG_N_PARALLEL") is None:
logger.info("LLAMA_ARG_N_PARALLEL not explicitly set, using 32")
os.environ["LLAMA_ARG_N_PARALLEL"] = "32"
if os.environ.get("LLAMA_ARG_N_GPU_LAYERS") is None:
if not external_server and os.environ.get("LLAMA_ARG_N_GPU_LAYERS") is None:
logger.info("LLAMA_ARG_N_GPU_LAYERS not explicitly set, using 999")
os.environ["LLAMA_ARG_N_GPU_LAYERS"] = "999"
if os.environ.get("LLAMA_ARG_FLASH_ATTN") is None:
if not external_server and os.environ.get("LLAMA_ARG_FLASH_ATTN") is None:
logger.info("LLAMA_ARG_FLASH_ATTN not explicitly set, using 'true'")
os.environ["LLAMA_ARG_FLASH_ATTN"] = "true"
@@ -165,7 +175,7 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p
else:
n_predict_min = n_predict
if os.environ.get("LLAMA_ARG_CTX_SIZE") is None:
if not external_server and os.environ.get("LLAMA_ARG_CTX_SIZE") is None:
context_per_slot: int = int(1.05 * (n_predict + (np.max(prompt_n) if synthetic_prompts else 2048)))
context_total: int = context_per_slot * parallel
os.environ["LLAMA_ARG_CTX_SIZE"] = str(context_total)
@@ -176,6 +186,7 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p
try:
server = get_server(path_server, path_log)
server_address: str = server["address"]
assert external_server == (server["process"] is None)
adapter = requests.adapters.HTTPAdapter(pool_connections=parallel, pool_maxsize=parallel) # type: ignore
session = requests.Session()
@@ -188,8 +199,9 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p
if seed_offset >= 0:
random.seed(3 * (seed_offset + 1000 * i) + 1)
data.append({
"session": session, "server_address": server_address, "prompt": p, "synthetic_prompt": synthetic_prompts,
"n_predict": random.randint(n_predict_min, n_predict), "seed": (3 * (seed_offset + 1000 * i) + 2) if seed_offset >= 0 else -1})
"session": session, "server_address": server_address, "external_server": external_server, "prompt": p,
"synthetic_prompt": synthetic_prompts, "n_predict": random.randint(n_predict_min, n_predict),
"seed": (3 * (seed_offset + 1000 * i) + 2) if seed_offset >= 0 else -1})
if not synthetic_prompts:
logger.info("Getting the prompt lengths...")
@@ -199,7 +211,7 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p
t0 = time()
results: list[tuple[float, list[float]]] = thread_map(send_prompt, data, max_workers=parallel, chunksize=1)
finally:
if server is not None:
if server is not None and server["process"] is not None:
server["process"].terminate()
server["process"].wait()
if session is not None:
@@ -233,15 +245,24 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p
logger.info(f"Average generation depth: {depth_sum / token_t.shape[0]:.2f} tokens")
logger.info(f"Average total generation speed: {token_t.shape[0] / token_t_last:.2f} tokens/s")
logger.info(f"Average generation speed per slot: {token_t.shape[0] / (parallel * token_t_last):.2f} tokens/s / slot")
logger.info("")
logger.info(
"The above numbers are the speeds as observed by the Python script and may differ from the performance reported by the server, "
"particularly when the server is fast vs. the network or Python script (e.g. when serving a very small model).")
if path_db is not None:
con = sqlite3.connect(path_db)
cursor = con.cursor()
cursor.execute(
"CREATE TABLE IF NOT EXISTS server_bench"
"(name TEXT, n_parallel INTEGER, prompt_source TEXT, n_prompts INTEGER, "
"n_predict INTEGER, n_predict_min INTEGER, seed_offset INTEGER, runtime REAL);")
cursor.execute(
"INSERT INTO server_bench VALUES (?, ?, ?, ?, ?, ?, ?, ?);",
[name, parallel, prompt_source, n_prompts, n_predict, n_predict_min, seed_offset, token_t_last])
con.commit()
plt.figure()
plt.scatter(prompt_n, 1e3 * prompt_t, s=10.0, marker=".", alpha=0.25)
plt.xlim(0, 1.05e0 * np.max(prompt_n))
plt.ylim(0, 1.05e3 * np.max(prompt_t))
plt.title(name or "")
plt.xlabel("Prompt length [tokens]")
plt.ylabel("Time to first token [ms]")
plt.savefig("prompt_time.png", dpi=240)
@@ -250,6 +271,7 @@ def benchmark(path_server: str, path_log: Optional[str], prompt_source: str, n_p
plt.figure()
plt.hist(token_t, np.arange(0, bin_max))
plt.xlim(0, bin_max + 1)
plt.title(name or "")
plt.xlabel("Time [s]")
plt.ylabel("Num. tokens generated per second")
plt.savefig("gen_rate.png", dpi=240)
@@ -259,9 +281,13 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Tool for benchmarking the throughput of the llama.cpp HTTP server. "
"Results are printed to console and visualized as plots (saved to current working directory). "
"To pass arguments such as the model path to the server, set the corresponding environment variables (see llama-server --help).")
"To pass arguments such as the model path to the server, set the corresponding environment variables (see llama-server --help). "
"The reported numbers are the speeds as observed by the Python script and may differ from the performance reported by the server, "
"particularly when the server is fast vs. the network or Python script (e.g. when serving a very small model).")
parser.add_argument("--path_server", type=str, default="llama-server", help="Path to the llama.cpp server binary")
parser.add_argument("--path_log", type=str, default="server-bench-{port}.log", help="Path to the model to use for the benchmark")
parser.add_argument("--path_db", type=str, default=None, help="Path to an sqlite database to store the benchmark results in")
parser.add_argument("--name", type=str, default=None, help="Name to label plots and database entries with")
parser.add_argument(
"--prompt_source", type=str, default="rng-1024-2048",
help="How to get the prompts for the benchmark, either 'mmlu' for MMLU questions or "

View File

@@ -223,12 +223,7 @@ void llama_kv_cache_unified::clear(bool data) {
}
bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size());
auto & cells = v_cells[seq_to_stream[seq_id]];
auto & head = v_heads[seq_to_stream[seq_id]];
uint32_t new_head = cells.size();
GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()));
if (p0 < 0) {
p0 = 0;
@@ -239,6 +234,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
}
if (seq_id >= 0) {
auto & cells = v_cells[seq_to_stream[seq_id]];
auto & head = v_heads[seq_to_stream[seq_id]];
uint32_t new_head = cells.size();
for (uint32_t i = 0; i < cells.size(); ++i) {
if (!cells.pos_in(i, p0, p1)) {
continue;
@@ -250,26 +250,38 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
}
}
}
// If we freed up a slot, set head to it so searching can start there.
if (new_head != cells.size() && new_head < head) {
head = new_head;
}
} else {
// match any sequence
for (uint32_t i = 0; i < cells.size(); ++i) {
if (!cells.pos_in(i, p0, p1)) {
continue;
for (uint32_t s = 0; s < n_stream; ++s) {
auto & cells = v_cells[s];
auto & head = v_heads[s];
uint32_t new_head = cells.size();
for (uint32_t i = 0; i < cells.size(); ++i) {
if (!cells.pos_in(i, p0, p1)) {
continue;
}
cells.rm(i);
if (new_head == cells.size()) {
new_head = i;
}
}
cells.rm(i);
if (new_head == cells.size()) {
new_head = i;
// If we freed up a slot, set head to it so searching can start there.
if (new_head != cells.size() && new_head < head) {
head = new_head;
}
}
}
// If we freed up a slot, set head to it so searching can start there.
if (new_head != cells.size() && new_head < head) {
head = new_head;
}
return true;
}
@@ -738,66 +750,70 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d
}
llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const {
if (debug > 0) {
const auto & cells = v_cells[seq_to_stream[1]];
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
const auto seq_id = ubatch.seq_id_unq[s];
const auto stream_id = seq_to_stream[seq_id];
const auto & cells = v_cells[stream_id];
const uint32_t head_cur = v_heads[stream_id];
const uint32_t head_cur = v_heads[1];
LLAMA_LOG_DEBUG("%s: stream[%d], n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
__func__, stream_id, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n",
__func__, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa);
if ((debug == 2 && n_swa > 0) || debug > 2) {
std::string ss;
for (uint32_t i = 0; i < cells.size(); ++i) {
if (cells.is_empty(i)) {
ss += '.';
} else {
assert(cells.seq_count(i) >= 1);
if (cells.seq_count(i) == 1) {
ss += std::to_string(cells.seq_get(i));
if ((debug == 2 && n_swa > 0) || debug > 2) {
std::string ss;
for (uint32_t i = 0; i < cells.size(); ++i) {
if (cells.is_empty(i)) {
ss += '.';
} else {
ss += 'M';
assert(cells.seq_count(i) >= 1);
if (cells.seq_count(i) == 1) {
ss += std::to_string(cells.seq_get(i));
} else {
ss += 'M';
}
}
if (i%256 == 255) {
ss += " *";
ss += '\n';
}
}
if (i%256 == 255) {
ss += " *";
ss += '\n';
}
}
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
}
if ((debug == 2 && n_swa > 0) || debug > 2) {
std::string ss;
for (uint32_t i = 0; i < cells.size(); ++i) {
std::string cur;
if (cells.is_empty(i)) {
cur = '.';
} else {
cur = std::to_string(cells.pos_get(i));
}
const int n = cur.size();
for (int j = 0; j < 5 - n; ++j) {
cur += ' ';
}
ss += cur;
if (i%256 == 255) {
ss += " *";
}
if (i%64 == 63) {
ss += '\n';
}
}
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
}
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
if (cells.seq_pos_min(s) < 0) {
continue;
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
}
LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
if ((debug == 2 && n_swa > 0) || debug > 2) {
std::string ss;
for (uint32_t i = 0; i < cells.size(); ++i) {
std::string cur;
if (cells.is_empty(i)) {
cur = '.';
} else {
cur = std::to_string(cells.pos_get(i));
}
const int n = cur.size();
for (int j = 0; j < 5 - n; ++j) {
cur += ' ';
}
ss += cur;
if (i%256 == 255) {
ss += " *";
}
if (i%64 == 63) {
ss += '\n';
}
}
LLAMA_LOG_DEBUG("\n%s\n", ss.c_str());
}
for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
if (cells.seq_pos_min(s) < 0) {
continue;
}
LLAMA_LOG_DEBUG("%s: stream[%d] min[%d] = %5d, max[%d] = %5d\n", __func__, stream_id, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s));
}
}
}

View File

@@ -999,7 +999,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
// TODO: temporary sanity check that the F16 -> MXFP4 is lossless
#if 1
#if 0
if (new_type == GGML_TYPE_MXFP4) {
auto * x = f32_data_03;

View File

@@ -44,6 +44,7 @@
#define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern"
#define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
// audio-specific
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"

View File

@@ -201,6 +201,7 @@ struct clip_hparams {
// legacy
bool has_llava_projector = false;
int minicpmv_version = 0;
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
};
struct clip_layer {
@@ -866,21 +867,8 @@ struct clip_graph {
int n_embd = clip_n_mmproj_embd(ctx);
const int d_head = 128;
int n_head = n_embd/d_head;
int num_query = 96;
if (ctx->model.hparams.minicpmv_version == 2) {
// MiniCPM-V 2.5
num_query = 96;
} else if (ctx->model.hparams.minicpmv_version == 3) {
// MiniCPM-V 2.6
num_query = 64;
} else if (ctx->model.hparams.minicpmv_version == 4) {
// MiniCPM-o 2.6
num_query = 64;
} else if (ctx->model.hparams.minicpmv_version == 5) {
// MiniCPM-V 4.0
num_query = 64;
}
// Use actual config value if available, otherwise fall back to hardcoded values
int num_query = ctx->model.hparams.minicpmv_query_num;
ggml_tensor * Q = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
model.mm_model_attn_q_b);
@@ -2138,7 +2126,19 @@ struct clip_model_loader {
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
if (hparams.minicpmv_query_num == 0) {
// Fallback to hardcoded values for legacy models
if (hparams.minicpmv_version == 3) {
hparams.minicpmv_query_num = 64;
} else if (hparams.minicpmv_version == 4) {
hparams.minicpmv_query_num = 64;
} else if (hparams.minicpmv_version == 5) {
hparams.minicpmv_query_num = 64;
} else {
hparams.minicpmv_query_num = 96;
}
}
} else if (is_audio) {
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
@@ -3556,20 +3556,23 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
} break;
case PROJECTOR_TYPE_MINICPMV:
{
if (params.minicpmv_version == 2) {
// MiniCPM-V 2.5
n_patches_sq = 96;
} else if (params.minicpmv_version == 3) {
// MiniCPM-V 2.6
n_patches_sq = 64;
} else if (params.minicpmv_version == 4) {
// MiniCPM-o 2.6
n_patches_sq = 64;
} else if (params.minicpmv_version == 5) {
// MiniCPM-V 4.0
n_patches_sq = 64;
// Use actual config value if available, otherwise fall back to hardcoded values
if (params.minicpmv_query_num > 0) {
n_patches_sq = params.minicpmv_query_num;
} else {
GGML_ABORT("Unknown minicpmv version");
// Fallback to hardcoded values for legacy models
if (params.minicpmv_version == 2) {
n_patches_sq = 96;
} else if (params.minicpmv_version == 3) {
n_patches_sq = 64;
} else if (params.minicpmv_version == 4) {
n_patches_sq = 64;
} else if (params.minicpmv_version == 5) {
// MiniCPM-V 4.0
n_patches_sq = 64;
} else {
GGML_ABORT("Unknown minicpmv version");
}
}
} break;
case PROJECTOR_TYPE_QWEN2VL:
@@ -4102,7 +4105,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
const auto & hparams = ctx->model.hparams;
switch (ctx->model.proj_type) {
case PROJECTOR_TYPE_LDP:
return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
@@ -4114,20 +4116,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_MLP_NORM:
return ctx->model.mm_3_b->ne[0];
case PROJECTOR_TYPE_MINICPMV:
if (hparams.minicpmv_version == 2) {
// MiniCPM-V 2.5
return 4096;
} else if (hparams.minicpmv_version == 3) {
// MiniCPM-V 2.6
return 3584;
} else if (hparams.minicpmv_version == 4) {
// MiniCPM-o 2.6
return 3584;
} else if (hparams.minicpmv_version == 5) {
// MiniCPM-V 4.0
return 2560;
}
GGML_ABORT("Unknown minicpmv version");
return ctx->model.mm_model_proj->ne[0];
case PROJECTOR_TYPE_GLM_EDGE:
return ctx->model.mm_model_mlp_3_w->ne[1];
case PROJECTOR_TYPE_QWEN2VL:

View File

@@ -517,6 +517,16 @@ if args.use_f32:
# output in the same directory as the model if output_dir is None
dir_model = args.model_dir
# Read config.json to get actual model configuration
config_path = os.path.join(dir_model, "config.json")
model_config = {}
if os.path.isfile(config_path):
with open(config_path, "r", encoding="utf-8") as f:
model_config = json.load(f)
print(f"Loaded config from {config_path}")
else:
print(f"Warning: config.json not found at {config_path}")
# If minicpmv_projector is not specified but the default path exists, use the default path
if args.minicpmv_projector is None:
default_projector_path = os.path.join(dir_model, "minicpmv.projector")
@@ -555,37 +565,62 @@ if args.use_f32:
# processor = CLIPProcessor.from_pretrained(dir_model)
minicpmv_version = args.minicpmv_version
emb_dim = 4096
block_count = 26
if minicpmv_version == 1: # MiniCPM-V 2.0
emb_dim = 2304
block_count = 26
elif minicpmv_version == 2: # MiniCPM-V 2.5
emb_dim = 4096
block_count = 27
elif minicpmv_version == 3: # MiniCPM-V 2.6
emb_dim = 3584
block_count = 27
elif minicpmv_version == 4: # MiniCPM-o 2.6
emb_dim = 3584
block_count = 27
elif minicpmv_version == 5: # MiniCPM-V 4.0
emb_dim = 2560
block_count = 27
default_vision_config = {
"hidden_size": 1152,
"image_size": 980,
"intermediate_size": 4304,
"model_type": "idefics2",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"patch_size": 14,
# Use actual config values instead of hardcoded ones
if model_config:
# For the projector/resampler, use the main model's hidden_size
emb_dim = model_config.get("hidden_size", 1536)
# For the vision model, use vision_config values
vision_config_dict = model_config.get("vision_config", {})
default_vision_config = {
"hidden_size": vision_config_dict.get("hidden_size", 1152),
"image_size": vision_config_dict.get("image_size", 980),
"intermediate_size": vision_config_dict.get("intermediate_size", 4304),
"model_type": vision_config_dict.get("model_type", "siglip"),
"num_attention_heads": vision_config_dict.get("num_attention_heads", 16),
"num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27),
"patch_size": vision_config_dict.get("patch_size", 14),
}
# Use vision model's num_hidden_layers for block_count
block_count = vision_config_dict.get("num_hidden_layers", 27)
print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}")
print(f"Vision config: {default_vision_config}")
else:
# Fallback to original hardcoded logic if config.json not found
emb_dim = 4096
block_count = 26
if minicpmv_version == 1:
emb_dim = 2304
block_count = 26
elif minicpmv_version == 2:
emb_dim = 4096
block_count = 27
elif minicpmv_version == 3:
emb_dim = 3584
block_count = 27
elif minicpmv_version == 4:
emb_dim = 3584
block_count = 27
elif minicpmv_version == 5:
emb_dim = 2560
block_count = 27
default_vision_config = {
"hidden_size": 1152,
"image_size": 980,
"intermediate_size": 4304,
"model_type": "idefics2",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"patch_size": 14,
}
vision_config = Idefics2VisionConfig(**default_vision_config)
model = Idefics2VisionTransformer(vision_config)
if minicpmv_version == 3:
if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"):
vision_config = SiglipVisionConfig(**default_vision_config)
model = SiglipVisionTransformer(vision_config)
elif minicpmv_version == 4:
@@ -644,16 +679,27 @@ else:
fout.add_description("two-tower CLIP model")
if has_vision_encoder:
# vision_model hparams
fout.add_uint32("clip.vision.image_size", 448)
fout.add_uint32("clip.vision.patch_size", 14)
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
# vision_model hparams - use actual config values
vision_image_size = model_config.get("image_size", 448) if model_config else 448
vision_patch_size = default_vision_config.get("patch_size", 14)
vision_hidden_size = default_vision_config.get("hidden_size", 1152)
vision_intermediate_size = default_vision_config.get("intermediate_size", 4304)
vision_attention_heads = default_vision_config.get("num_attention_heads", 16)
fout.add_uint32("clip.vision.image_size", vision_image_size)
fout.add_uint32("clip.vision.patch_size", vision_patch_size)
fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size)
fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size)
fout.add_uint32("clip.vision.projection_dim", 0)
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads)
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
# Add MiniCPM-V specific parameters
query_num = model_config.get("query_num", 0) if model_config else 0
resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0
fout.add_uint32("clip.minicpmv_query_num", query_num)
if processor is not None:
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std

View File

@@ -16,6 +16,8 @@ mm_tensors = [k for k, v in checkpoint.items() if k.startswith("resampler")]
# store these tensors in a new dictionary and torch.save them
projector = {name: checkpoint[name].float() for name in mm_tensors}
if 'resampler.proj' in projector.keys() and hasattr(model.llm.config,'scale_emb') is True:
projector['resampler.proj'] = projector['resampler.proj'] / model.llm.config.scale_emb
torch.save(projector, f"{args.model}/minicpmv.projector")
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")]

View File

@@ -525,7 +525,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
}
// We get the logits for all the tokens in the context window (params.n_ctx)
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
// from llama_decode below. Now, based on https://huggingface.co/docs/transformers/perplexity,
// calculate the perplexity over the last half of the window (so the model always has
// some context to predict the token).
//
@@ -559,7 +559,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
for (int seq = 0; seq < n_seq_batch; seq++) {
int seq_start = batch_start + seq*n_ctx;
// save original token and restore it after eval
// save original token and restore it after decode
const auto token_org = tokens[seq_start];
// add BOS token for the first batch of each chunk
@@ -584,7 +584,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
}
if (llama_decode(ctx, batch)) {
LOG_INF("%s : failed to eval\n", __func__);
LOG_INF("%s : failed to decode\n", __func__);
return {tokens, -1, logit_history, prob_history};
}

View File

@@ -1132,6 +1132,12 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type":
`chat_template_kwargs`: Allows sending additional parameters to the json templating system. For example: `{"enable_thinking": false}`
`reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text.
`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models.
`parse_tool_calls`: Whether to parse the generated tool call.
*Examples:*
You can use either Python `openai` library with appropriate checkpoints:

Binary file not shown.

View File

@@ -383,8 +383,12 @@ struct server_task {
} else {
params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
}
params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params_base.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
common_reasoning_format reasoning_format = params_base.reasoning_format;
if (data.contains("reasoning_format")) {
reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
}
params.oaicompat_chat_syntax.reasoning_format = reasoning_format;
params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
}

View File

@@ -209,6 +209,7 @@ export const AppContextProvider = ({
messages,
stream: true,
cache_prompt: true,
reasoning_format: 'none',
samplers: config.samplers,
temperature: config.temperature,
dynatemp_range: config.dynatemp_range,

View File

@@ -162,8 +162,15 @@ class chat_template {
}), false);
caps_.supports_tools = contains(out, "some_tool");
auto out_empty = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", ""}}}), {}, false);
auto out_null = try_raw_render(json::array({dummy_user_msg, {{"role", "assistant"}, {"content", nullptr}}}), {}, false);
const auto render_with_content = [&](const json & content) {
const json assistant_msg {{"role", "assistant"}, {"content", content}};
// Render two assistant messages as some templates like QwQ-32B are handling
// the content differently depending on whether it's the last message or not
// (to remove the <think> tag in all but the last message).
return try_raw_render(json::array({dummy_user_msg, assistant_msg, dummy_user_msg, assistant_msg}), {}, false);
};
auto out_empty = render_with_content("");
auto out_null = render_with_content(json());
caps_.requires_non_null_content = contains(out_empty, user_needle) && !contains(out_null, user_needle);
json j_null;
@@ -191,12 +198,12 @@ class chat_template {
dummy_user_msg,
make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj.dump())})),
}), {}, false);
auto tool_call_renders_str_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
auto tool_call_renders_str_arguments = contains(out, "<parameter=argument_needle>") || contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
out = try_raw_render(json::array({
dummy_user_msg,
make_tool_calls_msg(json::array({make_tool_call("ipython", dummy_args_obj)})),
}), {}, false);
auto tool_call_renders_obj_arguments = contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
auto tool_call_renders_obj_arguments = contains(out, "<parameter=argument_needle>") || contains(out, "\"argument_needle\":") || contains(out, "'argument_needle':");
caps_.supports_tool_calls = tool_call_renders_str_arguments || tool_call_renders_obj_arguments;
caps_.requires_object_arguments = !tool_call_renders_str_arguments && tool_call_renders_obj_arguments;

View File

@@ -1291,6 +1291,12 @@ public:
}
};
static bool in(const Value & value, const Value & container) {
return (((container.is_array() || container.is_object()) && container.contains(value)) ||
(value.is_string() && container.is_string() &&
container.to_str().find(value.to_str()) != std::string::npos));
}
class BinaryOpExpr : public Expression {
public:
enum class Op { StrConcat, Add, Sub, Mul, MulMul, Div, DivDiv, Mod, Eq, Ne, Lt, Gt, Le, Ge, And, Or, In, NotIn, Is, IsNot };
@@ -1355,13 +1361,8 @@ public:
case Op::Gt: return l > r;
case Op::Le: return l <= r;
case Op::Ge: return l >= r;
case Op::In: return (((r.is_array() || r.is_object()) && r.contains(l)) ||
(l.is_string() && r.is_string() &&
r.to_str().find(l.to_str()) != std::string::npos));
case Op::NotIn:
return !(((r.is_array() || r.is_object()) && r.contains(l)) ||
(l.is_string() && r.is_string() &&
r.to_str().find(l.to_str()) != std::string::npos));
case Op::In: return in(l, r);
case Op::NotIn: return !in(l, r);
default: break;
}
throw std::runtime_error("Unknown binary operator");
@@ -1500,6 +1501,13 @@ public:
} else if (method->get_name() == "pop") {
vargs.expectArgs("pop method", {1, 1}, {0, 0});
return obj.pop(vargs.args[0]);
} else if (method->get_name() == "keys") {
vargs.expectArgs("keys method", {0, 0}, {0, 0});
auto result = Value::array();
for (const auto& key : obj.keys()) {
result.push_back(Value(key));
}
return result;
} else if (method->get_name() == "get") {
vargs.expectArgs("get method", {1, 2}, {0, 0});
auto key = vargs.args[0];
@@ -1541,6 +1549,16 @@ public:
} else if (method->get_name() == "capitalize") {
vargs.expectArgs("capitalize method", {0, 0}, {0, 0});
return Value(capitalize(str));
} else if (method->get_name() == "upper") {
vargs.expectArgs("upper method", {0, 0}, {0, 0});
auto result = str;
std::transform(result.begin(), result.end(), result.begin(), ::toupper);
return Value(result);
} else if (method->get_name() == "lower") {
vargs.expectArgs("lower method", {0, 0}, {0, 0});
auto result = str;
std::transform(result.begin(), result.end(), result.begin(), ::tolower);
return Value(result);
} else if (method->get_name() == "endswith") {
vargs.expectArgs("endswith method", {1, 1}, {0, 0});
auto suffix = vargs.args[0].get<std::string>();
@@ -2646,15 +2664,11 @@ inline std::shared_ptr<Context> Context::builtins() {
auto items = Value::array();
if (args.contains("object")) {
auto & obj = args.at("object");
if (obj.is_string()) {
auto json_obj = json::parse(obj.get<std::string>());
for (const auto & kv : json_obj.items()) {
items.push_back(Value::array({kv.key(), kv.value()}));
}
} else if (!obj.is_null()) {
for (auto & key : obj.keys()) {
items.push_back(Value::array({key, obj.at(key)}));
}
if (!obj.is_object()) {
throw std::runtime_error("Can only get item pairs from a mapping");
}
for (auto & key : obj.keys()) {
items.push_back(Value::array({key, obj.at(key)}));
}
}
return items;
@@ -2782,6 +2796,9 @@ inline std::shared_ptr<Context> Context::builtins() {
if (!items.is_array()) throw std::runtime_error("object is not iterable");
return items;
}));
globals.set("in", simple_function("in", { "item", "items" }, [](const std::shared_ptr<Context> &, Value & args) -> Value {
return in(args.at("item"), args.at("items"));
}));
globals.set("unique", simple_function("unique", { "items" }, [](const std::shared_ptr<Context> &, Value & args) -> Value {
auto & items = args.at("items");
if (!items.is_array()) throw std::runtime_error("object is not iterable");