Compare commits

..

28 Commits
b6065 ... b6093

Author SHA1 Message Date
Romain Biessy
3306ceabf0 sycl: fix mul_mat selection (#15092) 2025-08-05 18:39:55 +02:00
Juk Armstrong
c81de6e107 Fix glm4moe bug (#15088)
Some checks are pending
CI / macOS-latest-cmake-arm64 (push) Waiting to run
CI / macOS-latest-cmake-x64 (push) Waiting to run
CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI / ubuntu-latest-llguidance (push) Waiting to run
CI / ubuntu-latest-cmake-rpc (push) Waiting to run
CI / ubuntu-22-cmake-vulkan (push) Waiting to run
CI / ubuntu-22-cmake-webgpu (push) Waiting to run
CI / ubuntu-22-cmake-hip (push) Waiting to run
CI / ubuntu-22-cmake-musa (push) Waiting to run
CI / ubuntu-22-cmake-sycl (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run
CI / build-linux-cross (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-cmake-ios (push) Waiting to run
CI / macOS-latest-cmake-tvos (push) Waiting to run
CI / macOS-latest-cmake-visionos (push) Waiting to run
CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cmake-cuda (push) Waiting to run
CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run
CI / windows-latest-cmake-sycl (push) Waiting to run
CI / windows-latest-cmake-hip (push) Waiting to run
CI / ios-xcode-build (push) Waiting to run
CI / android-build (push) Waiting to run
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
2025-08-05 13:56:44 +01:00
Alex Wu
22f060c9c4 webui: fix markdown table (#15081)
* webui: fix markdown table

* webui: fix table display with themes
2025-08-05 13:56:44 +02:00
compilade
ee3a9fcf88 context : fix index overflow on huge outputs (#15080)
* context : fix overflow when re-ordering huge outputs

* context : fix logits size overflow for huge batches
2025-08-05 11:27:45 +02:00
Diego Devesa
ec428b02c3 llama : add --n-cpu-moe option (#15077)
Some checks are pending
CI / macOS-latest-cmake-arm64 (push) Waiting to run
CI / macOS-latest-cmake-x64 (push) Waiting to run
CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI / ubuntu-latest-llguidance (push) Waiting to run
CI / ubuntu-latest-cmake-rpc (push) Waiting to run
CI / ubuntu-22-cmake-vulkan (push) Waiting to run
CI / ubuntu-22-cmake-webgpu (push) Waiting to run
CI / ubuntu-22-cmake-hip (push) Waiting to run
CI / ubuntu-22-cmake-musa (push) Waiting to run
CI / ubuntu-22-cmake-sycl (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run
CI / build-linux-cross (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-cmake-ios (push) Waiting to run
CI / macOS-latest-cmake-tvos (push) Waiting to run
CI / macOS-latest-cmake-visionos (push) Waiting to run
CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cmake-cuda (push) Waiting to run
CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run
CI / windows-latest-cmake-sycl (push) Waiting to run
CI / windows-latest-cmake-hip (push) Waiting to run
CI / ios-xcode-build (push) Waiting to run
CI / android-build (push) Waiting to run
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
* llama : add --n-cpu-moe option

Keeps the MoE weights of the first N layers in the CPU
2025-08-05 01:05:36 +02:00
compilade
19f68fa5a4 imatrix : warn when GGUF imatrix is saved without .gguf suffix (#15076)
Some checks failed
CI / macOS-latest-cmake-arm64 (push) Waiting to run
CI / macOS-latest-cmake-x64 (push) Waiting to run
CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI / ubuntu-latest-llguidance (push) Waiting to run
CI / ubuntu-latest-cmake-rpc (push) Waiting to run
CI / ubuntu-22-cmake-vulkan (push) Waiting to run
CI / ubuntu-22-cmake-webgpu (push) Waiting to run
CI / ubuntu-22-cmake-hip (push) Waiting to run
CI / ubuntu-22-cmake-musa (push) Waiting to run
CI / ubuntu-22-cmake-sycl (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run
CI / build-linux-cross (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-cmake-ios (push) Waiting to run
CI / macOS-latest-cmake-tvos (push) Waiting to run
CI / macOS-latest-cmake-visionos (push) Waiting to run
CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cmake-cuda (push) Waiting to run
CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run
CI / windows-latest-cmake-sycl (push) Waiting to run
CI / windows-latest-cmake-hip (push) Waiting to run
CI / ios-xcode-build (push) Waiting to run
CI / android-build (push) Waiting to run
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Has been cancelled
Python check requirements.txt / check-requirements (push) Has been cancelled
flake8 Lint / Lint (push) Has been cancelled
Python Type-Check / pyright type-check (push) Has been cancelled
* imatrix : add warning when suffix is not .gguf for GGUF imatrix

* imatrix : only warn about suffix when output format is unspecified
2025-08-04 23:26:52 +02:00
Christian Kastner
41613437ff cmake: Add GGML_BACKEND_DIR option (#15074)
* cmake: Add GGML_BACKEND_DIR option

This can be used by distributions to specify where to look for backends
when ggml is built with GGML_BACKEND_DL=ON.

* Fix phrasing
2025-08-04 21:29:14 +02:00
Sigbjørn Skjæret
e5bebe5251 gguf-py : add --chat-template-file to gguf_new_metadata (#15075) 2025-08-04 21:01:48 +02:00
Sam
ef0144c087 model: support GLM 4.5 family of models (#14939)
* model: Add GLM 4.5 (#14921)

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Merge in PR suggestions

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* model: Add GLM 4.5 family of models (#14921)

1. Updated tensor_mapping.py with NextN tensor mappings

- Added proper tensor mappings for all NextN/MTP tensors in /Users/samm/git/llama.cpp/gguf-py/gguf/tensor_mapping.py
- Added mappings for: eh_proj, embed_tokens, enorm, hnorm, shared_head.head, shared_head.norm

2. Added num_nextn_predict_layers configuration

- Added LLM_KV_NUM_NEXTN_PREDICT_LAYERS constant to llama-arch.h and llama-arch.cpp
- Added num_nextn_predict_layers field to llama_hparams struct
- Updated GLM4_MOE parameter loading in llama-model.cpp to read this parameter
- Modified tensor loading logic to conditionally load NextN tensors based on num_nextn_predict_layers
- Added GGUF writer support in gguf_writer.py with add_num_nextn_predict_layers() method
- Updated conversion script to extract and write this parameter from HuggingFace config

3. Added FIM tokens for GLM4_MOE

- Added GLM-4.5's FIM tokens to llama-vocab.cpp:
  - <|code_prefix|> for FIM_PRE
  - <|code_suffix|> for FIM_SUF
  - <|code_middle|> for FIM_MID

4. Removed manual NextN tensor handling

- Removed the special-case handling in convert_hf_to_gguf.py that manually mapped NextN tensors
- NextN tensors are now handled automatically through the proper tensor mapping system

* glm 4.5 update tensors names

* model: glm 4.5 apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* Update src/llama-model.cpp

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* model: glm 4.5 apply suggestions from code review

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* model: glm 4.5 apply suggestions from code review

* Apply suggestions from code review

* patch broken chat template

* typings fix

* add TENSOR_SKIP flag


Co-authored-by: Diego Devesa <slarengh@gmail.com>

* Update src/llama-model-loader.h

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Co-authored-by: Diego Devesa <slarengh@gmail.com>
2025-08-04 20:29:25 +02:00
Sigbjørn Skjæret
2721257e3e quantize : fix confusing error message if ftype is invalid (#15071) 2025-08-04 18:11:02 +02:00
Reese Levine
587d0118f5 ggml: WebGPU backend host improvements and style fixing (#14978)
* Add parameter buffer pool, batching of submissions, refactor command building/submission

* Add header for linux builds

* Free staged parameter buffers at once

* Format with clang-format

* Fix thread-safe implementation

* Use device implicit synchronization

* Update workflow to use custom release

* Remove testing branch workflow
2025-08-04 08:52:43 -07:00
Jeff Bolz
5aa1105da2 vulkan: fix build when using glslang that does not support coopmat2 (#15062)
Some checks are pending
CI / macOS-latest-cmake-arm64 (push) Waiting to run
CI / macOS-latest-cmake-x64 (push) Waiting to run
CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI / ubuntu-latest-llguidance (push) Waiting to run
CI / ubuntu-latest-cmake-rpc (push) Waiting to run
CI / ubuntu-22-cmake-vulkan (push) Waiting to run
CI / ubuntu-22-cmake-webgpu (push) Waiting to run
CI / ubuntu-22-cmake-hip (push) Waiting to run
CI / ubuntu-22-cmake-musa (push) Waiting to run
CI / ubuntu-22-cmake-sycl (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run
CI / build-linux-cross (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-cmake-ios (push) Waiting to run
CI / macOS-latest-cmake-tvos (push) Waiting to run
CI / macOS-latest-cmake-visionos (push) Waiting to run
CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cmake-cuda (push) Waiting to run
CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run
CI / windows-latest-cmake-sycl (push) Waiting to run
CI / windows-latest-cmake-hip (push) Waiting to run
CI / ios-xcode-build (push) Waiting to run
CI / android-build (push) Waiting to run
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
2025-08-04 07:09:19 +02:00
compilade
d31192b4ee imatrix : use GGUF by default (#14842)
Some checks are pending
CI / macOS-latest-cmake-arm64 (push) Waiting to run
CI / macOS-latest-cmake-x64 (push) Waiting to run
CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI / ubuntu-latest-llguidance (push) Waiting to run
CI / ubuntu-latest-cmake-rpc (push) Waiting to run
CI / ubuntu-22-cmake-vulkan (push) Waiting to run
CI / ubuntu-22-cmake-webgpu (push) Waiting to run
CI / ubuntu-22-cmake-hip (push) Waiting to run
CI / ubuntu-22-cmake-musa (push) Waiting to run
CI / ubuntu-22-cmake-sycl (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run
CI / build-linux-cross (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-cmake-ios (push) Waiting to run
CI / macOS-latest-cmake-tvos (push) Waiting to run
CI / macOS-latest-cmake-visionos (push) Waiting to run
CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cmake-cuda (push) Waiting to run
CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run
CI / windows-latest-cmake-sycl (push) Waiting to run
CI / windows-latest-cmake-hip (push) Waiting to run
CI / ios-xcode-build (push) Waiting to run
CI / android-build (push) Waiting to run
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Waiting to run
Python check requirements.txt / check-requirements (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
* imatrix : use GGUF by default

* imatrix : use GGUF regardless of the output filename

The legacy format can only be produced with --output-format dat
2025-08-03 22:00:05 +02:00
compilade
0a2f5496be imatrix : fix 3d activation handling for hybrid and recurrent models (#14994)
* imatrix : use a single count for dense 3d tensors

* imatrix : fix 3d activations when model tensor is 2d

* imatrix : fix 3d tensor counts
2025-08-03 21:49:13 +02:00
compilade
11a3811164 memory : handle kv_unified for hybrid models (#15050) 2025-08-03 21:43:07 +02:00
Csaba Kecskemeti
97366dc6ab vocab : JetBrains Mellum pre-tokenizer (#15045) 2025-08-03 21:38:18 +02:00
Gabriel Larson
83bc2f288c model : add text-only support for Kimi-VL (and find special tokens in text_config) (#15051)
* basic kimi-vl textmodel conversion

* check config["text_config"] for special tokens
2025-08-03 16:56:25 +02:00
Jeff Bolz
6c7a441161 vulkan: Use coopmat2 for conv2d (#14982)
Some checks are pending
CI / macOS-latest-cmake-arm64 (push) Waiting to run
CI / macOS-latest-cmake-x64 (push) Waiting to run
CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI / ubuntu-latest-llguidance (push) Waiting to run
CI / ubuntu-latest-cmake-rpc (push) Waiting to run
CI / ubuntu-22-cmake-vulkan (push) Waiting to run
CI / ubuntu-22-cmake-webgpu (push) Waiting to run
CI / ubuntu-22-cmake-hip (push) Waiting to run
CI / ubuntu-22-cmake-musa (push) Waiting to run
CI / ubuntu-22-cmake-sycl (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run
CI / build-linux-cross (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-cmake-ios (push) Waiting to run
CI / macOS-latest-cmake-tvos (push) Waiting to run
CI / macOS-latest-cmake-visionos (push) Waiting to run
CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cmake-cuda (push) Waiting to run
CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run
CI / windows-latest-cmake-sycl (push) Waiting to run
CI / windows-latest-cmake-hip (push) Waiting to run
CI / ios-xcode-build (push) Waiting to run
CI / android-build (push) Waiting to run
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
2025-08-03 14:23:57 +02:00
lhez
5c0eb5ef54 opencl: fix adreno compiler detection logic (#15029)
Some checks are pending
CI / macOS-latest-cmake-arm64 (push) Waiting to run
CI / macOS-latest-cmake-x64 (push) Waiting to run
CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI / ubuntu-latest-llguidance (push) Waiting to run
CI / ubuntu-latest-cmake-rpc (push) Waiting to run
CI / ubuntu-22-cmake-vulkan (push) Waiting to run
CI / ubuntu-22-cmake-webgpu (push) Waiting to run
CI / ubuntu-22-cmake-hip (push) Waiting to run
CI / ubuntu-22-cmake-musa (push) Waiting to run
CI / ubuntu-22-cmake-sycl (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run
CI / build-linux-cross (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-cmake-ios (push) Waiting to run
CI / macOS-latest-cmake-tvos (push) Waiting to run
CI / macOS-latest-cmake-visionos (push) Waiting to run
CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cmake-cuda (push) Waiting to run
CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run
CI / windows-latest-cmake-sycl (push) Waiting to run
CI / windows-latest-cmake-hip (push) Waiting to run
CI / ios-xcode-build (push) Waiting to run
CI / android-build (push) Waiting to run
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Waiting to run
Python check requirements.txt / check-requirements (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
2025-08-02 19:51:18 +02:00
Johannes Gäßler
03d4698218 CUDA: use mma FA kernel for gqa > 4 on RTX 4000 (#15035) 2025-08-02 16:37:08 +02:00
leejet
3303c19b16 cuda: make im2col a little faster (#15025) 2025-08-02 17:15:36 +03:00
Daniel Bevenius
4fdea540bd kv-cache : skip alignment of n_stream in kv-cache log msg [no ci] (#15040)
This commit removes the right alignment the `n_stream` value in the
log message in the `llama_kv_cache_unified` constructor.

The motivation for this change is to enhance the readability of log
message. Currently the output looks like this:
```console
llama_kv_cache_unified: size = 2048.00 MiB (  4096 cells,  32 layers,  1/ 1 seqs), K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
```
Notice that the `n_stream` value is right aligned, which makes it a
little harder to read.

With the change in this commit the output will look like
```console
llama_kv_cache_unified: size = 2048.00 MiB (  4096 cells,  32 layers, 1/1 seqs), K (f16): 1024.00 MiB, V (f16): 1024.00 MiB
```
2025-08-02 17:14:57 +03:00
Georgi Gerganov
a4569c41fd llama : enable LLAMA_SET_ROWS=1 by default (#14959)
ggml-ci
2025-08-02 17:14:21 +03:00
Georgi Gerganov
15e92fd337 cuda, sycl : fix batched gemm when ne02 == 1 && ne03 > 1 (#15038)
* cuda, sycl : fix batched gemm when ne02 == 1 && ne03 > 1

ggml-ci

* cont : fix cont types

ggml-ci

* cont : adopt variable names and comment from the other branch
2025-08-02 17:13:05 +03:00
Sigbjørn Skjæret
2bf3fbf0b5 ci : check that pre-tokenizer hashes are up-to-date (#15032)
* torch is not required for convert_hf_to_gguf_update

* add --check-missing parameter

* check that pre-tokenizer hashes are up-to-date
2025-08-02 14:39:01 +02:00
Douglas Hanley
711d5e6fe6 convert : fix Qwen3-Embedding pre-tokenizer hash (#15030)
Some checks are pending
CI / macOS-latest-cmake-arm64 (push) Waiting to run
CI / macOS-latest-cmake-x64 (push) Waiting to run
CI / macOS-latest-cmake-arm64-webgpu (push) Waiting to run
CI / ubuntu-cpu-cmake (arm64, ubuntu-22.04-arm) (push) Waiting to run
CI / ubuntu-cpu-cmake (x64, ubuntu-22.04) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, ADDRESS) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, THREAD) (push) Waiting to run
CI / ubuntu-latest-cmake-sanitizer (Debug, UNDEFINED) (push) Waiting to run
CI / ubuntu-latest-llguidance (push) Waiting to run
CI / ubuntu-latest-cmake-rpc (push) Waiting to run
CI / ubuntu-22-cmake-vulkan (push) Waiting to run
CI / ubuntu-22-cmake-webgpu (push) Waiting to run
CI / ubuntu-22-cmake-hip (push) Waiting to run
CI / ubuntu-22-cmake-musa (push) Waiting to run
CI / ubuntu-22-cmake-sycl (push) Waiting to run
CI / ubuntu-22-cmake-sycl-fp16 (push) Waiting to run
CI / build-linux-cross (push) Waiting to run
CI / build-cmake-pkg (push) Waiting to run
CI / macOS-latest-cmake-ios (push) Waiting to run
CI / macOS-latest-cmake-tvos (push) Waiting to run
CI / macOS-latest-cmake-visionos (push) Waiting to run
CI / macOS-latest-swift (generic/platform=iOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=macOS) (push) Waiting to run
CI / macOS-latest-swift (generic/platform=tvOS) (push) Waiting to run
CI / windows-msys2 (Release, clang-x86_64, CLANG64) (push) Waiting to run
CI / windows-msys2 (Release, ucrt-x86_64, UCRT64) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON) (push) Waiting to run
CI / windows-latest-cmake (arm64, llvm-arm64-opencl-adreno, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON) (push) Waiting to run
CI / windows-latest-cmake (x64, cpu-x64 (static), -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF) (push) Waiting to run
CI / windows-latest-cmake (x64, openblas-x64, -G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=… (push) Waiting to run
CI / windows-latest-cmake (x64, vulkan-x64, -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON) (push) Waiting to run
CI / ubuntu-latest-cmake-cuda (push) Waiting to run
CI / windows-2022-cmake-cuda (12.4) (push) Waiting to run
CI / windows-latest-cmake-sycl (push) Waiting to run
CI / windows-latest-cmake-hip (push) Waiting to run
CI / ios-xcode-build (push) Waiting to run
CI / android-build (push) Waiting to run
CI / openEuler-latest-cmake-cann (aarch64, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
CI / openEuler-latest-cmake-cann (x86, Release, 8.1.RC1.alpha001-910b-openeuler22.03-py3.10, ascend910b3) (push) Waiting to run
Python check requirements.txt / check-requirements (push) Waiting to run
flake8 Lint / Lint (push) Waiting to run
Python Type-Check / pyright type-check (push) Waiting to run
2025-08-02 12:51:02 +02:00
Jhen-Jie Hong
f738989dcb chat : fix multiple tool_calls on hermes-2-pro (#14962) 2025-08-02 18:04:48 +08:00
Jeff Bolz
4cb208c93c vulkan: coopmat2 mul_mat optimizations (#14934)
- Increase tile size for k-quants, to match non-k-quants
- Choose more carefully between large and medium tiles, considering how it
  interacts with split_k
- Allow larger/non-power of two split_k, and make the splits a multiple of 256
- Use split_k==3 to when >1/2 and <=2/3 of the SMs would hae been used
2025-08-02 11:21:37 +02:00
47 changed files with 1571 additions and 605 deletions

View File

@@ -159,31 +159,15 @@ jobs:
- name: Dawn Dependency
id: dawn-depends
run: |
ARTIFACTS_JSON=$(curl -s -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"https://api.github.com/repos/google/dawn/actions/artifacts")
echo "Finding latest macos-latest-Release artifact..."
DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
| sort_by(.created_at)
| reverse
| map(select(.name | test("macos-latest-Release$")))
| .[0].archive_download_url')
if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
echo "No suitable Dawn artifact found!"
exit 1
fi
echo "Downloading from: $DOWNLOAD_URL"
curl -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-o artifact.zip "$DOWNLOAD_URL"
unzip artifact.zip
DAWN_VERSION="v1.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-macos-latest-Release.tar.gz"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
curl -L -o artifact.tar.gz \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
mkdir dawn
tar_file=$(find . -name '*.tar.gz' | head -n 1)
echo "Extracting: $tar_file"
tar -xvf "$tar_file" -C dawn --strip-components=1
tar -xvf artifact.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build
@@ -433,31 +417,15 @@ jobs:
id: dawn-depends
run: |
sudo apt-get install -y libxrandr-dev libxinerama-dev libxcursor-dev mesa-common-dev libx11-xcb-dev libxi-dev
ARTIFACTS_JSON=$(curl -s -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"https://api.github.com/repos/google/dawn/actions/artifacts")
echo "Finding latest ubuntu-latest-Release artifact..."
DOWNLOAD_URL=$(echo "$ARTIFACTS_JSON" | jq -r '.artifacts
| sort_by(.created_at)
| reverse
| map(select(.name | test("ubuntu-latest-Release$")))
| .[0].archive_download_url')
if [ "$DOWNLOAD_URL" = "null" ] || [ -z "$DOWNLOAD_URL" ]; then
echo "No suitable Dawn artifact found!"
exit 1
fi
echo "Downloading from: $DOWNLOAD_URL"
curl -L \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-o artifact.zip "$DOWNLOAD_URL"
unzip artifact.zip
DAWN_VERSION="v1.0.0"
DAWN_OWNER="reeselevine"
DAWN_REPO="dawn"
DAWN_ASSET_NAME="Dawn-a1a6b45cced25a3b7f4fb491e0ae70796cc7f22b-ubuntu-latest-Release.tar.gz"
echo "Fetching release asset from https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
curl -L -o artifact.tar.gz \
"https://github.com/${DAWN_OWNER}/${DAWN_REPO}/releases/download/${DAWN_VERSION}/${DAWN_ASSET_NAME}"
mkdir dawn
tar_file=$(find . -name '*.tar.gz' | head -n 1)
echo "Extracting: $tar_file"
tar -xvf "$tar_file" -C dawn --strip-components=1
tar -xvf artifact.tar.gz -C dawn --strip-components=1
- name: Build
id: cmake_build

View File

@@ -0,0 +1,45 @@
name: Check Pre-Tokenizer Hashes
on:
push:
paths:
- 'convert_hf_to_gguf.py'
- 'convert_hf_to_gguf_update.py'
pull_request:
paths:
- 'convert_hf_to_gguf.py'
- 'convert_hf_to_gguf_update.py'
jobs:
pre-tokenizer-hashes:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install Python dependencies
run: |
python3 -m venv .venv
.venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
- name: Update pre-tokenizer hashes
run: |
cp convert_hf_to_gguf.py /tmp
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
- name: Check if committed pre-tokenizer hashes matches generated version
run: |
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
echo "Differences found:"
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
exit 1
fi
echo "Model pre-tokenizer hashes are up to date."

View File

@@ -24,6 +24,7 @@
#include <cstdarg>
#include <filesystem>
#include <fstream>
#include <list>
#include <regex>
#include <set>
#include <string>
@@ -2375,20 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
throw std::invalid_argument("unknown buffer type");
}
// FIXME: this leaks memory
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
// keep strings alive and avoid leaking memory by storing them in a static vector
static std::list<std::string> buft_overrides;
buft_overrides.push_back(tensor_name);
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
}
}
));
add_opt(common_arg(
{"--cpu-moe"},
"use CPU for Mixture of Experts (MoE) weights",
{"--cpu-moe", "-cmoe"},
"keep all Mixture of Experts (MoE) weights in the CPU",
[](common_params & params) {
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
}
).set_env("LLAMA_ARG_CPU_MOE"));
add_opt(common_arg(
{"--n-cpu-moe", "-ncmoe"}, "N",
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
[](common_params & params, int value) {
if (value < 0) {
throw std::invalid_argument("invalid value");
}
for (int i = 0; i < value; ++i) {
// keep strings alive and avoid leaking memory by storing them in a static vector
static std::list<std::string> buft_overrides;
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
}
}
).set_env("LLAMA_ARG_N_CPU_MOE"));
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
"number of layers to store in VRAM",
@@ -2647,6 +2663,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.n_out_freq = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--output-format"}, "{gguf,dat}",
string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
[](common_params & params, const std::string & value) {
/**/ if (value == "gguf") { params.imat_dat = -1; }
else if (value == "dat") { params.imat_dat = 1; }
else { throw std::invalid_argument("invalid output format"); }
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
{"--save-frequency"}, "N",
string_format("save an imatrix copy every N iterations (default: %d)", params.n_save_freq),

View File

@@ -1646,7 +1646,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
"|<function name=\"([^\"]+)\">" // match 5 (function name again)
);
if (auto res = builder.try_find_regex(open_regex)) {
while (auto res = builder.try_find_regex(open_regex)) {
const auto & block_start = res->groups[1];
std::string block_end = block_start.empty() ? "" : "```";
@@ -1668,7 +1668,6 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
builder.consume_literal(block_end);
builder.consume_spaces();
}
builder.add_content(builder.consume_rest());
} else {
throw common_chat_msg_partial_exception("failed to parse tool call");
}
@@ -1693,11 +1692,10 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
builder.consume_spaces();
}
}
builder.add_content(builder.consume_rest());
}
} else {
builder.add_content(builder.consume_rest());
}
builder.add_content(builder.consume_rest());
}
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {

View File

@@ -439,6 +439,7 @@ struct common_params {
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk
int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity

View File

@@ -678,6 +678,9 @@ class TextModel(ModelBase):
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
res = "glm4"
if chkhsh == "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902":
# ref: https://huggingface.co/zai-org/GLM-4.5-Air
res = "glm4"
if chkhsh == "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35":
# ref: https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0
res = "minerva-7b"
@@ -702,6 +705,9 @@ class TextModel(ModelBase):
if chkhsh == "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890":
# ref: https://huggingface.co/moonshotai/Kimi-K2-Base
res = "kimi-k2"
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
res = "qwen2"
if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
# ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
res = "llama-bpe"
@@ -849,9 +855,9 @@ class TextModel(ModelBase):
if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
res = "exaone4"
if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c":
# ref: https://huggingface.co/Qwen/Qwen3-Embedding-8B
res = "qwen2"
if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
# ref: https://huggingface.co/JetBrains/Mellum-4b-base
res = "mellum"
if res is None:
logger.warning("\n")
@@ -6059,6 +6065,7 @@ class DeepseekModel(TextModel):
@ModelBase.register("DeepseekV2ForCausalLM")
@ModelBase.register("DeepseekV3ForCausalLM")
@ModelBase.register("KimiVLForConditionalGeneration")
class DeepseekV2Model(TextModel):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
@@ -6161,6 +6168,13 @@ class DeepseekV2Model(TextModel):
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# skip vision tensors and remove "language_model." for Kimi-VL
if "vision_tower" in name or "multi_modal_projector" in name:
return []
if name.startswith("language_model."):
name = name.replace("language_model.", "")
# rename e_score_correction_bias tensors
if name.endswith("e_score_correction_bias"):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
@@ -6685,6 +6699,139 @@ class Glm4Model(TextModel):
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Glm4MoeForCausalLM")
class Glm4MoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.GLM4_MOE
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# GLM4_MOE has num_hidden_layers + 1 actual layers (including NextN layer)
self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
def set_vocab(self):
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2")
self.gguf_writer.add_tokenizer_pre(tokpre)
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_types(toktypes)
# Special tokens
# Note: Using <|endoftext|> (151329) for eot causes endless generation
special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"]) # 151331
special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"]) # 151336
special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"]) # 151329
special_vocab._set_special_token("eom", tokenizer.get_added_vocab()["<|observation|>"]) # 151338
# Patch broken chat template
if isinstance(special_vocab.chat_template, str) and "visible_text(m.content).endswith" in special_vocab.chat_template:
special_vocab.chat_template = special_vocab.chat_template.replace(
"""{{ visible_text(m.content) }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}""",
"""{% set content = visible_text(m.content) %}{{ content }}\n{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not content.endswith("/nothink")) else '' -}}""")
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
super().set_gguf_parameters()
if (rope_dim := self.hparams.get("head_dim")) is None:
rope_dim = (
self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
)
self.gguf_writer.add_rope_dimension_count(
int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))
)
# MoE parameters - Use only routed expert count (shared experts handled separately)
if (n_routed_experts := self.hparams.get("n_routed_experts")) is not None:
self.gguf_writer.add_expert_count(n_routed_experts)
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
if (n_shared_experts := self.hparams.get("n_shared_experts")) is not None:
self.gguf_writer.add_expert_shared_count(n_shared_experts)
if (first_k_dense_replace := self.hparams.get("first_k_dense_replace")) is not None:
self.gguf_writer.add_leading_dense_block_count(first_k_dense_replace)
# Expert gating function (sigmoid for GLM4_MOE)
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
# Routed scaling factor
if (routed_scaling_factor := self.hparams.get("routed_scaling_factor")) is not None:
self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
# Normalise topk probabilities
if (norm_topk_prob := self.hparams.get("norm_topk_prob")) is not None:
self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
# NextN/MTP prediction layers
if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(
self, data_torch: Tensor, name: str, bid: int | None
) -> Iterable[tuple[str, Tensor]]:
if name.startswith("model.visual."): # ignore visual part
return []
elif name.startswith("model.language_model."):
name = name.replace("language_model.", "") # for multimodal variants
# Handle main token embedding (but not layer-specific NextN embeddings)
if name == "model.embed_tokens.weight" and ".layers." not in name:
return [(self.map_tensor_name("token_embd.weight"), data_torch)]
# Handle routed experts
if name.find("mlp.experts") != -1:
n_experts = self.hparams["n_routed_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
if name.endswith("e_score_correction_bias"):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
new_name = self.map_tensor_name(name)
return [(new_name, data_torch)]
def prepare_tensors(self):
super().prepare_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
class ChatGLMModel(TextModel):
model_arch = gguf.MODEL_ARCH.CHATGLM

View File

@@ -59,6 +59,10 @@ parser.add_argument(
"--full", action="store_true",
help="download full list of models - make sure you have access to all of them",
)
parser.add_argument(
"--check-missing", action="store_true",
help="only check for missing pre-tokenizer hashes",
)
parser.add_argument(
"hf_token",
help="optional HF token",
@@ -70,6 +74,10 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token
if hf_token is None:
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
if args.check_missing and args.full:
logger.warning("Downloading full list of models requested, ignoring --check-missing!")
args.check_missing = False
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
@@ -130,6 +138,7 @@ models = [
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
]
# some models are known to be broken upstream, so we will skip them as exceptions
@@ -138,6 +147,7 @@ pre_computed_hashes = [
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b"},
{"name": "chatglm-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-chat", "chkhsh": "81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"},
{"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/zai-org/GLM-4.5-Air", "chkhsh": "9ca2dd618e8afaf09731a7cf6e2105b373ba6a1821559f258b272fe83e6eb902"},
{"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"},
{"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"},
{"name": "hunyuan-dense", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-4B-Instruct", "chkhsh": "bba3b3366b646dbdded5dbc42d59598b849371afc42f7beafa914afaa5b70aa6"},
@@ -147,6 +157,7 @@ pre_computed_hashes = [
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"},
{"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"},
{"name": "kimi-k2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/moonshotai/Kimi-K2-Base", "chkhsh": "81212dc7cdb7e0c1074ca62c5aeab0d43c9f52b8a737be7b12a777c953027890"},
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", "chkhsh": "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c"},
]
@@ -221,12 +232,13 @@ if not args.full:
all_models = models.copy()
models = [model for model in all_models if model["name"] not in existing_models]
logging.info(f"Downloading {len(models)} models...")
for model in models:
try:
download_model(model)
except Exception as e:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
if not args.check_missing:
logging.info(f"Downloading {len(models)} models...")
for model in models:
try:
download_model(model)
except Exception as e:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:

View File

@@ -39,8 +39,9 @@ if (WIN32)
set(CMAKE_SHARED_MODULE_PREFIX "")
endif()
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
set(GGML_BACKEND_DIR "" CACHE PATH "ggml: directory to load dynamic backends from (requires GGML_BACKEND_DL")
#
# option list

View File

@@ -106,7 +106,7 @@ if(NOT TARGET ggml::ggml)
find_library(GGML_LIBRARY ggml
REQUIRED
HINTS ${GGML_LIB_DIR}
HINTS ${GGML_LIB_DIR} ${GGML_BACKEND_DIR}
NO_CMAKE_FIND_ROOT_PATH)
add_library(ggml::ggml UNKNOWN IMPORTED)

View File

@@ -214,6 +214,13 @@ add_library(ggml
ggml-backend-reg.cpp)
add_library(ggml::ggml ALIAS ggml)
if (GGML_BACKEND_DIR)
if (NOT GGML_BACKEND_DL)
message(FATAL_ERROR "GGML_BACKEND_DIR requires GGML_BACKEND_DL")
endif()
target_compile_definitions(ggml PUBLIC GGML_BACKEND_DIR="${GGML_BACKEND_DIR}")
endif()
target_link_libraries(ggml PUBLIC ggml-base)
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
@@ -227,7 +234,11 @@ function(ggml_add_backend_library backend)
set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL)
add_dependencies(ggml ${backend})
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
if (GGML_BACKEND_DIR)
install(TARGETS ${backend} LIBRARY DESTINATION ${GGML_BACKEND_DIR})
else()
install(TARGETS ${backend} LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()
else()
add_library(${backend} ${ARGN})
target_link_libraries(ggml PUBLIC ${backend})

View File

@@ -498,6 +498,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
std::vector<fs::path> search_paths;
if (user_search_path == nullptr) {
#ifdef GGML_BACKEND_DIR
search_paths.push_back(fs::u8path(GGML_BACKEND_DIR));
#endif
// default search paths: executable directory, current directory
search_paths.push_back(get_executable_path());
search_paths.push_back(fs::current_path());

View File

@@ -315,8 +315,9 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies &&
(Q->ne[3] > 1 || cc < GGML_CUDA_CC_ADA_LOVELACE) && !mma_needs_data_conversion;
const bool mma_faster_for_rtx4000 = Q->ne[3] > 1 || (Q->ne[2] > 4*K->ne[2] && K->ne[1] >= 8192);
const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && !mma_needs_data_conversion &&
(cc < GGML_CUDA_CC_ADA_LOVELACE || mma_faster_for_rtx4000);
const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
if (prec == GGML_PREC_DEFAULT) {

View File

@@ -1852,6 +1852,9 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
ggml_cuda_pool_alloc<cuda_t> src0_alloc(ctx.pool());
ggml_cuda_pool_alloc<cuda_t> src1_alloc(ctx.pool());
bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
// Handle src0
src0_ptr = (const cuda_t *) src0->data;
@@ -1870,6 +1873,8 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
s11 = ne10;
s12 = ne11*s11;
s13 = ne12*s12;
is_src1_cont_2 = true;
}
// Setup destination buffer
@@ -1918,15 +1923,19 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
// with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
const int64_t smb = ne12 == 1 ? s13 : s12;
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
// use cublasGemmStridedBatchedEx
CUBLAS_CHECK(
cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
alpha, src0_ptr, cu_data_type_a, nb01/nb00, nb02/nb00, // strideA
src1_ptr, cu_data_type_b, s11, s12, // strideB
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
alpha, src0_ptr, cu_data_type_a, nb01/nb00, sma, // strideA
src1_ptr, cu_data_type_b, s11, smb, // strideB
beta, dst_t, cu_data_type, ne0, ne1*ne0, // strideC
ne12*ne13,
cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));

View File

@@ -1,65 +1,75 @@
#include "im2col.cuh"
#define MIN(a, b) (a) < (b) ? (a) : (b)
#define MAX_GRIDDIM_Z 65535
template <typename T>
static __global__ void im2col_kernel(
const float * x, T * dst, int64_t batch_offset,
int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW,
const float * x, T * dst,
int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH,
int64_t IC_IH_IW, int64_t IH_IW, int64_t N_OH, int64_t KH_KW, int64_t IC_KH_KW,
int s0, int s1, int p0, int p1, int d0, int d1) {
const int64_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= pelements) {
if (i >= IC_KH_KW) {
return;
}
const int64_t ksize = OW * KH;
const int64_t kx = i / ksize;
const int64_t kd = kx * ksize;
const int64_t ky = (i - kd) / OW;
const int64_t ix = i % OW;
const int64_t iic = i / (KH_KW);
const int64_t rem = i - iic * KH_KW;
const int64_t ikh = rem / KW;
const int64_t ikw = rem - ikh * KW;
const int64_t oh = blockIdx.y;
const int64_t batch = blockIdx.z / IC;
const int64_t ic = blockIdx.z % IC;
const int64_t iow = blockIdx.y;
for (int64_t iz = blockIdx.z; iz < N_OH; iz+=MAX_GRIDDIM_Z) {
const int64_t in = iz / OH;
const int64_t ioh = iz - in * OH;
const int64_t iiw = ix * s0 + kx * d0 - p0;
const int64_t iih = oh * s1 + ky * d1 - p1;
const int64_t iiw = iow * s0 + ikw * d0 - p0;
const int64_t iih = ioh * s1 + ikh * d1 - p1;
const int64_t offset_dst =
((batch * OH + oh) * OW + ix) * CHW +
(ic * (KW * KH) + ky * KW + kx);
const int64_t offset_dst =
((in * OH + ioh) * OW + iow) * IC_KH_KW + iic * KH_KW + ikh * KW + ikw;
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
dst[offset_dst] = 0.0f;
} else {
const int64_t offset_src = ic * offset_delta + batch * batch_offset;
dst[offset_dst] = x[offset_src + iih * IW + iiw];
if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
dst[offset_dst] = 0.0f;
} else {
const int64_t offset_src = iic * IC_IH_IW + in * IH_IW;
dst[offset_dst] = x[offset_src + iih * IW + iiw];
}
}
}
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
template <typename T>
static void im2col_cuda(const float * x, T* dst,
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
int64_t batch, int64_t batch_offset, int64_t offset_delta,
int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
const int parallel_elements = OW * KW * KH;
const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
dim3 block_nums(num_blocks, OH, batch * IC);
im2col_kernel<<<block_nums, CUDA_IM2COL_BLOCK_SIZE, 0, stream>>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1);
const int64_t IC_KH_KW = IC * KH * KW;
const int64_t num_blocks = (IC_KH_KW + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
const int64_t N_OH = N * OH;
const int64_t KH_KW = KW*KH;
dim3 block_nums(num_blocks, OW, MIN(N_OH, MAX_GRIDDIM_Z));
im2col_kernel<<<block_nums, MIN(IC_KH_KW, CUDA_IM2COL_BLOCK_SIZE) , 0, stream>>>(x, dst, IC, IW, IH, OH, OW, KW, KH,
IC_IH_IW, IH_IW, N_OH, KH_KW, IC_KH_KW,
s0, s1, p0, p1, d0, d1);
}
static void im2col_cuda_f16(const float * x, half * dst,
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
int64_t batch, int64_t batch_offset, int64_t offset_delta,
int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
im2col_cuda<half>(x, dst, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
}
static void im2col_cuda_f32(const float * x, float * dst,
int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC,
int64_t batch, int64_t batch_offset, int64_t offset_delta,
int64_t N, int64_t IC_IH_IW, int64_t IH_IW,
int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) {
im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream);
im2col_cuda<float>(x, dst, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
}
void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -91,13 +101,13 @@ void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const int64_t OH = is_2D ? dst->ne[2] : 1;
const int64_t OW = dst->ne[1];
const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
const int64_t batch = src1->ne[is_2D ? 3 : 2];
const size_t batch_offset = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
const int64_t IC_IH_IW = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
const int64_t N = src1->ne[is_2D ? 3 : 2];
const int64_t IH_IW = src1->nb[is_2D ? 3 : 2] / 4; // nb is byte offset, src is type float32
if(dst->type == GGML_TYPE_F16) {
im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
} else {
im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream);
im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, N, IC_IH_IW, IH_IW, s0, s1, p0, p1, d0, d1, stream);
}
}

View File

@@ -2046,8 +2046,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
backend_ctx->adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version);
backend_ctx->has_vector_subgroup_broadcast =
backend_ctx->adreno_cl_compiler_version.major >= 47 ||
backend_ctx->adreno_cl_compiler_version.major == 17;
(backend_ctx->adreno_cl_compiler_version.type == E031 && backend_ctx->adreno_cl_compiler_version.major >= 47) ||
(backend_ctx->adreno_cl_compiler_version.type == DX && backend_ctx->adreno_cl_compiler_version.major >= 17);
GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n",
backend_ctx->has_vector_subgroup_broadcast ? "true" : "false");

View File

@@ -2609,6 +2609,8 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer));
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(src1->ne[1] == 1);
GGML_ASSERT(src1->ne[3] == 1);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
@@ -2688,6 +2690,9 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
const size_t type_size_src0 = ggml_type_size(src0->type);
const size_t type_size_src1 = ggml_type_size(src1->type);
bool is_src0_cont_2 = ggml_is_contiguous_2(src0);
bool is_src1_cont_2 = ggml_is_contiguous_2(src1);
// SRC1 strides
int64_t s11 = nb11 / type_size_src1;
int64_t s12 = nb12 / type_size_src1;
@@ -2737,6 +2742,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
s11 = ne10;
s12 = ne11 * s11;
s13 = ne12 * s12;
is_src1_cont_2 = true;
}
ggml_sycl_pool_alloc<sycl::half> dst_f16(ctx.pool());
@@ -2852,12 +2859,16 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
else
#endif
{
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
if (r2 == 1 && r3 == 1 && is_src0_cont_2 && is_src1_cont_2) {
// with a [0, 2, 1, 3] perm. and ne02==1 the matrix strides need to be determined from dim 3:
const int64_t sma = ne02 == 1 ? nb03/nb00 : nb02/nb00;
const int64_t smb = ne12 == 1 ? s13 : s12;
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(*queue, oneapi::math::transpose::trans,
oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
src1_f16, dpct::library_data_t::real_half, s11, s12, beta, dst_ddf,
src0_f16, dpct::library_data_t::real_half, nb01 / nb00, sma,
src1_f16, dpct::library_data_t::real_half, s11, smb, beta, dst_ddf,
mkl_data_type, ne0, ne1 * ne0, ne12 * ne13, mkl_compute_type)));
} else {
const int ne23 = ne12 * ne13;
@@ -3187,7 +3198,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
// The kernel from the if path is faster for that specific case, but does not support all mul mats.
ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
}
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1 && src1->ne[3] == 1) {
// KQV single-batch
ggml_sycl_mul_mat_vec_nc(ctx, src0, src1, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2] * src1->ne[3] > 1) {

View File

@@ -2106,12 +2106,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
s_mmq_wg_denoms = { 32, 64, 1 };
// spec constants and tile sizes for quant matmul (Qi_K)
l_warptile_mmq_k = { 256, 64, 128, 64, 1 };
m_warptile_mmq_k = { 256, 32, 64, 64, 0 };
s_warptile_mmq_k = { 256, 32, 32, 128, 0 };
l_mmq_wg_denoms_k = { 64, 128, 1 };
m_mmq_wg_denoms_k = { 32, 64, 1 };
s_mmq_wg_denoms_k = { 32, 32, 1 };
l_warptile_mmq_k = { 256, 128, 256, 64, 1 };
m_warptile_mmq_k = { 256, 128, 128, 64, 1 };
s_warptile_mmq_k = { 256, 32, 64, 128, 0 };
l_mmq_wg_denoms_k = { 128, 256, 1 };
m_mmq_wg_denoms_k = { 128, 128, 1 };
s_mmq_wg_denoms_k = { 32, 64, 1 };
// spec constants and tile sizes for quant matmul_id
l_warptile_mmqid = { 256, 128, 128, 16, 0 };
@@ -3096,6 +3096,12 @@ static void ggml_vk_load_shaders(vk_device& device) {
uint32_t conv2d_SHMEM_PAD = 4;
bool conv2d_UNROLL = true;
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
if (device->coopmat2) {
conv2d_SHMEM_PAD = 8; // 8 float16_t
}
#endif
if (device->vendor_id == VK_VENDOR_ID_INTEL) {
conv2d_SHMEM_PAD = 0;
conv2d_UNROLL = false;
@@ -3154,6 +3160,16 @@ static void ggml_vk_load_shaders(vk_device& device) {
std::array<uint32_t, 3> wg_denoms = { conv2d_BS_K, conv2d_BS_NPQ, 1 };
std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
if (device->coopmat2) {
ggml_vk_create_pipeline(
device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_cm2_len, conv2d_f32_cm2_data, "main", 3,
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
ggml_vk_create_pipeline(
device, device->pipeline_conv2d_f16_f32[s], "conv2d_f16_f32", conv2d_f16_f32_cm2_len, conv2d_f16_f32_cm2_data, "main", 3,
sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants, 1, true, use_collectives);
} else
#endif
if (conv2d_UNROLL) {
ggml_vk_create_pipeline(
device, device->pipeline_conv2d_f32[s], "conv2d_f32", conv2d_f32_unroll_len, conv2d_f32_unroll_data, "main", 3,
@@ -5022,26 +5038,37 @@ static void ggml_vk_buffer_memset(vk_buffer& dst, size_t offset, uint32_t c, siz
ggml_vk_queue_command_pools_cleanup(dst->device);
}
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, int m, int n, int k, const vk_pipeline& pipeline) {
static uint32_t ggml_vk_guess_split_k(ggml_backend_vk_context * ctx, uint32_t m, uint32_t n, uint32_t k, const vk_pipeline& pipeline) {
VK_LOG_DEBUG("ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")");
uint32_t split_k = 1;
if (ctx->device->shader_core_count != 0 && m >= (int)pipeline->wg_denoms[0] && n >= (int)pipeline->wg_denoms[1]) {
if (ctx->device->shader_core_count != 0 && m >= pipeline->wg_denoms[0] && n >= pipeline->wg_denoms[1]) {
// If k is 'large' and the SMs will fill less than halfway, use split_k.
uint32_t m_tiles = CEIL_DIV(m, pipeline->wg_denoms[0]);
uint32_t n_tiles = CEIL_DIV(n, pipeline->wg_denoms[1]);
if (k >= 2048 && m_tiles * n_tiles < ctx->device->shader_core_count / 2) {
split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
// Clamp to 2 or 4
split_k = std::min(split_k, 4u);
if (split_k == 3) {
split_k = 2;
if (k >= 2048) {
if (m_tiles * n_tiles <= ctx->device->shader_core_count / 2) {
split_k = ctx->device->shader_core_count / (m_tiles * n_tiles);
} else if (m_tiles * n_tiles <= ctx->device->shader_core_count * 2 / 3) {
split_k = 3;
}
if (ctx->device->coopmat2) {
// coopmat2 shader expects splits to be aligned to 256
while (split_k > 1 && ((k / split_k) % 256) != 0) {
split_k /= 2;
// Cap the split at 8x. Unless k is huge this is a lot of overhead.
split_k = std::min(split_k, 8u);
// ggml_vk_matmul will align the splits to be a multiple of 256.
// If this rounded up size would cause the last split to be empty,
// then reduce the split count.
while (true) {
if (split_k == 1) {
break;
}
uint32_t k_split = CEIL_DIV(k, split_k);
k_split = ROUNDUP_POW2(k_split, 256);
if (k_split * (split_k - 1) < k) {
break;
}
split_k--;
}
}
}
@@ -5053,9 +5080,22 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
VK_LOG_DEBUG("ggml_vk_guess_matmul_pipeline(" << m << ", " << n << ", " << aligned << ", " << ggml_type_name(src0_type) << ", " << ggml_type_name(src1_type) << ")");
if (ctx->device->coopmat2) {
const uint32_t shader_core_count = ctx->device->shader_core_count;
const uint32_t tiles_l = CEIL_DIV(m, mmp->a_l->wg_denoms[0]) * CEIL_DIV(n, mmp->a_l->wg_denoms[1]);
const uint32_t tiles_m = CEIL_DIV(m, mmp->a_m->wg_denoms[0]) * CEIL_DIV(n, mmp->a_m->wg_denoms[1]);
// Use large shader when the N dimension is greater than the medium shader's tile size
uint32_t crossover_large = mmp->m->wg_denoms[1];
if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
// Prefer large over medium if either:
// - medium or large tiles would overfill the GPU
// - large tiles with a split_k==3 fits in the GPU and medium tiles with split_k==2 does not
// (medium with split_k==2 is probably better if it fits - more workgroups running and less split_k overhead)
bool prefer_large = tiles_m > shader_core_count || tiles_l > shader_core_count ||
// split_k==3 with large tiles likely better than medium tiles with no split_k.
(tiles_l <= shader_core_count / 3 && tiles_m > shader_core_count / 2);
if ((ctx->device->mul_mat_l[src0_type] && (n > crossover_large && prefer_large)) || (!ctx->device->mul_mat_m[src0_type] && !ctx->device->mul_mat_s[src0_type])) {
return aligned ? mmp->a_l : mmp->l;
}
// Use medium shader when the N dimension is greater than the small shader's tile size
@@ -5099,7 +5139,11 @@ static void ggml_vk_matmul(
GGML_ASSERT(batch_stride_d == m * n);
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, padded_n };
// Round the split size up to a multiple of 256 (k-quant alignment)
uint32_t k_split = CEIL_DIV(k, split_k);
k_split = ROUNDUP_POW2(k_split, 256);
const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k_split, ne02, ne12, broadcast2, broadcast3, padded_n };
// Make sure enough workgroups get assigned for split k to work
ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
ggml_vk_sync_buffers(subctx);

View File

@@ -1,6 +1,11 @@
#version 450
#extension GL_EXT_control_flow_attributes : enable
#ifdef COOPMAT2
#extension GL_NV_cooperative_matrix2 : enable
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
#extension GL_KHR_memory_scope_semantics : enable
#endif
#ifdef USE_COLLECTIVES
# extension GL_KHR_shader_subgroup_shuffle : enable
@@ -91,6 +96,12 @@ uint32_t n_elems_out = K * NPQ;
// Number of blocktiles per input
uint32_t NB_CRS = splitWork(CRS, BS_CRS);
#ifdef COOPMAT2
#define SHMEM_TYPE float16_t
#else
#define SHMEM_TYPE float
#endif
const uint32_t Ash_stride = BS_CRS + SHMEM_PAD;
const uint32_t Bsh_stride = BS_NPQ + SHMEM_PAD;
@@ -100,8 +111,8 @@ const uint32_t Bsh_numel = BS_CRS * BS_NPQ;
const uint32_t Ash_len = BS_K * Ash_stride;
const uint32_t Bsh_len = BS_CRS * Bsh_stride;
shared float Ash[Ash_len]; // K x CRS
shared float Bsh[Bsh_len]; // CRS x NPQ
shared SHMEM_TYPE Ash[Ash_len]; // K x CRS
shared SHMEM_TYPE Bsh[Bsh_len]; // CRS x NPQ
// Threadtile sizes
const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
@@ -110,10 +121,6 @@ const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K;
const uint32_t NT_K = BS_K / TS_K;
const uint32_t NT_NPQ = BS_NPQ / TS_NPQ;
float regA[TS_K];
float regB[TS_NPQ];
float regC[TS_K][TS_NPQ];
/*
Compute
KxCRS @ CRSxNPQ = K x NPQ
@@ -145,12 +152,36 @@ uint fastdiv(uint n, uint mp, uint L) {
return (msbs + n) >> L;
}
#ifdef COOPMAT2
#define ACC_TYPE float16_t
ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem)
{
uint32_t K_idx = B_idx_K * BS_K + r;
uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + c;
uint32_t N_idx = fastdiv(NPQ_idx, p.OWOHmp, p.OWOHL); // divide by p.OH * p.OW;
uint32_t OH_idx = fastdiv(NPQ_idx - N_idx * p.OH * p.OW, p.OWmp, p.OWL); // divide by p.OW;
uint32_t OW_idx = NPQ_idx - N_idx * p.OH * p.OW - OH_idx * p.OW;
uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + K_idx * p.nb2 + N_idx * p.nb3;
if (K_idx < K && NPQ_idx < NPQ) {
dst_data[dst_idx] = D_TYPE(elem);
}
return elem;
}
#endif
void main() {
#ifdef COOPMAT2
coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);
#else
float regC[TS_K][TS_NPQ];
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
regC[T_ly][T_lx] = 0.0;
}
}
#endif
/* Advance block in CRS dim */
for (uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++) {
uint32_t CRS_idx_a;
@@ -199,7 +230,7 @@ void main() {
if (K_idx >= K || CRS_idx_a >= CRS) {
val = 0.0;
}
Ash[B_ly * Ash_stride + B_lx] = val;
Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val);
}
/* Load input to B_block: (BS_CRS x BS_NPQ) */
UNROLL for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) {
@@ -244,11 +275,21 @@ void main() {
if (CRS_idx_b >= CRS || NPQ_idx >= NPQ || H_idx < 0 || H_idx >= p.H || W_idx < 0 || W_idx >= p.W) {
val = 0.0;
}
Bsh[B_ly * Bsh_stride + B_lx] = val;
Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val);
}
barrier();
#ifdef COOPMAT2
coopmat<float16_t, gl_ScopeWorkgroup, BS_K, BS_CRS, gl_MatrixUseA> matA;
coopmat<float16_t, gl_ScopeWorkgroup, BS_CRS, BS_NPQ, gl_MatrixUseB> matB;
coopMatLoad(matA, Ash, 0, Ash_stride, gl_CooperativeMatrixLayoutRowMajor);
coopMatLoad(matB, Bsh, 0, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor);
matC = coopMatMulAdd(matA, matB, matC);
#else
if (T_y * TS_K < K) {
UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) {
float regA[TS_K];
float regB[TS_NPQ];
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx];
}
@@ -262,9 +303,13 @@ void main() {
}
}
}
#endif
barrier();
}
/* Save C* */
#ifdef COOPMAT2
coopMatPerElementNV(matC, matC, perElemOpStore);
#else
if (T_y * TS_K < K) {
for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) {
for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) {
@@ -280,4 +325,5 @@ void main() {
}
}
}
#endif
}

View File

@@ -661,6 +661,11 @@ void process_shaders() {
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}});
string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", ""}});
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
string_to_spv("conv2d_f32", "conv2d_mm.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}, {"COOPMAT2", "1"}}, true, false, true);
string_to_spv("conv2d_f16_f32", "conv2d_mm.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"USE_COLLECTIVES", "1"}, {"UNROLL", "[[unroll]]"}, {"COOPMAT2", "1"}}, true, false, true);
#endif
string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}}));
string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}}));

File diff suppressed because it is too large Load Diff

View File

@@ -105,6 +105,7 @@ class Keys:
EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers"
NEXTN_PREDICT_LAYERS = "{arch}.nextn_predict_layers"
POOLING_TYPE = "{arch}.pooling_type"
LOGIT_SCALE = "{arch}.logit_scale"
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
@@ -357,6 +358,7 @@ class MODEL_ARCH(IntEnum):
DEEPSEEK2 = auto()
CHATGLM = auto()
GLM4 = auto()
GLM4_MOE = auto()
BITNET = auto()
T5 = auto()
T5ENCODER = auto()
@@ -614,6 +616,13 @@ class MODEL_TENSOR(IntEnum):
A_MMPROJ_FC = auto()
A_MM_NORM_PRE = auto()
A_MM_NORM_MID = auto()
# nextn/mtp
NEXTN_EH_PROJ = auto()
NEXTN_EMBED_TOKENS = auto()
NEXTN_ENORM = auto()
NEXTN_HNORM = auto()
NEXTN_SHARED_HEAD_HEAD = auto()
NEXTN_SHARED_HEAD_NORM = auto()
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -678,6 +687,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.GLM4: "glm4",
MODEL_ARCH.GLM4_MOE: "glm4moe",
MODEL_ARCH.BITNET: "bitnet",
MODEL_ARCH.T5: "t5",
MODEL_ARCH.T5ENCODER: "t5encoder",
@@ -936,6 +946,13 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
# NextN/MTP
MODEL_TENSOR.NEXTN_EH_PROJ: "blk.{bid}.nextn.eh_proj",
MODEL_TENSOR.NEXTN_EMBED_TOKENS: "blk.{bid}.nextn.embed_tokens",
MODEL_TENSOR.NEXTN_ENORM: "blk.{bid}.nextn.enorm",
MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm",
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head",
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm",
}
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -2124,6 +2141,37 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.FFN_POST_NORM,
],
MODEL_ARCH.GLM4_MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,
MODEL_TENSOR.ATTN_V,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
# NextN/MTP tensors - preserved but unused
MODEL_TENSOR.NEXTN_EH_PROJ,
MODEL_TENSOR.NEXTN_EMBED_TOKENS,
MODEL_TENSOR.NEXTN_ENORM,
MODEL_TENSOR.NEXTN_HNORM,
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
],
MODEL_ARCH.BITNET: [
MODEL_TENSOR.ATTN_Q,
MODEL_TENSOR.ATTN_K,

View File

@@ -753,6 +753,9 @@ class GGUFWriter:
def add_moe_every_n_layers(self, value: int) -> None:
self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)
def add_nextn_predict_layers(self, count: int) -> None:
self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)
def add_swin_norm(self, value: bool) -> None:
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)

View File

@@ -111,6 +111,7 @@ def main() -> None:
parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."')
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
parser.add_argument("--chat-template-file", type=Path, help="Jinja file containing chat template", metavar='chat_template.jinja')
parser.add_argument("--pre-tokenizer", type=str, help="The models tokenizer.ggml.pre", metavar='"pre tokenizer"')
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url')
parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
@@ -134,12 +135,17 @@ def main() -> None:
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)
if args.chat_template_config:
with open(args.chat_template_config, 'r') as fp:
with open(args.chat_template_config, 'r', encoding='utf-8') as fp:
config = json.load(fp)
template = config.get('chat_template')
if template:
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
if args.chat_template_file:
with open(args.chat_template_file, 'r', encoding='utf-8') as fp:
template = fp.read()
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
if args.pre_tokenizer:
new_metadata[gguf.Keys.Tokenizer.PRE] = MetadataDetails(gguf.GGUFValueType.STRING, args.pre_tokenizer)

View File

@@ -1369,6 +1369,31 @@ class TensorNameMap:
MODEL_TENSOR.A_MM_NORM_MID: (
"audio.multi_modal_projector.ln_mid", # ultravox
),
# NextN/MTP tensors for GLM4_MOE
MODEL_TENSOR.NEXTN_EH_PROJ: (
"model.layers.{bid}.eh_proj",
),
MODEL_TENSOR.NEXTN_EMBED_TOKENS: (
"model.layers.{bid}.embed_tokens",
),
MODEL_TENSOR.NEXTN_ENORM: (
"model.layers.{bid}.enorm",
),
MODEL_TENSOR.NEXTN_HNORM: (
"model.layers.{bid}.hnorm",
),
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: (
"model.layers.{bid}.shared_head.head",
),
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: (
"model.layers.{bid}.shared_head.norm",
),
}
# architecture-specific block mappings

View File

@@ -312,7 +312,11 @@ class SpecialVocab:
with open(config_file, encoding = 'utf-8') as f:
config = json.load(f)
for typ in self.special_token_types:
self._set_special_token(typ, config.get(f'{typ}_token_id'))
token_id = config.get(f'{typ}_token_id')
# If not found at root, check in text_config (for multimodal models like Kimi-VL)
if token_id is None and 'text_config' in config:
token_id = config['text_config'].get(f'{typ}_token_id')
self._set_special_token(typ, token_id)
return True

View File

@@ -21,4 +21,5 @@ These templates can be updated with the following commands:
./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
./scripts/get_chat_template.py Qwen/QwQ-32B > models/templates/Qwen-QwQ-32B.jinja
./scripts/get_chat_template.py Qwen/Qwen3-0.6B > models/templates/Qwen-Qwen3-0.6B.jinja
```
./scripts/get_chat_template.py zai-org/GLM-4.5 > models/templates/zai-org-GLM-4.5.jinja
```

View File

@@ -1,7 +1 @@
-r ./requirements-convert_legacy_llama.txt
--extra-index-url https://download.pytorch.org/whl/cpu
torch~=2.2.1; platform_machine != "s390x"
# torch s390x packages can only be found from nightly builds
--extra-index-url https://download.pytorch.org/whl/nightly
torch>=0.0.0.dev0; platform_machine == "s390x"

View File

@@ -62,6 +62,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_GLM4, "glm4" },
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
{ LLM_ARCH_BITNET, "bitnet" },
{ LLM_ARCH_T5, "t5" },
{ LLM_ARCH_T5ENCODER, "t5encoder" },
@@ -127,6 +128,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
{ LLM_KV_NEXTN_PREDICT_LAYERS, "%s.nextn_predict_layers" },
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -1391,6 +1393,40 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
},
},
{
LLM_ARCH_GLM4_MOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
// NextN/MTP tensors - preserved but unused (in final layer, dynamic layer number)
{ LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
{ LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
{ LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
{ LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
{ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
{ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
},
},
{
LLM_ARCH_BITNET,
{
@@ -2181,6 +2217,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
{LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
// NextN/MTP tensors are currently ignored (reserved for future MTP support)
// These tensors only exist in the last layer(s) and are treated as output tensors
{LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_NEXTN_EMBED_TOKENS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
{LLM_TENSOR_NEXTN_ENORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}},
{LLM_TENSOR_NEXTN_HNORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
};
LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}

View File

@@ -66,6 +66,7 @@ enum llm_arch {
LLM_ARCH_DEEPSEEK2,
LLM_ARCH_CHATGLM,
LLM_ARCH_GLM4,
LLM_ARCH_GLM4_MOE,
LLM_ARCH_BITNET,
LLM_ARCH_T5,
LLM_ARCH_T5ENCODER,
@@ -131,6 +132,7 @@ enum llm_kv {
LLM_KV_EXPERT_WEIGHTS_NORM,
LLM_KV_EXPERT_GATING_FUNC,
LLM_KV_MOE_EVERY_N_LAYERS,
LLM_KV_NEXTN_PREDICT_LAYERS,
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,
LLM_KV_DECODER_START_TOKEN_ID,
@@ -409,6 +411,12 @@ enum llm_tensor {
LLM_TENSOR_SHORTCONV_CONV,
LLM_TENSOR_SHORTCONV_INPROJ,
LLM_TENSOR_SHORTCONV_OUTPROJ,
LLM_TENSOR_NEXTN_EH_PROJ,
LLM_TENSOR_NEXTN_EMBED_TOKENS,
LLM_TENSOR_NEXTN_ENORM,
LLM_TENSOR_NEXTN_HNORM,
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
};
enum llm_tensor_layer {

View File

@@ -105,7 +105,7 @@ llama_context::llama_context(
{
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
if (!supports_set_rows && !cparams.kv_unified) {
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@@ -786,7 +786,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
const auto & hparams = model.hparams;
const int64_t n_embd = hparams.n_embd;
const int32_t n_vocab = model.vocab.n_tokens();
const int64_t n_vocab = model.vocab.n_tokens();
// note: during encode, we always pass the full sequence starting from pos = 0
if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
@@ -959,7 +959,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
const auto & vocab = model.vocab;
const auto & hparams = model.hparams;
const int32_t n_vocab = vocab.n_tokens();
const int64_t n_vocab = vocab.n_tokens();
const int64_t n_embd = hparams.n_embd;
// when computing embeddings, all tokens are output
@@ -1328,21 +1328,21 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
}
void llama_context::output_reorder() {
const uint32_t n_vocab = model.vocab.n_tokens();
const uint64_t n_vocab = model.vocab.n_tokens();
const uint64_t n_embd = model.hparams.n_embd;
for (uint32_t s = 0; s < output_swaps.size(); ++s) {
const uint32_t i0 = output_swaps[s].i0;
const uint32_t i1 = output_swaps[s].i1;
for (size_t s = 0; s < output_swaps.size(); ++s) {
const uint64_t i0 = output_swaps[s].i0;
const uint64_t i1 = output_swaps[s].i1;
if (logits_size > 0) {
for (uint32_t k = 0; k < n_vocab; k++) {
for (uint64_t k = 0; k < n_vocab; k++) {
std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
}
}
if (embd_size > 0) {
for (uint32_t k = 0; k < n_embd; k++) {
for (uint64_t k = 0; k < n_embd; k++) {
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
}
}

View File

@@ -289,7 +289,7 @@ private:
// env: LLAMA_SET_ROWS (temporary)
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
bool supports_set_rows = false;
bool supports_set_rows = true;
// env: LLAMA_GRAPH_REUSE_DISABLE
bool graph_reuse_disable = false;

View File

@@ -749,8 +749,8 @@ ggml_tensor * llm_graph_context::build_ffn(
if (down) {
cur = build_lora_mm(down, cur);
if (arch == LLM_ARCH_GLM4) {
// GLM4 seems to have numerical issues with half-precision accumulators
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
}
@@ -1391,8 +1391,8 @@ ggml_tensor * llm_graph_context::build_attn(
if (wo) {
cur = build_lora_mm(wo, cur);
if (arch == LLM_ARCH_GLM4) {
// GLM4 seems to have numerical issues with half-precision accumulators
if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
// GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
}
}

View File

@@ -73,6 +73,7 @@ struct llama_hparams {
bool expert_weights_norm = false;
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
uint32_t moe_every_n_layers = 0;
uint32_t nextn_predict_layers = 0;
float f_norm_eps;
float f_norm_rms_eps;

View File

@@ -39,6 +39,10 @@ llama_kv_cache_unified::llama_kv_cache_unified(
if (model.arch == LLM_ARCH_GEMMA3N) {
n_layer_cache = 20;
}
if (model.arch == LLM_ARCH_GLM4_MOE) {
// GLM-4.5: Only process up to last layer, skip final NextN layer
n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers;
}
// create a context for each buffer type
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@@ -183,7 +187,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
const size_t memory_size_k = size_k_bytes();
const size_t memory_size_v = size_v_bytes();
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%2u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
LLAMA_LOG_INFO("%s: size = %7.2f MiB (%6u cells, %3d layers, %2u/%u seqs), K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f), kv_size, (int) layers.size(), n_seq_max, n_stream,
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
@@ -193,7 +197,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : 0;
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
if (!supports_set_rows) {
// ref: https://github.com/ggml-org/llama.cpp/pull/14363

View File

@@ -230,7 +230,7 @@ private:
// env: LLAMA_SET_ROWS (temporary)
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
bool supports_set_rows = false;
bool supports_set_rows = true;
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;

View File

@@ -25,6 +25,7 @@ llama_memory_hybrid::llama_memory_hybrid(
/* common */
uint32_t n_seq_max,
bool offload,
bool unified,
/* layer filters */
layer_filter_cb && filter_attn,
layer_filter_cb && filter_recr) :
@@ -38,7 +39,7 @@ llama_memory_hybrid::llama_memory_hybrid(
type_v,
v_trans,
offload,
1,
unified,
kv_size,
n_seq_max,
n_pad,

View File

@@ -39,6 +39,7 @@ public:
/* common */
uint32_t n_seq_max,
bool offload,
bool unified,
/* layer filters */
layer_filter_cb && filter_attn = nullptr,
layer_filter_cb && filter_recr = nullptr);

View File

@@ -58,8 +58,9 @@ struct llama_model_loader {
}
};
static const int TENSOR_NOT_REQUIRED = 1;
static const int TENSOR_DUPLICATED = 2;
static const int TENSOR_NOT_REQUIRED = 1 << 0;
static const int TENSOR_DUPLICATED = 1 << 1;
static const int TENSOR_SKIP = 1 << 2;
int n_kv = 0;
int n_tensors = 0;

View File

@@ -109,8 +109,10 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_A13B: return "A13B";
case LLM_TYPE_21B_A3B: return "21B.A3B";
case LLM_TYPE_30B_A3B: return "30B.A3B";
case LLM_TYPE_106B_A12B: return "106B.A12B";
case LLM_TYPE_235B_A22B: return "235B.A22B";
case LLM_TYPE_300B_A47B: return "300B.A47B";
case LLM_TYPE_355B_A32B: return "355B.A32B";
case LLM_TYPE_E2B: return "E2B";
case LLM_TYPE_E4B: return "E4B";
default: return "?B";
@@ -1434,6 +1436,34 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_GLM4_MOE:
{
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
// MoE parameters
ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert);
ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
// Expert gating function (GLM-4.5 uses sigmoid)
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
}
// NextN/MTP parameters
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
switch (hparams.n_layer) {
case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer)
case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer)
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_BITNET:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -1949,6 +1979,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const auto TENSOR_DUPLICATED = llama_model_loader::TENSOR_DUPLICATED;
const auto TENSOR_NOT_REQUIRED = llama_model_loader::TENSOR_NOT_REQUIRED;
const auto TENSOR_SKIP = llama_model_loader::TENSOR_SKIP;
// create tensors for the weights
{
@@ -2004,7 +2035,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
}
// skip unused tensors
if (info.op == GGML_OP_NONE) {
if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) {
const size_t nbytes = ggml_nbytes(t_meta);
LLAMA_LOG_WARN("model has unused tensor %s (size = %zu bytes) -- ignoring\n", tn.str().c_str(), nbytes);
@@ -4427,6 +4458,105 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
}
} break;
case LLM_ARCH_GLM4_MOE:
{
const int64_t n_expert = hparams.n_expert;
const int64_t n_expert_used = hparams.n_expert_used;
const int64_t n_expert_shared = hparams.n_expert_shared;
GGML_ASSERT(hparams.n_expert > 0 && "n_expert must be > 0 for GLM4_MOE MoE layers");
GGML_ASSERT(hparams.n_expert_used > 0 && "n_expert_used must be > 0 for GLM4_MOE MoE layers");
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0);
// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED);
// if output is NULL, init from the input tok embed
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED);
}
// Load ALL tensors including NextN layer to satisfy total tensor count
// but only PROCESS up to last layer (skipping final NextN layer) in forward pass
for (int i = 0; i < n_layer; ++i) {
int flags = 0;
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
// skip all tensors in the NextN layers
flags |= TENSOR_SKIP;
}
auto & layer = layers[i];
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags);
// GLM-style attention with bias terms
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, flags);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), { n_embd, n_embd_k_gqa }, flags);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), { n_embd, n_embd_v_gqa }, flags);
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), { n_embd_head_k * n_head }, flags);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, flags);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, flags);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags);
// K/Q norm tensors (optional for GLM-4.5 355B variant)
layer.attn_q_norm = create_tensor(
tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
layer.attn_k_norm = create_tensor(
tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), { n_embd_head_k }, TENSOR_NOT_REQUIRED | flags);
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, flags);
// Check if this layer uses MoE or dense FFN based on n_layer_dense_lead
// GLM 4.5 uses hybrid architecture: layer 0 is dense, layers 1+ are MoE
const bool use_moe = (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead);
if (use_moe) {
// MoE layers
layer.ffn_gate_inp =
create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags);
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), { n_expert }, flags);
// MoE branch
const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff / n_expert_used;
layer.ffn_gate_exps = create_tensor(
tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
layer.ffn_down_exps = create_tensor(
tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags);
layer.ffn_up_exps = create_tensor(
tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, flags);
// Shared expert
if (n_expert_shared > 0) {
const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
layer.ffn_gate_shexp = create_tensor(
tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
layer.ffn_down_shexp = create_tensor(
tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags);
layer.ffn_up_shexp = create_tensor(
tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags);
}
} else {
// Dense layers (first k layers) - GLM uses separate gate/up projections
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags);
}
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags);
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags);
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags);
}
}
}
break;
case LLM_ARCH_NEMOTRON:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -13564,6 +13694,165 @@ struct llm_build_glm4 : public llm_graph_context {
}
};
struct llm_build_glm4_moe : public llm_graph_context {
llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
ggml_tensor * cur;
ggml_tensor * inpL;
inpL = build_inp_embd(model.tok_embd);
// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();
auto * inp_attn = build_attn_inp_kv_unified();
ggml_tensor * inp_out_ids = build_inp_out_ids();
// Only process up to last layer (skip final NextN layer)
// Final layer tensors are loaded but not processed in forward pass
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
for (int il = 0; il < n_transformer_layers; ++il) {
ggml_tensor * inpSA = inpL;
// Pre-attention norm
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);
// self-attention
{
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
if (model.layers[il].bq) {
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
}
cb(Qcur, "Qcur", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
if (model.layers[il].bk) {
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
}
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
if (model.layers[il].bv) {
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
}
cb(Vcur, "Vcur", il);
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
// Apply Q/K norm if available (GLM-4.5 355B variant)
if (model.layers[il].attn_q_norm) {
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
}
if (model.layers[il].attn_k_norm) {
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il);
}
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
Kcur = ggml_rope_ext(
ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
cur = build_attn(inp_attn,
model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
}
if (il == n_transformer_layers - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}
ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);
// Post-attention norm
cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "post_attn_norm", il);
// Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
// Dense FFN layer
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
// Process routed experts using existing MoE infrastructure
ggml_tensor * routed_out = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
true, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(routed_out, "ffn_moe_out", il);
// Process shared expert on original input
ggml_tensor * shared_out = build_ffn(cur,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(shared_out, "ffn_shexp_out", il);
// Final output: routed_output + shared_output
cur = ggml_add(ctx0, routed_out, shared_out);
cb(cur, "ffn_out", il);
}
cur = ggml_add(ctx0, cur, ffn_inp);
cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
}
cur = inpL;
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
cb(cur, "result_norm", -1);
res->t_embd = cur;
// lm_head
cur = build_lora_mm(model.output, cur);
cb(cur, "result_output", -1);
res->t_logits = cur;
ggml_build_forward_expand(gf, cur);
}
};
struct llm_build_nemotron : public llm_graph_context {
llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -17598,6 +17887,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
/* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max),
/* n_seq_max */ cparams.n_seq_max,
/* offload */ cparams.offload_kqv,
/* unified */ cparams.kv_unified,
/* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr,
/* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr);
} else {
@@ -17876,6 +18166,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_glm4>(*this, params);
} break;
case LLM_ARCH_GLM4_MOE:
{
llm = std::make_unique<llm_build_glm4_moe>(*this, params);
} break;
case LLM_ARCH_BITNET:
{
llm = std::make_unique<llm_build_bitnet>(*this, params);
@@ -18207,6 +18501,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_HUNYUAN_DENSE:
case LLM_ARCH_LFM2:
case LLM_ARCH_SMALLTHINKER:
case LLM_ARCH_GLM4_MOE:
return LLAMA_ROPE_TYPE_NEOX;
case LLM_ARCH_QWEN2VL:

View File

@@ -101,8 +101,10 @@ enum llm_type {
LLM_TYPE_A13B,
LLM_TYPE_21B_A3B, // Ernie MoE small
LLM_TYPE_30B_A3B,
LLM_TYPE_106B_A12B, // GLM-4.5-Air
LLM_TYPE_235B_A22B,
LLM_TYPE_300B_A47B, // Ernie MoE big
LLM_TYPE_355B_A32B, // GLM-4.5
LLM_TYPE_E2B,
LLM_TYPE_E4B,
};
@@ -166,6 +168,15 @@ struct llama_layer_shortconv {
struct ggml_tensor * out_proj = nullptr;
};
struct llama_layer_nextn {
struct ggml_tensor * eh_proj = nullptr;
struct ggml_tensor * embed_tokens = nullptr;
struct ggml_tensor * enorm = nullptr;
struct ggml_tensor * hnorm = nullptr;
struct ggml_tensor * shared_head_head = nullptr;
struct ggml_tensor * shared_head_norm = nullptr;
};
struct llama_layer {
// normalization
struct ggml_tensor * attn_norm = nullptr;
@@ -354,6 +365,8 @@ struct llama_layer {
struct llama_layer_convnext convnext;
struct llama_layer_shortconv shortconv;
struct llama_layer_nextn nextn;
};
struct llama_model {

View File

@@ -1856,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
tokenizer_pre == "gigachat" ||
tokenizer_pre == "jina-v2-es" ||
tokenizer_pre == "jina-v2-de" ||
tokenizer_pre == "a.x-4.0") {
tokenizer_pre == "a.x-4.0" ||
tokenizer_pre == "mellum") {
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "jina-v1-en" ||
@@ -2190,6 +2191,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<fim▁begin>" // DeepSeek
|| t.first == "<PRE>"
|| t.first == "▁<PRE>" // CodeLlama
|| t.first == "<|code_prefix|>" // GLM-4.5
) {
special_fim_pre_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2209,6 +2211,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<fim▁hole>" // DeepSeek
|| t.first == "<SUF>"
|| t.first == "▁<SUF>" // CodeLlama
|| t.first == "<|code_suffix|>" // GLM-4.5
) {
special_fim_suf_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
@@ -2228,6 +2231,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|| t.first == "<fim▁end>" // DeepSeek
|| t.first == "<MID>"
|| t.first == "▁<MID>" // CodeLlama
|| t.first == "<|code_middle|>" // GLM-4.5
) {
special_fim_mid_id = t.second;
if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {

View File

@@ -953,6 +953,33 @@ static void test_template_output_parsers() {
/* is_partial= */ false,
{COMMON_CHAT_FORMAT_HERMES_2_PRO}));
// Test multiple tool calls
common_chat_msg message_assist_multiple_calls;
message_assist_multiple_calls.role = "assistant";
message_assist_multiple_calls.content = "";
message_assist_multiple_calls.tool_calls.push_back({"special_function", "{\"arg1\": 1}", ""});
message_assist_multiple_calls.tool_calls.push_back({"python", "{\"code\":\"print('hello')\"}", ""});
assert_msg_equals(
message_assist_multiple_calls,
common_chat_parse(
"<tool_call>\n"
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
"</tool_call>\n"
"<tool_call>\n"
"{\"name\": \"python\", \"arguments\": {\"code\":\"print('hello')\"}}\n"
"</tool_call>",
/* is_partial= */ false,
{COMMON_CHAT_FORMAT_HERMES_2_PRO}));
assert_msg_equals(
message_assist_multiple_calls,
common_chat_parse(
"<function=special_function>{\"arg1\": 1}</function>\n"
"<function=python>{\"code\":\"print('hello')\"}</function>",
/* is_partial= */ false,
{COMMON_CHAT_FORMAT_HERMES_2_PRO}));
assert_msg_equals(
simple_assist_msg(
"This is not a tool call:",
@@ -1039,6 +1066,22 @@ static void test_template_output_parsers() {
"<tool_call>\n"
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
"</tool_call>");
// Test multiple tool calls with template
common_chat_msg message_assist_multiple_calls_template;
message_assist_multiple_calls_template.role = "assistant";
message_assist_multiple_calls_template.content = "";
message_assist_multiple_calls_template.tool_calls.push_back({"special_function", "{\"arg1\": 1}", ""});
message_assist_multiple_calls_template.tool_calls.push_back({"python", "{\"code\":\"print('test')\"}", ""});
test_templates(tmpls.get(), end_tokens, message_assist_multiple_calls_template, tools,
"<tool_call>\n"
"{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n"
"</tool_call>\n"
"<tool_call>\n"
"{\"name\": \"python\", \"arguments\": {\"code\":\"print('test')\"}}\n"
"</tool_call>");
test_templates(tmpls.get(), end_tokens, message_assist_call_python_lines, tools,
"<tool_call>\n"
"{\"name\": \"python\", \"arguments\": {\"code\":\"# This is a program:\\nprint('hey')\"}}\n"

View File

@@ -7,7 +7,7 @@ More information is available in <https://github.com/ggml-org/llama.cpp/pull/486
```
./llama-imatrix \
-m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \
-m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \
[--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \
[--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \
[--show-statistics] [...]
@@ -20,6 +20,7 @@ The parameters in square brackets are optional and have the following meaning:
* `-lv | --verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
* `-o | --output-file` specifies the name of the file where the computed data will be stored. If missing `imatrix.gguf` is used.
* `-ofreq | --output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
* `--output-format` specifies the output format of the generated imatrix file. Either "gguf", or "dat" (the legacy format). Defaults to "gguf".
* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
* `--process-output` specifies if data will be collected for the `output.weight` tensor. Typically, it is better not to utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
* `--in-file` one or more existing imatrix files to load and combine. Useful for merging files from multiple runs/datasets.
@@ -45,14 +46,19 @@ Recent versions of `llama-imatrix` store data in GGUF format by default. For the
```bash
# generate and save the imatrix using legacy format
./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt -o imatrix-legcy-format.dat -ngl 99
./llama-imatrix -m ggml-model-f16.gguf -f calibration-data.txt --output-format dat -o imatrix-legcy-format.dat -ngl 99
```
```bash
# covert legacy (binary) imatrix format to new (GGUF) format
# convert legacy (binary) imatrix format to new (GGUF) format
./llama-imatrix --in-file imatrix-legacy-format.dat -o imatrix-new-format.gguf
```
```bash
# convert new (GGUF) imatrix format to legacy (binary) format
./llama-imatrix --in-file imatrix-new-format.gguf --output-format dat -o imatrix-legacy-format.dat
```
```bash
# combine existing imatrices
./llama-imatrix --in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf -o imatrix-combined.gguf

View File

@@ -26,7 +26,7 @@
static void print_usage(int, char ** argv) {
LOG("\nexample usage:\n");
LOG("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.gguf] [--output-format {gguf,dat}] [--no-ppl] \\\n"
" [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n"
" [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n"
" [--show-statistics] [...]\n" , argv[0]);
@@ -250,13 +250,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
// TODO: 4d? (is that even used in practice?)
// the extra dimension would need to be stored somewhere to be reflected in the imatrix file
if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
GGML_ASSERT(false);
}
// this has been adapted to the new format of storing merged experts in a single 3d tensor
// ref: https://github.com/ggml-org/llama.cpp/pull/6387
if (t->op == GGML_OP_MUL_MAT_ID) {
@@ -272,6 +265,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
GGML_ASSERT(ids->ne[1] == src1->ne[2]);
// the extra dimension would need to be stored somewhere to be reflected in the imatrix file
if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
GGML_ASSERT(false);
}
m_ids.resize(ggml_nbytes(ids));
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
@@ -335,29 +334,40 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
} else {
auto & e = m_stats[wname];
const int64_t n_mat = src1->ne[2] * src1->ne[3];
const int64_t n_mat = src0->ne[2] * src0->ne[3];
// use a single count per dense tensor
// (necessary when merging older GGUF-imatrix files with 3d tensors)
if (e.counts.size() > 1) {
bool all_equal = true;
for (size_t i = 1; i < e.counts.size(); ++i) {
if (e.counts[0] != e.counts[i]) {
all_equal = false;
break;
}
}
if (all_equal) {
e.counts.resize(1);
}
}
if (e.values.empty()) {
e.values.resize(src1->ne[0] * n_mat, 0);
e.counts.resize(n_mat, 0);
e.counts.resize(1, 0);
}
else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) {
LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat));
exit(1); //GGML_ABORT("fatal error");
}
else if (e.counts.size() != (size_t)n_mat) {
LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat);
exit(1); //GGML_ABORT("fatal error");
}
LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type);
for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
const int64_t mat_id = i3 * src1->ne[2] + i2;
// handle 3D+ tensors, but flatten 3D+ activations when model tensor is 2D
const int64_t mat_id = (i3 % src0->ne[3]) * src0->ne[2] + (i2 % src0->ne[2]);
const int64_t mat_start = mat_id * src1->ne[0];
for (int64_t row = 0; row < src1->ne[1]; ++row) {
const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]);
e.counts[mat_id]++;
const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->nb[3]);
for (int64_t j = 0; j < src1->ne[0]; ++j) {
e.values[mat_start + j] += x[j] * x[j];
if (!std::isfinite((float)e.values[j])) {
@@ -366,16 +376,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
}
}
const int32_t n_chunk = e.counts[mat_id] / chunk_size;
if (n_chunk > m_last_chunk) {
const int32_t chunk_step = n_chunk - m_last_chunk;
m_last_chunk = n_chunk;
if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
save_imatrix();
}
if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
save_imatrix(m_last_chunk);
}
}
}
// only 1 count in practice, except when a tensor is used for both MUL_MAT_ID and MUL_MAT
for (size_t i = 0; i < e.counts.size(); ++i) {
e.counts[i] += ggml_nrows(src1) / n_mat;
const int32_t n_chunk = e.counts[i] / chunk_size;
if (n_chunk > m_last_chunk) {
const int32_t chunk_step = n_chunk - m_last_chunk;
m_last_chunk = n_chunk;
if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
save_imatrix();
}
if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
save_imatrix(m_last_chunk);
}
}
}
@@ -492,13 +506,17 @@ void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {
void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
auto fname = m_params.out_file;
int8_t use_legacy_format = m_params.imat_dat;
// TODO: use the new format in more cases
if (!string_ends_with(fname, ".gguf")) {
LOG_WRN("\n%s: saving to legacy imatrix format because output suffix is not .gguf\n", __func__);
if (use_legacy_format > 0) {
this->save_imatrix_legacy(n_chunk);
return;
}
// only warn when `--output-format gguf` is not specified
if (use_legacy_format == 0 && !string_ends_with(fname, ".gguf")) {
LOG_WRN("\n%s: saving imatrix using GGUF format with a different suffix than .gguf\n", __func__);
LOG_WRN("%s: if you want the previous imatrix format, use --output-format dat\n", __func__);
}
if (n_chunk > 0) {
fname += ".at_";

View File

@@ -611,7 +611,7 @@ int main(int argc, char ** argv) {
return 1;
}
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[arg_idx]);
return 1;
}
if (ftype_str == "COPY") {

Binary file not shown.

View File

@@ -31,7 +31,24 @@ html {
hr {
@apply my-4 border-base-content/20 border-1;
}
/* TODO: fix markdown table */
table {
@apply w-full border-collapse text-sm font-sans my-4 text-base-content;
}
thead {
@apply bg-base-200 text-base-content;
}
th {
@apply border border-base-300 px-4 py-2 text-left font-semibold;
}
td {
@apply border border-base-300 px-4 py-2 align-top;
}
tbody tr:nth-child(even) {
@apply bg-base-100;
}
tbody tr:hover {
@apply bg-base-200;
}
}
.btn-mini {