Compare commits

...

62 Commits

Author SHA1 Message Date
Georgi Gerganov
c0c3e428dd refactor 2026-02-16 23:02:45 +02:00
Georgi Gerganov
7f049860b4 resoning and error handling 2026-02-16 22:16:15 +02:00
Georgi Gerganov
2ffa45edfc add tokens 2026-02-16 21:52:54 +02:00
Georgi Gerganov
9c29be1177 store full response 2026-02-16 21:44:29 +02:00
Georgi Gerganov
013963cfd5 add html 2026-02-16 21:22:06 +02:00
Georgi Gerganov
e2e998a2d6 fix prompts 2026-02-16 21:02:25 +02:00
Georgi Gerganov
6c41664b8b simplify 2026-02-16 19:50:27 +02:00
Georgi Gerganov
7b84af8051 fix counts 2026-02-16 16:38:31 +02:00
Georgi Gerganov
60a501e138 cleanup 2026-02-16 16:31:14 +02:00
Georgi Gerganov
e6e777cfb3 resume eval 2026-02-16 16:21:36 +02:00
Georgi Gerganov
ad3a54eb68 ignore errors 2026-02-16 15:23:23 +02:00
Georgi Gerganov
c6d70b9bea add AGENTS.md 2026-02-16 13:13:35 +02:00
Georgi Gerganov
de956a6ca8 cleanup 2026-02-16 12:02:16 +02:00
Georgi Gerganov
350e7c1409 datasets : fix aime2025 2026-02-16 11:55:57 +02:00
Georgi Gerganov
db10dda1f3 grade : improve regex + logs 2026-02-16 11:51:36 +02:00
Georgi Gerganov
52759bf078 grader : update prompt 2026-02-16 11:17:53 +02:00
Georgi Gerganov
99e3c3d02c datasets : add aime2025 2026-02-16 11:07:54 +02:00
Georgi Gerganov
c6315655b7 cont 2026-02-16 10:56:58 +02:00
Georgi Gerganov
f762a71d56 grader : improve example answers 2026-02-16 10:51:41 +02:00
Georgi Gerganov
73e61d5b75 rename 2026-02-16 10:30:10 +02:00
Georgi Gerganov
cffd268bb3 add gpqa + sampling + docs 2026-02-16 00:52:33 +02:00
Georgi Gerganov
e8a807519a datasets : add gsm8k 2026-02-15 23:19:46 +02:00
Georgi Gerganov
1db8428f00 remove old files 2026-02-15 22:16:54 +02:00
Georgi Gerganov
7751ae2796 docs 2026-02-15 22:15:50 +02:00
Georgi Gerganov
d2b10302ce improve grader 2026-02-15 22:12:02 +02:00
Georgi Gerganov
68dde884d6 minor 2026-02-15 21:21:40 +02:00
Georgi Gerganov
fd90796da2 eval : support multiple dataset runs 2026-02-15 21:08:24 +02:00
Georgi Gerganov
8156d549f6 sim : fix answer matching 2026-02-15 21:08:24 +02:00
Georgi Gerganov
9695e6feb4 test : fix path 2026-02-15 21:08:24 +02:00
Georgi Gerganov
fb1481d60d eval : add prompts 2026-02-15 21:08:24 +02:00
Georgi Gerganov
812ae13ec1 eval : print progress 2026-02-15 21:08:24 +02:00
Georgi Gerganov
e79e8d02d5 examples: add task summary table to llama-eval-new.py 2026-02-15 21:08:23 +02:00
Georgi Gerganov
a939f4c47e docs: update llama-eval-discussion.md with threading and model parameter updates
- Add threading support implementation details
- Document ThreadPoolExecutor usage and thread safety
- Add model parameter implementation details
- Include testing results for both features
2026-02-15 21:08:23 +02:00
Georgi Gerganov
62b04cef54 examples: add threading support and model parameter to llama-eval-new.py
- Add ThreadPoolExecutor for parallel request processing controlled by --threads
- Add --model argument to specify model name in request data
- Refactor process() to use thread-safe _process_single_case() method
- Update progress tracking to work with concurrent execution
2026-02-15 21:08:23 +02:00
Georgi Gerganov
37b26cafee docs: update llama-eval-discussion.md with session work summary 2026-02-15 21:08:23 +02:00
Georgi Gerganov
04f6872116 examples: use cached dataset path in simulator to avoid HF Hub requests 2026-02-15 21:08:23 +02:00
Georgi Gerganov
c2619c18bf examples: use cached dataset path to avoid HF Hub requests 2026-02-15 21:08:23 +02:00
Georgi Gerganov
87f8930968 examples: remove HF_HUB_OFFLINE to allow dataset download 2026-02-15 21:08:23 +02:00
Georgi Gerganov
9453f9de12 examples: use HF_HUB_OFFLINE to avoid HF Hub warnings 2026-02-15 21:08:23 +02:00
Georgi Gerganov
5a1be6ce37 examples: implement flexible grader system for answer validation
- Add Grader class supporting regex and CLI-based grading
- Implement built-in regex patterns for AIME, GSM8K, MMLU, HellaSwag, ARC, WinoGrande
- Add CLI grader interface: python script.py --answer <pred> --expected <gold>
- Add HF telemetry disable to avoid warnings
- Support exact match requirement for regex patterns
- Add 30-second timeout for CLI grader
- Handle both boxed and plain text formats for AIME answers
2026-02-15 21:08:23 +02:00
Georgi Gerganov
a80814e97b docs: remove README.md from llama-eval 2026-02-15 21:08:23 +02:00
Georgi Gerganov
5cc2258e82 examples: add simplified llama-eval-new.py for AIME evaluation
- Create new simplified evaluation script focused only on AIME
- Implement EvalState and Processor dataclasses for structured state management
- Add real-time feedback showing correct/incorrect status per case
- Abstract grading interface for external grader support
- Use structured JSON output for eval state
- Apply HuggingFace dataset caching to avoid repeated downloads
- Remove Levenshtein matching - eval script only sends requests and validates answers
2026-02-15 21:08:22 +02:00
Georgi Gerganov
c87af1d527 docs: update llama-eval-discussion.md with session work summary
Add summary of llama-server-simulator implementation work including
features, testing results, technical decisions, and refactoring.
2026-02-15 21:08:22 +02:00
Georgi Gerganov
23d4e21a81 examples: refactor test-simulator.sh for better readability
Extract repeating question string into TEST_QUESTION variable and
create make_request() helper function to reduce code duplication.
Add proper error handling for error responses.
2026-02-15 21:08:22 +02:00
Georgi Gerganov
07d5e1e0ea examples: add llama-server simulator for testing eval scripts
Add a standalone Python script that simulates a llama-server HTTP endpoint
for testing the eval script. The simulator:

- Implements /v1/chat/completions endpoint with OpenAI-compatible format
- Loads AIME dataset from HuggingFace with local caching
- Uses Levenshtein distance for intelligent question matching
- Supports configurable success rate for correct/wrong answer generation
- Provides debug logging for troubleshooting

Also includes test scripts and documentation for testing and understanding
the simulator functionality.
2026-02-15 21:08:22 +02:00
gatbontonpc
8839037528 add checkpointing 2026-02-15 21:08:22 +02:00
gatbontonpc
89cab3dbc5 Add readme 2026-02-15 21:08:22 +02:00
gatbontonpc
c2d83ca048 multi source llama-eval 2026-02-15 21:08:22 +02:00
gatbontonpc
c05df17ce3 working llama-eval mc and math suite 2026-02-15 21:08:19 +02:00
David Friehs
27b93cbd15 cuda: optimize iq2xxs/iq2xs/iq3xxs dequantization (#19624)
* cuda: optimize iq2xxs/iq2xs/iq3xxs dequantization

- load all 8 int8 for a grid position in one load
- calculate signs via popcnt instead of fetching from ksigns table
- broadcast signs to drop individual shift/mask

* cuda: iq2xxs: simplify sum scaling

express `(sum * scale + sum / 2) / 4` as `(sum * (scale * 2 + 1)) / 8`
express `((aux32 >> 28) * 2 + 1)` as `(aux32 >> 27 | 1)`

saves 3 registers for mul_mat_vec_q (152 -> 149) according to nsight
AFAICT no overflow can occur here as iq2xxs values are far too small

* uint -> uint32_t

error: identifier "uint" is undefined
2026-02-15 22:38:42 +05:30
Aaron Teo
6e67fd2144 docs: update s390x build docs (#19643) 2026-02-16 00:33:34 +08:00
Adrien Gallouët
9e118b97c4 build : remove LLAMA_HTTPLIB option (#19623)
This option was introduced as a workaround because cpp-httplib could not
build on visionOS. Since it has been fixed and now compiles on all platforms,
we can remove it and simplify many things.

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-02-15 15:38:50 +01:00
Daniel Bevenius
57088276d4 cmake : check if KleidiAI API has been fetched (#19640)
This commit addresses a build issue with the KleidiAI backend when
building multiple cpu backends. Commmit
3a00c98584 ("cmake : fix KleidiAI install
target failure with EXCLUDE_FROM_ALL") introduced a change where
FetchContent_Populate is called instead of FetchContent_MakeAvailable,
where the latter does handle this case (it is idempotent but
FetchContent_Populate is not).

I missed this during my review and I should not have commited without
verifying the CI failure, sorry about that.
2026-02-15 13:59:38 +01:00
Georgi Gerganov
341bc7d23c context : fix output reorder with backend sampling (#19638) 2026-02-15 14:57:40 +02:00
Georgi Gerganov
08e6d914b8 ggml : avoid UB in gemm ukernel (#19642) 2026-02-15 14:56:35 +02:00
Aaron Teo
184c694f45 ggml-cpu: optimize ggml_vec_dot_bf16 for s390x (#19399) 2026-02-15 18:20:35 +08:00
Aman Gupta
684b36101c ggml-cpu: FA add GEMM microkernel (#19422)
* ggml-cpu: FA add GEMM microkernel

* add guard for sizeless vector types

* fix case where DV % GGML_F32_EPR !=0

* move memset out of the loop

* move another memset out of the loop

* use RM=4 for arm

* simd_gemm: convert everything to int

* convert everything to size_t to avoid warnings

* fixup

* add pragma for ignoring aggressive loop optimizations
2026-02-15 11:09:24 +05:30
SamareshSingh
3a00c98584 cmake : fix KleidiAI install target failure with EXCLUDE_FROM_ALL (#19581)
* cmake: fix KleidiAI install target failure with EXCLUDE_FROM_ALL

Fix for the bug #19501 by adding EXCLUDE_FROM_ALL to FetchContent_Declare. This properly excludes KleidiAI from both build and install targets, preventing install failures when GGML_CPU_KLEIDIAI=ON is used.

The KleidiAI source files are still compiled into libggml-cpu.so, preserving all functionality.

* addressed code review comments
2026-02-15 06:22:53 +01:00
Sigbjørn Skjæret
079feab9e3 convert : ensure all models handle new experts count (#19621)
* ensure all models handle new experts count

* revert removal for PhiMoeModel, does not inherit from base
2026-02-14 22:22:32 +01:00
Anav Prasad
01d8eaa28d mtmd : Add Nemotron Nano 12B v2 VL support (#19547)
* nemotron nano v2 vlm support added

* simplified code; addressed reviews

* pre-downsample position embeddings during GGUF conversion for fixed input size
2026-02-14 14:07:00 +01:00
Georgi Gerganov
1725e316c1 models : optimize qwen3next graph (#19375)
* models : optimizing qwen3next graph

* cont

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* cont : remove redundant q, g chunking

* minor

* minor

* avoid passing masks around

* avoid concats during chunking

* naming + shapes

* update names and use prefix to disable CUDA graphs
2026-02-14 12:57:36 +02:00
Adrien Gallouët
b7742cf321 ggml : fix GGML_DEBUG with OpenMP (#19599)
last_graph is only available without OpenMP, but
ggml_graph_compute_thread() is called in both cases.

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-02-14 11:22:57 +01:00
41 changed files with 2839 additions and 604 deletions

View File

@@ -112,7 +112,6 @@ option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
# 3rd party libs
option(LLAMA_HTTPLIB "llama: httplib for downloading functionality" ON)
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
@@ -197,9 +196,7 @@ add_subdirectory(src)
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
if (LLAMA_HTTPLIB)
add_subdirectory(vendor/cpp-httplib)
endif()
add_subdirectory(vendor/cpp-httplib)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)

View File

@@ -449,10 +449,9 @@ cmake -B build-visionos -G Xcode \
-DCMAKE_SYSTEM_NAME=visionOS \
-DCMAKE_OSX_SYSROOT=xros \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_HTTPLIB=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-S .
cmake --build build-visionos --config Release -- -quiet
@@ -465,10 +464,9 @@ cmake -B build-visionos-sim -G Xcode \
-DCMAKE_SYSTEM_NAME=visionOS \
-DCMAKE_OSX_SYSROOT=xrsimulator \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_HTTPLIB=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-S .
cmake --build build-visionos-sim --config Release -- -quiet

View File

@@ -112,11 +112,7 @@ endif()
# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
set(LLAMA_COMMON_EXTRA_LIBS build_info)
if (LLAMA_HTTPLIB)
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
endif()
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
if (LLAMA_LLGUIDANCE)
include(ExternalProject)

View File

@@ -19,9 +19,7 @@
#include <thread>
#include <vector>
#if defined(LLAMA_USE_HTTPLIB)
#include "http.h"
#endif
#ifndef __EMSCRIPTEN__
#ifdef __linux__
@@ -142,8 +140,6 @@ std::pair<std::string, std::string> common_download_split_repo_tag(const std::st
return {hf_repo, tag};
}
#if defined(LLAMA_USE_HTTPLIB)
class ProgressBar {
static inline std::mutex mutex;
static inline std::map<const ProgressBar *, int> lines;
@@ -768,30 +764,6 @@ std::string common_docker_resolve_model(const std::string & docker) {
}
}
#else
common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool, const common_header_list &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
bool common_download_model(const common_params_model &, const std::string &, bool, const common_header_list &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
std::string common_docker_resolve_model(const std::string &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
int common_download_file_single(const std::string &,
const std::string &,
const std::string &,
bool,
const common_header_list &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
#endif // defined(LLAMA_USE_HTTPLIB)
std::vector<common_cached_model_info> common_list_cached_models() {
std::vector<common_cached_model_info> models;
const std::string cache_dir = fs_get_cache_directory();

View File

@@ -2726,8 +2726,6 @@ class AfmoeModel(LlamaModel):
super().set_gguf_parameters()
# MoE parameters
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (n_shared_experts := self.hparams.get("num_shared_experts")) is not None:
self.gguf_writer.add_expert_shared_count(n_shared_experts)
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
@@ -2749,7 +2747,7 @@ class AfmoeModel(LlamaModel):
# Handle expert weights - they're already merged in the HF format
# process the experts separately
if name.find("mlp.experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -4074,6 +4072,87 @@ class InternVisionModel(MmprojModel):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register(
"NemotronH_Nano_VL_V2",
"RADIOModel",
)
class NemotronNanoV2VLModel(MmprojModel):
# ViT-Huge architecture parameters for RADIO v2.5-h
_vit_hidden_size = 1280
_vit_intermediate_size = 5120
_vit_num_layers = 32
_vit_num_heads = 16
def get_vision_config(self) -> dict[str, Any] | None:
# RADIO config doesn't have standard ViT parameters, so they need to be constructed manually
vision_config = self.global_config.get("vision_config")
if vision_config is None:
return None
# Add ViT-H parameters
vision_config = {
**vision_config,
"hidden_size": self._vit_hidden_size,
"intermediate_size": self._vit_intermediate_size,
"num_hidden_layers": self._vit_num_layers,
"num_attention_heads": self._vit_num_heads,
"image_size": self.global_config.get("force_image_size", 512),
}
return vision_config
def set_gguf_parameters(self):
if "image_mean" not in self.preprocessor_config:
self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406]
if "image_std" not in self.preprocessor_config:
self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225]
super().set_gguf_parameters()
hparams = self.global_config
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL)
self.gguf_writer.add_vision_attention_layernorm_eps(1e-6)
self.gguf_writer.add_vision_use_gelu(True)
downsample_ratio = hparams.get("downsample_ratio", 0.5)
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".position_embd." in new_name or "pos_embed" in new_name:
return gguf.GGMLQuantizationType.F32
return super().tensor_force_quant(name, new_name, bid, n_dims)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if "input_conditioner" in name:
return
# RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it
if "patch_generator.pos_embed" in name:
if not name.endswith(".weight"):
name += ".weight"
# Downsample position embeddings for fixed 512x512 image size
import torch.nn.functional as F
n_embd = self.hparams["hidden_size"]
image_size = self.global_config.get("force_image_size", 512)
patch_size = self.hparams["patch_size"]
target_patches_per_side = image_size // patch_size # 32
max_patches_per_side = int((data_torch.shape[1]) ** 0.5) # 128
if target_patches_per_side != max_patches_per_side:
# Reshape to grid, interpolate, flatten back
data_torch = data_torch.reshape(1, max_patches_per_side, max_patches_per_side, n_embd)
data_torch = data_torch.permute(0, 3, 1, 2).float() # [1, n_embd, 128, 128]
data_torch = F.interpolate(data_torch, size=(target_patches_per_side, target_patches_per_side),
mode='bilinear', align_corners=True)
data_torch = data_torch.permute(0, 2, 3, 1) # [1, 32, 32, n_embd]
data_torch = data_torch.reshape(1, target_patches_per_side * target_patches_per_side, n_embd)
# Reshape linear patch embedding to conv2d format for ggml_conv_2d
# From [n_embd, patch_size*patch_size*3] to [n_embd, 3, patch_size, patch_size]
if "patch_generator.embedder" in name:
patch_size = self.hparams["patch_size"]
n_embd = self.hparams["hidden_size"]
data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size)
if name.startswith("vision_model.radio_model.model.") or name.startswith("mlp1."):
yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("WavTokenizerDec")
class WavTokenizerDecModel(TextModel):
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
@@ -4116,8 +4195,6 @@ class Qwen2MoeModel(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
@@ -4162,7 +4239,7 @@ class Qwen2MoeModel(TextModel):
return
if name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -4913,13 +4990,13 @@ class PhiMoeModel(Phi3MiniModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
self.gguf_writer.add_expert_count(self.hparams["num_local_experts"])
self.gguf_writer.add_expert_used_count(self.find_hparam(["num_experts_per_tok", "num_experts_per_token"]))
self.gguf_writer.add_expert_count(self.find_hparam(["num_local_experts", "num_experts"]))
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
n_experts = self.hparams["num_local_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -5331,7 +5408,7 @@ class KimiLinearModel(TextModel):
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
n_experts = self.find_hparam(["num_local_experts", "num_experts"], optional=False)
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -5926,12 +6003,13 @@ class NomicBertModel(BertModel):
if "mlp.experts.bias" in name:
return # Explicitly return.
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
if "mlp.experts.mlp.w1" in name:
data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"])
name += ".weight"
if "mlp.experts.mlp.w2" in name:
data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
data_torch = data_torch.view(n_experts, self.hparams["n_inner"], self.hparams["n_embd"])
data_torch = data_torch.transpose(1, 2)
name += ".weight"
@@ -5941,7 +6019,6 @@ class NomicBertModel(BertModel):
super().set_gguf_parameters()
if self.is_moe:
self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
def _is_tokenizer_xlmroberta(self) -> bool:
@@ -7055,6 +7132,8 @@ class Mamba2Model(TextModel):
if hparams is None:
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if "llm_config" in hparams:
hparams["text_config"] = hparams["llm_config"]
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
@@ -7176,8 +7255,8 @@ class JambaModel(TextModel):
self.gguf_writer.add_ssm_state_size(d_state)
self.gguf_writer.add_ssm_time_step_rank(dt_rank)
self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
self.gguf_writer.add_expert_used_count(self.hparams["num_experts_per_tok"])
self.gguf_writer.add_expert_count(self.find_hparam(["num_local_experts", "num_experts"]))
self.gguf_writer.add_expert_used_count(self.find_hparam(["num_experts_per_tok", "num_experts_per_token"]))
self.gguf_writer.add_file_type(self.ftype)
_experts: list[dict[str, Tensor]] | None = None
@@ -7195,7 +7274,7 @@ class JambaModel(TextModel):
# process the experts separately
if ".feed_forward.experts." in name:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
@@ -7343,8 +7422,6 @@ class OlmoeModel(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_layer_norm_rms_eps(1e-5)
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
_experts: list[dict[str, Tensor]] | None = None
@@ -7352,7 +7429,7 @@ class OlmoeModel(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -7933,10 +8010,6 @@ class MiniMaxM2Model(TextModel):
model_arch = gguf.MODEL_ARCH.MINIMAXM2
_experts_cache: dict[int, dict[str, Tensor]] = {}
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.hparams["num_experts"] = self.hparams["num_local_experts"]
def set_gguf_parameters(self):
super().set_gguf_parameters()
@@ -7949,7 +8022,7 @@ class MiniMaxM2Model(TextModel):
# merge expert weights
if 'experts' in name:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
expert_cache = self._experts_cache.setdefault(bid, {})
@@ -9154,7 +9227,6 @@ class ExaoneMoEModel(Exaone4Model):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
moe_intermediate_size = self.hparams["moe_intermediate_size"]
num_shared_experts = self.hparams["num_shared_experts"]
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
@@ -9195,7 +9267,7 @@ class ExaoneMoEModel(Exaone4Model):
name = name.replace("e_score_correction_bias", "e_score_correction.bias")
if name.find("mlp.experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -9346,7 +9418,7 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
# case, the model architecture needs to be updated to a standard
# "granite" or "granitemoe" model
if not self._ssm_layers:
has_experts = self.find_hparam(["num_experts_per_tok"], optional=True)
has_experts = self.find_hparam(["num_experts_per_tok", "num_experts_per_token"], optional=True)
new_arch = (
gguf.MODEL_ARCH.GRANITE_MOE
if has_experts else
@@ -9542,6 +9614,14 @@ class NemotronHModel(GraniteHybridModel):
self.gguf_writer.add_add_bos_token(True)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Skip vision model and projector tensors for VLM models (handled by mmproj) (e.g., Nemotron Nano 12B v2 VL)
if name.startswith(("vision_model.", "mlp1.")):
return
# Strip language_model. prefix for VLM models (e.g., Nemotron Nano 12B v2 VL)
if name.startswith("language_model."):
name = name[len("language_model."):]
if self.is_moe and bid is not None:
if name.endswith("mixer.gate.e_score_correction_bias"):
new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
@@ -9636,7 +9716,6 @@ class BailingMoeModel(TextModel):
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_weights_scale(1.0)
self.gguf_writer.add_expert_count(hparams["num_experts"])
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
@@ -9670,7 +9749,7 @@ class BailingMoeModel(TextModel):
yield from super().modify_tensors(v,self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), bid)
return
elif name.find("mlp.experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -9741,7 +9820,6 @@ class BailingMoeV2Model(TextModel):
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_shared_feed_forward_length(hparams.get("moe_shared_expert_intermediate_size", hparams["moe_intermediate_size"] * hparams["num_shared_experts"]))
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
self.gguf_writer.add_expert_count(hparams["num_experts"])
self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
@@ -9752,7 +9830,7 @@ class BailingMoeV2Model(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if "mlp.experts" in name:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -9798,8 +9876,6 @@ class GroveMoeModel(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}")
@@ -9820,7 +9896,7 @@ class GroveMoeModel(TextModel):
# process the experts separately
if name.find("chunk_experts") != -1:
n_experts = self.hparams["num_experts"] // 2 # see add_experts_per_group
n_experts = self.find_hparam(["num_local_experts", "num_experts"]) // 2 # see add_experts_per_group
assert bid is not None
if self._chunk_experts is None:
@@ -9847,7 +9923,7 @@ class GroveMoeModel(TextModel):
else:
return
elif name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -10240,7 +10316,6 @@ class HunYuanMoEModel(TextModel):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_expert_count(hparams["num_experts"])
self.gguf_writer.add_expert_shared_feed_forward_length(hparams["intermediate_size"])
moe_intermediate_size = hparams["moe_intermediate_size"]
@@ -10283,7 +10358,7 @@ class HunYuanMoEModel(TextModel):
return
if name.find("mlp.experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -10325,16 +10400,9 @@ class LLaDAMoEModel(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size)
# number of experts used per token (top-k)
if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
self.gguf_writer.add_mask_token_id(156895)
self.gguf_writer.add_causal_attention(False)
self.gguf_writer.add_diffusion_shift_logits(False)
@@ -10345,7 +10413,7 @@ class LLaDAMoEModel(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("experts") != -1:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:
@@ -10682,7 +10750,6 @@ class LFM2MoeModel(TextModel):
super().set_gguf_parameters()
self.gguf_writer.add_expert_count(self.hparams["num_experts"])
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"])
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
@@ -10703,7 +10770,7 @@ class LFM2MoeModel(TextModel):
# merge expert weights
if 'experts' in name:
n_experts = self.hparams["num_experts"]
n_experts = self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
expert_cache = self._experts_cache.setdefault(bid, {})
@@ -10813,9 +10880,9 @@ class SmallThinkerModel(TextModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
if (n_experts := self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))) is not None:
if (n_experts := self.hparams.get("moe_num_primary_experts")) is not None:
self.gguf_writer.add_expert_count(n_experts)
if (n_experts_used := self.hparams.get("num_experts_per_tok", self.hparams.get("moe_num_active_primary_experts"))) is not None:
if (n_experts_used := self.hparams.get("moe_num_active_primary_experts")) is not None:
self.gguf_writer.add_expert_used_count(n_experts_used)
if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None:
self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
@@ -10840,7 +10907,7 @@ class SmallThinkerModel(TextModel):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("experts") != -1:
n_experts = self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))
n_experts = self.hparams.get("moe_num_primary_experts") or self.find_hparam(["num_local_experts", "num_experts"])
assert bid is not None
if self._experts is None:

View File

@@ -242,10 +242,10 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|------------|-------------|------|-------|
| FP32 | ✅ | ✅ | ❓ |
| FP16 | ✅ | ✅ | ❓ |
| BF16 | 🚫 | ✅ | ❓ |
| BF16 | | ✅ | ❓ |
| Q4_0 | ✅ | ❓ | ❓ |
| Q4_1 | ✅ | ❓ | ❓ |
| MXFP4 | 🚫 | ❓ | ❓ |
| MXFP4 | | ❓ | ❓ |
| Q5_0 | ✅ | ❓ | ❓ |
| Q5_1 | ✅ | ❓ | ❓ |
| Q8_0 | ✅ | ❓ | ❓ |
@@ -272,4 +272,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
- 🚫 - acceleration unavailable, will still run using scalar implementation
- ❓ - acceleration unknown, please contribute if you can test it yourself
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 7, 2025.
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Feb 15, 2026.

View File

@@ -0,0 +1,190 @@
# llama-eval Codebase Guidelines
## Overview
This directory contains Python evaluation tools for llama.cpp:
- `llama-eval.py` - Main evaluation tool with multiple datasets (AIME, AIME2025, GSM8K, GPQA)
- `llama-server-simulator.py` - Flask-based server simulator for testing
- `test-simulator.sh` - Test script for the simulator
## Build/Run Commands
### Virtual Environment
The project uses a virtual environment located at `venv/`:
```bash
source venv/bin/activate
```
### Running the Main Evaluator
```bash
python llama-eval.py \
--server http://127.0.0.1:8013 \
--model gpt-oss-20b-hf-low \
--dataset aime \
--n_cases 10 \
--grader-type llm \
--seed 42
```
### Running the Simulator (for testing)
```bash
python llama-server-simulator.py --port 8033 --success-rate 0.8
```
### Running Tests
```bash
./test-simulator.sh
```
## Code Style Guidelines
### Imports
- Standard library imports first (argparse, json, os, re, subprocess, sys, time)
- Third-party imports (requests, tqdm, datasets, flask) after standard library
- Relative imports not used
- Group imports by category with blank line between groups
### Formatting
- 4-space indentation
- Max line length: 125 characters (per parent project's .flake8)
- Use double quotes for strings
- Use triple double quotes for docstrings
- Binary operators at the beginning of continued lines
### Naming Conventions
- Classes: PascalCase (e.g., `AimeDataset`, `Grader`, `Processor`)
- Functions: snake_case (e.g., `normalize_number`, `get_prompt`)
- Variables: snake_case (e.g., `question_text`, `correct_count`)
- Constants: UPPER_SNAKE_CASE (e.g., `GRADER_PATTERNS`, `TEMPLATE_REGISTRY`)
- Private methods: prefix with underscore (e.g., `_load_dataset`, `_grade_regex`)
### Types
- Use type hints for all function signatures
- Import from `typing` module: `Dict`, `List`, `Optional`, `Any`, `Tuple`
- Use `@dataclass` for data structures
- Prefer `Optional[T]` over `Union[T, None]`
### Error Handling
- Use try/except for network requests and file operations
- Return `None` or `False` on errors when appropriate
- Use `ValueError` for invalid arguments
- Use `FileNotFoundError` for missing files
- CLI scripts should handle exceptions gracefully
### Dataclasses
- Use `@dataclass` for structured data
- Define fields with explicit types
- Use `Optional[T]` for nullable fields
- Provide default values where appropriate
### String Formatting
- Use f-strings for formatting (Python 3.6+)
- Use triple double quotes for multi-line strings
- Escape backslashes in regex patterns: `r'\\boxed{(\d+)}'`
### File Paths
- Use `pathlib.Path` instead of string paths
- Create directories with `mkdir(parents=True, exist_ok=True)`
- Use `Path.home()` for user home directory
### Logging
- Use `print()` for user-facing output
- Use `sys.stderr` for debug logging
- Simulator writes debug logs to `/tmp/simulator-debug.log`
### Testing
- Test script uses bash with `set -e` for strict error handling
- Simulator runs in background with PID tracking
- Tests verify correct answers, error cases, and edge cases
- Use `curl` for HTTP testing in shell scripts
### Whitespace Cleanup
- Remove trailing whitespace from all lines
- When making edits, do not leave trailing whitespace
## Dataset Support
### AIME Dataset
- 90 questions from 2025 AIME competition
- Answers in `\boxed{answer}` format
- Supports regex, CLI, and LLM grading
### AIME2025 Dataset
- 30 questions from 2025 AIME I & II
- Answers in `\boxed{answer}` format
- Requires loading two config parts
### GSM8K Dataset
- 7473 math word problems
- Answers numeric values with `####` separator
- Supports regex, CLI, and LLM grading
### GPQA Dataset
- 198 questions from GPQA Diamond
- Multiple choice with shuffled options (A, B, C, D)
- **Requires LLM grader** (returns letter A/B/C/D)
## Grading Types
### Regex Grader
- Built-in patterns per dataset
- Prioritizes `\boxed{}` for AIME datasets
- Extracts last number for GSM8K
### CLI Grader
- External script interface
- Call: `grader.sh --answer <pred> --expected <gold>`
- Exit code 0 = correct, non-zero = incorrect
### LLM Grader
- Uses judge model for answer extraction
- Includes few-shot examples
- Case-insensitive comparison
- Required for GPQA
## Configuration
### Sampling Parameters (Optional)
- `--temperature`: Sampling temperature
- `--top-k`: Top K sampling
- `--top-p`: Top P sampling
- `--min-p`: Min P sampling
- Only passed to API if explicitly specified
### Default Values
- `--n_predict`: -1 (infinite)
- `--grader-type`: llm
- `--seed`: 1234
- `--threads`: 32
- `--output`: llama-eval-state.json
## Output Format
### Progress Table
- Shows task ID, dataset, prompt (truncated to 43 chars), expected answer, status
- Uses `tqdm` for progress bars
### Results Summary
- Format: `Results: X/Y correct (Z%)`
- Displayed after all tasks complete
### JSON Output
- Complete eval state saved to output file
- Contains: task IDs, correctness, prompts, extracted answers, sampling config
- Uses `dataclasses.asdict()` for serialization
## HuggingFace Datasets
- Cache directory: `~/.cache/huggingface/datasets`
- Set via `HF_DATASETS_CACHE` environment variable
- Telemetry disabled via `HF_HUB_DISABLE_TELEMETRY=1`
- Datasets loaded with `datasets.load_dataset()`
## Flask Simulator
- Runs on configurable port (default: 5000)
- Endpoint: `/v1/chat/completions` (OpenAI-compatible)
- Uses Dice coefficient for question matching
- Configurable success rate for testing
- Debug logs to `/tmp/simulator-debug.log`

View File

@@ -0,0 +1,94 @@
# llama-eval Implementation Summary
## Overview
Simple evaluation tool for llama.cpp with support for multiple datasets (AIME, GSM8K, GPQA) and flexible grading (regex, CLI, LLM).
## Key Features
- **Multiple Datasets**: AIME, GSM8K, GPQA with proper answer extraction
- **Flexible Grading**: Regex, CLI, or LLM-based grading
- **Parallel Processing**: Configurable thread count for concurrent requests
- **Sampling Parameters**: Temperature, Top K, Top P, Min P (optional)
- **Real-time Feedback**: Progress tracking with detailed output
- **JSON Output**: Complete eval state saved for debugging
- **GPQA Support**: Answer shuffling with reproducible results
## Architecture
### Eval State
```python
@dataclass
class EvalState:
id: str
tasks: List[str]
task_states: Dict[str, Dict[str, Any]]
sampling_config: Dict[str, Any]
```
### Processor
- Handles processing, grading, and state management
- Thread-safe concurrent execution
- Configurable sampling parameters
### Grader
- Abstract grading interface supporting multiple types
- Regex grader with dataset-specific patterns
- CLI grader with external script interface
- LLM grader with configurable server and model
### Datasets
- `AimeDataset`: 90 AIME 2025 questions
- `Aime2025Dataset`: 30 AIME 2025 I & II questions
- `Gsm8kDataset`: 7473 math word problems
- `GpqaDataset`: 198 GPQA Diamond questions with shuffling
## Configuration
### Sampling Parameters (Optional)
- `--temperature`: Sampling temperature
- `--top-k`: Top K sampling
- `--top-p`: Top P sampling
- `--min-p`: Min P sampling
- Only passed if explicitly specified
### Grading Types
- **regex**: Built-in patterns for each dataset
- **cli**: External script with `--answer` and `--expected` args
- **llm**: LLM-based extraction with few-shot examples and configurable server/model
### Dataset Requirements
- **AIME**: Supports regex, CLI, or LLM grader
- **AIME2025**: Supports regex, CLI, or LLM grader
- **GSM8K**: Supports regex, CLI, or LLM grader
- **GPQA**: Requires LLM grader
## Output Format
### Progress Table
```
Task ID Dataset Prompt (first 43 chars) Expected Status
aime_000_001 AIME Complete the following reactions and sel... A pending
```
### Results Summary
```
============================================================
Results: 8/10 correct (80.0%)
============================================================
```
### JSON Output
Complete eval state with task IDs, correctness, prompts, extracted answers, and sampling configuration.
## Technical Details
- Default max tokens: -1 (infinite)
- Default grader type: llm
- Default seed: 1234
- Default threads: 32
- Prompt truncation: First 43 chars + padding + "..."
- Response truncation: Last 10 lines for grading
- GPQA requires LLM grader (returns letter A/B/C/D)
- Judge model defaults to evaluated model if not specified
- Sample answers defined in SAMPLE_ANSWERS dict for few-shot learning

View File

@@ -0,0 +1,112 @@
# llama-eval Evaluation Tool
Simple evaluation tool for llama.cpp with support for multiple datasets.
## Features
- **Multiple Datasets**: AIME, GSM8K, GPQA
- **Flexible Grading**: Regex, CLI, or LLM-based grading
- **Parallel Processing**: Configurable thread count
- **Real-time Feedback**: Progress tracking with detailed output
- **Sampling Parameters**: Temperature, Top K, Top P, Min P
- **JSON Output**: Complete eval state saved for debugging
## Usage
```bash
python llama-eval.py \
--server http://127.0.0.1:8013 \
--model gpt-oss-20b-hf-low \
--judge-model gpt-oss-20b-hf-medium \
--dataset aime \
--n_cases 10 \
--grader-type llm \
--seed 42
```
## CLI Arguments
- `--server`: llama-server URL (default: http://127.0.0.1:8013)
- `--model`: Model name for evaluation (default: llama)
- `--judge-model`: Model name for LLM judge (default: same as main model)
- `--judge-server`: Server URL for LLM judge (default: same as main server)
- `--dataset`: Dataset type (aime, aime2025, gsm8k, gpqa)
- `--n_cases`: Number of cases to evaluate (default: all)
- `--n_predict`: Max tokens to predict per prompt (default: -1, infinite)
- `--temperature`: Sampling temperature (default: not passed)
- `--top-k`: Top K sampling (default: not passed)
- `--top-p`: Top P sampling (default: not passed)
- `--min-p`: Min P sampling (default: not passed)
- `--threads`: Number of threads for parallel requests (default: 32)
- `--verbose`: Show detailed output for each case
- `--output`: Output file for eval state (default: llama-eval-state.json)
- `--grader-type`: Grader type (regex, cli, llm, default: llm)
- `--grader-script`: Path to CLI grader script (required for --grader-type cli)
- `--seed`: Random seed for shuffling (default: 1234)
## Datasets
### AIME
- 90 questions from 2025 AIME competition
- Answers in boxed format: `\boxed{answer}`
- Requires regex grader or LLM grader
### AIME2025
- 30 questions from 2025 AIME I & II competitions
- Answers in boxed format: `\boxed{answer}`
- Supports regex, CLI, or LLM grader
### GSM8K
- 7473 math word problems
- Answers are numeric values
- Requires regex grader or LLM grader
### GPQA
- 198 questions from GPQA Diamond dataset
- Multiple choice with shuffled options
- Requires LLM grader (returns letter A, B, C, or D)
## Grading Types
### Regex Grader
Built-in patterns for different datasets:
- AIME: `\boxed{(\d+)}|\b(\d+)\b`
- AIME2025: `\boxed{(\d+)}|\b(\d+)\b`
- GSM8K: `\b(\d+)\b`
- GPQA: Letter extraction (A, B, C, D)
### CLI Grader
External script interface:
```bash
./grader.sh --answer <pred> --expected <gold>
```
Returns exit code 0 if correct, non-zero if incorrect.
### LLM Grader
Uses LLM to extract and compare answers:
- Configurable server and model
- Includes few-shot examples from sample answers
- Case-insensitive comparison
- Required for GPQA dataset
## Output
### Progress Table
```
Task ID Dataset Prompt (first 43 chars) Expected Status
aime_000_001 AIME Complete the following reactions and sel... A pending
```
### Results
```
============================================================
Results: 8/10 correct (80.0%)
============================================================
```
### JSON Output
Complete eval state saved to output file with:
- Task IDs and correctness status
- Prompts and extracted answers
- Sampling configuration
- Processing metadata

1229
examples/llama-eval/llama-eval.py Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,36 @@
# llama-server-simulator
Standalone Python script simulating llama-server HTTP endpoint for testing.
## Features
- HTTP Server with OpenAI-compatible `/v1/chat/completions` endpoint
- AIME Dataset Integration - Loads 90 questions from HuggingFace
- Intelligent Question Matching - Uses exact matching, LaTeX removal, and Levenshtein distance
- Configurable Success Rate - Control correct/wrong answer generation (0-1)
- Debug Logging - Troubleshoot matching issues
## Usage
```bash
python llama-server-simulator.py --success-rate 0.8
```
## Arguments
- `--success-rate`: Probability of returning correct answer (0.0-1.0, default: 0.8)
- `--port`: Server port (default: 8033)
- `--debug`: Enable debug logging (default: False)
## Testing
```bash
./test-simulator.sh
```
## Implementation Details
- Uses Levenshtein distance for partial matching (threshold: 0.3)
- Automatic caching via HuggingFace datasets library
- Wrong answers generated by incrementing expected answer
- Debug output written to stderr

View File

@@ -0,0 +1,283 @@
#!/usr/bin/env python3
import argparse
import json
import random
import re
import time
import sys
import os
from typing import Dict, List, Optional
from dataclasses import dataclass, asdict
from pathlib import Path
import datasets
from flask import Flask, request, jsonify
# Set cache directory for HuggingFace datasets
cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
cache_dir.mkdir(parents=True, exist_ok=True)
os.environ["HF_DATASETS_CACHE"] = str(cache_dir)
def dice(s1: str, s2: str) -> float:
"""Calculate Dice coefficient between two strings based on bigram overlap."""
if not s1 and not s2:
return 1.0
def _bigrams(s: str):
return [s[i : i + 2] for i in range(len(s) - 1)]
bigrams1 = _bigrams(s1)
bigrams2 = _bigrams(s2)
if not bigrams1 and not bigrams2:
return 1.0
from collections import Counter
freq1 = Counter(bigrams1)
freq2 = Counter(bigrams2)
intersection = sum(min(freq1[bg], freq2[bg]) for bg in freq1)
dice_coeff = 2 * intersection / (len(bigrams1) + len(bigrams2))
return dice_coeff
def debug_log(message: str):
"""Log debug messages to both stdout and a file"""
print(message, file=sys.stderr)
with open("/tmp/simulator-debug.log", "a") as f:
f.write(message + "\n")
app = Flask(__name__)
@dataclass
class EvalState:
id: str
tasks: List[str]
task_states: Dict[str, Dict]
sampling_config: Dict
def normalize_number(s: str) -> Optional[int]:
match = re.match(r"\d+", s) # match digits from the start
if not match:
return None
return int(match.group(0))
class AimeDataset:
def __init__(self, split: str = "train"):
self.split = split
self.questions: List[Dict] = []
self._load_dataset()
def _load_dataset(self):
print(f"Loading AIME dataset (split: {self.split})...")
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
if cache_path.exists():
print(f"Using cached dataset from {cache_path}")
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
else:
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
self.questions = list(ds)
print(f"AIME dataset loaded: {len(self.questions)} questions")
def find_question(self, request_text: str) -> Optional[Dict]:
best_match = None
best_distance = -1
best_index = -1
for i, question in enumerate(self.questions):
question_text = question["problem"]
request_lower = request_text.lower()
question_lower = question_text.lower()
# Exact match
if question_lower == request_lower:
debug_log(f"DEBUG: Found exact match at index {i}")
return question
# Remove LaTeX formatting for more flexible matching
question_no_latex = re.sub(r'\$[^$]+\$', '', question_text)
if question_no_latex.lower() == request_lower:
debug_log(f"DEBUG: Found match (no LaTeX) at index {i}")
return question
# Calculate Levenshtein distance for partial matches
# Only consider if request is at least 50% of question length
if len(request_lower) >= len(question_lower) * 0.5:
distance = dice(question_lower, request_lower)
if distance > best_distance:
best_distance = distance
best_match = question
best_index = i
if best_match and best_distance > 0.3: # Threshold for partial match
debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
return best_match
debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
return None
def get_answer(self, question: Dict) -> str:
answer = question["answer"]
if isinstance(answer, str):
normalized = normalize_number(answer)
return str(normalized) if normalized is not None else answer
return str(answer)
class Simulator:
def __init__(
self,
port: int = 8033,
host: str = "localhost",
success_rate: float = 0.8,
dataset_split: str = "train"
):
self.port = port
self.host = host
self.success_rate = success_rate
self.dataset = AimeDataset(dataset_split)
self.eval_state = EvalState(
id="aime-2025",
tasks=["aime"],
task_states={},
sampling_config={"temperature": 0, "max_tokens": 2048}
)
def _generate_response(
self,
question: Dict,
should_be_correct: bool
) -> Dict:
expected_answer = self.dataset.get_answer(question)
if should_be_correct:
response_text = expected_answer
else:
response_text = self._generate_wrong_answer(question)
return {
"id": f"chatcmpl-{int(time.time())}",
"object": "chat.completion",
"created": int(time.time()),
"model": "llama",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": response_text
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 100,
"completion_tokens": 50,
"total_tokens": 150
}
}
def _generate_wrong_answer(self, question: Dict) -> str:
expected_answer = self.dataset.get_answer(question)
if expected_answer.isdigit():
wrong_answer = str(int(expected_answer) + 1)
else:
wrong_answer = expected_answer + " (wrong)"
return wrong_answer
def _process_request(self, request_data: Dict) -> Dict:
messages = request_data.get("messages", [])
if not messages:
return {"error": "No messages in request"}
request_text = messages[0].get("content", "")
debug_log(f"DEBUG: Received request with content: {request_text[:150]}...")
question = self.dataset.find_question(request_text)
if not question:
debug_log(f"DEBUG: find_question returned None")
return {"error": "No matching question found"}
should_be_correct = random.random() < self.success_rate
response = self._generate_response(question, should_be_correct)
task_id = "aime"
self.eval_state.task_states[task_id] = {
"correct": should_be_correct,
"expected": self.dataset.get_answer(question),
"predicted": response["choices"][0]["message"]["content"]
}
return response
@app.route('/v1/chat/completions', methods=['POST'])
def chat_completions():
try:
request_data = request.get_json()
if not request_data:
return jsonify({"error": "Invalid JSON"}), 400
response = simulator._process_request(request_data)
return jsonify(response)
except Exception as e:
print(f"Error processing request: {e}")
return jsonify({"error": str(e)}), 500
def main():
parser = argparse.ArgumentParser(
description="llama-server simulator for testing eval scripts"
)
parser.add_argument(
"--port",
type=int,
default=8033,
help="Server port (default: 8033)"
)
parser.add_argument(
"--host",
type=str,
default="localhost",
help="Server host (default: localhost)"
)
parser.add_argument(
"--success-rate",
type=float,
default=0.8,
help="Success rate 0-1 (default: 0.8)"
)
parser.add_argument(
"--dataset-split",
type=str,
default="train",
help="AIME dataset split to use (default: train)"
)
args = parser.parse_args()
global simulator
simulator = Simulator(
port=args.port,
host=args.host,
success_rate=args.success_rate,
dataset_split=args.dataset_split
)
print("\n=== llama-server-simulator ===")
print(f"Server running on http://{args.host}:{args.port}")
print(f"Success rate: {args.success_rate}")
print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
print("\nPress Ctrl+C to stop\n")
app.run(host=args.host, port=args.port, debug=False)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,86 @@
#!/bin/bash
set -e
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "=== llama-server-simulator Test Script ==="
echo ""
PORT=8033
SUCCESS_RATE=0.8
TEST_PORT=8034
echo "Starting simulator on port $PORT with success rate $SUCCESS_RATE..."
source "$SCRIPT_DIR/venv/bin/activate"
python3 "$SCRIPT_DIR/llama-server-simulator.py" --port $PORT --success-rate $SUCCESS_RATE > /tmp/simulator-test.log 2>&1 &
SIMULATOR_PID=$!
echo "Waiting for simulator to start..."
sleep 5
# Helper function to make a request and extract the answer
make_request() {
local question="$1"
curl -s -X POST http://localhost:$PORT/v1/chat/completions \
-H "Content-Type: application/json" \
-d "{
\"model\": \"llama\",
\"messages\": [
{\"role\": \"user\", \"content\": \"$question\"}
],
\"temperature\": 0,
\"max_tokens\": 2048
}" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data.get('choices', [{}])[0].get('message', {}).get('content', data.get('error', 'No response')))"
}
# Test question (repeated in multiple tests)
TEST_QUESTION="Quadratic polynomials P(x) and Q(x) have leading coefficients 2 and -2, respectively. The graphs of both polynomials pass through the two points (16,54) and (20,53). Find P(0) + Q(0)."
echo ""
echo "=== Test 1: Correct Answer ==="
echo "Sending request with known question..."
answer=$(make_request "$TEST_QUESTION")
echo "Answer: $answer"
echo "Expected: 116"
echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
echo ""
echo "=== Test 2: Wrong Answer ==="
echo "Sending request with known question (success rate 0.0)..."
answer=$(make_request "$TEST_QUESTION")
echo "Answer: $answer"
echo "Expected: 116"
echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
echo ""
echo "=== Test 3: No Matching Question ==="
echo "Sending request with non-matching text..."
response=$(make_request "What is the capital of France?")
echo "Response: $response"
echo "Expected: No matching question found"
echo "Correct: $([ "$response" == "No matching question found" ] && echo "Yes" || echo "No")"
echo ""
echo "=== Test 4: Success Rate Verification ==="
echo "Sending 10 requests to test success rate..."
correct_count=0
for i in {1..10}; do
answer=$(make_request "$TEST_QUESTION")
if [ "$answer" == "116" ]; then
correct_count=$((correct_count + 1))
fi
echo " Request $i: Answer = $answer"
done
echo "Correct answers: $correct_count/10"
echo "Expected: ~8/10 (80% success rate)"
echo "Success rate: $(echo "scale=1; $correct_count * 10" | bc)%"
echo ""
echo "=== Test Complete ==="
echo "Stopping simulator..."
kill $SIMULATOR_PID 2>/dev/null
wait $SIMULATOR_PID 2>/dev/null || true
echo "Simulator stopped."

View File

@@ -569,27 +569,24 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
cmake_policy(SET CMP0135 NEW)
endif()
# TODO: Use FetchContent_MakeAvailable with EXCLUDE_FROM_ALL after bumping minimum CMake version to 3.28+
# Using FetchContent_Populate instead to avoid EXCLUDE_FROM_ALL which requires CMake 3.28
FetchContent_Declare(KleidiAI_Download
URL ${KLEIDIAI_DOWNLOAD_URL}
DOWNLOAD_EXTRACT_TIMESTAMP NEW
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
FetchContent_MakeAvailable(KleidiAI_Download)
FetchContent_GetProperties(KleidiAI_Download
SOURCE_DIR KLEIDIAI_SRC
POPULATED KLEIDIAI_POPULATED)
if (NOT KLEIDIAI_POPULATED)
message(FATAL_ERROR "KleidiAI source downloaded failed.")
FetchContent_Populate(KleidiAI_Download)
FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
endif()
add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
# Remove kleidiai target after fetching it
if (TARGET kleidiai)
set_target_properties(kleidiai PROPERTIES EXCLUDE_FROM_ALL TRUE)
endif()
list(APPEND GGML_CPU_SOURCES
ggml-cpu/kleidiai/kleidiai.cpp
ggml-cpu/kleidiai/kernels.cpp

View File

@@ -6,8 +6,8 @@
#include "ggml-impl.h"
#include "simd-mappings.h"
#define GGML_FA_TILE_Q 32
#define GGML_FA_TILE_KV 16
#define GGML_FA_TILE_Q 64
#define GGML_FA_TILE_KV 64
#ifdef __cplusplus

View File

@@ -2874,8 +2874,8 @@ struct ggml_cplan ggml_graph_plan(
const int64_t DV = node->src[2]->ne[0];
// Tiled flash attention scratch (tile sizes defined in common.h)
// Per-thread: Q_q + KQ + mask + VKQ32 + V32 + padding
size_t prefill = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV)*n_tasks;
// Per-thread: Q_q + KQ + mask + VKQ32 + V32 + K_f32 + padding
size_t prefill = sizeof(float)*(GGML_FA_TILE_Q*DK + 2*GGML_FA_TILE_Q*GGML_FA_TILE_KV + GGML_FA_TILE_Q*DV + GGML_FA_TILE_KV*DV + GGML_FA_TILE_KV*DK)*n_tasks;
// Decode path: n_kv_chunks = n_tasks (one chunk per thread)
// Per-thread: VKQ accmulator (DV), partial M, partial S + intra-thread scratch for V, Q and VKQ
@@ -2947,7 +2947,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
/*.use_ref =*/ cplan->use_ref,
};
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
#ifdef GGML_USE_OPENMP
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p\n", state->ith, (const void *)cplan);
#else
GGML_PRINT_DEBUG("thread #%d compute-start cplan %p last-graph %d\n", state->ith, (const void *)cplan, state->last_graph);
#endif
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
struct ggml_tensor * node = cgraph->nodes[node_n];
@@ -2974,7 +2978,11 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
}
}
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d \n", state->ith, cplan, state->last_graph);
#ifdef GGML_USE_OPENMP
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p\n", state->ith, (const void *)cplan);
#else
GGML_PRINT_DEBUG("thread #%d compute-done cplan %p last-graph %d\n", state->ith, (const void *)cplan, state->last_graph);
#endif
ggml_barrier(state->threadpool);

View File

@@ -3,6 +3,7 @@
#include "ggml-cpu.h"
#include "ggml-impl.h"
#include "binary-ops.h"
#include "simd-gemm.h"
#include "ggml.h"
#include "unary-ops.h"
#include "vec.h"
@@ -8389,10 +8390,6 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
GGML_ASSERT(k->type == v->type);
const ggml_type kv_type = k->type;
const auto * kv_type_traits_cpu = ggml_get_type_traits_cpu(kv_type);
const ggml_from_float_t kv_from_float = kv_type_traits_cpu->from_float;
const ggml_vec_dot_t kv_vec_dot = kv_type_traits_cpu->vec_dot;
const size_t kv_type_size = ggml_type_size(kv_type);
// broadcast factors
const int64_t rk2 = neq2/nek2;
@@ -8424,8 +8421,6 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
static constexpr int Q_TILE_SZ = ggml_fa_tile_config::Q;
static constexpr int KV_TILE_SZ = ggml_fa_tile_config::KV;
GGML_ASSERT(nek1 % KV_TILE_SZ == 0 && "KV sequence length must be divisible by KV_TILE_SZ");
int ir = ir0;
while (ir < ir1) {
// q indices for the start of this tile
@@ -8452,18 +8447,20 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
}
// Per-thread scratch layout:
// Q_q: Q_TILE_SZ * DK (converted Q tile in KV type)
// Q_q: Q_TILE_SZ * DK (converted Q tile — F32 for GEMM, KV type for scalar)
// KQ: Q_TILE_SZ * KV_TILE_SZ (attention scores in float)
// mask: Q_TILE_SZ * KV_TILE_SZ (mask in float)
// VKQ32: Q_TILE_SZ * DV (FP32 output accumulator)
// V32: KV_TILE_SZ * DV (F32 buffer for V tile - used for f166 conversion)
float * base = (float *) params->wdata + ith*(Q_TILE_SZ*DK + 2*Q_TILE_SZ*KV_TILE_SZ + Q_TILE_SZ*DV + KV_TILE_SZ*DV + CACHE_LINE_SIZE_F32);
// V32: KV_TILE_SZ * DV (F32 buffer for V tile)
// K_f32: KV_TILE_SZ * DK (F32 buffer for K tile — GEMM path)
float * base = (float *) params->wdata + ith*(Q_TILE_SZ*DK + 2*Q_TILE_SZ*KV_TILE_SZ + Q_TILE_SZ*DV + KV_TILE_SZ*DV + KV_TILE_SZ*DK + CACHE_LINE_SIZE_F32);
void * Q_q = base;
float * KQ = (float *)((char *)base + Q_TILE_SZ * DK * sizeof(float));
float * mask32 = KQ + Q_TILE_SZ * KV_TILE_SZ;
float * VKQ32 = mask32 + Q_TILE_SZ * KV_TILE_SZ;
float * V32 = VKQ32 + Q_TILE_SZ * DV; // F32 buffer for V tile
float * V32 = VKQ32 + Q_TILE_SZ * DV;
float * K_f32 = V32 + KV_TILE_SZ * DV;
memset(VKQ32, 0, Q_TILE_SZ * DV * sizeof(float));
memset(mask32, 0, Q_TILE_SZ * KV_TILE_SZ * sizeof(float));
@@ -8476,28 +8473,38 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
const int iv3 = iq3 / rv3;
const int iv2 = iq2 / rv2;
for (int tq = 0; tq < tile_rows; tq++) {
const float * pq = (const float *) ((char *) q->data + ((iq1 + tq)*nbq1 + iq2*nbq2 + iq3*nbq3));
kv_from_float(pq, (char *)Q_q + tq * DK * kv_type_size, DK);
}
// Zero-pad remaining rows
for (int tq = tile_rows; tq < Q_TILE_SZ; tq++) {
memset((char *)Q_q + tq * DK * kv_type_size, 0, DK * kv_type_size);
{
float * Q_f32 = (float *)Q_q;
for (int tq = 0; tq < tile_rows; tq++) {
const float * pq = (const float *) ((char *) q->data + ((iq1 + tq)*nbq1 + iq2*nbq2 + iq3*nbq3));
memcpy(Q_f32 + tq * DK, pq, DK * sizeof(float));
}
for (int tq = tile_rows; tq < Q_TILE_SZ; tq++) {
memset(Q_f32 + tq * DK, 0, DK * sizeof(float));
}
}
memset(K_f32, 0, DK * KV_TILE_SZ * sizeof(float));
memset(V32, 0, KV_TILE_SZ * DV * sizeof(float));
for (int64_t ic = 0; ic < nek1; ic += KV_TILE_SZ) {
const int kv_tile = (int)std::min((int64_t)KV_TILE_SZ, nek1 - ic);
// skip the tile entirely if all the masks are -inf
if (mask) {
bool can_skip = true;
for (int tq = 0; tq < tile_rows; tq++) {
const ggml_fp16_t * mp_row = (const ggml_fp16_t *)((const char *) mask->data + (iq1 + tq)*mask->nb[1] + (iq2%mask->ne[2])*mask->nb[2] + (iq3%mask->ne[3])*mask->nb[3]);
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
for (int tk = 0; tk < kv_tile; tk++) {
mask32[tq * KV_TILE_SZ + tk] = slope * GGML_CPU_FP16_TO_FP32(mp_row[ic + tk]);
if (mask32[tq * KV_TILE_SZ + tk] != -INFINITY) {
can_skip = false;
}
}
// Pad remaining mask entries with -inf
for (int tk = kv_tile; tk < KV_TILE_SZ; tk++) {
mask32[tq * KV_TILE_SZ + tk] = -INFINITY;
}
}
if (can_skip) {
@@ -8505,13 +8512,32 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
}
}
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
const void * q_row = (const char *)Q_q + tq * DK * kv_type_size;
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
const void * k_row = (const char *) k->data + ((ic + tk)*nbk1 + ik2*nbk2 + ik3*nbk3);
float s;
kv_vec_dot(DK, &s, 0, k_row, 0, q_row, 0, 1);
KQ[tq * KV_TILE_SZ + tk] = s * scale;
// Pack K tile transposed: K_f32[dk][kv] so KV_TILE is contiguous (SIMD dim)
// Zero-pad the last tile so the GEMM always operates on KV_TILE_SZ columns
for (int tk = 0; tk < kv_tile; tk++) {
const char * k_data = (const char *)k->data + (ic + tk)*nbk1 + ik2*nbk2 + ik3*nbk3;
if (kv_type == GGML_TYPE_F16) {
const ggml_fp16_t * k_f16 = (const ggml_fp16_t *)k_data;
for (int64_t dk = 0; dk < DK; dk++) {
K_f32[dk * KV_TILE_SZ + tk] = GGML_CPU_FP16_TO_FP32(k_f16[dk]);
}
} else {
const float * k_f32_src = (const float *)k_data;
for (int64_t dk = 0; dk < DK; dk++) {
K_f32[dk * KV_TILE_SZ + tk] = k_f32_src[dk];
}
}
}
memset(KQ, 0, Q_TILE_SZ * KV_TILE_SZ * sizeof(float));
simd_gemm(KQ, (const float *)Q_q, K_f32, Q_TILE_SZ, DK, KV_TILE_SZ);
ggml_vec_scale_f32(Q_TILE_SZ * KV_TILE_SZ, KQ, scale);
// Set padded KQ entries to -inf so softmax gives them zero weight
if (kv_tile < KV_TILE_SZ) {
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
for (int tk = kv_tile; tk < KV_TILE_SZ; tk++) {
KQ[tq * KV_TILE_SZ + tk] = -INFINITY;
}
}
}
@@ -8551,33 +8577,22 @@ static void ggml_compute_forward_flash_attn_ext_tiled(
S[tq] += ggml_vec_soft_max_f32(KV_TILE_SZ, kq_row, kq_row, Mnew);
}
// Convert V tile to F32 first (if F16), then do MAD
// On x86, ggml_vec_mad_f16 internall converts F16<->F32 on every load/store, so pre-converting is faster.
// TODO: on ARM, native f16 should be faster
if (kv_type == GGML_TYPE_F16) {
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
const ggml_fp16_t * v_row = (const ggml_fp16_t *)((const char *) v->data + ((ic + tk)*nbv1 + iv2*nbv2 + iv3*nbv3));
ggml_fp16_to_fp32_row(v_row, V32 + tk * DV, DV);
}
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
if (skip[tq]) continue;
float * vkq_row = VKQ32 + tq * DV;
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
const float p = KQ[tq * KV_TILE_SZ + tk];
ggml_vec_mad_f32(DV, vkq_row, V32 + tk * DV, p);
}
}
} else {
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
if (skip[tq]) continue;
float * vkq_row = VKQ32 + tq * DV;
for (int tk = 0; tk < KV_TILE_SZ; tk++) {
const float p = KQ[tq * KV_TILE_SZ + tk];
const float * v_row = (const float *)((const char *) v->data + ((ic + tk)*nbv1 + iv2*nbv2 + iv3*nbv3));
ggml_vec_mad_f32(DV, vkq_row, v_row, p);
}
// V accumulation: VKQ32 += softmax(KQ) * V
// Pack V tile to contiguous F32, zero-padded
for (int tk = 0; tk < kv_tile; tk++) {
const char * v_data = (const char *)v->data + (ic + tk)*nbv1 + iv2*nbv2 + iv3*nbv3;
if (kv_type == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((const ggml_fp16_t *)v_data, V32 + tk * DV, DV);
} else {
memcpy(V32 + tk * DV, v_data, DV * sizeof(float));
}
}
for (int tq = 0; tq < Q_TILE_SZ; tq++) {
if (skip[tq]) {
memset(KQ + tq * KV_TILE_SZ, 0, KV_TILE_SZ * sizeof(float));
}
}
simd_gemm(VKQ32, KQ, V32, Q_TILE_SZ, KV_TILE_SZ, DV);
}
// sinks (apply only to valid rows in the tile)
@@ -8794,15 +8809,15 @@ static void ggml_compute_forward_flash_attn_ext_f16(
const int64_t dr = (nr + nchunk - 1) / nchunk;
static constexpr int64_t KV_TILE_SZ = ggml_fa_tile_config::KV;
static constexpr int64_t Q_TILE_SZ = ggml_fa_tile_config::Q;
const bool use_tiled = !use_ref &&
bool use_tiled = !use_ref &&
(q->type == GGML_TYPE_F32 &&
kv_is_f32_or_f16 &&
k->type == v->type &&
nek1 % KV_TILE_SZ == 0 &&
neq1 >= Q_TILE_SZ);
#ifdef GGML_SIMD
use_tiled &= (DV % GGML_F32_EPR == 0);
#endif
int current_chunk = ith;
while (current_chunk < nchunk) {

View File

@@ -0,0 +1,136 @@
#pragma once
// Computes C[M x N] += A[M x K] * B[K x N]
#include "simd-mappings.h"
// TODO: add support for sizeless vector types
#if defined(GGML_SIMD) && !defined(__ARM_FEATURE_SVE) && !defined(__riscv_v_intrinsic)
// TODO: untested on avx512
// These are in units of GGML_F32_EPR
#if defined(__AVX512F__) || defined (__ARM_NEON__)
static constexpr int GEMM_RM = 4;
static constexpr int GEMM_RN = 4; // 16+4+1 = 25/32
#elif defined(__AVX2__) || defined(__AVX__)
static constexpr int GEMM_RM = 6;
static constexpr int GEMM_RN = 2; // 12+2+1 = 15/16
#else
static constexpr int GEMM_RM = 2;
static constexpr int GEMM_RN = 2;
#endif
template <int RM, int RN>
static inline void simd_gemm_ukernel(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int K, int N)
{
static constexpr int KN = GGML_F32_EPR;
GGML_F32_VEC acc[RM][RN];
for (int64_t i = 0; i < RM; i++) {
for (int r = 0; r < RN; r++) {
acc[i][r] = GGML_F32_VEC_LOAD(C + i * N + r * KN);
}
}
for (int64_t kk = 0; kk < K; kk++) {
GGML_F32_VEC Bv[RN];
for (int r = 0; r < RN; r++) {
Bv[r] = GGML_F32_VEC_LOAD(B + kk * N + r * KN);
}
for (int64_t i = 0; i < RM; i++) {
GGML_F32_VEC p = GGML_F32_VEC_SET1(A[i * K + kk]);
for (int r = 0; r < RN; r++) {
acc[i][r] = GGML_F32_VEC_FMA(acc[i][r], Bv[r], p);
}
}
}
for (int64_t i = 0; i < RM; i++) {
for (int r = 0; r < RN; r++) {
GGML_F32_VEC_STORE(C + i * N + r * KN, acc[i][r]);
}
}
}
// C[M x N] += A[M x K] * B[K x N]
static void simd_gemm(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int M, int K, int N)
{
static constexpr int KN = GGML_F32_EPR;
int64_t ii = 0;
for (; ii + GEMM_RM <= M; ii += GEMM_RM) {
int64_t jj = 0;
for (; jj + GEMM_RN * KN <= N; jj += GEMM_RN * KN) {
simd_gemm_ukernel<GEMM_RM, GEMM_RN>(C + jj, A, B + jj, K, N);
}
for (; jj + KN <= N; jj += KN) {
simd_gemm_ukernel<GEMM_RM, 1>(C + jj, A, B + jj, K, N);
}
for (; jj < N; jj++) {
for (int64_t i = 0; i < GEMM_RM; i++) {
float a = C[i * N + jj];
for (int64_t kk = 0; kk < K; kk++) {
a += A[i + kk] * B[kk * N + jj];
}
C[i * N + jj] = a;
}
}
A += GEMM_RM * K;
C += GEMM_RM * N;
}
// Tail rows: one at a time
for (; ii < M; ii++) {
int64_t jj = 0;
for (; jj + GEMM_RN * KN <= N; jj += GEMM_RN * KN) {
simd_gemm_ukernel<1, GEMM_RN>(C + jj, A, B + jj, K, N);
}
for (; jj + KN <= N; jj += KN) {
simd_gemm_ukernel<1, 1>(C + jj, A, B + jj, K, N);
}
for (; jj < N; jj++) {
float a = C[jj];
for (int64_t kk = 0; kk < K; kk++) {
a += A[kk] * B[kk * N + jj];
}
C[jj] = a;
}
A += K;
C += N;
}
}
#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
#endif
#else // scalar path
static void simd_gemm(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int M, int K, int N)
{
for (int64_t i = 0; i < M; i++) {
for (int64_t j = 0; j < N; j++) {
float sum = C[i * N + j];
for (int64_t kk = 0; kk < K; kk++) {
sum += A[i * K + kk] * B[kk * N + j];
}
C[i * N + j] = sum;
}
}
}
#endif // GGML_SIMD

View File

@@ -1160,6 +1160,14 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
float32x4_t tmp = x[0] + vec_reve(x[0]); \
res = tmp[0] + tmp[1]; \
}
#define GGML_F32x4_REDUCE_4(res, s0, s1, s2, s3) \
{ \
float32x4_t v = vec_add(vec_add(s0, s1), \
vec_add(s2, s3)); \
v = vec_add(v, vec_sld(v, v, 8)); \
v = vec_add(v, vec_sld(v, v, 4)); \
res += (ggml_float)vec_extract(v, 0); \
}
#define GGML_F32_VEC GGML_F32x4
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
@@ -1209,6 +1217,24 @@ static inline void __lzs_f16cx4_store(ggml_fp16_t * x, float32x4_t v_y) {
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
// BF16 s390x
#define GGML_BF16_STEP 16
#define GGML_BF16_EPR 8
#define GGML_BF16x8 __vector unsigned short
#define GGML_BF16x8_ZERO vec_splats((unsigned short)0)
#define GGML_BF16x8_LOAD(p) vec_xl(0, (const unsigned short *)(p))
#define GGML_BF16_VEC GGML_BF16x8
#define GGML_BF16_VEC_ZERO GGML_BF16x8_ZERO
#define GGML_BF16_VEC_LOAD GGML_BF16x8_LOAD
#define GGML_BF16_TO_F32_LO(v) ((float32x4_t) vec_mergel((v), GGML_BF16_VEC_ZERO))
#define GGML_BF16_TO_F32_HI(v) ((float32x4_t) vec_mergeh((v), GGML_BF16_VEC_ZERO))
#define GGML_BF16_FMA_LO(acc, x, y) \
(acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_LO(x), GGML_BF16_TO_F32_LO(y))
#define GGML_BF16_FMA_HI(acc, x, y) \
(acc) = GGML_F32x4_FMA((acc), GGML_BF16_TO_F32_HI(x), GGML_BF16_TO_F32_HI(y))
#elif defined(__riscv_v_intrinsic)
// compatible with vlen >= 128

View File

@@ -236,8 +236,7 @@ void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t *
vfloat32m1_t redsum = __riscv_vfredusum_vs_f32m4_f32m1(vsum0, __riscv_vfmv_v_f_f32m1(0.0f, 1), vl);
sumf += __riscv_vfmv_f_s_f32m1_f32(redsum);
#endif
#if defined(__POWER9_VECTOR__)
#elif defined(__POWER9_VECTOR__) || defined(__VXE__) || defined(__VXE2__)
const int np = (n & ~(GGML_BF16_STEP - 1));
if (np > 0) {
GGML_F32_VEC sum[4] = {GGML_F32_VEC_ZERO};

View File

@@ -2872,6 +2872,7 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
const std::string ffn_moe_down_bias_prefix = "ffn_moe_down_biased";
const std::string nemotron_h_block_out_prefix = "nemotron_h_block_out";
const std::string mamba2_y_add_d_prefix = "mamba2_y_add_d";
const std::string delta_net_prefix = "dnet_add";
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
@@ -2902,7 +2903,8 @@ static bool ggml_cuda_graph_check_compability(ggml_cgraph * cgraph) {
strncmp(node->name, ffn_moe_up_bias_prefix.c_str(), ffn_moe_up_bias_prefix.size()) != 0 &&
strncmp(node->name, ffn_moe_down_bias_prefix.c_str(), ffn_moe_down_bias_prefix.size()) != 0 &&
strncmp(node->name, nemotron_h_block_out_prefix.c_str(), nemotron_h_block_out_prefix.size()) != 0 &&
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0) {
strncmp(node->name, mamba2_y_add_d_prefix.c_str(), mamba2_y_add_d_prefix.size()) != 0 &&
strncmp(node->name, delta_net_prefix.c_str(), delta_net_prefix.size()) != 0) {
// disable CUDA graphs for batch size > 1 for now while excluding the matrix-matrix addition as part of Gemma3n's `project_per_layer_input` operation
// by means of matching node names. See
// https://github.com/ggml-org/llama.cpp/blob/f9a31eea06a859e34cecb88b4d020c7f03d86cc4/src/llama-model.cpp#L10199-L10241 and
@@ -4544,6 +4546,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_UNARY_OP_CEIL:
case GGML_UNARY_OP_ROUND:
case GGML_UNARY_OP_TRUNC:
// TODO: should become:
//return ggml_is_contiguous_rows(op->src[0]);
return ggml_is_contiguous(op->src[0]);
default:
return false;

View File

@@ -2715,14 +2715,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
#pragma unroll
for (int l = 0; l < QR2_XXS; ++l) {
const int * grid_pos = (const int *) (iq2xxs_grid + aux8[l]);
const int signs_packed = ksigns_iq2xs[(aux32 >> (7*l)) & 0x7F];
const uint2 grid_pos = ((const uint2*)iq2xxs_grid)[aux8[l]];
const uint32_t signs = unpack_ksigns(aux32 >> (7 * l));
const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
const int grid0 = __vsub4(grid_pos.x ^ signs0, signs0);
const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
const int grid1 = __vsub4(grid_pos.y ^ signs1, signs1);
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid0;
@@ -2733,12 +2733,12 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
}
const int ls = aux32 >> 28;
const int ls = aux32 >> 27 | 1; // (scale * 2 + 1)
const float d = bxi->d;
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/4;
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = d * ls / 8; // (d * scale + d / 2) / 4
#else
x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = (ls*d + d/2)/4;
x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = d * ls / 8; // (d * scale + d / 2) / 4
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
}
}
@@ -2776,11 +2776,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
#pragma unroll
for (int l = 0; l < QR2_XS; ++l) {
const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l] & 0x000001FF));
const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
const uint2 grid_pos = ((const uint2*)iq2xs_grid)[q2[l] & 0x1FF];
const uint32_t signs = unpack_ksigns(q2[l] >> 9);
const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
@@ -2904,11 +2907,13 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
#pragma unroll
for (int l = 0; l < QR3_XXS; ++l) {
const int2 grid_pos = make_int2(iq3xxs_grid[q3[2*l+0]], iq3xxs_grid[q3[2*l+1]]);
const uint32_t signs = unpack_ksigns(aux32 >> (7*l));
const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l)) & 0x7F));
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid_l;

View File

@@ -94,6 +94,15 @@ static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, con
#endif
}
static __device__ __forceinline__ uint32_t unpack_ksigns(const uint8_t v) {
// v is a 7 bit int, with the 8th sign being encodable as popcnt
// with xor we can "correct" the bit instead of having to mask
const uint32_t p = __popc(v) & 1;
const uint32_t s = v ^ p << 7;
// broadcast over uint to allow for 0x08040201 / 0x80402010 as selectors
return s * 0x01010101;
}
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
@@ -905,22 +914,22 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
int sumi = 0;
#pragma unroll
for (int k0 = 0; k0 < 8; k0 += 2) {
const int * grid_pos = (const int *) (iq2xxs_grid + aux8[k0/2]);
const int signs_packed = ksigns_iq2xs[(aux32 >> (7*k0/2)) & 0x7F];
const uint2 grid_pos = ((const uint2*)iq2xxs_grid)[aux8[k0/2]];
const uint32_t signs = unpack_ksigns(aux32 >> (7 * k0 / 2));
const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
const int grid0 = __vsub4(grid_pos.x ^ signs0, signs0);
const int u0 = get_int_b4(bq8_1[iqs/2].qs, k0 + 0);
sumi = ggml_cuda_dp4a(grid0, u0, sumi);
const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
const int grid1 = __vsub4(grid_pos.y ^ signs1, signs1);
const int u1 = get_int_b4(bq8_1[iqs/2].qs, k0 + 1);
sumi = ggml_cuda_dp4a(grid1, u1, sumi);
}
const int ls = aux32 >> 28;
sumi = (ls*sumi + sumi/2)/4;
const int ls = aux32 >> 27 | 1; // (scale * 2 + 1)
sumi = sumi * ls / 8; // (sumi * scale + sumi / 2) / 4
const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
return d * sumi;
}
@@ -942,13 +951,15 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
int sumi1 = 0;
#pragma unroll
for (int l0 = 0; l0 < 8; l0 += 2) {
const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l0/2] & 0x000001FF));
const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l0/2] >> 9));
const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
const uint2 grid_pos = ((const uint2*)iq2xs_grid)[q2[l0/2] & 0x1FF];
const uint32_t signs = unpack_ksigns(q2[l0/2] >> 9);
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
if (l0 < 4) {
@@ -1028,13 +1039,16 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
#pragma unroll
for (int l0 = 0; l0 < 8; l0 += 2) {
const int2 grid_pos = make_int2(iq3xxs_grid[q3[l0 + 0]], iq3xxs_grid[q3[l0 + 1]]);
const uint32_t signs = unpack_ksigns(aux32 >> (7*l0/2));
const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l0/2)) & 0x7F));
const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
sumi = ggml_cuda_dp4a(grid_l, u0, sumi);

View File

@@ -273,6 +273,7 @@ static std::vector<int> ggml_metal_graph_optimize_reorder(const std::vector<node
case GGML_OP_DIAG:
case GGML_OP_MUL:
case GGML_OP_ADD:
case GGML_OP_SUB:
case GGML_OP_DIV:
case GGML_OP_GLU:
case GGML_OP_SCALE:

View File

@@ -3830,6 +3830,7 @@ class VisionProjectorType:
MUSIC_FLAMINGO = "musicflamingo" # audio
GLM4V = "glm4v"
YOUTUVL = "youtuvl"
NEMOTRON_V2_VL = "nemotron_v2_vl"
# Items here are (block size, type size)

View File

@@ -1346,6 +1346,7 @@ class TensorNameMap:
"model.vision_tower.embeddings.cls_token", # Intern-S1
"vision_model.class_embedding", # llama 4
"model.vision.patch_embedding.cls_embedding", # cogvlm
"vision_model.radio_model.model.patch_generator.cls_token.token", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
@@ -1360,6 +1361,7 @@ class TensorNameMap:
"vision_tower.patch_embed.proj", # kimi-vl
"model.vision.patch_embedding.proj", # cogvlm
"siglip2.vision_model.embeddings.patch_embedding",
"vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_EMBD_NORM: (
@@ -1376,12 +1378,14 @@ class TensorNameMap:
"visual.pos_embed", # qwen3vl
"model.vision.patch_embedding.position_embedding", # cogvlm
"visual.embeddings.position_embedding", # glm4v
"vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_ATTN_QKV: (
"visual.blocks.{bid}.attn.qkv", # qwen3vl
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
"vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5
"vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5
"vision_model.radio_model.model.blocks.{bid}.attn.qkv", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_ATTN_Q: (
@@ -1446,6 +1450,7 @@ class TensorNameMap:
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
"vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_ATTN_O: (
@@ -1462,6 +1467,7 @@ class TensorNameMap:
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
@@ -1477,6 +1483,7 @@ class TensorNameMap:
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
"vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_FFN_UP: (
@@ -1493,6 +1500,7 @@ class TensorNameMap:
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
"vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_FFN_GATE: (
@@ -1515,6 +1523,7 @@ class TensorNameMap:
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
"vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_LAYER_SCALE_1: (

View File

@@ -5,7 +5,7 @@ import os
import sys
import subprocess
HTTPLIB_VERSION = "f80864ca031932351abef49b74097c67f14719c6"
HTTPLIB_VERSION = "d4180e923f846b44a3d30acd938438d6e64fc9f6"
vendor = {
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",

View File

@@ -878,6 +878,7 @@ const llama_token * llama_context::get_sampled_candidates_ith(int32_t idx) {
}
} catch (const std::exception & err) {
// fallback to full vocab list
GGML_UNUSED(err);
}
return sampling.token_ids_full_vocab.data();
@@ -1809,7 +1810,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
//
uint32_t llama_context::output_reserve(int32_t n_outputs) {
const auto & hparams = model.hparams;
const auto & vocab = model.vocab;
@@ -1893,11 +1893,6 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
embd = has_embd ? buffer_view<float>{(float *) (base + offset), embd.size} : buffer_view<float>{nullptr, 0};
offset += embd.size * sizeof(float);
sampling.logits = {nullptr, 0};
sampling.probs = {nullptr, 0};
sampling.sampled = {nullptr, 0};
sampling.candidates = {nullptr, 0};
if (has_sampling) {
sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)};
offset += sampling.logits.size * sizeof(float);
@@ -1923,6 +1918,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
std::fill(sampling.candidates_count.begin(), sampling.candidates_count.end(), 0);
std::fill_n(sampling.sampled.data, sampling.sampled.size, LLAMA_TOKEN_NULL);
} else {
sampling.logits = {nullptr, 0};
sampling.probs = {nullptr, 0};
sampling.sampled = {nullptr, 0};
sampling.candidates = {nullptr, 0};
sampling.logits_count.clear();
sampling.probs_count.clear();
sampling.candidates_count.clear();
}
// set all ids as invalid (negative)
@@ -1953,37 +1957,30 @@ void llama_context::output_reorder() {
}
}
if (sampling.logits.has_data()) {
if (!sampling.samplers.empty()) {
assert(sampling.logits.size > 0);
assert(sampling.probs.size > 0);
assert(sampling.candidates.size > 0);
assert(sampling.sampled.size > 0);
assert(sampling.logits_count.size() > 0);
assert(sampling.probs_count.size() > 0);
assert(sampling.candidates_count.size() > 0);
for (uint64_t k = 0; k < n_vocab; ++k) {
std::swap(sampling.logits.data[i0*n_vocab + k], sampling.logits.data[i1*n_vocab + k]);
}
}
if (sampling.probs.has_data()) {
for (uint64_t k = 0; k < n_vocab; ++k) {
std::swap(sampling.probs.data[i0*n_vocab + k], sampling.probs.data[i1*n_vocab + k]);
}
}
if (sampling.candidates.has_data()) {
for (uint64_t k = 0; k < n_vocab; ++k) {
std::swap(sampling.candidates.data[i0*n_vocab + k], sampling.candidates.data[i1*n_vocab + k]);
}
}
if (sampling.sampled.has_data()) {
std::swap(sampling.sampled.data[i0], sampling.sampled.data[i1]);
}
if (!sampling.logits_count.empty()) {
std::swap(sampling.logits_count[i0], sampling.logits_count[i1]);
}
if (!sampling.probs_count.empty()) {
std::swap(sampling.probs_count[i0], sampling.probs_count[i1]);
}
if (!sampling.candidates_count.empty()) {
std::swap(sampling.sampled.data[i0], sampling.sampled.data[i1]);
std::swap(sampling.logits_count[i0], sampling.logits_count[i1]);
std::swap(sampling.probs_count[i0], sampling.probs_count[i1]);
std::swap(sampling.candidates_count[i0], sampling.candidates_count[i1]);
}
}

View File

@@ -265,24 +265,26 @@ private:
std::unique_ptr<llama_memory_i> memory;
// decode output (2-dimensional array: [n_outputs][n_vocab])
struct buffer_view<float> logits = {nullptr, 0};
buffer_view<float> logits = {nullptr, 0};
// embeddings output (2-dimensional array: [n_outputs][n_embd])
// populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
struct buffer_view<float> embd = {nullptr, 0};
buffer_view<float> embd = {nullptr, 0};
struct sampling_info {
// !samplers.empty() to check if any samplers are active
std::map<llama_seq_id, llama_sampler *> samplers;
struct buffer_view<float> logits = {nullptr, 0};
struct buffer_view<llama_token> sampled = {nullptr, 0};
struct buffer_view<float> probs = {nullptr, 0};
struct buffer_view<llama_token> candidates = {nullptr, 0};
buffer_view<float> logits = {nullptr, 0};
buffer_view<llama_token> sampled = {nullptr, 0};
buffer_view<float> probs = {nullptr, 0};
buffer_view<llama_token> candidates = {nullptr, 0};
std::vector<uint32_t> logits_count;
std::vector<uint32_t> probs_count;
std::vector<uint32_t> candidates_count;
// optimization
std::vector<llama_token> token_ids_full_vocab;
};

View File

@@ -489,9 +489,6 @@ private:
ggml_tensor * build_layer_attn_linear(
llm_graph_input_rs * inp,
ggml_tensor * cur,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il);
ggml_tensor * build_layer_ffn(
@@ -506,9 +503,6 @@ private:
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il);
// returns pair of output and new state

View File

@@ -16,17 +16,6 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
ggml_tensor * inp_pos = build_inp_pos();
ggml_tensor * inp_out_ids = build_inp_out_ids();
ggml_tensor * causal_mask =
ggml_tri(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, CHUNK_SIZE, CHUNK_SIZE), 1.0f),
GGML_TRI_TYPE_LOWER);
ggml_tensor * identity = ggml_diag(ctx0, ggml_fill_inplace(ctx0, ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, CHUNK_SIZE), 1.0f));
ggml_tensor * diag_mask = ggml_add(ctx0, causal_mask, identity);
ggml_build_forward_expand(gf, causal_mask);
ggml_build_forward_expand(gf, identity);
ggml_build_forward_expand(gf, diag_mask);
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;
@@ -36,7 +25,7 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
// Determine layer type and build appropriate attention mechanism
if (hparams.is_recurrent(il)) {
// Linear attention layer (gated delta net)
cur = build_layer_attn_linear(inp->get_recr(), cur, causal_mask, identity, diag_mask, il);
cur = build_layer_attn_linear(inp->get_recr(), cur, il);
} else {
// Full attention layer
cur = build_layer_attn(inp->get_attn(), cur, inp_pos, il);
@@ -99,11 +88,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
ggml_tensor * b,
ggml_tensor * s,
int il) {
const int64_t S_k = q->ne[0];
const int64_t H_k = q->ne[1];
@@ -113,134 +99,123 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
const int64_t S_v = v->ne[0];
const int64_t H_v = v->ne[1];
GGML_ASSERT(v->ne[2] == n_tokens);
GGML_ASSERT(k->ne[2] == n_tokens);
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
GGML_ASSERT(S_k == S_v);
GGML_ASSERT(H_v % H_k == 0);
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
GGML_ASSERT(b->ne[0] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
const float eps_norm = hparams.f_norm_rms_eps;
q = ggml_l2_norm(ctx0, q, eps_norm);
k = ggml_l2_norm(ctx0, k, eps_norm);
const float scale = 1.0f / sqrtf(S_v);
const float scale = 1.0f / sqrtf(S_k);
q = ggml_scale(ctx0, q, scale);
beta = ggml_sigmoid(ctx0, beta);
cb(q, "q_in", il);
cb(k, "k_in", il);
cb(v, "v_in", il);
cb(beta, "beta_in", il);
cb(b, "b_in", il);
cb(g, "g_in", il);
q = ggml_cont_4d(ctx0, ggml_permute(ctx0, q, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
k = ggml_cont_4d(ctx0, ggml_permute(ctx0, k, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
v = ggml_cont_4d(ctx0, ggml_permute(ctx0, v, 0, 2, 1, 3), S_v, n_tokens, H_v, n_seqs);
g = ggml_cont_4d(ctx0, ggml_permute(ctx0, g, 2, 0, 3, 1), n_tokens, 1, H_k, n_seqs);
q = ggml_permute(ctx0, q, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
k = ggml_permute(ctx0, k, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
v = ggml_permute(ctx0, v, 0, 2, 1, 3); // [S_v, n_tokens, H_v, n_seqs]
g = ggml_permute(ctx0, g, 2, 1, 3, 0); // [ 1, n_tokens, H_v, n_seqs]
b = ggml_permute(ctx0, b, 2, 0, 1, 3); // [ 1, n_tokens, H_v, n_seqs]
beta = ggml_cont(ctx0, ggml_permute(ctx0, beta, 2, 0, 1, 3));
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
const int CS = CHUNK_SIZE;
cb(q, "q_perm", il);
cb(k, "k_perm", il);
cb(v, "v_perm", il);
cb(beta, "beta_perm", il);
cb(g, "g_perm", il);
cb(state, "state_in", il);
GGML_ASSERT(q->ne[1] == n_tokens && q->ne[0] == S_k && q->ne[2] == H_k && q->ne[3] == n_seqs);
GGML_ASSERT(k->ne[1] == n_tokens && k->ne[0] == S_k && k->ne[2] == H_k && k->ne[3] == n_seqs);
GGML_ASSERT(v->ne[1] == n_tokens && v->ne[0] == S_v && v->ne[2] == H_k && v->ne[3] == n_seqs);
GGML_ASSERT(beta->ne[1] == n_tokens && beta->ne[2] == H_k && beta->ne[0] == 1 && beta->ne[3] == n_seqs);
// Do padding
const int64_t chunk_size = CHUNK_SIZE;
const int64_t pad = (chunk_size - n_tokens % chunk_size) % chunk_size;
const int64_t n_chunks = (n_tokens + pad) / chunk_size;
const int pad = (CS - n_tokens % CS) % CS;
const int n_chunks = (n_tokens + pad) / CS;
q = ggml_pad(ctx0, q, 0, pad, 0, 0);
k = ggml_pad(ctx0, k, 0, pad, 0, 0);
v = ggml_pad(ctx0, v, 0, pad, 0, 0);
g = ggml_pad(ctx0, g, pad, 0, 0, 0);
beta = ggml_pad(ctx0, beta, 0, pad, 0, 0);
g = ggml_pad(ctx0, g, 0, pad, 0, 0);
b = ggml_pad(ctx0, b, 0, pad, 0, 0);
cb(q, "q_pad", il);
cb(k, "k_pad", il);
cb(v, "v_pad", il);
cb(beta, "beta_pad", il);
cb(g, "g_pad", il);
ggml_tensor * v_b = ggml_mul(ctx0, v, b);
ggml_tensor * k_b = ggml_mul(ctx0, k, b);
ggml_tensor * v_beta = ggml_mul(ctx0, v, beta);
ggml_tensor * k_beta = ggml_mul(ctx0, k, beta);
cb(v_b, "v_b", il);
cb(k_b, "k_b", il);
cb(v_beta, "v_beta", il);
cb(k_beta, "k_beta", il);
q = ggml_reshape_4d(ctx0, q, S_k, CS, n_chunks, H_k * n_seqs);
k = ggml_reshape_4d(ctx0, k, S_k, CS, n_chunks, H_k * n_seqs);
k_b = ggml_reshape_4d(ctx0, k_b, S_k, CS, n_chunks, H_v * n_seqs);
v = ggml_reshape_4d(ctx0, v, S_v, CS, n_chunks, H_v * n_seqs);
v_b = ggml_reshape_4d(ctx0, v_b, S_v, CS, n_chunks, H_v * n_seqs);
q = ggml_reshape_4d(ctx0, q, S_k, chunk_size, n_chunks, H_k * n_seqs);
k = ggml_reshape_4d(ctx0, k, S_k, chunk_size, n_chunks, H_k * n_seqs);
k_beta = ggml_reshape_4d(ctx0, k_beta, S_k, chunk_size, n_chunks, H_k * n_seqs);
v = ggml_reshape_4d(ctx0, v, S_v, chunk_size, n_chunks, H_v * n_seqs);
v_beta = ggml_reshape_4d(ctx0, v_beta, S_v, chunk_size, n_chunks, H_v * n_seqs);
g = ggml_reshape_4d(ctx0, g, CS, 1, n_chunks, H_v * n_seqs);
b = ggml_reshape_4d(ctx0, b, 1, CS, n_chunks, H_v * n_seqs);
g = ggml_reshape_4d(ctx0, g, chunk_size, 1, n_chunks, H_k * n_seqs);
beta = ggml_reshape_4d(ctx0, beta, 1, chunk_size, n_chunks, H_k * n_seqs);
// [CS, 1, n_chunks, H_v * n_seqs]
ggml_tensor * g_cs = ggml_cumsum(ctx0, g);
cb(g_cs, "g_cs", il);
ggml_tensor * g_cumsum = ggml_cumsum(ctx0, g);
cb(g_cumsum, "g_cumsum", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
ggml_tensor * g_cs_i = g_cs;
ggml_tensor * g_cs_j = ggml_reshape_4d(ctx0, g_cs, 1, CS, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_i = g_cumsum; // ggml_reshape_4d(ctx0, g_cumsum, chunk_size, 1, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_j = ggml_reshape_4d(ctx0, g_cumsum, 1, chunk_size, n_chunks, H_v * n_seqs);
g_cs_j = ggml_repeat_4d(ctx0, g_cs_j, CS, CS, n_chunks, H_v * n_seqs);
ggml_tensor * gcs_j_broadcast =
ggml_repeat_4d(ctx0, gcs_j, chunk_size, chunk_size, n_chunks, H_v * n_seqs);
ggml_tensor * decay_mask = ggml_sub(ctx0, gcs_j_broadcast, gcs_i);
cb(decay_mask, "decay_mask", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
// [CS, CS, n_chunks, H_v * n_seqs]
ggml_tensor * decay_mask;
decay_mask = ggml_sub(ctx0, g_cs_j, g_cs_i);
decay_mask = ggml_tri(ctx0, decay_mask, GGML_TRI_TYPE_LOWER_DIAG);
decay_mask = ggml_exp(ctx0, decay_mask);
decay_mask = ggml_mul(ctx0, decay_mask, diag_mask);
cb(decay_mask, "decay_mask", il);
ggml_tensor * kmulkbeta = ggml_mul_mat(ctx0, k, k_beta);
// [CS, CS, n_chunks, H_k * n_seqs]
ggml_tensor * kb;
kb = ggml_mul_mat(ctx0, k, k_b);
kb = ggml_mul (ctx0, kb, decay_mask);
ggml_tensor * k_decay = ggml_mul(ctx0, kmulkbeta, decay_mask);
ggml_tensor * attn = ggml_neg(ctx0, ggml_mul(ctx0, k_decay, causal_mask));
cb(attn, "attn_pre_solve", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
// [CS, CS, n_chunks, H_k * n_seqs]
ggml_tensor * attn;
attn = ggml_tri(ctx0, kb, GGML_TRI_TYPE_LOWER);
ggml_tensor * attn_lower = ggml_mul(ctx0, attn, causal_mask);
ggml_tensor * lhs = ggml_sub(ctx0, ggml_repeat(ctx0, identity, attn_lower), attn_lower);
ggml_tensor * identity;
identity = ggml_view_1d(ctx0, attn, CS, 0);
identity = ggml_fill (ctx0, identity, 1.0f);
identity = ggml_diag (ctx0, identity);
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
attn = ggml_mul(ctx0, lin_solve, causal_mask);
attn = ggml_add(ctx0, attn, identity);
cb(attn, "attn_solved", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
ggml_tensor * lhs = ggml_add(ctx0, attn, identity);
cb(lhs, "dnet_add_ch_lhs", il);
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_beta)), attn);
attn = ggml_neg(ctx0, attn);
ggml_tensor * g_cumsum_t = ggml_cont(ctx0, ggml_transpose(ctx0, g_cumsum));
ggml_tensor * gexp = ggml_exp(ctx0, g_cumsum_t);
ggml_tensor * lin_solve = ggml_solve_tri(ctx0, lhs, attn, true, true, false);
attn = ggml_add(ctx0, lin_solve, identity);
cb(attn, "dnet_add_ch_attn_solved", il); // [CS, CS, n_chunks, H_k * n_seqs]
ggml_tensor * kbeta_gexp = ggml_mul(ctx0, k_beta, gexp);
cb(kbeta_gexp, "kbeta_gexp", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
// [S_v, CS, n_chunks, H_v * n_seqs]
v = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, v_b)), attn);
ggml_tensor * k_cumdecay =
ggml_cont(ctx0, ggml_transpose(ctx0, ggml_mul_mat(ctx0, attn, ggml_cont(ctx0, ggml_transpose(ctx0, kbeta_gexp)))));
cb(k_cumdecay, "k_cumdecay", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
// [CS, 1, n_chunks, H_v * n_seqs]
ggml_tensor * g_exp = ggml_exp(ctx0, g_cs);
ggml_tensor * attn_kq = ggml_mul_mat(ctx0, k, q);
attn_kq = ggml_mul(ctx0, attn_kq, decay_mask);
attn_kq = ggml_mul(ctx0, attn_kq, diag_mask);
cb(attn_kq, "attn_kq", il); // shape: (chunk_size, chunk_size, n_chunks, H_v * n_seqs)
k_b = ggml_cont(ctx0, ggml_transpose(ctx0, k_b));
// [CS, S_k, n_chunks, H_k * n_seqs]
ggml_tensor * kbg = ggml_mul(ctx0, k_b, g_exp);
cb(kbg, "k_beta_g_exp", il);
// [S_k, CS, n_chunks, H_k * n_seqs]
ggml_tensor * k_cd = ggml_mul_mat(ctx0, kbg, attn);
cb(k_cd, "k_cumdecay", il);
// [S_k, CS, n_chunks, H_k * n_seqs]
ggml_tensor * g_exp_t = ggml_transpose(ctx0, g_exp);
ggml_tensor * q_g_exp = ggml_mul(ctx0, q, g_exp_t);
// [CS, CS, n_chunks, H_k * n_seqs]
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
kq = ggml_mul(ctx0, kq, decay_mask);
kq = ggml_tri(ctx0, kq, GGML_TRI_TYPE_LOWER_DIAG);
cb(kq, "kq", il);
// vectorized calculation of key_gdiff
// improved from the chunked version:
@@ -250,109 +225,98 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_chu
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
// get last element in g_cumsum along chunk_size dimension (ne0)
// get last element in g_cumsum along CS dimension (ne0)
// example: [[x, y, z, ..., last], ...] -> [[last], ...]
ggml_tensor * g_last = ggml_view_4d(ctx0, g_cumsum, 1, 1, g_cumsum->ne[2], g_cumsum->ne[3],
g_cumsum->nb[1], g_cumsum->nb[2], g_cumsum->nb[3],
(g_cumsum->ne[0] - 1) * ggml_element_size(g_cumsum));
// [1, 1, n_chunks, H_v * n_seqs]
ggml_tensor * g_last = ggml_view_4d(ctx0, g_cs, 1, 1, g_cs->ne[2], g_cs->ne[3],
g_cs->nb[1],
g_cs->nb[2],
g_cs->nb[3],
ggml_row_size(g_cs->type, g_cs->ne[0] - 1));
cb(g_last, "g_last", il);
// TODO: remove this cont when CUDA supports non-cont unary ops
g_last = ggml_cont(ctx0, g_last);
cb(g_last, "g_last", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
// [1, 1, n_chunks, H_v * n_seqs]
ggml_tensor * g_last_exp = ggml_exp(ctx0, g_last);
cb(g_last_exp, "g_last_exp", il); // shape: (1, 1, n_chunks, H_v * n_seqs)
cb(g_last_exp, "g_last_exp", il);
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cumsum, g_last));
cb(g_diff, "g_diff", il); // shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
// [CS, 1, n_chunks, H_v * n_seqs]
ggml_tensor * g_diff = ggml_neg(ctx0, ggml_sub(ctx0, g_cs, g_last));
cb(g_diff, "g_diff", il);
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
ggml_tensor * g_diff_exp_t = ggml_reshape_4d(ctx0, g_diff_exp,
1, chunk_size, n_chunks, g_diff_exp->ne[3]);
ggml_tensor * g_diff_exp = ggml_exp(ctx0, g_diff);
ggml_tensor * g_diff_exp_t = ggml_transpose(ctx0, g_diff_exp);
ggml_tensor * key_gdiff = ggml_mul(ctx0, k, g_diff_exp_t);
cb(key_gdiff, "key_gdiff", il); // shape: (S_k, chunk_size, n_chunks, H_v * n_seqs)
// [S_k, CS, n_chunks, H_v * n_seqs]
ggml_tensor * kg = ggml_mul(ctx0, k, g_diff_exp_t);
cb(kg, "key_gdiff", il);
ggml_tensor * key_gdiff_t = ggml_cont(ctx0, ggml_transpose(ctx0, key_gdiff));
cb(key_gdiff_t, "key_gdiff_t", il); // shape: (chunk_size, S_k, n_chunks, H_v * n_seqs)
// [CS, S_k, n_chunks, H_v * n_seqs]
ggml_tensor * kg_t = ggml_cont(ctx0, ggml_transpose(ctx0, kg));
cb(kg_t, "key_gdiff_t", il);
ggml_tensor * s_t = ggml_transpose(ctx0, s);
s_t = ggml_cont_4d(ctx0, s_t, S_v, S_v, 1, H_v * n_seqs);
cb(s_t, "dnet_add_ch_state", il);
// state to be updated per chunk
ggml_tensor * new_state = state; // ggml_dup(ctx0, state);
cb(new_state, "new_state", il); // shape: (S_v, S_v, H_v, n_seqs)
// shape after loop of chunks: (S_v, chunk_size, n_chunks, H_v * n_seqs)
ggml_tensor * core_attn_out = nullptr;
// [CS, S_v, n_chunks, H_v * n_seqs]
ggml_tensor * v_t = ggml_cont(ctx0, ggml_transpose(ctx0, v));
for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
// shape: (S_k, chunk_size, 1, H_k * n_seqs)
ggml_tensor * q_chunk = get_slice_2d(ctx0, q, chunk); // (no cont), next op: ggml_mul
ggml_tensor * ch_k_cd = get_slice_2d(ctx0, k_cd, chunk); // [S_k, CS, 1, H_k * n_seqs]
ggml_tensor * ch_v_t = get_slice_2d(ctx0, v_t, chunk); // [ CS, S_v, 1, H_v * n_seqs]
ggml_tensor * ch_kq = get_slice_2d(ctx0, kq, chunk); // [ CS, CS, 1, H_k * n_seqs]
ggml_tensor * ch_q_g_exp = get_slice_2d(ctx0, q_g_exp, chunk); // [S_k, CS, 1, H_k * n_seqs]
ggml_tensor * ch_kg_t = get_slice_2d(ctx0, kg_t, chunk); // [ CS, S_k, 1, H_v * n_seqs]
// shape: (S_v, chunk_size, 1, H_v * n_seqs)
ggml_tensor * v_chunk = get_slice_2d(ctx0, v, chunk); // (no cont), next op: ggml_repeat
// [CS, S_v, 1, H_v * n_seqs]
ggml_tensor * v_t_p = ggml_mul_mat(ctx0, ch_k_cd, s_t);
cb(v_t_p, "v_prime", il);
// shape: (chunk_size, 1, n_chunks, H_v * n_seqs)
ggml_tensor * gexp_chunk = get_slice_2d(ctx0, gexp, chunk); // (no cont), next op: ggml_mul
// [CS, S_v, 1, H_v * n_seqs]
ggml_tensor * v_t_new = ggml_sub(ctx0, ch_v_t, v_t_p);
cb(v_t_new, "v_t_new", il);
// shape: (chunk_size, 1, H_v * n_seqs)
ggml_tensor * k_cumdecay_chunk = get_slice_2d(ctx0, k_cumdecay, chunk); // (no cont), next op: ggml_mul_mat
// [S_v, CS, 1, H_v * n_seqs]
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_t_new, ch_kq);
cb(v_attn, "v_attn", il);
// attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
// replaced by precomputed attn_kq
ggml_tensor * attn_chunk = get_slice_2d(ctx0, attn_kq, chunk);
cb(attn_chunk, "attn_chunk", il);
// [S_v, CS, 1, H_v * n_seqs]
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, s_t, ch_q_g_exp);
cb(attn_inter, "attn_inter", il);
ggml_tensor * state_t = ggml_cont_4d(ctx0, ggml_permute(ctx0, new_state, 1, 0, 2, 3), S_v, S_v, 1, H_v * n_seqs);
// [S_v, CS, 1, H_v * n_seqs]
ggml_tensor * o_ch = ggml_add(ctx0, attn_inter, v_attn);
cb(o_ch, "dnet_add_ch_attn_out", il);
// v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
ggml_tensor * v_prime = ggml_mul_mat(ctx0, state_t, k_cumdecay_chunk);
cb(v_prime, "v_prime_chunk", il); // shape: (S_v, 1, H_v * n_seqs)
// v_new = v_i - v_prime
ggml_tensor * v_new = ggml_sub(ctx0, ggml_repeat(ctx0, v_chunk, v_prime), v_prime);
ggml_tensor * v_new_t = ggml_cont(ctx0, ggml_transpose(ctx0, v_new));
cb(v_new, "v_new_chunk", il);
// attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
ggml_tensor * q_g_exp = ggml_mul(ctx0, q_chunk, gexp_chunk);
ggml_tensor * attn_inter = ggml_mul_mat(ctx0, state_t, q_g_exp);
cb(attn_inter, "attn_inter_chunk", il);
// core_attn_out[:, :, i] = attn_inter + attn @ v_new
ggml_tensor * v_attn = ggml_mul_mat(ctx0, v_new_t, attn_chunk);
cb(v_attn, "v_attn_chunk", il);
ggml_tensor * core_attn_out_chunk = ggml_add(ctx0, attn_inter, v_attn);
cb(core_attn_out_chunk, "core_attn_out_chunk", il); // shape: (S_v, chunk_size, 1, H_v * n_seqs)
core_attn_out = core_attn_out == nullptr
? core_attn_out_chunk
: ggml_concat(ctx0, core_attn_out, core_attn_out_chunk, 2);
v = ggml_set_inplace(ctx0, v, o_ch, v->nb[1], v->nb[2], v->nb[3], chunk * v->nb[2]);
// kgdmulvnew = (key_gdiff).transpose(-1, -2) @ v_new
ggml_tensor * k_gdiff_t = get_slice_2d(ctx0, key_gdiff_t, chunk);
//ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, k_gdiff, v_new); // this is slower on metal, why?
ggml_tensor * kgdmulvnew = ggml_mul_mat(ctx0, v_new_t, k_gdiff_t);
// TODO: head broadcast might not work here - probably will need a transpose
ggml_tensor * kgv = ggml_mul_mat(ctx0, ch_kg_t, v_t_new); // [S_k, S_v, 1, H_k * n_seqs]
// last_recurrent_state = last_recurrent_state * g_last + kgdmulvnew
ggml_tensor * gexp_last_chunk = ggml_cont(ctx0, get_slice_2d(ctx0, g_last_exp, chunk));
new_state = ggml_add(ctx0,
ggml_mul(ctx0, new_state, ggml_reshape_4d(ctx0, gexp_last_chunk, gexp_last_chunk->ne[0], gexp_last_chunk->ne[1], H_v, n_seqs)),
ggml_reshape_4d(ctx0, kgdmulvnew, kgdmulvnew->ne[0], kgdmulvnew->ne[1], H_v, n_seqs));
ggml_tensor * ch_g_last_exp = get_slice_2d(ctx0, g_last_exp, chunk);
s_t = ggml_mul(ctx0, s_t, ch_g_last_exp);
s_t = ggml_add(ctx0, s_t, kgv);
cb(s_t, "dnet_add_ch_state", il);
}
s_t = ggml_reshape_4d(ctx0, s_t, S_v, S_v, H_v, n_seqs);
// truncate padded tokens
ggml_tensor * output_tokens = ggml_view_4d(ctx0, core_attn_out,
ggml_tensor * o = ggml_view_4d(ctx0, v,
S_v, n_tokens, H_v, n_seqs,
ggml_row_size(core_attn_out->type, S_v),
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks),
ggml_row_size(core_attn_out->type, S_v * chunk_size * n_chunks * H_v), 0);
output_tokens = ggml_cont(ctx0, output_tokens);
cb(output_tokens, "output_tokens", il);
ggml_row_size(v->type, S_v),
ggml_row_size(v->type, S_v * CS * n_chunks),
ggml_row_size(v->type, S_v * CS * n_chunks * H_v), 0);
// permute back to (S_v, H_v, n_tokens, n_seqs)
output_tokens = ggml_permute(ctx0, output_tokens, 0, 2, 1, 3);
output_tokens = ggml_cont(ctx0, output_tokens);
o = ggml_permute (ctx0, o, 0, 2, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
s = ggml_transpose(ctx0, s_t); // [S_v, S_v, H_v, n_seqs]
return {output_tokens, new_state};
return {o, s};
}
std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_autoregressive(
@@ -360,8 +324,8 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_aut
ggml_tensor * k,
ggml_tensor * v,
ggml_tensor * g,
ggml_tensor * beta,
ggml_tensor * state,
ggml_tensor * b, // beta
ggml_tensor * s, // state
int il) {
const int64_t S_k = q->ne[0];
const int64_t H_k = q->ne[1];
@@ -371,75 +335,72 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_delta_net_aut
const int64_t S_v = v->ne[0];
const int64_t H_v = v->ne[1];
GGML_ASSERT(n_tokens == 1); // This function is optimized for single token processing
GGML_ASSERT(v->ne[2] == n_tokens);
GGML_ASSERT(k->ne[2] == n_tokens);
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
GGML_ASSERT(beta->ne[0] == H_v && beta->ne[2] == n_tokens && beta->ne[3] == n_seqs);
GGML_ASSERT(state->ne[0] == S_v && state->ne[1] == S_v * H_v && state->ne[2] == 1 && state->ne[3] == n_seqs);
GGML_ASSERT(n_tokens == 1);
GGML_ASSERT(S_k == S_v);
GGML_ASSERT(H_v % H_k == 0);
GGML_ASSERT(q->ne[0] == S_k && q->ne[1] == H_k && q->ne[2] == n_tokens && q->ne[3] == n_seqs);
GGML_ASSERT(k->ne[0] == S_k && k->ne[1] == H_k && k->ne[2] == n_tokens && k->ne[3] == n_seqs);
GGML_ASSERT(v->ne[0] == S_v && v->ne[1] == H_v && v->ne[2] == n_tokens && v->ne[3] == n_seqs);
GGML_ASSERT(H_k == H_v); // we did a repeat to make sure this is the case
GGML_ASSERT(g->ne[0] == H_v && g->ne[1] == n_tokens && g->ne[2] == n_seqs);
GGML_ASSERT(b->ne[0] == H_v && b->ne[2] == n_tokens && b->ne[3] == n_seqs);
GGML_ASSERT(s->ne[0] == S_v && s->ne[1] == S_v && s->ne[2] == H_v && s->ne[3] == n_seqs);
const float eps_norm = hparams.f_norm_rms_eps;
const float scale = 1.0f / sqrtf(S_k);
q = ggml_l2_norm(ctx0, q, eps_norm);
k = ggml_l2_norm(ctx0, k, eps_norm);
q = ggml_scale(ctx0, q, scale);
const float scale = 1.0f / sqrtf(S_v);
q = ggml_scale(ctx0, q, scale);
beta = ggml_sigmoid(ctx0, beta);
q = ggml_permute(ctx0, q, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
k = ggml_permute(ctx0, k, 0, 2, 1, 3); // [S_k, n_tokens, H_k, n_seqs]
v = ggml_permute(ctx0, v, 0, 2, 1, 3); // [S_v, n_tokens, H_v, n_seqs]
cb(q, "q_in", il);
cb(k, "k_in", il);
cb(v, "v_in", il);
cb(beta, "beta_in", il);
cb(b, "b_in", il);
cb(g, "g_in", il);
state = ggml_reshape_4d(ctx0, state, S_v, S_v, H_v, n_seqs);
g = ggml_reshape_4d(ctx0, g, 1, 1, H_v, n_seqs);
b = ggml_reshape_4d(ctx0, b, 1, 1, H_v, n_seqs);
ggml_tensor * g_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, g), 1, 1, H_k, n_seqs);
ggml_tensor * beta_t = ggml_reshape_4d(ctx0, ggml_transpose(ctx0, beta), 1, 1, H_k, n_seqs);
// [S_v, S_v, H_v, n_seqs]
g = ggml_exp(ctx0, g);
s = ggml_mul(ctx0, s, g);
// Apply exponential to g_t
g_t = ggml_exp(ctx0, g_t);
ggml_tensor * s_t = ggml_cont(ctx0, ggml_transpose(ctx0, s));
// Apply the gated delta rule for the single timestep
// last_recurrent_state = last_recurrent_state * g_t
state = ggml_mul(ctx0, state, g_t);
// [1, S_v, H_v, n_seqs]
ggml_tensor * sk;
sk = ggml_mul (ctx0, s_t, k);
sk = ggml_sum_rows(ctx0, sk);
// kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
ggml_tensor * k_t_unsqueezed = ggml_reshape_4d(ctx0, k, 1, S_v, H_v, n_seqs);
ggml_tensor * kv_mem = ggml_mul(ctx0, state, k_t_unsqueezed);
// we need to sum over dim=-2, so we transpose, sum, then transpose again
kv_mem = ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, kv_mem))));
// [S_v, 1, H_v, n_seqs]
ggml_tensor * d;
d = ggml_sub(ctx0, v, ggml_transpose(ctx0, sk));
d = ggml_mul(ctx0, d, b);
// v_t = v.unsqueeze(2) (we insert the singleton dimension after n_seqs and H_v)
ggml_tensor * v_t = ggml_reshape_4d(ctx0, v, S_v, 1, H_v, n_seqs);
// delta = (v_t - kv_mem) * beta_t
ggml_tensor * v_diff = ggml_sub(ctx0, v_t, kv_mem); // both should be [S_v, 1, H_v, n_seqs]
ggml_tensor * delta = ggml_mul(ctx0, v_diff, beta_t);
// [1, S_v, H_v, n_seqs]
ggml_tensor * d_t;
d_t = ggml_transpose(ctx0, d);
// last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta
ggml_tensor * k_t_delta = ggml_mul(ctx0, ggml_repeat_4d(ctx0, k_t_unsqueezed, S_v, S_v, H_v, n_seqs), delta);
state = ggml_add(ctx0, state, k_t_delta);
// [S_v, S_v, H_v, n_seqs]
ggml_tensor * kd;
k = ggml_repeat(ctx0, k, s);
kd = ggml_mul (ctx0, k, d_t);
// Compute the attention output
// core_attn_out = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
ggml_tensor * q_t_unsqueezed = ggml_reshape_4d(ctx0, q, 1, S_v, H_v, n_seqs); // unsqueeze q_t
ggml_tensor * state_q = ggml_mul(ctx0, state, q_t_unsqueezed);
// again, since it's over dim = -2, transpose, sum, transpose back
ggml_tensor * core_attn_out =
ggml_transpose(ctx0, ggml_sum_rows(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, state_q))));
s_t = ggml_add(ctx0, s_t, kd);
// core_attn_out should be [S_v, 1, H_v, n_seqs] after this
cb(core_attn_out, "output_tokens", il);
cb(state, "new_state", il);
cb(s_t, "dnet_add_ar_state", il);
return {core_attn_out, state};
ggml_tensor * s_q = ggml_mul (ctx0, s_t, q);
ggml_tensor * o = ggml_sum_rows(ctx0, s_q);
o = ggml_permute (ctx0, o, 2, 0, 1, 3); // [S_v, H_v, n_tokens, n_seqs]
s = ggml_transpose(ctx0, s_t); // [S_v, S_v, H_v, n_seqs]
return {o, s};
}
ggml_tensor * llm_build_qwen3next::build_norm_gated(
@@ -472,39 +433,29 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
// Split Q projection into query and gate
// The split should be along dimension 0 (the feature dimension)
ggml_tensor * Qcur = ggml_view_4d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, 1,
Qcur_full->nb[1], Qcur_full->nb[2], Qcur_full->nb[3], 0);
Qcur_full->nb[1], Qcur_full->nb[2], Qcur_full->nb[3], 0);
cb(Qcur, "Qcur_view", il);
ggml_tensor * gate =
ggml_view_4d(ctx0, Qcur_full, n_embd_head, n_head, n_tokens, 1,
Qcur_full->nb[1], Qcur_full->nb[2], Qcur_full->nb[3], n_embd_head * ggml_element_size(Qcur_full));
cb(Qcur, "Qcur", il);
cb(gate, "gate", il);
// Now reshape Qcur to [n_embd_head, n_head, n_tokens] for multi-head attention
Qcur = ggml_cont_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cb(Qcur, "Qcur_reshaped", il);
// Apply Q normalization
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
cb(Kcur, "Kcur", il);
ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
cb(Vcur, "Vcur", il);
// Apply K normalization
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, nullptr, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, nullptr, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il);
// Reshape gate to [n_embd, n_tokens] for the sigmoid gating (flatten the heads)
gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
cb(gate, "gate_reshaped", il);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
// Apply RoPE
Qcur = ggml_rope_ext(
ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -519,7 +470,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);
// Attention computation
const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f / sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
cur = build_attn(inp,
@@ -527,10 +477,15 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il);
cb(cur, "attn_pregate", il);
ggml_tensor * gate_sigmoid = ggml_sigmoid(ctx0, gate);
cb(gate_sigmoid, "gate_sigmoid", il);
// TODO: CUDA is missing non-contiguous unary ops. when implemented: remove this cont
gate = ggml_cont_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
cur = ggml_mul(ctx0, cur, gate_sigmoid);
gate = ggml_sigmoid(ctx0, gate);
cb(gate, "gate_sigmoid", il);
gate = ggml_reshape_2d(ctx0, gate, n_embd_head * n_head, n_tokens);
cur = ggml_mul(ctx0, cur, gate);
cb(cur, "attn_gated", il);
cur = build_lora_mm(model.layers[il].wo, cur);
@@ -560,7 +515,6 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_qkvz(
cb(z, "z", il);
return { qkv_mixed, z };
} else {
// legacy (slower) path
ggml_tensor * mixed_qkvz = build_lora_mm(model.layers[il].ssm_in, input);
@@ -624,9 +578,6 @@ std::pair<ggml_tensor *, ggml_tensor *> llm_build_qwen3next::build_qkvz(
ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
llm_graph_input_rs * inp,
ggml_tensor * cur,
ggml_tensor * causal_mask,
ggml_tensor * identity,
ggml_tensor * diag_mask,
int il) {
const auto * mctx_cur = inp->mctx;
@@ -671,7 +622,12 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
split_sizes_ba[0] * ggml_element_size(mixed_ba_reshaped));
cb(a, "a", il);
ggml_tensor * beta = ggml_cont_4d(ctx0, b, num_v_heads, 1, n_seq_tokens, n_seqs);
// TODO: CUDA is missing non-contiguous unary ops. when implemented: remove this cont
b = ggml_cont(ctx0, b);
ggml_tensor * beta = ggml_sigmoid(ctx0, b);
beta = ggml_reshape_4d(ctx0, beta, num_v_heads, 1, n_seq_tokens, n_seqs);
// Reshape a to merge head dimensions: [batch, seq_len, num_k_heads, num_v_heads/num_k_heads] -> [batch, seq_len, num_v_heads]
ggml_tensor * alpha = ggml_cont_3d(ctx0, a, num_v_heads, n_seq_tokens, n_seqs);
@@ -679,6 +635,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * alpha_biased = ggml_add(ctx0, alpha, model.layers[il].ssm_dt);
ggml_tensor * alpha_softplus = ggml_softplus(ctx0, alpha_biased);
cb(alpha_softplus, "a_softplus", il);
ggml_tensor * gate = ggml_mul(ctx0, alpha_softplus, model.layers[il].ssm_a); // -A_log.exp() * softplus
cb(gate, "gate", il);
@@ -686,8 +643,6 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * conv_states_all = mctx_cur->get_r_l(il);
ggml_tensor * ssm_states_all = mctx_cur->get_s_l(il);
// bool use_precomputed_states = n_seq_tokens == 1 && mctx_cur->has_previous_state();
// Build the convolution states tensor
ggml_tensor * conv_states = build_rs(inp, conv_states_all, hparams.n_embd_r(), n_seqs);
cb(conv_states, "conv_states", il);
@@ -696,11 +651,12 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_tensor * conv_kernel = model.layers[il].ssm_conv1d;
const int64_t conv_kernel_size = conv_kernel->ne[0];
const int64_t conv_channels = d_inner + 2 * hparams.ssm_n_group * hparams.ssm_d_state;
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
conv_states = ggml_reshape_3d(ctx0, conv_states, conv_kernel_size - 1, conv_channels, n_seqs);
cb(conv_states, "conv_states_reshaped", il);
qkv_mixed = ggml_permute(ctx0, qkv_mixed, 1, 0, 2, 3);
cb(qkv_mixed, "qkv_mixed_permuted", il);
qkv_mixed = ggml_transpose(ctx0, qkv_mixed);
cb(qkv_mixed, "qkv_mixed_transposed", il);
ggml_tensor * conv_input = ggml_concat(ctx0, conv_states, qkv_mixed, 0);
cb(conv_input, "conv_input", il);
@@ -720,7 +676,10 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, state_update_target));
cb(conv_states_all, "conv_states_updated", il);
// Apply SSM convolution
ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim, num_v_heads, n_seqs);
cb(state, "state_predelta", il);
ggml_tensor * conv_output_proper = ggml_ssm_conv(ctx0, conv_input, conv_kernel);
cb(conv_output_proper, "conv_output_raw", il);
@@ -734,26 +693,36 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
int64_t nb1_qkv = ggml_row_size(conv_qkv_mix->type, qkv_dim);
// Extract the convolved Q, K, V from conv_output
ggml_tensor * q_conv =
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv, 0);
ggml_tensor * q_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs,
ggml_row_size(conv_qkv_mix->type, head_k_dim),
nb1_qkv,
nb1_qkv * n_seq_tokens,
0);
ggml_tensor * k_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_k_dim, num_k_heads, n_seq_tokens, n_seqs,
ggml_row_size(conv_qkv_mix->type, head_k_dim),
nb1_qkv,
nb1_qkv * n_seq_tokens,
head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
ggml_tensor * v_conv = ggml_view_4d(ctx0, conv_qkv_mix, head_v_dim, num_v_heads, n_seq_tokens, n_seqs,
ggml_row_size(conv_qkv_mix->type, head_v_dim),
nb1_qkv,
nb1_qkv * n_seq_tokens,
ggml_row_size(conv_qkv_mix->type, 2 * head_k_dim * num_k_heads));
cb(q_conv, "q_conv", il);
ggml_tensor * k_conv =
ggml_view_2d(ctx0, conv_qkv_mix, head_k_dim * num_k_heads, n_seq_tokens * n_seqs, nb1_qkv,
head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
cb(k_conv, "k_conv", il);
ggml_tensor * v_conv =
ggml_view_2d(ctx0, conv_qkv_mix, head_v_dim * num_v_heads, n_seq_tokens * n_seqs, nb1_qkv,
2 * head_k_dim * num_k_heads * ggml_element_size(conv_qkv_mix));
cb(v_conv, "v_conv", il);
// Unsqueeze them
q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
const float eps_norm = hparams.f_norm_rms_eps;
ggml_tensor * state = build_rs(inp, ssm_states_all, hparams.n_embd_s(), n_seqs);
state = ggml_reshape_4d(ctx0, state, head_v_dim, head_v_dim * num_v_heads, 1, n_seqs);
cb(state, "state_predelta", il);
q_conv = ggml_l2_norm(ctx0, q_conv, eps_norm);
k_conv = ggml_l2_norm(ctx0, k_conv, eps_norm);
//q_conv = ggml_cont_4d(ctx0, q_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
//k_conv = ggml_cont_4d(ctx0, k_conv, head_k_dim, num_k_heads, n_seq_tokens, n_seqs);
//v_conv = ggml_cont_4d(ctx0, v_conv, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
// if head keys and value keys are different, repeat to force tensors into matching shapes
if (num_k_heads != num_v_heads) {
@@ -786,7 +755,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
if (n_seq_tokens == 1) {
attn_out = build_delta_net_autoregressive(q_conv, k_conv, v_conv, gate, beta, state, il);
} else {
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, causal_mask, identity, diag_mask, il);
attn_out = build_delta_net_chunking(q_conv, k_conv, v_conv, gate, beta, state, il);
}
ggml_tensor * output = attn_out.first;
ggml_tensor * new_state = attn_out.second;
@@ -795,19 +764,15 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
// Update the recurrent states
ggml_build_forward_expand(gf,
ggml_cpy(ctx0, new_state,
ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
// Reshape both attn_out_final and z to 2D tensors for normalization
// attn_out_final: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
ggml_tensor * attn_out_2d_final = ggml_reshape_2d(ctx0, output, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
ggml_cpy(ctx0, new_state,
ggml_view_1d(ctx0, ssm_states_all, hparams.n_embd_s() * n_seqs,
kv_head * hparams.n_embd_s() * ggml_element_size(ssm_states_all))));
// z: [head_dim, n_heads, n_tokens, n_seqs] -> [n_heads * n_tokens * n_seqs, head_dim]
ggml_tensor * z_2d = ggml_reshape_2d(ctx0, z, head_v_dim, num_v_heads * n_seq_tokens * n_seqs);
ggml_tensor * z_2d = ggml_reshape_4d(ctx0, z, head_v_dim, num_v_heads, n_seq_tokens, n_seqs);
// Apply gated normalization: self.norm(core_attn_out, z)
ggml_tensor * attn_out_norm = build_norm_gated(attn_out_2d_final, model.layers[il].ssm_norm, z_2d, il);
ggml_tensor * attn_out_norm = build_norm_gated(output, model.layers[il].ssm_norm, z_2d, il);
// Final reshape: [head_dim, n_heads, n_tokens, n_seqs] -> [n_tokens, n_seqs, n_heads * head_dim]
ggml_tensor * final_output = ggml_reshape_3d(ctx0, attn_out_norm, head_v_dim * num_v_heads, n_seq_tokens, n_seqs);
@@ -818,7 +783,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn_linear(
cb(cur, "linear_attn_out", il);
// Reshape back to original dimensions
cur = ggml_cont_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
cur = ggml_reshape_2d(ctx0, cur, n_embd, n_seq_tokens * n_seqs);
return cur;
}
@@ -839,7 +805,7 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
if (model.layers[il].ffn_up_shexp != nullptr) {
ggml_tensor * ffn_shexp =
build_ffn(cur,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL,
@@ -852,11 +818,9 @@ ggml_tensor * llm_build_qwen3next::build_layer_ffn(ggml_tensor * cur, const int
ggml_tensor * shared_gate = build_lora_mm(model.layers[il].ffn_gate_inp_shexp, cur);
cb(shared_gate, "shared_expert_gate", il);
// Apply sigmoid to the gate
shared_gate = ggml_sigmoid(ctx0, shared_gate);
cb(shared_gate, "shared_expert_gate_sigmoid", il);
// Apply the gate to the shared expert output
ffn_shexp = ggml_mul(ctx0, ffn_shexp, shared_gate);
cb(ffn_shexp, "ffn_shexp_gated", il);

View File

@@ -8301,7 +8301,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
//for (int kv : { 1, 17, 31, 33, 61, 113, 65, 127, 129, 130, 255, 260, 371, 380, 407, 512, 1024, }) {
for (int kv : { 113, 512, 1024, }) {
if (nr2 != 1 && kv != 512) continue;
for (int nb : { 1, 3, 32, 35, }) {
for (int nb : { 1, 3, 32, 75, }) {
for (ggml_prec prec : {GGML_PREC_F32, GGML_PREC_DEFAULT}) {
if (hsk != 128 && prec == GGML_PREC_DEFAULT) continue;
for (ggml_type type_KV : {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {

View File

@@ -20,6 +20,7 @@ add_library(mtmd
models/internvl.cpp
models/kimivl.cpp
models/kimik25.cpp
models/nemotron-v2-vl.cpp
models/llama4.cpp
models/llava.cpp
models/minicpmv.cpp

View File

@@ -236,6 +236,7 @@ enum projector_type {
PROJECTOR_TYPE_GLM4V,
PROJECTOR_TYPE_YOUTUVL,
PROJECTOR_TYPE_KIMIK25,
PROJECTOR_TYPE_NEMOTRON_V2_VL,
PROJECTOR_TYPE_UNKNOWN,
};
@@ -270,6 +271,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {

View File

@@ -15,6 +15,7 @@ enum ffn_op_type {
FFN_GELU_ERF,
FFN_SILU,
FFN_GELU_QUICK,
FFN_RELU_SQR,
};
enum norm_type {

View File

@@ -559,6 +559,12 @@ ggml_tensor * clip_graph::build_ffn(
cur = ggml_gelu_quick(ctx0, cur);
cb(cur, "ffn_gelu_quick", il);
} break;
case FFN_RELU_SQR:
{
cur = ggml_relu(ctx0, cur);
cur = ggml_sqr(ctx0, cur);
cb(cur, "ffn_relu_sqr", il);
} break;
}
if (down) {
@@ -810,6 +816,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
{
builder = std::make_unique<clip_graph_internvl>(ctx, img);
} break;
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
builder = std::make_unique<clip_graph_nemotron_v2_vl>(ctx, img);
} break;
case PROJECTOR_TYPE_LLAMA4:
{
builder = std::make_unique<clip_graph_llama4>(ctx, img);
@@ -1110,6 +1120,7 @@ struct clip_model_loader {
}
} break;
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
} break;
@@ -1767,6 +1778,12 @@ struct clip_model_loader {
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
} break;
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
} break;
case PROJECTOR_TYPE_GLMA:
{
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
@@ -3088,6 +3105,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
case PROJECTOR_TYPE_GLM_EDGE:
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
clip_image_u8 resized_image;
int sz = params.image_size;
@@ -3397,6 +3415,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
case PROJECTOR_TYPE_LLAMA4:
{
// both X and Y are downscaled by the scale factor
@@ -3805,6 +3824,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_GEMMA3NV:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_ULTRAVOX:
@@ -3968,6 +3988,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
return ctx->model.mm_3_w->ne[1];
case PROJECTOR_TYPE_LLAMA4:
return ctx->model.mm_model_proj->ne[1];

View File

@@ -42,6 +42,11 @@ struct clip_graph_internvl : clip_graph {
ggml_cgraph * build() override;
};
struct clip_graph_nemotron_v2_vl : clip_graph {
clip_graph_nemotron_v2_vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
struct clip_graph_llama4 : clip_graph {
clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;

View File

@@ -0,0 +1,35 @@
#include "models.h"
ggml_cgraph * clip_graph_nemotron_v2_vl::build() {
GGML_ASSERT(model.class_embedding != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
const int n_registers = model.class_embedding->ne[1];
const int n_pos = n_patches + n_registers;
ggml_tensor * inp = build_inp();
// add position embeddings (pre-downsampled during GGUF conversion for fixed 512x512 input)
inp = ggml_add(ctx0, inp, model.position_embeddings);
cb(inp, "inp_pos", -1);
inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, hparams.ffn_op, nullptr, nullptr);
cur = ggml_view_2d(ctx0, cur,
n_embd, n_patches,
ggml_row_size(cur->type, n_embd),
n_registers * ggml_row_size(cur->type, n_embd));
cur = build_patch_merge_permute(cur, model.hparams.n_merge);
{
cur = build_norm(cur, model.mm_0_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
cur = build_ffn(cur, model.mm_1_w, nullptr, nullptr, nullptr, model.mm_3_w, nullptr, FFN_RELU_SQR, -1);
}
ggml_build_forward_expand(gf, cur);
return gf;
}

View File

@@ -28,10 +28,6 @@ target_link_libraries(${TARGET} PUBLIC common mtmd ${CMAKE_THREAD_LIBS_INIT})
set(TARGET llama-server)
if (NOT LLAMA_HTTPLIB)
message(FATAL_ERROR "LLAMA_HTTPLIB is OFF, cannot build llama-server. Hint: to skip building server, set -DLLAMA_BUILD_SERVER=OFF")
endif()
set(TARGET_SRCS
server.cpp
server-http.cpp

View File

@@ -1264,78 +1264,32 @@ int poll_wrapper(struct pollfd *fds, nfds_t nfds, int timeout) {
#endif
}
template <bool Read>
ssize_t select_impl(socket_t sock, time_t sec, time_t usec) {
#ifdef __APPLE__
if (sock >= FD_SETSIZE) { return -1; }
fd_set fds, *rfds, *wfds;
FD_ZERO(&fds);
FD_SET(sock, &fds);
rfds = (Read ? &fds : nullptr);
wfds = (Read ? nullptr : &fds);
timeval tv;
tv.tv_sec = static_cast<long>(sec);
tv.tv_usec = static_cast<decltype(tv.tv_usec)>(usec);
return handle_EINTR([&]() {
return select(static_cast<int>(sock + 1), rfds, wfds, nullptr, &tv);
});
#else
ssize_t select_impl(socket_t sock, short events, time_t sec,
time_t usec) {
struct pollfd pfd;
pfd.fd = sock;
pfd.events = (Read ? POLLIN : POLLOUT);
pfd.events = events;
pfd.revents = 0;
auto timeout = static_cast<int>(sec * 1000 + usec / 1000);
return handle_EINTR([&]() { return poll_wrapper(&pfd, 1, timeout); });
#endif
}
ssize_t select_read(socket_t sock, time_t sec, time_t usec) {
return select_impl<true>(sock, sec, usec);
return select_impl(sock, POLLIN, sec, usec);
}
ssize_t select_write(socket_t sock, time_t sec, time_t usec) {
return select_impl<false>(sock, sec, usec);
return select_impl(sock, POLLOUT, sec, usec);
}
Error wait_until_socket_is_ready(socket_t sock, time_t sec,
time_t usec) {
#ifdef __APPLE__
if (sock >= FD_SETSIZE) { return Error::Connection; }
fd_set fdsr, fdsw;
FD_ZERO(&fdsr);
FD_ZERO(&fdsw);
FD_SET(sock, &fdsr);
FD_SET(sock, &fdsw);
timeval tv;
tv.tv_sec = static_cast<long>(sec);
tv.tv_usec = static_cast<decltype(tv.tv_usec)>(usec);
auto ret = handle_EINTR([&]() {
return select(static_cast<int>(sock + 1), &fdsr, &fdsw, nullptr, &tv);
});
if (ret == 0) { return Error::ConnectionTimeout; }
if (ret > 0 && (FD_ISSET(sock, &fdsr) || FD_ISSET(sock, &fdsw))) {
auto error = 0;
socklen_t len = sizeof(error);
auto res = getsockopt(sock, SOL_SOCKET, SO_ERROR,
reinterpret_cast<char *>(&error), &len);
auto successful = res >= 0 && !error;
return successful ? Error::Success : Error::Connection;
}
return Error::Connection;
#else
struct pollfd pfd_read;
pfd_read.fd = sock;
pfd_read.events = POLLIN | POLLOUT;
pfd_read.revents = 0;
auto timeout = static_cast<int>(sec * 1000 + usec / 1000);
@@ -1354,7 +1308,6 @@ Error wait_until_socket_is_ready(socket_t sock, time_t sec,
}
return Error::Connection;
#endif
}
bool is_socket_alive(socket_t sock) {
@@ -7138,17 +7091,6 @@ Server::process_request(Stream &strm, const std::string &remote_addr,
res.version = "HTTP/1.1";
res.headers = default_headers_;
#ifdef __APPLE__
// Socket file descriptor exceeded FD_SETSIZE...
if (strm.socket() >= FD_SETSIZE) {
Headers dummy;
detail::read_headers(strm, dummy);
res.status = StatusCode::InternalServerError_500;
output_error_log(Error::ExceedMaxSocketDescriptorCount, &req);
return write_response(strm, close_connection, req, res);
}
#endif
// Request line and headers
if (!parse_request_line(line_reader.ptr(), req)) {
res.status = StatusCode::BadRequest_400;
@@ -12063,7 +12005,7 @@ bool get_cert_sans(cert_t cert, std::vector<SanEntry> &sans) {
if (!names) return true; // No SANs is valid
auto count = sk_GENERAL_NAME_num(names);
for (int i = 0; i < count; i++) {
for (decltype(count) i = 0; i < count; i++) {
auto gen = sk_GENERAL_NAME_value(names, i);
if (!gen) continue;

View File

@@ -8,8 +8,8 @@
#ifndef CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_VERSION "0.31.0"
#define CPPHTTPLIB_VERSION_NUM "0x001F00"
#define CPPHTTPLIB_VERSION "0.32.0"
#define CPPHTTPLIB_VERSION_NUM "0x002000"
/*
* Platform compatibility check