refactor

resoning and error handling
add tokens
2026-02-19 14:13:22 +02:00 · 2026-02-16 23:02:45 +02:00 · 2026-02-16 22:16:15 +02:00 · 2026-02-16 21:52:54 +02:00 · 2026-02-16 21:44:29 +02:00 · 2026-02-16 21:22:06 +02:00
19 changed files with 2104 additions and 154 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,7 +112,6 @@ option(LLAMA_TOOLS_INSTALL  "llama: install tools"        ${LLAMA_TOOLS_INSTALL_
 option(LLAMA_TESTS_INSTALL  "llama: install tests"        ON)

 # 3rd party libs
-option(LLAMA_HTTPLIB    "llama: httplib for downloading functionality" ON)
 option(LLAMA_OPENSSL    "llama: use openssl to support HTTPS" ON)
 option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)

@@ -197,9 +196,7 @@ add_subdirectory(src)

 if (LLAMA_BUILD_COMMON)
    add_subdirectory(common)
-    if (LLAMA_HTTPLIB)
-        add_subdirectory(vendor/cpp-httplib)
-    endif()
+    add_subdirectory(vendor/cpp-httplib)
 endif()

 if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -449,10 +449,9 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xros \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
+    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
-    -DLLAMA_HTTPLIB=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
    -S .
 cmake --build build-visionos --config Release -- -quiet
@@ -465,10 +464,9 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xrsimulator \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
+    -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
    -DLLAMA_OPENSSL=OFF \
-    -DLLAMA_HTTPLIB=OFF \
    -DLLAMA_BUILD_SERVER=OFF \
    -S .
 cmake --build build-visionos-sim --config Release -- -quiet
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -112,11 +112,7 @@ endif()

 # TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
 set(LLAMA_COMMON_EXTRA_LIBS build_info)
-
-if (LLAMA_HTTPLIB)
-    target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
-    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
-endif()
+set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)

 if (LLAMA_LLGUIDANCE)
    include(ExternalProject)
--- a/common/download.cpp
+++ b/common/download.cpp
@@ -19,9 +19,7 @@
 #include <thread>
 #include <vector>

-#if defined(LLAMA_USE_HTTPLIB)
 #include "http.h"
-#endif

 #ifndef __EMSCRIPTEN__
 #ifdef __linux__
@@ -142,8 +140,6 @@ std::pair<std::string, std::string> common_download_split_repo_tag(const std::st
    return {hf_repo, tag};
 }

-#if defined(LLAMA_USE_HTTPLIB)
-
 class ProgressBar {
    static inline std::mutex mutex;
    static inline std::map<const ProgressBar *, int> lines;
@@ -768,30 +764,6 @@ std::string common_docker_resolve_model(const std::string & docker) {
    }
 }

-#else
-
-common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool, const common_header_list &) {
-    throw std::runtime_error("download functionality is not enabled in this build");
-}
-
-bool common_download_model(const common_params_model &, const std::string &, bool, const common_header_list &) {
-    throw std::runtime_error("download functionality is not enabled in this build");
-}
-
-std::string common_docker_resolve_model(const std::string &) {
-    throw std::runtime_error("download functionality is not enabled in this build");
-}
-
-int common_download_file_single(const std::string &,
-                                const std::string &,
-                                const std::string &,
-                                bool,
-                                const common_header_list &) {
-    throw std::runtime_error("download functionality is not enabled in this build");
-}
-
-#endif // defined(LLAMA_USE_HTTPLIB)
-
 std::vector<common_cached_model_info> common_list_cached_models() {
    std::vector<common_cached_model_info> models;
    const std::string cache_dir = fs_get_cache_directory();
--- a/docs/build-s390x.md
+++ b/docs/build-s390x.md
@@ -242,10 +242,10 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 |------------|-------------|------|-------|
 | FP32       | ✅           | ✅    | ❓     |
 | FP16       | ✅           | ✅    | ❓     |
-| BF16       | 🚫           | ✅    | ❓     |
+| BF16       | ✅           | ✅    | ❓     |
 | Q4_0       | ✅           | ❓    | ❓     |
 | Q4_1       | ✅           | ❓    | ❓     |
-| MXFP4      | 🚫           | ❓    | ❓     |
+| MXFP4      | ✅           | ❓    | ❓     |
 | Q5_0       | ✅           | ❓    | ❓     |
 | Q5_1       | ✅           | ❓    | ❓     |
 | Q8_0       | ✅           | ❓    | ❓     |
@@ -272,4 +272,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
 -   🚫 - acceleration unavailable, will still run using scalar implementation
 -   ❓ - acceleration unknown, please contribute if you can test it yourself

-Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 7, 2025.
+Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Feb 15, 2026.
--- a/examples/llama-eval/AGENTS.md
+++ b/examples/llama-eval/AGENTS.md
@@ -0,0 +1,190 @@
+# llama-eval Codebase Guidelines
+
+## Overview
+
+This directory contains Python evaluation tools for llama.cpp:
+- `llama-eval.py` - Main evaluation tool with multiple datasets (AIME, AIME2025, GSM8K, GPQA)
+- `llama-server-simulator.py` - Flask-based server simulator for testing
+- `test-simulator.sh` - Test script for the simulator
+
+## Build/Run Commands
+
+### Virtual Environment
+The project uses a virtual environment located at `venv/`:
+```bash
+source venv/bin/activate
+```
+
+### Running the Main Evaluator
+```bash
+python llama-eval.py \
+  --server http://127.0.0.1:8013 \
+  --model gpt-oss-20b-hf-low \
+  --dataset aime \
+  --n_cases 10 \
+  --grader-type llm \
+  --seed 42
+```
+
+### Running the Simulator (for testing)
+```bash
+python llama-server-simulator.py --port 8033 --success-rate 0.8
+```
+
+### Running Tests
+```bash
+./test-simulator.sh
+```
+
+## Code Style Guidelines
+
+### Imports
+- Standard library imports first (argparse, json, os, re, subprocess, sys, time)
+- Third-party imports (requests, tqdm, datasets, flask) after standard library
+- Relative imports not used
+- Group imports by category with blank line between groups
+
+### Formatting
+- 4-space indentation
+- Max line length: 125 characters (per parent project's .flake8)
+- Use double quotes for strings
+- Use triple double quotes for docstrings
+- Binary operators at the beginning of continued lines
+
+### Naming Conventions
+- Classes: PascalCase (e.g., `AimeDataset`, `Grader`, `Processor`)
+- Functions: snake_case (e.g., `normalize_number`, `get_prompt`)
+- Variables: snake_case (e.g., `question_text`, `correct_count`)
+- Constants: UPPER_SNAKE_CASE (e.g., `GRADER_PATTERNS`, `TEMPLATE_REGISTRY`)
+- Private methods: prefix with underscore (e.g., `_load_dataset`, `_grade_regex`)
+
+### Types
+- Use type hints for all function signatures
+- Import from `typing` module: `Dict`, `List`, `Optional`, `Any`, `Tuple`
+- Use `@dataclass` for data structures
+- Prefer `Optional[T]` over `Union[T, None]`
+
+### Error Handling
+- Use try/except for network requests and file operations
+- Return `None` or `False` on errors when appropriate
+- Use `ValueError` for invalid arguments
+- Use `FileNotFoundError` for missing files
+- CLI scripts should handle exceptions gracefully
+
+### Dataclasses
+- Use `@dataclass` for structured data
+- Define fields with explicit types
+- Use `Optional[T]` for nullable fields
+- Provide default values where appropriate
+
+### String Formatting
+- Use f-strings for formatting (Python 3.6+)
+- Use triple double quotes for multi-line strings
+- Escape backslashes in regex patterns: `r'\\boxed{(\d+)}'`
+
+### File Paths
+- Use `pathlib.Path` instead of string paths
+- Create directories with `mkdir(parents=True, exist_ok=True)`
+- Use `Path.home()` for user home directory
+
+### Logging
+- Use `print()` for user-facing output
+- Use `sys.stderr` for debug logging
+- Simulator writes debug logs to `/tmp/simulator-debug.log`
+
+### Testing
+
+- Test script uses bash with `set -e` for strict error handling
+- Simulator runs in background with PID tracking
+- Tests verify correct answers, error cases, and edge cases
+- Use `curl` for HTTP testing in shell scripts
+
+### Whitespace Cleanup
+- Remove trailing whitespace from all lines
+- When making edits, do not leave trailing whitespace
+
+## Dataset Support
+
+### AIME Dataset
+- 90 questions from 2025 AIME competition
+- Answers in `\boxed{answer}` format
+- Supports regex, CLI, and LLM grading
+
+### AIME2025 Dataset
+- 30 questions from 2025 AIME I & II
+- Answers in `\boxed{answer}` format
+- Requires loading two config parts
+
+### GSM8K Dataset
+- 7473 math word problems
+- Answers numeric values with `####` separator
+- Supports regex, CLI, and LLM grading
+
+### GPQA Dataset
+- 198 questions from GPQA Diamond
+- Multiple choice with shuffled options (A, B, C, D)
+- **Requires LLM grader** (returns letter A/B/C/D)
+
+## Grading Types
+
+### Regex Grader
+- Built-in patterns per dataset
+- Prioritizes `\boxed{}` for AIME datasets
+- Extracts last number for GSM8K
+
+### CLI Grader
+- External script interface
+- Call: `grader.sh --answer <pred> --expected <gold>`
+- Exit code 0 = correct, non-zero = incorrect
+
+### LLM Grader
+- Uses judge model for answer extraction
+- Includes few-shot examples
+- Case-insensitive comparison
+- Required for GPQA
+
+## Configuration
+
+### Sampling Parameters (Optional)
+- `--temperature`: Sampling temperature
+- `--top-k`: Top K sampling
+- `--top-p`: Top P sampling
+- `--min-p`: Min P sampling
+- Only passed to API if explicitly specified
+
+### Default Values
+- `--n_predict`: -1 (infinite)
+- `--grader-type`: llm
+- `--seed`: 1234
+- `--threads`: 32
+- `--output`: llama-eval-state.json
+
+## Output Format
+
+### Progress Table
+- Shows task ID, dataset, prompt (truncated to 43 chars), expected answer, status
+- Uses `tqdm` for progress bars
+
+### Results Summary
+- Format: `Results: X/Y correct (Z%)`
+- Displayed after all tasks complete
+
+### JSON Output
+- Complete eval state saved to output file
+- Contains: task IDs, correctness, prompts, extracted answers, sampling config
+- Uses `dataclasses.asdict()` for serialization
+
+## HuggingFace Datasets
+
+- Cache directory: `~/.cache/huggingface/datasets`
+- Set via `HF_DATASETS_CACHE` environment variable
+- Telemetry disabled via `HF_HUB_DISABLE_TELEMETRY=1`
+- Datasets loaded with `datasets.load_dataset()`
+
+## Flask Simulator
+
+- Runs on configurable port (default: 5000)
+- Endpoint: `/v1/chat/completions` (OpenAI-compatible)
+- Uses Dice coefficient for question matching
+- Configurable success rate for testing
+- Debug logs to `/tmp/simulator-debug.log`
--- a/examples/llama-eval/IMPLEMENTATION.md
+++ b/examples/llama-eval/IMPLEMENTATION.md
@@ -0,0 +1,94 @@
+# llama-eval Implementation Summary
+
+## Overview
+
+Simple evaluation tool for llama.cpp with support for multiple datasets (AIME, GSM8K, GPQA) and flexible grading (regex, CLI, LLM).
+
+## Key Features
+
+- **Multiple Datasets**: AIME, GSM8K, GPQA with proper answer extraction
+- **Flexible Grading**: Regex, CLI, or LLM-based grading
+- **Parallel Processing**: Configurable thread count for concurrent requests
+- **Sampling Parameters**: Temperature, Top K, Top P, Min P (optional)
+- **Real-time Feedback**: Progress tracking with detailed output
+- **JSON Output**: Complete eval state saved for debugging
+- **GPQA Support**: Answer shuffling with reproducible results
+
+## Architecture
+
+### Eval State
+```python
+@dataclass
+class EvalState:
+    id: str
+    tasks: List[str]
+    task_states: Dict[str, Dict[str, Any]]
+    sampling_config: Dict[str, Any]
+```
+
+### Processor
+- Handles processing, grading, and state management
+- Thread-safe concurrent execution
+- Configurable sampling parameters
+
+### Grader
+- Abstract grading interface supporting multiple types
+- Regex grader with dataset-specific patterns
+- CLI grader with external script interface
+- LLM grader with configurable server and model
+
+### Datasets
+- `AimeDataset`: 90 AIME 2025 questions
+- `Aime2025Dataset`: 30 AIME 2025 I & II questions
+- `Gsm8kDataset`: 7473 math word problems
+- `GpqaDataset`: 198 GPQA Diamond questions with shuffling
+
+## Configuration
+
+### Sampling Parameters (Optional)
+- `--temperature`: Sampling temperature
+- `--top-k`: Top K sampling
+- `--top-p`: Top P sampling
+- `--min-p`: Min P sampling
+- Only passed if explicitly specified
+
+### Grading Types
+- **regex**: Built-in patterns for each dataset
+- **cli**: External script with `--answer` and `--expected` args
+- **llm**: LLM-based extraction with few-shot examples and configurable server/model
+
+### Dataset Requirements
+- **AIME**: Supports regex, CLI, or LLM grader
+- **AIME2025**: Supports regex, CLI, or LLM grader
+- **GSM8K**: Supports regex, CLI, or LLM grader
+- **GPQA**: Requires LLM grader
+
+## Output Format
+
+### Progress Table
+```
+  Task ID             Dataset  Prompt (first 43 chars)                        Expected    Status
+  aime_000_001         AIME   Complete the following reactions and sel...    A          pending
+```
+
+### Results Summary
+```
+============================================================
+Results: 8/10 correct (80.0%)
+============================================================
+```
+
+### JSON Output
+Complete eval state with task IDs, correctness, prompts, extracted answers, and sampling configuration.
+
+## Technical Details
+
+- Default max tokens: -1 (infinite)
+- Default grader type: llm
+- Default seed: 1234
+- Default threads: 32
+- Prompt truncation: First 43 chars + padding + "..."
+- Response truncation: Last 10 lines for grading
+- GPQA requires LLM grader (returns letter A/B/C/D)
+- Judge model defaults to evaluated model if not specified
+- Sample answers defined in SAMPLE_ANSWERS dict for few-shot learning
--- a/examples/llama-eval/README.md
+++ b/examples/llama-eval/README.md
@@ -0,0 +1,112 @@
+# llama-eval Evaluation Tool
+
+Simple evaluation tool for llama.cpp with support for multiple datasets.
+
+## Features
+
+- **Multiple Datasets**: AIME, GSM8K, GPQA
+- **Flexible Grading**: Regex, CLI, or LLM-based grading
+- **Parallel Processing**: Configurable thread count
+- **Real-time Feedback**: Progress tracking with detailed output
+- **Sampling Parameters**: Temperature, Top K, Top P, Min P
+- **JSON Output**: Complete eval state saved for debugging
+
+## Usage
+
+```bash
+python llama-eval.py \
+  --server http://127.0.0.1:8013 \
+  --model gpt-oss-20b-hf-low \
+  --judge-model gpt-oss-20b-hf-medium \
+  --dataset aime \
+  --n_cases 10 \
+  --grader-type llm \
+  --seed 42
+```
+
+## CLI Arguments
+
+- `--server`: llama-server URL (default: http://127.0.0.1:8013)
+- `--model`: Model name for evaluation (default: llama)
+- `--judge-model`: Model name for LLM judge (default: same as main model)
+- `--judge-server`: Server URL for LLM judge (default: same as main server)
+- `--dataset`: Dataset type (aime, aime2025, gsm8k, gpqa)
+- `--n_cases`: Number of cases to evaluate (default: all)
+- `--n_predict`: Max tokens to predict per prompt (default: -1, infinite)
+- `--temperature`: Sampling temperature (default: not passed)
+- `--top-k`: Top K sampling (default: not passed)
+- `--top-p`: Top P sampling (default: not passed)
+- `--min-p`: Min P sampling (default: not passed)
+- `--threads`: Number of threads for parallel requests (default: 32)
+- `--verbose`: Show detailed output for each case
+- `--output`: Output file for eval state (default: llama-eval-state.json)
+- `--grader-type`: Grader type (regex, cli, llm, default: llm)
+- `--grader-script`: Path to CLI grader script (required for --grader-type cli)
+- `--seed`: Random seed for shuffling (default: 1234)
+
+## Datasets
+
+### AIME
+- 90 questions from 2025 AIME competition
+- Answers in boxed format: `\boxed{answer}`
+- Requires regex grader or LLM grader
+
+### AIME2025
+- 30 questions from 2025 AIME I & II competitions
+- Answers in boxed format: `\boxed{answer}`
+- Supports regex, CLI, or LLM grader
+
+### GSM8K
+- 7473 math word problems
+- Answers are numeric values
+- Requires regex grader or LLM grader
+
+### GPQA
+- 198 questions from GPQA Diamond dataset
+- Multiple choice with shuffled options
+- Requires LLM grader (returns letter A, B, C, or D)
+
+## Grading Types
+
+### Regex Grader
+Built-in patterns for different datasets:
+- AIME: `\boxed{(\d+)}|\b(\d+)\b`
+- AIME2025: `\boxed{(\d+)}|\b(\d+)\b`
+- GSM8K: `\b(\d+)\b`
+- GPQA: Letter extraction (A, B, C, D)
+
+### CLI Grader
+External script interface:
+```bash
+./grader.sh --answer <pred> --expected <gold>
+```
+Returns exit code 0 if correct, non-zero if incorrect.
+
+### LLM Grader
+Uses LLM to extract and compare answers:
+- Configurable server and model
+- Includes few-shot examples from sample answers
+- Case-insensitive comparison
+- Required for GPQA dataset
+
+## Output
+
+### Progress Table
+```
+  Task ID             Dataset  Prompt (first 43 chars)                        Expected    Status
+  aime_000_001         AIME   Complete the following reactions and sel...    A          pending
+```
+
+### Results
+```
+============================================================
+Results: 8/10 correct (80.0%)
+============================================================
+```
+
+### JSON Output
+Complete eval state saved to output file with:
+- Task IDs and correctness status
+- Prompts and extracted answers
+- Sampling configuration
+- Processing metadata
--- a/examples/llama-eval/llama-eval.py
+++ b/examples/llama-eval/llama-eval.py
--- a/examples/llama-eval/llama-server-simulator-README.md
+++ b/examples/llama-eval/llama-server-simulator-README.md
@@ -0,0 +1,36 @@
+# llama-server-simulator
+
+Standalone Python script simulating llama-server HTTP endpoint for testing.
+
+## Features
+
+- HTTP Server with OpenAI-compatible `/v1/chat/completions` endpoint
+- AIME Dataset Integration - Loads 90 questions from HuggingFace
+- Intelligent Question Matching - Uses exact matching, LaTeX removal, and Levenshtein distance
+- Configurable Success Rate - Control correct/wrong answer generation (0-1)
+- Debug Logging - Troubleshoot matching issues
+
+## Usage
+
+```bash
+python llama-server-simulator.py --success-rate 0.8
+```
+
+## Arguments
+
+- `--success-rate`: Probability of returning correct answer (0.0-1.0, default: 0.8)
+- `--port`: Server port (default: 8033)
+- `--debug`: Enable debug logging (default: False)
+
+## Testing
+
+```bash
+./test-simulator.sh
+```
+
+## Implementation Details
+
+- Uses Levenshtein distance for partial matching (threshold: 0.3)
+- Automatic caching via HuggingFace datasets library
+- Wrong answers generated by incrementing expected answer
+- Debug output written to stderr
--- a/examples/llama-eval/llama-server-simulator.py
+++ b/examples/llama-eval/llama-server-simulator.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+
+import argparse
+import json
+import random
+import re
+import time
+import sys
+import os
+from typing import Dict, List, Optional
+from dataclasses import dataclass, asdict
+from pathlib import Path
+
+import datasets
+from flask import Flask, request, jsonify
+
+# Set cache directory for HuggingFace datasets
+cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
+cache_dir.mkdir(parents=True, exist_ok=True)
+os.environ["HF_DATASETS_CACHE"] = str(cache_dir)
+
+def dice(s1: str, s2: str) -> float:
+    """Calculate Dice coefficient between two strings based on bigram overlap."""
+    if not s1 and not s2:
+        return 1.0
+
+    def _bigrams(s: str):
+        return [s[i : i + 2] for i in range(len(s) - 1)]
+
+    bigrams1 = _bigrams(s1)
+    bigrams2 = _bigrams(s2)
+
+    if not bigrams1 and not bigrams2:
+        return 1.0
+
+    from collections import Counter
+
+    freq1 = Counter(bigrams1)
+    freq2 = Counter(bigrams2)
+
+    intersection = sum(min(freq1[bg], freq2[bg]) for bg in freq1)
+    dice_coeff = 2 * intersection / (len(bigrams1) + len(bigrams2))
+    return dice_coeff
+
+def debug_log(message: str):
+    """Log debug messages to both stdout and a file"""
+    print(message, file=sys.stderr)
+    with open("/tmp/simulator-debug.log", "a") as f:
+        f.write(message + "\n")
+
+app = Flask(__name__)
+
+@dataclass
+class EvalState:
+    id: str
+    tasks: List[str]
+    task_states: Dict[str, Dict]
+    sampling_config: Dict
+
+def normalize_number(s: str) -> Optional[int]:
+    match = re.match(r"\d+", s)  # match digits from the start
+    if not match:
+        return None
+    return int(match.group(0))
+
+class AimeDataset:
+    def __init__(self, split: str = "train"):
+        self.split = split
+        self.questions: List[Dict] = []
+        self._load_dataset()
+
+    def _load_dataset(self):
+        print(f"Loading AIME dataset (split: {self.split})...")
+
+        cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
+        if cache_path.exists():
+            print(f"Using cached dataset from {cache_path}")
+            ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
+        else:
+            ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
+
+        self.questions = list(ds)
+        print(f"AIME dataset loaded: {len(self.questions)} questions")
+
+    def find_question(self, request_text: str) -> Optional[Dict]:
+        best_match = None
+        best_distance = -1
+        best_index = -1
+
+        for i, question in enumerate(self.questions):
+            question_text = question["problem"]
+            request_lower = request_text.lower()
+            question_lower = question_text.lower()
+
+            # Exact match
+            if question_lower == request_lower:
+                debug_log(f"DEBUG: Found exact match at index {i}")
+                return question
+
+            # Remove LaTeX formatting for more flexible matching
+            question_no_latex = re.sub(r'\$[^$]+\$', '', question_text)
+            if question_no_latex.lower() == request_lower:
+                debug_log(f"DEBUG: Found match (no LaTeX) at index {i}")
+                return question
+
+            # Calculate Levenshtein distance for partial matches
+            # Only consider if request is at least 50% of question length
+            if len(request_lower) >= len(question_lower) * 0.5:
+                distance = dice(question_lower, request_lower)
+
+                if distance > best_distance:
+                    best_distance = distance
+                    best_match = question
+                    best_index = i
+
+        if best_match and best_distance > 0.3:  # Threshold for partial match
+            debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
+            return best_match
+
+        debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
+        return None
+
+    def get_answer(self, question: Dict) -> str:
+        answer = question["answer"]
+        if isinstance(answer, str):
+            normalized = normalize_number(answer)
+            return str(normalized) if normalized is not None else answer
+        return str(answer)
+
+class Simulator:
+    def __init__(
+        self,
+        port: int = 8033,
+        host: str = "localhost",
+        success_rate: float = 0.8,
+        dataset_split: str = "train"
+    ):
+        self.port = port
+        self.host = host
+        self.success_rate = success_rate
+        self.dataset = AimeDataset(dataset_split)
+        self.eval_state = EvalState(
+            id="aime-2025",
+            tasks=["aime"],
+            task_states={},
+            sampling_config={"temperature": 0, "max_tokens": 2048}
+        )
+
+    def _generate_response(
+        self,
+        question: Dict,
+        should_be_correct: bool
+    ) -> Dict:
+        expected_answer = self.dataset.get_answer(question)
+
+        if should_be_correct:
+            response_text = expected_answer
+        else:
+            response_text = self._generate_wrong_answer(question)
+
+        return {
+            "id": f"chatcmpl-{int(time.time())}",
+            "object": "chat.completion",
+            "created": int(time.time()),
+            "model": "llama",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": response_text
+                    },
+                    "finish_reason": "stop"
+                }
+            ],
+            "usage": {
+                "prompt_tokens": 100,
+                "completion_tokens": 50,
+                "total_tokens": 150
+            }
+        }
+
+    def _generate_wrong_answer(self, question: Dict) -> str:
+        expected_answer = self.dataset.get_answer(question)
+
+        if expected_answer.isdigit():
+            wrong_answer = str(int(expected_answer) + 1)
+        else:
+            wrong_answer = expected_answer + " (wrong)"
+
+        return wrong_answer
+
+    def _process_request(self, request_data: Dict) -> Dict:
+        messages = request_data.get("messages", [])
+        if not messages:
+            return {"error": "No messages in request"}
+
+        request_text = messages[0].get("content", "")
+        debug_log(f"DEBUG: Received request with content: {request_text[:150]}...")
+
+        question = self.dataset.find_question(request_text)
+        if not question:
+            debug_log(f"DEBUG: find_question returned None")
+            return {"error": "No matching question found"}
+
+        should_be_correct = random.random() < self.success_rate
+
+        response = self._generate_response(question, should_be_correct)
+
+        task_id = "aime"
+        self.eval_state.task_states[task_id] = {
+            "correct": should_be_correct,
+            "expected": self.dataset.get_answer(question),
+            "predicted": response["choices"][0]["message"]["content"]
+        }
+
+        return response
+
+@app.route('/v1/chat/completions', methods=['POST'])
+def chat_completions():
+    try:
+        request_data = request.get_json()
+
+        if not request_data:
+            return jsonify({"error": "Invalid JSON"}), 400
+
+        response = simulator._process_request(request_data)
+
+        return jsonify(response)
+
+    except Exception as e:
+        print(f"Error processing request: {e}")
+        return jsonify({"error": str(e)}), 500
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="llama-server simulator for testing eval scripts"
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=8033,
+        help="Server port (default: 8033)"
+    )
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="localhost",
+        help="Server host (default: localhost)"
+    )
+    parser.add_argument(
+        "--success-rate",
+        type=float,
+        default=0.8,
+        help="Success rate 0-1 (default: 0.8)"
+    )
+    parser.add_argument(
+        "--dataset-split",
+        type=str,
+        default="train",
+        help="AIME dataset split to use (default: train)"
+    )
+
+    args = parser.parse_args()
+
+    global simulator
+    simulator = Simulator(
+        port=args.port,
+        host=args.host,
+        success_rate=args.success_rate,
+        dataset_split=args.dataset_split
+    )
+
+    print("\n=== llama-server-simulator ===")
+    print(f"Server running on http://{args.host}:{args.port}")
+    print(f"Success rate: {args.success_rate}")
+    print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
+    print("\nPress Ctrl+C to stop\n")
+
+    app.run(host=args.host, port=args.port, debug=False)
+
+if __name__ == "__main__":
+    main()
--- a/examples/llama-eval/test-simulator.sh
+++ b/examples/llama-eval/test-simulator.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+set -e
+
+# Get the directory where this script is located
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+echo "=== llama-server-simulator Test Script ==="
+echo ""
+
+PORT=8033
+SUCCESS_RATE=0.8
+TEST_PORT=8034
+
+echo "Starting simulator on port $PORT with success rate $SUCCESS_RATE..."
+source "$SCRIPT_DIR/venv/bin/activate"
+python3 "$SCRIPT_DIR/llama-server-simulator.py" --port $PORT --success-rate $SUCCESS_RATE > /tmp/simulator-test.log 2>&1 &
+SIMULATOR_PID=$!
+
+echo "Waiting for simulator to start..."
+sleep 5
+
+# Helper function to make a request and extract the answer
+make_request() {
+  local question="$1"
+  curl -s -X POST http://localhost:$PORT/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d "{
+      \"model\": \"llama\",
+      \"messages\": [
+        {\"role\": \"user\", \"content\": \"$question\"}
+      ],
+      \"temperature\": 0,
+      \"max_tokens\": 2048
+    }" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data.get('choices', [{}])[0].get('message', {}).get('content', data.get('error', 'No response')))"
+}
+
+# Test question (repeated in multiple tests)
+TEST_QUESTION="Quadratic polynomials P(x) and Q(x) have leading coefficients 2 and -2, respectively. The graphs of both polynomials pass through the two points (16,54) and (20,53). Find P(0) + Q(0)."
+
+echo ""
+echo "=== Test 1: Correct Answer ==="
+echo "Sending request with known question..."
+answer=$(make_request "$TEST_QUESTION")
+echo "Answer: $answer"
+echo "Expected: 116"
+echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
+
+echo ""
+echo "=== Test 2: Wrong Answer ==="
+echo "Sending request with known question (success rate 0.0)..."
+answer=$(make_request "$TEST_QUESTION")
+echo "Answer: $answer"
+echo "Expected: 116"
+echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
+
+echo ""
+echo "=== Test 3: No Matching Question ==="
+echo "Sending request with non-matching text..."
+response=$(make_request "What is the capital of France?")
+echo "Response: $response"
+echo "Expected: No matching question found"
+echo "Correct: $([ "$response" == "No matching question found" ] && echo "Yes" || echo "No")"
+
+echo ""
+echo "=== Test 4: Success Rate Verification ==="
+echo "Sending 10 requests to test success rate..."
+correct_count=0
+for i in {1..10}; do
+  answer=$(make_request "$TEST_QUESTION")
+  if [ "$answer" == "116" ]; then
+    correct_count=$((correct_count + 1))
+  fi
+  echo "  Request $i: Answer = $answer"
+done
+echo "Correct answers: $correct_count/10"
+echo "Expected: ~8/10 (80% success rate)"
+echo "Success rate: $(echo "scale=1; $correct_count * 10" | bc)%"
+
+echo ""
+echo "=== Test Complete ==="
+echo "Stopping simulator..."
+kill $SIMULATOR_PID 2>/dev/null
+wait $SIMULATOR_PID 2>/dev/null || true
+
+echo "Simulator stopped."
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -576,13 +576,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            DOWNLOAD_EXTRACT_TIMESTAMP NEW
            URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})

-        FetchContent_Populate(KleidiAI_Download)
        FetchContent_GetProperties(KleidiAI_Download
            SOURCE_DIR  KLEIDIAI_SRC
            POPULATED   KLEIDIAI_POPULATED)

        if (NOT KLEIDIAI_POPULATED)
-            message(FATAL_ERROR "KleidiAI source downloaded failed.")
+            FetchContent_Populate(KleidiAI_Download)
+            FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
        endif()

        add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -2715,14 +2715,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa

 #pragma unroll
        for (int l = 0; l < QR2_XXS; ++l) {
-            const int * grid_pos = (const int *) (iq2xxs_grid + aux8[l]);
-            const int signs_packed = ksigns_iq2xs[(aux32 >> (7*l)) & 0x7F];
+            const uint2 grid_pos = ((const uint2*)iq2xxs_grid)[aux8[l]];
+            const uint32_t signs = unpack_ksigns(aux32 >> (7 * l));

-            const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
-            const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
+            const int signs0 = __vcmpne4(signs & 0x08040201, 0);
+            const int grid0 = __vsub4(grid_pos.x ^ signs0, signs0);

-            const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
-            const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
+            const int signs1 = __vcmpne4(signs & 0x80402010, 0);
+            const int grid1 = __vsub4(grid_pos.y ^ signs1, signs1);

 #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid0;
@@ -2733,12 +2733,12 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
 #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
        }

-        const int ls = aux32 >> 28;
+        const int ls = aux32 >> 27 | 1; // (scale * 2 + 1)
        const float d = bxi->d;
 #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-        x_df[i*MMQ_MMA_TILE_X_K_Q8_0   + kqsx] = (ls*d + d/2)/4;
+        x_df[i*MMQ_MMA_TILE_X_K_Q8_0   + kqsx] = d * ls / 8; // (d * scale + d / 2) / 4
 #else
-        x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = (ls*d + d/2)/4;
+        x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = d * ls / 8; // (d * scale + d / 2) / 4
 #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE)  || defined(AMD_WMMA_AVAILABLE)
    }
 }
@@ -2776,11 +2776,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa

    #pragma unroll
        for (int l = 0; l < QR2_XS; ++l) {
-            const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l] & 0x000001FF));
-            const uint32_t * signs    = (const uint32_t *)(ksigns64   + (q2[l] >> 9));
+            const uint2 grid_pos = ((const uint2*)iq2xs_grid)[q2[l] & 0x1FF];
+            const uint32_t signs = unpack_ksigns(q2[l] >> 9);

-            const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
-            const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
+            const int signs0 = __vcmpne4(signs & 0x08040201, 0);
+            const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
+
+            const int signs1 = __vcmpne4(signs & 0x80402010, 0);
+            const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);

 #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
            x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
@@ -2904,11 +2907,13 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
 #pragma unroll
        for (int l = 0; l < QR3_XXS; ++l) {
            const int2 grid_pos = make_int2(iq3xxs_grid[q3[2*l+0]], iq3xxs_grid[q3[2*l+1]]);
+            const uint32_t signs = unpack_ksigns(aux32 >> (7*l));

-            const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l)) & 0x7F));
+            const int signs0 = __vcmpne4(signs & 0x08040201, 0);
+            const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);

-            const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
-            const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
+            const int signs1 = __vcmpne4(signs & 0x80402010, 0);
+            const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);

 #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
            x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid_l;
--- a/ggml/src/ggml-cuda/vecdotq.cuh
+++ b/ggml/src/ggml-cuda/vecdotq.cuh
@@ -94,6 +94,15 @@ static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, con
 #endif
 }

+static __device__ __forceinline__ uint32_t unpack_ksigns(const uint8_t v) {
+    // v is a 7 bit int, with the 8th sign being encodable as popcnt
+    // with xor we can "correct" the bit instead of having to mask
+    const uint32_t p = __popc(v) & 1;
+    const uint32_t s = v ^ p << 7;
+    // broadcast over uint to allow for 0x08040201 / 0x80402010 as selectors
+    return s * 0x01010101;
+}
+
 // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
 // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q

@@ -905,22 +914,22 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
    int sumi = 0;
 #pragma unroll
    for (int k0 = 0; k0 < 8; k0 += 2) {
-        const int * grid_pos = (const int *) (iq2xxs_grid + aux8[k0/2]);
-        const int signs_packed = ksigns_iq2xs[(aux32 >> (7*k0/2)) & 0x7F];
+        const uint2 grid_pos = ((const uint2*)iq2xxs_grid)[aux8[k0/2]];
+        const uint32_t signs = unpack_ksigns(aux32 >> (7 * k0 / 2));

-        const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
-        const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
+        const int signs0 = __vcmpne4(signs & 0x08040201, 0);
+        const int grid0 = __vsub4(grid_pos.x ^ signs0, signs0);
        const int u0 = get_int_b4(bq8_1[iqs/2].qs, k0 + 0);
        sumi = ggml_cuda_dp4a(grid0, u0, sumi);

-        const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
-        const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
+        const int signs1 = __vcmpne4(signs & 0x80402010, 0);
+        const int grid1 = __vsub4(grid_pos.y ^ signs1, signs1);
        const int u1 = get_int_b4(bq8_1[iqs/2].qs, k0 + 1);
        sumi = ggml_cuda_dp4a(grid1, u1, sumi);
    }

-    const int ls = aux32 >> 28;
-    sumi = (ls*sumi + sumi/2)/4;
+    const int ls = aux32 >> 27 | 1; // (scale * 2 + 1)
+    sumi = sumi * ls / 8;           // (sumi * scale + sumi / 2) / 4
    const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
    return d * sumi;
 }
@@ -942,13 +951,15 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
    int sumi1 = 0;
 #pragma unroll
    for (int l0 = 0; l0 < 8; l0 += 2) {
-        const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l0/2] & 0x000001FF));
-        const uint32_t * signs    = (const uint32_t *)(ksigns64   + (q2[l0/2] >> 9));
-
-        const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
-        const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
+        const uint2 grid_pos = ((const uint2*)iq2xs_grid)[q2[l0/2] & 0x1FF];
+        const uint32_t signs = unpack_ksigns(q2[l0/2] >> 9);

+        const int signs0 = __vcmpne4(signs & 0x08040201, 0);
+        const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
        const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
+
+        const int signs1 = __vcmpne4(signs & 0x80402010, 0);
+        const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
        const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);

        if (l0 < 4) {
@@ -1028,13 +1039,16 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
 #pragma unroll
    for (int l0 = 0; l0 < 8; l0 += 2) {
        const int2 grid_pos = make_int2(iq3xxs_grid[q3[l0 + 0]], iq3xxs_grid[q3[l0 + 1]]);
+        const uint32_t signs = unpack_ksigns(aux32 >> (7*l0/2));

-        const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l0/2)) & 0x7F));
-
-        const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
-        const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
+        const int signs0 = __vcmpne4(signs & 0x08040201, 0);
+        const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);

        const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
+
+        const int signs1 = __vcmpne4(signs & 0x80402010, 0);
+        const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
+
        const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);

        sumi = ggml_cuda_dp4a(grid_l, u0, sumi);
--- a/scripts/sync_vendor.py
+++ b/scripts/sync_vendor.py
@@ -5,7 +5,7 @@ import os
 import sys
 import subprocess

-HTTPLIB_VERSION = "f80864ca031932351abef49b74097c67f14719c6"
+HTTPLIB_VERSION = "d4180e923f846b44a3d30acd938438d6e64fc9f6"

 vendor = {
    "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -28,10 +28,6 @@ target_link_libraries(${TARGET} PUBLIC common mtmd ${CMAKE_THREAD_LIBS_INIT})

 set(TARGET llama-server)

-if (NOT LLAMA_HTTPLIB)
-    message(FATAL_ERROR "LLAMA_HTTPLIB is OFF, cannot build llama-server. Hint: to skip building server, set -DLLAMA_BUILD_SERVER=OFF")
-endif()
-
 set(TARGET_SRCS
    server.cpp
    server-http.cpp
--- a/vendor/cpp-httplib/httplib.cpp
+++ b/vendor/cpp-httplib/httplib.cpp
@@ -1264,78 +1264,32 @@ int poll_wrapper(struct pollfd *fds, nfds_t nfds, int timeout) {
 #endif
 }

-template <bool Read>
-ssize_t select_impl(socket_t sock, time_t sec, time_t usec) {
-#ifdef __APPLE__
-  if (sock >= FD_SETSIZE) { return -1; }
-
-  fd_set fds, *rfds, *wfds;
-  FD_ZERO(&fds);
-  FD_SET(sock, &fds);
-  rfds = (Read ? &fds : nullptr);
-  wfds = (Read ? nullptr : &fds);
-
-  timeval tv;
-  tv.tv_sec = static_cast<long>(sec);
-  tv.tv_usec = static_cast<decltype(tv.tv_usec)>(usec);
-
-  return handle_EINTR([&]() {
-    return select(static_cast<int>(sock + 1), rfds, wfds, nullptr, &tv);
-  });
-#else
+ssize_t select_impl(socket_t sock, short events, time_t sec,
+                           time_t usec) {
  struct pollfd pfd;
  pfd.fd = sock;
-  pfd.events = (Read ? POLLIN : POLLOUT);
+  pfd.events = events;
+  pfd.revents = 0;

  auto timeout = static_cast<int>(sec * 1000 + usec / 1000);

  return handle_EINTR([&]() { return poll_wrapper(&pfd, 1, timeout); });
-#endif
 }

 ssize_t select_read(socket_t sock, time_t sec, time_t usec) {
-  return select_impl<true>(sock, sec, usec);
+  return select_impl(sock, POLLIN, sec, usec);
 }

 ssize_t select_write(socket_t sock, time_t sec, time_t usec) {
-  return select_impl<false>(sock, sec, usec);
+  return select_impl(sock, POLLOUT, sec, usec);
 }

 Error wait_until_socket_is_ready(socket_t sock, time_t sec,
                                        time_t usec) {
-#ifdef __APPLE__
-  if (sock >= FD_SETSIZE) { return Error::Connection; }
-
-  fd_set fdsr, fdsw;
-  FD_ZERO(&fdsr);
-  FD_ZERO(&fdsw);
-  FD_SET(sock, &fdsr);
-  FD_SET(sock, &fdsw);
-
-  timeval tv;
-  tv.tv_sec = static_cast<long>(sec);
-  tv.tv_usec = static_cast<decltype(tv.tv_usec)>(usec);
-
-  auto ret = handle_EINTR([&]() {
-    return select(static_cast<int>(sock + 1), &fdsr, &fdsw, nullptr, &tv);
-  });
-
-  if (ret == 0) { return Error::ConnectionTimeout; }
-
-  if (ret > 0 && (FD_ISSET(sock, &fdsr) || FD_ISSET(sock, &fdsw))) {
-    auto error = 0;
-    socklen_t len = sizeof(error);
-    auto res = getsockopt(sock, SOL_SOCKET, SO_ERROR,
-                          reinterpret_cast<char *>(&error), &len);
-    auto successful = res >= 0 && !error;
-    return successful ? Error::Success : Error::Connection;
-  }
-
-  return Error::Connection;
-#else
  struct pollfd pfd_read;
  pfd_read.fd = sock;
  pfd_read.events = POLLIN | POLLOUT;
+  pfd_read.revents = 0;

  auto timeout = static_cast<int>(sec * 1000 + usec / 1000);

@@ -1354,7 +1308,6 @@ Error wait_until_socket_is_ready(socket_t sock, time_t sec,
  }

  return Error::Connection;
-#endif
 }

 bool is_socket_alive(socket_t sock) {
@@ -7138,17 +7091,6 @@ Server::process_request(Stream &strm, const std::string &remote_addr,
  res.version = "HTTP/1.1";
  res.headers = default_headers_;

-#ifdef __APPLE__
-  // Socket file descriptor exceeded FD_SETSIZE...
-  if (strm.socket() >= FD_SETSIZE) {
-    Headers dummy;
-    detail::read_headers(strm, dummy);
-    res.status = StatusCode::InternalServerError_500;
-    output_error_log(Error::ExceedMaxSocketDescriptorCount, &req);
-    return write_response(strm, close_connection, req, res);
-  }
-#endif
-
  // Request line and headers
  if (!parse_request_line(line_reader.ptr(), req)) {
    res.status = StatusCode::BadRequest_400;
@@ -12063,7 +12005,7 @@ bool get_cert_sans(cert_t cert, std::vector<SanEntry> &sans) {
  if (!names) return true; // No SANs is valid

  auto count = sk_GENERAL_NAME_num(names);
-  for (int i = 0; i < count; i++) {
+  for (decltype(count) i = 0; i < count; i++) {
    auto gen = sk_GENERAL_NAME_value(names, i);
    if (!gen) continue;

--- a/vendor/cpp-httplib/httplib.h
+++ b/vendor/cpp-httplib/httplib.h
@@ -8,8 +8,8 @@
 #ifndef CPPHTTPLIB_HTTPLIB_H
 #define CPPHTTPLIB_HTTPLIB_H

-#define CPPHTTPLIB_VERSION "0.31.0"
-#define CPPHTTPLIB_VERSION_NUM "0x001F00"
+#define CPPHTTPLIB_VERSION "0.32.0"
+#define CPPHTTPLIB_VERSION_NUM "0x002000"

 /*
 * Platform compatibility check
Author	SHA1	Message	Date
Georgi Gerganov	c0c3e428dd	refactor	2026-02-16 23:02:45 +02:00
Georgi Gerganov	7f049860b4	resoning and error handling	2026-02-16 22:16:15 +02:00
Georgi Gerganov	2ffa45edfc	add tokens	2026-02-16 21:52:54 +02:00
Georgi Gerganov	9c29be1177	store full response	2026-02-16 21:44:29 +02:00
Georgi Gerganov	013963cfd5	add html	2026-02-16 21:22:06 +02:00
Georgi Gerganov	e2e998a2d6	fix prompts	2026-02-16 21:02:25 +02:00
Georgi Gerganov	6c41664b8b	simplify	2026-02-16 19:50:27 +02:00
Georgi Gerganov	7b84af8051	fix counts	2026-02-16 16:38:31 +02:00
Georgi Gerganov	60a501e138	cleanup	2026-02-16 16:31:14 +02:00
Georgi Gerganov	e6e777cfb3	resume eval	2026-02-16 16:21:36 +02:00
Georgi Gerganov	ad3a54eb68	ignore errors	2026-02-16 15:23:23 +02:00
Georgi Gerganov	c6d70b9bea	add AGENTS.md	2026-02-16 13:13:35 +02:00
Georgi Gerganov	de956a6ca8	cleanup	2026-02-16 12:02:16 +02:00
Georgi Gerganov	350e7c1409	datasets : fix aime2025	2026-02-16 11:55:57 +02:00
Georgi Gerganov	db10dda1f3	grade : improve regex + logs	2026-02-16 11:51:36 +02:00
Georgi Gerganov	52759bf078	grader : update prompt	2026-02-16 11:17:53 +02:00
Georgi Gerganov	99e3c3d02c	datasets : add aime2025	2026-02-16 11:07:54 +02:00
Georgi Gerganov	c6315655b7	cont	2026-02-16 10:56:58 +02:00
Georgi Gerganov	f762a71d56	grader : improve example answers	2026-02-16 10:51:41 +02:00
Georgi Gerganov	73e61d5b75	rename	2026-02-16 10:30:10 +02:00
Georgi Gerganov	cffd268bb3	add gpqa + sampling + docs	2026-02-16 00:52:33 +02:00
Georgi Gerganov	e8a807519a	datasets : add gsm8k	2026-02-15 23:19:46 +02:00
Georgi Gerganov	1db8428f00	remove old files	2026-02-15 22:16:54 +02:00
Georgi Gerganov	7751ae2796	docs	2026-02-15 22:15:50 +02:00
Georgi Gerganov	d2b10302ce	improve grader	2026-02-15 22:12:02 +02:00
Georgi Gerganov	68dde884d6	minor	2026-02-15 21:21:40 +02:00
Georgi Gerganov	fd90796da2	eval : support multiple dataset runs	2026-02-15 21:08:24 +02:00
Georgi Gerganov	8156d549f6	sim : fix answer matching	2026-02-15 21:08:24 +02:00
Georgi Gerganov	9695e6feb4	test : fix path	2026-02-15 21:08:24 +02:00
Georgi Gerganov	fb1481d60d	eval : add prompts	2026-02-15 21:08:24 +02:00
Georgi Gerganov	812ae13ec1	eval : print progress	2026-02-15 21:08:24 +02:00
Georgi Gerganov	e79e8d02d5	examples: add task summary table to llama-eval-new.py	2026-02-15 21:08:23 +02:00
Georgi Gerganov	a939f4c47e	docs: update llama-eval-discussion.md with threading and model parameter updates - Add threading support implementation details - Document ThreadPoolExecutor usage and thread safety - Add model parameter implementation details - Include testing results for both features	2026-02-15 21:08:23 +02:00
Georgi Gerganov	62b04cef54	examples: add threading support and model parameter to llama-eval-new.py - Add ThreadPoolExecutor for parallel request processing controlled by --threads - Add --model argument to specify model name in request data - Refactor process() to use thread-safe _process_single_case() method - Update progress tracking to work with concurrent execution	2026-02-15 21:08:23 +02:00
Georgi Gerganov	37b26cafee	docs: update llama-eval-discussion.md with session work summary	2026-02-15 21:08:23 +02:00
Georgi Gerganov	04f6872116	examples: use cached dataset path in simulator to avoid HF Hub requests	2026-02-15 21:08:23 +02:00
Georgi Gerganov	c2619c18bf	examples: use cached dataset path to avoid HF Hub requests	2026-02-15 21:08:23 +02:00
Georgi Gerganov	87f8930968	examples: remove HF_HUB_OFFLINE to allow dataset download	2026-02-15 21:08:23 +02:00
Georgi Gerganov	9453f9de12	examples: use HF_HUB_OFFLINE to avoid HF Hub warnings	2026-02-15 21:08:23 +02:00
Georgi Gerganov	5a1be6ce37	examples: implement flexible grader system for answer validation - Add Grader class supporting regex and CLI-based grading - Implement built-in regex patterns for AIME, GSM8K, MMLU, HellaSwag, ARC, WinoGrande - Add CLI grader interface: python script.py --answer <pred> --expected <gold> - Add HF telemetry disable to avoid warnings - Support exact match requirement for regex patterns - Add 30-second timeout for CLI grader - Handle both boxed and plain text formats for AIME answers	2026-02-15 21:08:23 +02:00
Georgi Gerganov	a80814e97b	docs: remove README.md from llama-eval	2026-02-15 21:08:23 +02:00
Georgi Gerganov	5cc2258e82	examples: add simplified llama-eval-new.py for AIME evaluation - Create new simplified evaluation script focused only on AIME - Implement EvalState and Processor dataclasses for structured state management - Add real-time feedback showing correct/incorrect status per case - Abstract grading interface for external grader support - Use structured JSON output for eval state - Apply HuggingFace dataset caching to avoid repeated downloads - Remove Levenshtein matching - eval script only sends requests and validates answers	2026-02-15 21:08:22 +02:00
Georgi Gerganov	c87af1d527	docs: update llama-eval-discussion.md with session work summary Add summary of llama-server-simulator implementation work including features, testing results, technical decisions, and refactoring.	2026-02-15 21:08:22 +02:00
Georgi Gerganov	23d4e21a81	examples: refactor test-simulator.sh for better readability Extract repeating question string into TEST_QUESTION variable and create make_request() helper function to reduce code duplication. Add proper error handling for error responses.	2026-02-15 21:08:22 +02:00
Georgi Gerganov	07d5e1e0ea	examples: add llama-server simulator for testing eval scripts Add a standalone Python script that simulates a llama-server HTTP endpoint for testing the eval script. The simulator: - Implements /v1/chat/completions endpoint with OpenAI-compatible format - Loads AIME dataset from HuggingFace with local caching - Uses Levenshtein distance for intelligent question matching - Supports configurable success rate for correct/wrong answer generation - Provides debug logging for troubleshooting Also includes test scripts and documentation for testing and understanding the simulator functionality.	2026-02-15 21:08:22 +02:00
gatbontonpc	8839037528	add checkpointing	2026-02-15 21:08:22 +02:00
gatbontonpc	89cab3dbc5	Add readme	2026-02-15 21:08:22 +02:00
gatbontonpc	c2d83ca048	multi source llama-eval	2026-02-15 21:08:22 +02:00
gatbontonpc	c05df17ce3	working llama-eval mc and math suite	2026-02-15 21:08:19 +02:00
David Friehs	27b93cbd15	cuda: optimize iq2xxs/iq2xs/iq3xxs dequantization (#19624 ) * cuda: optimize iq2xxs/iq2xs/iq3xxs dequantization - load all 8 int8 for a grid position in one load - calculate signs via popcnt instead of fetching from ksigns table - broadcast signs to drop individual shift/mask * cuda: iq2xxs: simplify sum scaling express `(sum * scale + sum / 2) / 4` as `(sum * (scale * 2 + 1)) / 8` express `((aux32 >> 28) * 2 + 1)` as `(aux32 >> 27 \| 1)` saves 3 registers for mul_mat_vec_q (152 -> 149) according to nsight AFAICT no overflow can occur here as iq2xxs values are far too small * uint -> uint32_t error: identifier "uint" is undefined	2026-02-15 22:38:42 +05:30
Aaron Teo	6e67fd2144	docs: update s390x build docs (#19643 )	2026-02-16 00:33:34 +08:00
Adrien Gallouët	9e118b97c4	build : remove LLAMA_HTTPLIB option (#19623 ) This option was introduced as a workaround because cpp-httplib could not build on visionOS. Since it has been fixed and now compiles on all platforms, we can remove it and simplify many things. Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2026-02-15 15:38:50 +01:00
Daniel Bevenius	57088276d4	cmake : check if KleidiAI API has been fetched (#19640 ) This commit addresses a build issue with the KleidiAI backend when building multiple cpu backends. Commmit `3a00c98584` ("cmake : fix KleidiAI install target failure with EXCLUDE_FROM_ALL") introduced a change where FetchContent_Populate is called instead of FetchContent_MakeAvailable, where the latter does handle this case (it is idempotent but FetchContent_Populate is not). I missed this during my review and I should not have commited without verifying the CI failure, sorry about that.	2026-02-15 13:59:38 +01:00