Compare commits

..

53 Commits

Author SHA1 Message Date
Georgi Gerganov
c0c3e428dd refactor 2026-02-16 23:02:45 +02:00
Georgi Gerganov
7f049860b4 resoning and error handling 2026-02-16 22:16:15 +02:00
Georgi Gerganov
2ffa45edfc add tokens 2026-02-16 21:52:54 +02:00
Georgi Gerganov
9c29be1177 store full response 2026-02-16 21:44:29 +02:00
Georgi Gerganov
013963cfd5 add html 2026-02-16 21:22:06 +02:00
Georgi Gerganov
e2e998a2d6 fix prompts 2026-02-16 21:02:25 +02:00
Georgi Gerganov
6c41664b8b simplify 2026-02-16 19:50:27 +02:00
Georgi Gerganov
7b84af8051 fix counts 2026-02-16 16:38:31 +02:00
Georgi Gerganov
60a501e138 cleanup 2026-02-16 16:31:14 +02:00
Georgi Gerganov
e6e777cfb3 resume eval 2026-02-16 16:21:36 +02:00
Georgi Gerganov
ad3a54eb68 ignore errors 2026-02-16 15:23:23 +02:00
Georgi Gerganov
c6d70b9bea add AGENTS.md 2026-02-16 13:13:35 +02:00
Georgi Gerganov
de956a6ca8 cleanup 2026-02-16 12:02:16 +02:00
Georgi Gerganov
350e7c1409 datasets : fix aime2025 2026-02-16 11:55:57 +02:00
Georgi Gerganov
db10dda1f3 grade : improve regex + logs 2026-02-16 11:51:36 +02:00
Georgi Gerganov
52759bf078 grader : update prompt 2026-02-16 11:17:53 +02:00
Georgi Gerganov
99e3c3d02c datasets : add aime2025 2026-02-16 11:07:54 +02:00
Georgi Gerganov
c6315655b7 cont 2026-02-16 10:56:58 +02:00
Georgi Gerganov
f762a71d56 grader : improve example answers 2026-02-16 10:51:41 +02:00
Georgi Gerganov
73e61d5b75 rename 2026-02-16 10:30:10 +02:00
Georgi Gerganov
cffd268bb3 add gpqa + sampling + docs 2026-02-16 00:52:33 +02:00
Georgi Gerganov
e8a807519a datasets : add gsm8k 2026-02-15 23:19:46 +02:00
Georgi Gerganov
1db8428f00 remove old files 2026-02-15 22:16:54 +02:00
Georgi Gerganov
7751ae2796 docs 2026-02-15 22:15:50 +02:00
Georgi Gerganov
d2b10302ce improve grader 2026-02-15 22:12:02 +02:00
Georgi Gerganov
68dde884d6 minor 2026-02-15 21:21:40 +02:00
Georgi Gerganov
fd90796da2 eval : support multiple dataset runs 2026-02-15 21:08:24 +02:00
Georgi Gerganov
8156d549f6 sim : fix answer matching 2026-02-15 21:08:24 +02:00
Georgi Gerganov
9695e6feb4 test : fix path 2026-02-15 21:08:24 +02:00
Georgi Gerganov
fb1481d60d eval : add prompts 2026-02-15 21:08:24 +02:00
Georgi Gerganov
812ae13ec1 eval : print progress 2026-02-15 21:08:24 +02:00
Georgi Gerganov
e79e8d02d5 examples: add task summary table to llama-eval-new.py 2026-02-15 21:08:23 +02:00
Georgi Gerganov
a939f4c47e docs: update llama-eval-discussion.md with threading and model parameter updates
- Add threading support implementation details
- Document ThreadPoolExecutor usage and thread safety
- Add model parameter implementation details
- Include testing results for both features
2026-02-15 21:08:23 +02:00
Georgi Gerganov
62b04cef54 examples: add threading support and model parameter to llama-eval-new.py
- Add ThreadPoolExecutor for parallel request processing controlled by --threads
- Add --model argument to specify model name in request data
- Refactor process() to use thread-safe _process_single_case() method
- Update progress tracking to work with concurrent execution
2026-02-15 21:08:23 +02:00
Georgi Gerganov
37b26cafee docs: update llama-eval-discussion.md with session work summary 2026-02-15 21:08:23 +02:00
Georgi Gerganov
04f6872116 examples: use cached dataset path in simulator to avoid HF Hub requests 2026-02-15 21:08:23 +02:00
Georgi Gerganov
c2619c18bf examples: use cached dataset path to avoid HF Hub requests 2026-02-15 21:08:23 +02:00
Georgi Gerganov
87f8930968 examples: remove HF_HUB_OFFLINE to allow dataset download 2026-02-15 21:08:23 +02:00
Georgi Gerganov
9453f9de12 examples: use HF_HUB_OFFLINE to avoid HF Hub warnings 2026-02-15 21:08:23 +02:00
Georgi Gerganov
5a1be6ce37 examples: implement flexible grader system for answer validation
- Add Grader class supporting regex and CLI-based grading
- Implement built-in regex patterns for AIME, GSM8K, MMLU, HellaSwag, ARC, WinoGrande
- Add CLI grader interface: python script.py --answer <pred> --expected <gold>
- Add HF telemetry disable to avoid warnings
- Support exact match requirement for regex patterns
- Add 30-second timeout for CLI grader
- Handle both boxed and plain text formats for AIME answers
2026-02-15 21:08:23 +02:00
Georgi Gerganov
a80814e97b docs: remove README.md from llama-eval 2026-02-15 21:08:23 +02:00
Georgi Gerganov
5cc2258e82 examples: add simplified llama-eval-new.py for AIME evaluation
- Create new simplified evaluation script focused only on AIME
- Implement EvalState and Processor dataclasses for structured state management
- Add real-time feedback showing correct/incorrect status per case
- Abstract grading interface for external grader support
- Use structured JSON output for eval state
- Apply HuggingFace dataset caching to avoid repeated downloads
- Remove Levenshtein matching - eval script only sends requests and validates answers
2026-02-15 21:08:22 +02:00
Georgi Gerganov
c87af1d527 docs: update llama-eval-discussion.md with session work summary
Add summary of llama-server-simulator implementation work including
features, testing results, technical decisions, and refactoring.
2026-02-15 21:08:22 +02:00
Georgi Gerganov
23d4e21a81 examples: refactor test-simulator.sh for better readability
Extract repeating question string into TEST_QUESTION variable and
create make_request() helper function to reduce code duplication.
Add proper error handling for error responses.
2026-02-15 21:08:22 +02:00
Georgi Gerganov
07d5e1e0ea examples: add llama-server simulator for testing eval scripts
Add a standalone Python script that simulates a llama-server HTTP endpoint
for testing the eval script. The simulator:

- Implements /v1/chat/completions endpoint with OpenAI-compatible format
- Loads AIME dataset from HuggingFace with local caching
- Uses Levenshtein distance for intelligent question matching
- Supports configurable success rate for correct/wrong answer generation
- Provides debug logging for troubleshooting

Also includes test scripts and documentation for testing and understanding
the simulator functionality.
2026-02-15 21:08:22 +02:00
gatbontonpc
8839037528 add checkpointing 2026-02-15 21:08:22 +02:00
gatbontonpc
89cab3dbc5 Add readme 2026-02-15 21:08:22 +02:00
gatbontonpc
c2d83ca048 multi source llama-eval 2026-02-15 21:08:22 +02:00
gatbontonpc
c05df17ce3 working llama-eval mc and math suite 2026-02-15 21:08:19 +02:00
David Friehs
27b93cbd15 cuda: optimize iq2xxs/iq2xs/iq3xxs dequantization (#19624)
* cuda: optimize iq2xxs/iq2xs/iq3xxs dequantization

- load all 8 int8 for a grid position in one load
- calculate signs via popcnt instead of fetching from ksigns table
- broadcast signs to drop individual shift/mask

* cuda: iq2xxs: simplify sum scaling

express `(sum * scale + sum / 2) / 4` as `(sum * (scale * 2 + 1)) / 8`
express `((aux32 >> 28) * 2 + 1)` as `(aux32 >> 27 | 1)`

saves 3 registers for mul_mat_vec_q (152 -> 149) according to nsight
AFAICT no overflow can occur here as iq2xxs values are far too small

* uint -> uint32_t

error: identifier "uint" is undefined
2026-02-15 22:38:42 +05:30
Aaron Teo
6e67fd2144 docs: update s390x build docs (#19643) 2026-02-16 00:33:34 +08:00
Adrien Gallouët
9e118b97c4 build : remove LLAMA_HTTPLIB option (#19623)
This option was introduced as a workaround because cpp-httplib could not
build on visionOS. Since it has been fixed and now compiles on all platforms,
we can remove it and simplify many things.

Signed-off-by: Adrien Gallouët <angt@huggingface.co>
2026-02-15 15:38:50 +01:00
Daniel Bevenius
57088276d4 cmake : check if KleidiAI API has been fetched (#19640)
This commit addresses a build issue with the KleidiAI backend when
building multiple cpu backends. Commmit
3a00c98584 ("cmake : fix KleidiAI install
target failure with EXCLUDE_FROM_ALL") introduced a change where
FetchContent_Populate is called instead of FetchContent_MakeAvailable,
where the latter does handle this case (it is idempotent but
FetchContent_Populate is not).

I missed this during my review and I should not have commited without
verifying the CI failure, sorry about that.
2026-02-15 13:59:38 +01:00
19 changed files with 2104 additions and 154 deletions

View File

@@ -112,7 +112,6 @@ option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
# 3rd party libs
option(LLAMA_HTTPLIB "llama: httplib for downloading functionality" ON)
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
@@ -197,9 +196,7 @@ add_subdirectory(src)
if (LLAMA_BUILD_COMMON)
add_subdirectory(common)
if (LLAMA_HTTPLIB)
add_subdirectory(vendor/cpp-httplib)
endif()
add_subdirectory(vendor/cpp-httplib)
endif()
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)

View File

@@ -449,10 +449,9 @@ cmake -B build-visionos -G Xcode \
-DCMAKE_SYSTEM_NAME=visionOS \
-DCMAKE_OSX_SYSROOT=xros \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_HTTPLIB=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-S .
cmake --build build-visionos --config Release -- -quiet
@@ -465,10 +464,9 @@ cmake -B build-visionos-sim -G Xcode \
-DCMAKE_SYSTEM_NAME=visionOS \
-DCMAKE_OSX_SYSROOT=xrsimulator \
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
-DLLAMA_OPENSSL=OFF \
-DLLAMA_HTTPLIB=OFF \
-DLLAMA_BUILD_SERVER=OFF \
-S .
cmake --build build-visionos-sim --config Release -- -quiet

View File

@@ -112,11 +112,7 @@ endif()
# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
set(LLAMA_COMMON_EXTRA_LIBS build_info)
if (LLAMA_HTTPLIB)
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
endif()
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
if (LLAMA_LLGUIDANCE)
include(ExternalProject)

View File

@@ -19,9 +19,7 @@
#include <thread>
#include <vector>
#if defined(LLAMA_USE_HTTPLIB)
#include "http.h"
#endif
#ifndef __EMSCRIPTEN__
#ifdef __linux__
@@ -142,8 +140,6 @@ std::pair<std::string, std::string> common_download_split_repo_tag(const std::st
return {hf_repo, tag};
}
#if defined(LLAMA_USE_HTTPLIB)
class ProgressBar {
static inline std::mutex mutex;
static inline std::map<const ProgressBar *, int> lines;
@@ -768,30 +764,6 @@ std::string common_docker_resolve_model(const std::string & docker) {
}
}
#else
common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool, const common_header_list &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
bool common_download_model(const common_params_model &, const std::string &, bool, const common_header_list &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
std::string common_docker_resolve_model(const std::string &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
int common_download_file_single(const std::string &,
const std::string &,
const std::string &,
bool,
const common_header_list &) {
throw std::runtime_error("download functionality is not enabled in this build");
}
#endif // defined(LLAMA_USE_HTTPLIB)
std::vector<common_cached_model_info> common_list_cached_models() {
std::vector<common_cached_model_info> models;
const std::string cache_dir = fs_get_cache_directory();

View File

@@ -242,10 +242,10 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|------------|-------------|------|-------|
| FP32 | ✅ | ✅ | ❓ |
| FP16 | ✅ | ✅ | ❓ |
| BF16 | 🚫 | ✅ | ❓ |
| BF16 | | ✅ | ❓ |
| Q4_0 | ✅ | ❓ | ❓ |
| Q4_1 | ✅ | ❓ | ❓ |
| MXFP4 | 🚫 | ❓ | ❓ |
| MXFP4 | | ❓ | ❓ |
| Q5_0 | ✅ | ❓ | ❓ |
| Q5_1 | ✅ | ❓ | ❓ |
| Q8_0 | ✅ | ❓ | ❓ |
@@ -272,4 +272,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
- 🚫 - acceleration unavailable, will still run using scalar implementation
- ❓ - acceleration unknown, please contribute if you can test it yourself
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 7, 2025.
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Feb 15, 2026.

View File

@@ -0,0 +1,190 @@
# llama-eval Codebase Guidelines
## Overview
This directory contains Python evaluation tools for llama.cpp:
- `llama-eval.py` - Main evaluation tool with multiple datasets (AIME, AIME2025, GSM8K, GPQA)
- `llama-server-simulator.py` - Flask-based server simulator for testing
- `test-simulator.sh` - Test script for the simulator
## Build/Run Commands
### Virtual Environment
The project uses a virtual environment located at `venv/`:
```bash
source venv/bin/activate
```
### Running the Main Evaluator
```bash
python llama-eval.py \
--server http://127.0.0.1:8013 \
--model gpt-oss-20b-hf-low \
--dataset aime \
--n_cases 10 \
--grader-type llm \
--seed 42
```
### Running the Simulator (for testing)
```bash
python llama-server-simulator.py --port 8033 --success-rate 0.8
```
### Running Tests
```bash
./test-simulator.sh
```
## Code Style Guidelines
### Imports
- Standard library imports first (argparse, json, os, re, subprocess, sys, time)
- Third-party imports (requests, tqdm, datasets, flask) after standard library
- Relative imports not used
- Group imports by category with blank line between groups
### Formatting
- 4-space indentation
- Max line length: 125 characters (per parent project's .flake8)
- Use double quotes for strings
- Use triple double quotes for docstrings
- Binary operators at the beginning of continued lines
### Naming Conventions
- Classes: PascalCase (e.g., `AimeDataset`, `Grader`, `Processor`)
- Functions: snake_case (e.g., `normalize_number`, `get_prompt`)
- Variables: snake_case (e.g., `question_text`, `correct_count`)
- Constants: UPPER_SNAKE_CASE (e.g., `GRADER_PATTERNS`, `TEMPLATE_REGISTRY`)
- Private methods: prefix with underscore (e.g., `_load_dataset`, `_grade_regex`)
### Types
- Use type hints for all function signatures
- Import from `typing` module: `Dict`, `List`, `Optional`, `Any`, `Tuple`
- Use `@dataclass` for data structures
- Prefer `Optional[T]` over `Union[T, None]`
### Error Handling
- Use try/except for network requests and file operations
- Return `None` or `False` on errors when appropriate
- Use `ValueError` for invalid arguments
- Use `FileNotFoundError` for missing files
- CLI scripts should handle exceptions gracefully
### Dataclasses
- Use `@dataclass` for structured data
- Define fields with explicit types
- Use `Optional[T]` for nullable fields
- Provide default values where appropriate
### String Formatting
- Use f-strings for formatting (Python 3.6+)
- Use triple double quotes for multi-line strings
- Escape backslashes in regex patterns: `r'\\boxed{(\d+)}'`
### File Paths
- Use `pathlib.Path` instead of string paths
- Create directories with `mkdir(parents=True, exist_ok=True)`
- Use `Path.home()` for user home directory
### Logging
- Use `print()` for user-facing output
- Use `sys.stderr` for debug logging
- Simulator writes debug logs to `/tmp/simulator-debug.log`
### Testing
- Test script uses bash with `set -e` for strict error handling
- Simulator runs in background with PID tracking
- Tests verify correct answers, error cases, and edge cases
- Use `curl` for HTTP testing in shell scripts
### Whitespace Cleanup
- Remove trailing whitespace from all lines
- When making edits, do not leave trailing whitespace
## Dataset Support
### AIME Dataset
- 90 questions from 2025 AIME competition
- Answers in `\boxed{answer}` format
- Supports regex, CLI, and LLM grading
### AIME2025 Dataset
- 30 questions from 2025 AIME I & II
- Answers in `\boxed{answer}` format
- Requires loading two config parts
### GSM8K Dataset
- 7473 math word problems
- Answers numeric values with `####` separator
- Supports regex, CLI, and LLM grading
### GPQA Dataset
- 198 questions from GPQA Diamond
- Multiple choice with shuffled options (A, B, C, D)
- **Requires LLM grader** (returns letter A/B/C/D)
## Grading Types
### Regex Grader
- Built-in patterns per dataset
- Prioritizes `\boxed{}` for AIME datasets
- Extracts last number for GSM8K
### CLI Grader
- External script interface
- Call: `grader.sh --answer <pred> --expected <gold>`
- Exit code 0 = correct, non-zero = incorrect
### LLM Grader
- Uses judge model for answer extraction
- Includes few-shot examples
- Case-insensitive comparison
- Required for GPQA
## Configuration
### Sampling Parameters (Optional)
- `--temperature`: Sampling temperature
- `--top-k`: Top K sampling
- `--top-p`: Top P sampling
- `--min-p`: Min P sampling
- Only passed to API if explicitly specified
### Default Values
- `--n_predict`: -1 (infinite)
- `--grader-type`: llm
- `--seed`: 1234
- `--threads`: 32
- `--output`: llama-eval-state.json
## Output Format
### Progress Table
- Shows task ID, dataset, prompt (truncated to 43 chars), expected answer, status
- Uses `tqdm` for progress bars
### Results Summary
- Format: `Results: X/Y correct (Z%)`
- Displayed after all tasks complete
### JSON Output
- Complete eval state saved to output file
- Contains: task IDs, correctness, prompts, extracted answers, sampling config
- Uses `dataclasses.asdict()` for serialization
## HuggingFace Datasets
- Cache directory: `~/.cache/huggingface/datasets`
- Set via `HF_DATASETS_CACHE` environment variable
- Telemetry disabled via `HF_HUB_DISABLE_TELEMETRY=1`
- Datasets loaded with `datasets.load_dataset()`
## Flask Simulator
- Runs on configurable port (default: 5000)
- Endpoint: `/v1/chat/completions` (OpenAI-compatible)
- Uses Dice coefficient for question matching
- Configurable success rate for testing
- Debug logs to `/tmp/simulator-debug.log`

View File

@@ -0,0 +1,94 @@
# llama-eval Implementation Summary
## Overview
Simple evaluation tool for llama.cpp with support for multiple datasets (AIME, GSM8K, GPQA) and flexible grading (regex, CLI, LLM).
## Key Features
- **Multiple Datasets**: AIME, GSM8K, GPQA with proper answer extraction
- **Flexible Grading**: Regex, CLI, or LLM-based grading
- **Parallel Processing**: Configurable thread count for concurrent requests
- **Sampling Parameters**: Temperature, Top K, Top P, Min P (optional)
- **Real-time Feedback**: Progress tracking with detailed output
- **JSON Output**: Complete eval state saved for debugging
- **GPQA Support**: Answer shuffling with reproducible results
## Architecture
### Eval State
```python
@dataclass
class EvalState:
id: str
tasks: List[str]
task_states: Dict[str, Dict[str, Any]]
sampling_config: Dict[str, Any]
```
### Processor
- Handles processing, grading, and state management
- Thread-safe concurrent execution
- Configurable sampling parameters
### Grader
- Abstract grading interface supporting multiple types
- Regex grader with dataset-specific patterns
- CLI grader with external script interface
- LLM grader with configurable server and model
### Datasets
- `AimeDataset`: 90 AIME 2025 questions
- `Aime2025Dataset`: 30 AIME 2025 I & II questions
- `Gsm8kDataset`: 7473 math word problems
- `GpqaDataset`: 198 GPQA Diamond questions with shuffling
## Configuration
### Sampling Parameters (Optional)
- `--temperature`: Sampling temperature
- `--top-k`: Top K sampling
- `--top-p`: Top P sampling
- `--min-p`: Min P sampling
- Only passed if explicitly specified
### Grading Types
- **regex**: Built-in patterns for each dataset
- **cli**: External script with `--answer` and `--expected` args
- **llm**: LLM-based extraction with few-shot examples and configurable server/model
### Dataset Requirements
- **AIME**: Supports regex, CLI, or LLM grader
- **AIME2025**: Supports regex, CLI, or LLM grader
- **GSM8K**: Supports regex, CLI, or LLM grader
- **GPQA**: Requires LLM grader
## Output Format
### Progress Table
```
Task ID Dataset Prompt (first 43 chars) Expected Status
aime_000_001 AIME Complete the following reactions and sel... A pending
```
### Results Summary
```
============================================================
Results: 8/10 correct (80.0%)
============================================================
```
### JSON Output
Complete eval state with task IDs, correctness, prompts, extracted answers, and sampling configuration.
## Technical Details
- Default max tokens: -1 (infinite)
- Default grader type: llm
- Default seed: 1234
- Default threads: 32
- Prompt truncation: First 43 chars + padding + "..."
- Response truncation: Last 10 lines for grading
- GPQA requires LLM grader (returns letter A/B/C/D)
- Judge model defaults to evaluated model if not specified
- Sample answers defined in SAMPLE_ANSWERS dict for few-shot learning

View File

@@ -0,0 +1,112 @@
# llama-eval Evaluation Tool
Simple evaluation tool for llama.cpp with support for multiple datasets.
## Features
- **Multiple Datasets**: AIME, GSM8K, GPQA
- **Flexible Grading**: Regex, CLI, or LLM-based grading
- **Parallel Processing**: Configurable thread count
- **Real-time Feedback**: Progress tracking with detailed output
- **Sampling Parameters**: Temperature, Top K, Top P, Min P
- **JSON Output**: Complete eval state saved for debugging
## Usage
```bash
python llama-eval.py \
--server http://127.0.0.1:8013 \
--model gpt-oss-20b-hf-low \
--judge-model gpt-oss-20b-hf-medium \
--dataset aime \
--n_cases 10 \
--grader-type llm \
--seed 42
```
## CLI Arguments
- `--server`: llama-server URL (default: http://127.0.0.1:8013)
- `--model`: Model name for evaluation (default: llama)
- `--judge-model`: Model name for LLM judge (default: same as main model)
- `--judge-server`: Server URL for LLM judge (default: same as main server)
- `--dataset`: Dataset type (aime, aime2025, gsm8k, gpqa)
- `--n_cases`: Number of cases to evaluate (default: all)
- `--n_predict`: Max tokens to predict per prompt (default: -1, infinite)
- `--temperature`: Sampling temperature (default: not passed)
- `--top-k`: Top K sampling (default: not passed)
- `--top-p`: Top P sampling (default: not passed)
- `--min-p`: Min P sampling (default: not passed)
- `--threads`: Number of threads for parallel requests (default: 32)
- `--verbose`: Show detailed output for each case
- `--output`: Output file for eval state (default: llama-eval-state.json)
- `--grader-type`: Grader type (regex, cli, llm, default: llm)
- `--grader-script`: Path to CLI grader script (required for --grader-type cli)
- `--seed`: Random seed for shuffling (default: 1234)
## Datasets
### AIME
- 90 questions from 2025 AIME competition
- Answers in boxed format: `\boxed{answer}`
- Requires regex grader or LLM grader
### AIME2025
- 30 questions from 2025 AIME I & II competitions
- Answers in boxed format: `\boxed{answer}`
- Supports regex, CLI, or LLM grader
### GSM8K
- 7473 math word problems
- Answers are numeric values
- Requires regex grader or LLM grader
### GPQA
- 198 questions from GPQA Diamond dataset
- Multiple choice with shuffled options
- Requires LLM grader (returns letter A, B, C, or D)
## Grading Types
### Regex Grader
Built-in patterns for different datasets:
- AIME: `\boxed{(\d+)}|\b(\d+)\b`
- AIME2025: `\boxed{(\d+)}|\b(\d+)\b`
- GSM8K: `\b(\d+)\b`
- GPQA: Letter extraction (A, B, C, D)
### CLI Grader
External script interface:
```bash
./grader.sh --answer <pred> --expected <gold>
```
Returns exit code 0 if correct, non-zero if incorrect.
### LLM Grader
Uses LLM to extract and compare answers:
- Configurable server and model
- Includes few-shot examples from sample answers
- Case-insensitive comparison
- Required for GPQA dataset
## Output
### Progress Table
```
Task ID Dataset Prompt (first 43 chars) Expected Status
aime_000_001 AIME Complete the following reactions and sel... A pending
```
### Results
```
============================================================
Results: 8/10 correct (80.0%)
============================================================
```
### JSON Output
Complete eval state saved to output file with:
- Task IDs and correctness status
- Prompts and extracted answers
- Sampling configuration
- Processing metadata

1229
examples/llama-eval/llama-eval.py Executable file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,36 @@
# llama-server-simulator
Standalone Python script simulating llama-server HTTP endpoint for testing.
## Features
- HTTP Server with OpenAI-compatible `/v1/chat/completions` endpoint
- AIME Dataset Integration - Loads 90 questions from HuggingFace
- Intelligent Question Matching - Uses exact matching, LaTeX removal, and Levenshtein distance
- Configurable Success Rate - Control correct/wrong answer generation (0-1)
- Debug Logging - Troubleshoot matching issues
## Usage
```bash
python llama-server-simulator.py --success-rate 0.8
```
## Arguments
- `--success-rate`: Probability of returning correct answer (0.0-1.0, default: 0.8)
- `--port`: Server port (default: 8033)
- `--debug`: Enable debug logging (default: False)
## Testing
```bash
./test-simulator.sh
```
## Implementation Details
- Uses Levenshtein distance for partial matching (threshold: 0.3)
- Automatic caching via HuggingFace datasets library
- Wrong answers generated by incrementing expected answer
- Debug output written to stderr

View File

@@ -0,0 +1,283 @@
#!/usr/bin/env python3
import argparse
import json
import random
import re
import time
import sys
import os
from typing import Dict, List, Optional
from dataclasses import dataclass, asdict
from pathlib import Path
import datasets
from flask import Flask, request, jsonify
# Set cache directory for HuggingFace datasets
cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
cache_dir.mkdir(parents=True, exist_ok=True)
os.environ["HF_DATASETS_CACHE"] = str(cache_dir)
def dice(s1: str, s2: str) -> float:
"""Calculate Dice coefficient between two strings based on bigram overlap."""
if not s1 and not s2:
return 1.0
def _bigrams(s: str):
return [s[i : i + 2] for i in range(len(s) - 1)]
bigrams1 = _bigrams(s1)
bigrams2 = _bigrams(s2)
if not bigrams1 and not bigrams2:
return 1.0
from collections import Counter
freq1 = Counter(bigrams1)
freq2 = Counter(bigrams2)
intersection = sum(min(freq1[bg], freq2[bg]) for bg in freq1)
dice_coeff = 2 * intersection / (len(bigrams1) + len(bigrams2))
return dice_coeff
def debug_log(message: str):
"""Log debug messages to both stdout and a file"""
print(message, file=sys.stderr)
with open("/tmp/simulator-debug.log", "a") as f:
f.write(message + "\n")
app = Flask(__name__)
@dataclass
class EvalState:
id: str
tasks: List[str]
task_states: Dict[str, Dict]
sampling_config: Dict
def normalize_number(s: str) -> Optional[int]:
match = re.match(r"\d+", s) # match digits from the start
if not match:
return None
return int(match.group(0))
class AimeDataset:
def __init__(self, split: str = "train"):
self.split = split
self.questions: List[Dict] = []
self._load_dataset()
def _load_dataset(self):
print(f"Loading AIME dataset (split: {self.split})...")
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
if cache_path.exists():
print(f"Using cached dataset from {cache_path}")
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
else:
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
self.questions = list(ds)
print(f"AIME dataset loaded: {len(self.questions)} questions")
def find_question(self, request_text: str) -> Optional[Dict]:
best_match = None
best_distance = -1
best_index = -1
for i, question in enumerate(self.questions):
question_text = question["problem"]
request_lower = request_text.lower()
question_lower = question_text.lower()
# Exact match
if question_lower == request_lower:
debug_log(f"DEBUG: Found exact match at index {i}")
return question
# Remove LaTeX formatting for more flexible matching
question_no_latex = re.sub(r'\$[^$]+\$', '', question_text)
if question_no_latex.lower() == request_lower:
debug_log(f"DEBUG: Found match (no LaTeX) at index {i}")
return question
# Calculate Levenshtein distance for partial matches
# Only consider if request is at least 50% of question length
if len(request_lower) >= len(question_lower) * 0.5:
distance = dice(question_lower, request_lower)
if distance > best_distance:
best_distance = distance
best_match = question
best_index = i
if best_match and best_distance > 0.3: # Threshold for partial match
debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
return best_match
debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
return None
def get_answer(self, question: Dict) -> str:
answer = question["answer"]
if isinstance(answer, str):
normalized = normalize_number(answer)
return str(normalized) if normalized is not None else answer
return str(answer)
class Simulator:
def __init__(
self,
port: int = 8033,
host: str = "localhost",
success_rate: float = 0.8,
dataset_split: str = "train"
):
self.port = port
self.host = host
self.success_rate = success_rate
self.dataset = AimeDataset(dataset_split)
self.eval_state = EvalState(
id="aime-2025",
tasks=["aime"],
task_states={},
sampling_config={"temperature": 0, "max_tokens": 2048}
)
def _generate_response(
self,
question: Dict,
should_be_correct: bool
) -> Dict:
expected_answer = self.dataset.get_answer(question)
if should_be_correct:
response_text = expected_answer
else:
response_text = self._generate_wrong_answer(question)
return {
"id": f"chatcmpl-{int(time.time())}",
"object": "chat.completion",
"created": int(time.time()),
"model": "llama",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": response_text
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 100,
"completion_tokens": 50,
"total_tokens": 150
}
}
def _generate_wrong_answer(self, question: Dict) -> str:
expected_answer = self.dataset.get_answer(question)
if expected_answer.isdigit():
wrong_answer = str(int(expected_answer) + 1)
else:
wrong_answer = expected_answer + " (wrong)"
return wrong_answer
def _process_request(self, request_data: Dict) -> Dict:
messages = request_data.get("messages", [])
if not messages:
return {"error": "No messages in request"}
request_text = messages[0].get("content", "")
debug_log(f"DEBUG: Received request with content: {request_text[:150]}...")
question = self.dataset.find_question(request_text)
if not question:
debug_log(f"DEBUG: find_question returned None")
return {"error": "No matching question found"}
should_be_correct = random.random() < self.success_rate
response = self._generate_response(question, should_be_correct)
task_id = "aime"
self.eval_state.task_states[task_id] = {
"correct": should_be_correct,
"expected": self.dataset.get_answer(question),
"predicted": response["choices"][0]["message"]["content"]
}
return response
@app.route('/v1/chat/completions', methods=['POST'])
def chat_completions():
try:
request_data = request.get_json()
if not request_data:
return jsonify({"error": "Invalid JSON"}), 400
response = simulator._process_request(request_data)
return jsonify(response)
except Exception as e:
print(f"Error processing request: {e}")
return jsonify({"error": str(e)}), 500
def main():
parser = argparse.ArgumentParser(
description="llama-server simulator for testing eval scripts"
)
parser.add_argument(
"--port",
type=int,
default=8033,
help="Server port (default: 8033)"
)
parser.add_argument(
"--host",
type=str,
default="localhost",
help="Server host (default: localhost)"
)
parser.add_argument(
"--success-rate",
type=float,
default=0.8,
help="Success rate 0-1 (default: 0.8)"
)
parser.add_argument(
"--dataset-split",
type=str,
default="train",
help="AIME dataset split to use (default: train)"
)
args = parser.parse_args()
global simulator
simulator = Simulator(
port=args.port,
host=args.host,
success_rate=args.success_rate,
dataset_split=args.dataset_split
)
print("\n=== llama-server-simulator ===")
print(f"Server running on http://{args.host}:{args.port}")
print(f"Success rate: {args.success_rate}")
print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
print("\nPress Ctrl+C to stop\n")
app.run(host=args.host, port=args.port, debug=False)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,86 @@
#!/bin/bash
set -e
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
echo "=== llama-server-simulator Test Script ==="
echo ""
PORT=8033
SUCCESS_RATE=0.8
TEST_PORT=8034
echo "Starting simulator on port $PORT with success rate $SUCCESS_RATE..."
source "$SCRIPT_DIR/venv/bin/activate"
python3 "$SCRIPT_DIR/llama-server-simulator.py" --port $PORT --success-rate $SUCCESS_RATE > /tmp/simulator-test.log 2>&1 &
SIMULATOR_PID=$!
echo "Waiting for simulator to start..."
sleep 5
# Helper function to make a request and extract the answer
make_request() {
local question="$1"
curl -s -X POST http://localhost:$PORT/v1/chat/completions \
-H "Content-Type: application/json" \
-d "{
\"model\": \"llama\",
\"messages\": [
{\"role\": \"user\", \"content\": \"$question\"}
],
\"temperature\": 0,
\"max_tokens\": 2048
}" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data.get('choices', [{}])[0].get('message', {}).get('content', data.get('error', 'No response')))"
}
# Test question (repeated in multiple tests)
TEST_QUESTION="Quadratic polynomials P(x) and Q(x) have leading coefficients 2 and -2, respectively. The graphs of both polynomials pass through the two points (16,54) and (20,53). Find P(0) + Q(0)."
echo ""
echo "=== Test 1: Correct Answer ==="
echo "Sending request with known question..."
answer=$(make_request "$TEST_QUESTION")
echo "Answer: $answer"
echo "Expected: 116"
echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
echo ""
echo "=== Test 2: Wrong Answer ==="
echo "Sending request with known question (success rate 0.0)..."
answer=$(make_request "$TEST_QUESTION")
echo "Answer: $answer"
echo "Expected: 116"
echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
echo ""
echo "=== Test 3: No Matching Question ==="
echo "Sending request with non-matching text..."
response=$(make_request "What is the capital of France?")
echo "Response: $response"
echo "Expected: No matching question found"
echo "Correct: $([ "$response" == "No matching question found" ] && echo "Yes" || echo "No")"
echo ""
echo "=== Test 4: Success Rate Verification ==="
echo "Sending 10 requests to test success rate..."
correct_count=0
for i in {1..10}; do
answer=$(make_request "$TEST_QUESTION")
if [ "$answer" == "116" ]; then
correct_count=$((correct_count + 1))
fi
echo " Request $i: Answer = $answer"
done
echo "Correct answers: $correct_count/10"
echo "Expected: ~8/10 (80% success rate)"
echo "Success rate: $(echo "scale=1; $correct_count * 10" | bc)%"
echo ""
echo "=== Test Complete ==="
echo "Stopping simulator..."
kill $SIMULATOR_PID 2>/dev/null
wait $SIMULATOR_PID 2>/dev/null || true
echo "Simulator stopped."

View File

@@ -576,13 +576,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
DOWNLOAD_EXTRACT_TIMESTAMP NEW
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
FetchContent_Populate(KleidiAI_Download)
FetchContent_GetProperties(KleidiAI_Download
SOURCE_DIR KLEIDIAI_SRC
POPULATED KLEIDIAI_POPULATED)
if (NOT KLEIDIAI_POPULATED)
message(FATAL_ERROR "KleidiAI source downloaded failed.")
FetchContent_Populate(KleidiAI_Download)
FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
endif()
add_compile_definitions(GGML_USE_CPU_KLEIDIAI)

View File

@@ -2715,14 +2715,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
#pragma unroll
for (int l = 0; l < QR2_XXS; ++l) {
const int * grid_pos = (const int *) (iq2xxs_grid + aux8[l]);
const int signs_packed = ksigns_iq2xs[(aux32 >> (7*l)) & 0x7F];
const uint2 grid_pos = ((const uint2*)iq2xxs_grid)[aux8[l]];
const uint32_t signs = unpack_ksigns(aux32 >> (7 * l));
const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
const int grid0 = __vsub4(grid_pos.x ^ signs0, signs0);
const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
const int grid1 = __vsub4(grid_pos.y ^ signs1, signs1);
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid0;
@@ -2733,12 +2733,12 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
}
const int ls = aux32 >> 28;
const int ls = aux32 >> 27 | 1; // (scale * 2 + 1)
const float d = bxi->d;
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/4;
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = d * ls / 8; // (d * scale + d / 2) / 4
#else
x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = (ls*d + d/2)/4;
x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = d * ls / 8; // (d * scale + d / 2) / 4
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
}
}
@@ -2776,11 +2776,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
#pragma unroll
for (int l = 0; l < QR2_XS; ++l) {
const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l] & 0x000001FF));
const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
const uint2 grid_pos = ((const uint2*)iq2xs_grid)[q2[l] & 0x1FF];
const uint32_t signs = unpack_ksigns(q2[l] >> 9);
const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
@@ -2904,11 +2907,13 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
#pragma unroll
for (int l = 0; l < QR3_XXS; ++l) {
const int2 grid_pos = make_int2(iq3xxs_grid[q3[2*l+0]], iq3xxs_grid[q3[2*l+1]]);
const uint32_t signs = unpack_ksigns(aux32 >> (7*l));
const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l)) & 0x7F));
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid_l;

View File

@@ -94,6 +94,15 @@ static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, con
#endif
}
static __device__ __forceinline__ uint32_t unpack_ksigns(const uint8_t v) {
// v is a 7 bit int, with the 8th sign being encodable as popcnt
// with xor we can "correct" the bit instead of having to mask
const uint32_t p = __popc(v) & 1;
const uint32_t s = v ^ p << 7;
// broadcast over uint to allow for 0x08040201 / 0x80402010 as selectors
return s * 0x01010101;
}
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
@@ -905,22 +914,22 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
int sumi = 0;
#pragma unroll
for (int k0 = 0; k0 < 8; k0 += 2) {
const int * grid_pos = (const int *) (iq2xxs_grid + aux8[k0/2]);
const int signs_packed = ksigns_iq2xs[(aux32 >> (7*k0/2)) & 0x7F];
const uint2 grid_pos = ((const uint2*)iq2xxs_grid)[aux8[k0/2]];
const uint32_t signs = unpack_ksigns(aux32 >> (7 * k0 / 2));
const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
const int grid0 = __vsub4(grid_pos.x ^ signs0, signs0);
const int u0 = get_int_b4(bq8_1[iqs/2].qs, k0 + 0);
sumi = ggml_cuda_dp4a(grid0, u0, sumi);
const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
const int grid1 = __vsub4(grid_pos.y ^ signs1, signs1);
const int u1 = get_int_b4(bq8_1[iqs/2].qs, k0 + 1);
sumi = ggml_cuda_dp4a(grid1, u1, sumi);
}
const int ls = aux32 >> 28;
sumi = (ls*sumi + sumi/2)/4;
const int ls = aux32 >> 27 | 1; // (scale * 2 + 1)
sumi = sumi * ls / 8; // (sumi * scale + sumi / 2) / 4
const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
return d * sumi;
}
@@ -942,13 +951,15 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
int sumi1 = 0;
#pragma unroll
for (int l0 = 0; l0 < 8; l0 += 2) {
const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l0/2] & 0x000001FF));
const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l0/2] >> 9));
const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
const uint2 grid_pos = ((const uint2*)iq2xs_grid)[q2[l0/2] & 0x1FF];
const uint32_t signs = unpack_ksigns(q2[l0/2] >> 9);
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
if (l0 < 4) {
@@ -1028,13 +1039,16 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
#pragma unroll
for (int l0 = 0; l0 < 8; l0 += 2) {
const int2 grid_pos = make_int2(iq3xxs_grid[q3[l0 + 0]], iq3xxs_grid[q3[l0 + 1]]);
const uint32_t signs = unpack_ksigns(aux32 >> (7*l0/2));
const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l0/2)) & 0x7F));
const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
sumi = ggml_cuda_dp4a(grid_l, u0, sumi);

View File

@@ -5,7 +5,7 @@ import os
import sys
import subprocess
HTTPLIB_VERSION = "f80864ca031932351abef49b74097c67f14719c6"
HTTPLIB_VERSION = "d4180e923f846b44a3d30acd938438d6e64fc9f6"
vendor = {
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",

View File

@@ -28,10 +28,6 @@ target_link_libraries(${TARGET} PUBLIC common mtmd ${CMAKE_THREAD_LIBS_INIT})
set(TARGET llama-server)
if (NOT LLAMA_HTTPLIB)
message(FATAL_ERROR "LLAMA_HTTPLIB is OFF, cannot build llama-server. Hint: to skip building server, set -DLLAMA_BUILD_SERVER=OFF")
endif()
set(TARGET_SRCS
server.cpp
server-http.cpp

View File

@@ -1264,78 +1264,32 @@ int poll_wrapper(struct pollfd *fds, nfds_t nfds, int timeout) {
#endif
}
template <bool Read>
ssize_t select_impl(socket_t sock, time_t sec, time_t usec) {
#ifdef __APPLE__
if (sock >= FD_SETSIZE) { return -1; }
fd_set fds, *rfds, *wfds;
FD_ZERO(&fds);
FD_SET(sock, &fds);
rfds = (Read ? &fds : nullptr);
wfds = (Read ? nullptr : &fds);
timeval tv;
tv.tv_sec = static_cast<long>(sec);
tv.tv_usec = static_cast<decltype(tv.tv_usec)>(usec);
return handle_EINTR([&]() {
return select(static_cast<int>(sock + 1), rfds, wfds, nullptr, &tv);
});
#else
ssize_t select_impl(socket_t sock, short events, time_t sec,
time_t usec) {
struct pollfd pfd;
pfd.fd = sock;
pfd.events = (Read ? POLLIN : POLLOUT);
pfd.events = events;
pfd.revents = 0;
auto timeout = static_cast<int>(sec * 1000 + usec / 1000);
return handle_EINTR([&]() { return poll_wrapper(&pfd, 1, timeout); });
#endif
}
ssize_t select_read(socket_t sock, time_t sec, time_t usec) {
return select_impl<true>(sock, sec, usec);
return select_impl(sock, POLLIN, sec, usec);
}
ssize_t select_write(socket_t sock, time_t sec, time_t usec) {
return select_impl<false>(sock, sec, usec);
return select_impl(sock, POLLOUT, sec, usec);
}
Error wait_until_socket_is_ready(socket_t sock, time_t sec,
time_t usec) {
#ifdef __APPLE__
if (sock >= FD_SETSIZE) { return Error::Connection; }
fd_set fdsr, fdsw;
FD_ZERO(&fdsr);
FD_ZERO(&fdsw);
FD_SET(sock, &fdsr);
FD_SET(sock, &fdsw);
timeval tv;
tv.tv_sec = static_cast<long>(sec);
tv.tv_usec = static_cast<decltype(tv.tv_usec)>(usec);
auto ret = handle_EINTR([&]() {
return select(static_cast<int>(sock + 1), &fdsr, &fdsw, nullptr, &tv);
});
if (ret == 0) { return Error::ConnectionTimeout; }
if (ret > 0 && (FD_ISSET(sock, &fdsr) || FD_ISSET(sock, &fdsw))) {
auto error = 0;
socklen_t len = sizeof(error);
auto res = getsockopt(sock, SOL_SOCKET, SO_ERROR,
reinterpret_cast<char *>(&error), &len);
auto successful = res >= 0 && !error;
return successful ? Error::Success : Error::Connection;
}
return Error::Connection;
#else
struct pollfd pfd_read;
pfd_read.fd = sock;
pfd_read.events = POLLIN | POLLOUT;
pfd_read.revents = 0;
auto timeout = static_cast<int>(sec * 1000 + usec / 1000);
@@ -1354,7 +1308,6 @@ Error wait_until_socket_is_ready(socket_t sock, time_t sec,
}
return Error::Connection;
#endif
}
bool is_socket_alive(socket_t sock) {
@@ -7138,17 +7091,6 @@ Server::process_request(Stream &strm, const std::string &remote_addr,
res.version = "HTTP/1.1";
res.headers = default_headers_;
#ifdef __APPLE__
// Socket file descriptor exceeded FD_SETSIZE...
if (strm.socket() >= FD_SETSIZE) {
Headers dummy;
detail::read_headers(strm, dummy);
res.status = StatusCode::InternalServerError_500;
output_error_log(Error::ExceedMaxSocketDescriptorCount, &req);
return write_response(strm, close_connection, req, res);
}
#endif
// Request line and headers
if (!parse_request_line(line_reader.ptr(), req)) {
res.status = StatusCode::BadRequest_400;
@@ -12063,7 +12005,7 @@ bool get_cert_sans(cert_t cert, std::vector<SanEntry> &sans) {
if (!names) return true; // No SANs is valid
auto count = sk_GENERAL_NAME_num(names);
for (int i = 0; i < count; i++) {
for (decltype(count) i = 0; i < count; i++) {
auto gen = sk_GENERAL_NAME_value(names, i);
if (!gen) continue;

View File

@@ -8,8 +8,8 @@
#ifndef CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_VERSION "0.31.0"
#define CPPHTTPLIB_VERSION_NUM "0x001F00"
#define CPPHTTPLIB_VERSION "0.32.0"
#define CPPHTTPLIB_VERSION_NUM "0x002000"
/*
* Platform compatibility check