mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-02-19 14:13:22 +02:00
Compare commits
53 Commits
b8060
...
gg/scripts
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c0c3e428dd | ||
|
|
7f049860b4 | ||
|
|
2ffa45edfc | ||
|
|
9c29be1177 | ||
|
|
013963cfd5 | ||
|
|
e2e998a2d6 | ||
|
|
6c41664b8b | ||
|
|
7b84af8051 | ||
|
|
60a501e138 | ||
|
|
e6e777cfb3 | ||
|
|
ad3a54eb68 | ||
|
|
c6d70b9bea | ||
|
|
de956a6ca8 | ||
|
|
350e7c1409 | ||
|
|
db10dda1f3 | ||
|
|
52759bf078 | ||
|
|
99e3c3d02c | ||
|
|
c6315655b7 | ||
|
|
f762a71d56 | ||
|
|
73e61d5b75 | ||
|
|
cffd268bb3 | ||
|
|
e8a807519a | ||
|
|
1db8428f00 | ||
|
|
7751ae2796 | ||
|
|
d2b10302ce | ||
|
|
68dde884d6 | ||
|
|
fd90796da2 | ||
|
|
8156d549f6 | ||
|
|
9695e6feb4 | ||
|
|
fb1481d60d | ||
|
|
812ae13ec1 | ||
|
|
e79e8d02d5 | ||
|
|
a939f4c47e | ||
|
|
62b04cef54 | ||
|
|
37b26cafee | ||
|
|
04f6872116 | ||
|
|
c2619c18bf | ||
|
|
87f8930968 | ||
|
|
9453f9de12 | ||
|
|
5a1be6ce37 | ||
|
|
a80814e97b | ||
|
|
5cc2258e82 | ||
|
|
c87af1d527 | ||
|
|
23d4e21a81 | ||
|
|
07d5e1e0ea | ||
|
|
8839037528 | ||
|
|
89cab3dbc5 | ||
|
|
c2d83ca048 | ||
|
|
c05df17ce3 | ||
|
|
27b93cbd15 | ||
|
|
6e67fd2144 | ||
|
|
9e118b97c4 | ||
|
|
57088276d4 |
@@ -112,7 +112,6 @@ option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_
|
||||
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_HTTPLIB "llama: httplib for downloading functionality" ON)
|
||||
option(LLAMA_OPENSSL "llama: use openssl to support HTTPS" ON)
|
||||
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
|
||||
|
||||
@@ -197,9 +196,7 @@ add_subdirectory(src)
|
||||
|
||||
if (LLAMA_BUILD_COMMON)
|
||||
add_subdirectory(common)
|
||||
if (LLAMA_HTTPLIB)
|
||||
add_subdirectory(vendor/cpp-httplib)
|
||||
endif()
|
||||
add_subdirectory(vendor/cpp-httplib)
|
||||
endif()
|
||||
|
||||
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||
|
||||
@@ -449,10 +449,9 @@ cmake -B build-visionos -G Xcode \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_SYSROOT=xros \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_HTTPLIB=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-S .
|
||||
cmake --build build-visionos --config Release -- -quiet
|
||||
@@ -465,10 +464,9 @@ cmake -B build-visionos-sim -G Xcode \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_SYSROOT=xrsimulator \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DLLAMA_OPENSSL=OFF \
|
||||
-DLLAMA_HTTPLIB=OFF \
|
||||
-DLLAMA_BUILD_SERVER=OFF \
|
||||
-S .
|
||||
cmake --build build-visionos-sim --config Release -- -quiet
|
||||
|
||||
@@ -112,11 +112,7 @@ endif()
|
||||
|
||||
# TODO: use list(APPEND LLAMA_COMMON_EXTRA_LIBS ...)
|
||||
set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
||||
|
||||
if (LLAMA_HTTPLIB)
|
||||
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_HTTPLIB)
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
|
||||
endif()
|
||||
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} cpp-httplib)
|
||||
|
||||
if (LLAMA_LLGUIDANCE)
|
||||
include(ExternalProject)
|
||||
|
||||
@@ -19,9 +19,7 @@
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#if defined(LLAMA_USE_HTTPLIB)
|
||||
#include "http.h"
|
||||
#endif
|
||||
|
||||
#ifndef __EMSCRIPTEN__
|
||||
#ifdef __linux__
|
||||
@@ -142,8 +140,6 @@ std::pair<std::string, std::string> common_download_split_repo_tag(const std::st
|
||||
return {hf_repo, tag};
|
||||
}
|
||||
|
||||
#if defined(LLAMA_USE_HTTPLIB)
|
||||
|
||||
class ProgressBar {
|
||||
static inline std::mutex mutex;
|
||||
static inline std::map<const ProgressBar *, int> lines;
|
||||
@@ -768,30 +764,6 @@ std::string common_docker_resolve_model(const std::string & docker) {
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool, const common_header_list &) {
|
||||
throw std::runtime_error("download functionality is not enabled in this build");
|
||||
}
|
||||
|
||||
bool common_download_model(const common_params_model &, const std::string &, bool, const common_header_list &) {
|
||||
throw std::runtime_error("download functionality is not enabled in this build");
|
||||
}
|
||||
|
||||
std::string common_docker_resolve_model(const std::string &) {
|
||||
throw std::runtime_error("download functionality is not enabled in this build");
|
||||
}
|
||||
|
||||
int common_download_file_single(const std::string &,
|
||||
const std::string &,
|
||||
const std::string &,
|
||||
bool,
|
||||
const common_header_list &) {
|
||||
throw std::runtime_error("download functionality is not enabled in this build");
|
||||
}
|
||||
|
||||
#endif // defined(LLAMA_USE_HTTPLIB)
|
||||
|
||||
std::vector<common_cached_model_info> common_list_cached_models() {
|
||||
std::vector<common_cached_model_info> models;
|
||||
const std::string cache_dir = fs_get_cache_directory();
|
||||
|
||||
@@ -242,10 +242,10 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
|------------|-------------|------|-------|
|
||||
| FP32 | ✅ | ✅ | ❓ |
|
||||
| FP16 | ✅ | ✅ | ❓ |
|
||||
| BF16 | 🚫 | ✅ | ❓ |
|
||||
| BF16 | ✅ | ✅ | ❓ |
|
||||
| Q4_0 | ✅ | ❓ | ❓ |
|
||||
| Q4_1 | ✅ | ❓ | ❓ |
|
||||
| MXFP4 | 🚫 | ❓ | ❓ |
|
||||
| MXFP4 | ✅ | ❓ | ❓ |
|
||||
| Q5_0 | ✅ | ❓ | ❓ |
|
||||
| Q5_1 | ✅ | ❓ | ❓ |
|
||||
| Q8_0 | ✅ | ❓ | ❓ |
|
||||
@@ -272,4 +272,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl
|
||||
- 🚫 - acceleration unavailable, will still run using scalar implementation
|
||||
- ❓ - acceleration unknown, please contribute if you can test it yourself
|
||||
|
||||
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Sep 7, 2025.
|
||||
Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Feb 15, 2026.
|
||||
|
||||
190
examples/llama-eval/AGENTS.md
Normal file
190
examples/llama-eval/AGENTS.md
Normal file
@@ -0,0 +1,190 @@
|
||||
# llama-eval Codebase Guidelines
|
||||
|
||||
## Overview
|
||||
|
||||
This directory contains Python evaluation tools for llama.cpp:
|
||||
- `llama-eval.py` - Main evaluation tool with multiple datasets (AIME, AIME2025, GSM8K, GPQA)
|
||||
- `llama-server-simulator.py` - Flask-based server simulator for testing
|
||||
- `test-simulator.sh` - Test script for the simulator
|
||||
|
||||
## Build/Run Commands
|
||||
|
||||
### Virtual Environment
|
||||
The project uses a virtual environment located at `venv/`:
|
||||
```bash
|
||||
source venv/bin/activate
|
||||
```
|
||||
|
||||
### Running the Main Evaluator
|
||||
```bash
|
||||
python llama-eval.py \
|
||||
--server http://127.0.0.1:8013 \
|
||||
--model gpt-oss-20b-hf-low \
|
||||
--dataset aime \
|
||||
--n_cases 10 \
|
||||
--grader-type llm \
|
||||
--seed 42
|
||||
```
|
||||
|
||||
### Running the Simulator (for testing)
|
||||
```bash
|
||||
python llama-server-simulator.py --port 8033 --success-rate 0.8
|
||||
```
|
||||
|
||||
### Running Tests
|
||||
```bash
|
||||
./test-simulator.sh
|
||||
```
|
||||
|
||||
## Code Style Guidelines
|
||||
|
||||
### Imports
|
||||
- Standard library imports first (argparse, json, os, re, subprocess, sys, time)
|
||||
- Third-party imports (requests, tqdm, datasets, flask) after standard library
|
||||
- Relative imports not used
|
||||
- Group imports by category with blank line between groups
|
||||
|
||||
### Formatting
|
||||
- 4-space indentation
|
||||
- Max line length: 125 characters (per parent project's .flake8)
|
||||
- Use double quotes for strings
|
||||
- Use triple double quotes for docstrings
|
||||
- Binary operators at the beginning of continued lines
|
||||
|
||||
### Naming Conventions
|
||||
- Classes: PascalCase (e.g., `AimeDataset`, `Grader`, `Processor`)
|
||||
- Functions: snake_case (e.g., `normalize_number`, `get_prompt`)
|
||||
- Variables: snake_case (e.g., `question_text`, `correct_count`)
|
||||
- Constants: UPPER_SNAKE_CASE (e.g., `GRADER_PATTERNS`, `TEMPLATE_REGISTRY`)
|
||||
- Private methods: prefix with underscore (e.g., `_load_dataset`, `_grade_regex`)
|
||||
|
||||
### Types
|
||||
- Use type hints for all function signatures
|
||||
- Import from `typing` module: `Dict`, `List`, `Optional`, `Any`, `Tuple`
|
||||
- Use `@dataclass` for data structures
|
||||
- Prefer `Optional[T]` over `Union[T, None]`
|
||||
|
||||
### Error Handling
|
||||
- Use try/except for network requests and file operations
|
||||
- Return `None` or `False` on errors when appropriate
|
||||
- Use `ValueError` for invalid arguments
|
||||
- Use `FileNotFoundError` for missing files
|
||||
- CLI scripts should handle exceptions gracefully
|
||||
|
||||
### Dataclasses
|
||||
- Use `@dataclass` for structured data
|
||||
- Define fields with explicit types
|
||||
- Use `Optional[T]` for nullable fields
|
||||
- Provide default values where appropriate
|
||||
|
||||
### String Formatting
|
||||
- Use f-strings for formatting (Python 3.6+)
|
||||
- Use triple double quotes for multi-line strings
|
||||
- Escape backslashes in regex patterns: `r'\\boxed{(\d+)}'`
|
||||
|
||||
### File Paths
|
||||
- Use `pathlib.Path` instead of string paths
|
||||
- Create directories with `mkdir(parents=True, exist_ok=True)`
|
||||
- Use `Path.home()` for user home directory
|
||||
|
||||
### Logging
|
||||
- Use `print()` for user-facing output
|
||||
- Use `sys.stderr` for debug logging
|
||||
- Simulator writes debug logs to `/tmp/simulator-debug.log`
|
||||
|
||||
### Testing
|
||||
|
||||
- Test script uses bash with `set -e` for strict error handling
|
||||
- Simulator runs in background with PID tracking
|
||||
- Tests verify correct answers, error cases, and edge cases
|
||||
- Use `curl` for HTTP testing in shell scripts
|
||||
|
||||
### Whitespace Cleanup
|
||||
- Remove trailing whitespace from all lines
|
||||
- When making edits, do not leave trailing whitespace
|
||||
|
||||
## Dataset Support
|
||||
|
||||
### AIME Dataset
|
||||
- 90 questions from 2025 AIME competition
|
||||
- Answers in `\boxed{answer}` format
|
||||
- Supports regex, CLI, and LLM grading
|
||||
|
||||
### AIME2025 Dataset
|
||||
- 30 questions from 2025 AIME I & II
|
||||
- Answers in `\boxed{answer}` format
|
||||
- Requires loading two config parts
|
||||
|
||||
### GSM8K Dataset
|
||||
- 7473 math word problems
|
||||
- Answers numeric values with `####` separator
|
||||
- Supports regex, CLI, and LLM grading
|
||||
|
||||
### GPQA Dataset
|
||||
- 198 questions from GPQA Diamond
|
||||
- Multiple choice with shuffled options (A, B, C, D)
|
||||
- **Requires LLM grader** (returns letter A/B/C/D)
|
||||
|
||||
## Grading Types
|
||||
|
||||
### Regex Grader
|
||||
- Built-in patterns per dataset
|
||||
- Prioritizes `\boxed{}` for AIME datasets
|
||||
- Extracts last number for GSM8K
|
||||
|
||||
### CLI Grader
|
||||
- External script interface
|
||||
- Call: `grader.sh --answer <pred> --expected <gold>`
|
||||
- Exit code 0 = correct, non-zero = incorrect
|
||||
|
||||
### LLM Grader
|
||||
- Uses judge model for answer extraction
|
||||
- Includes few-shot examples
|
||||
- Case-insensitive comparison
|
||||
- Required for GPQA
|
||||
|
||||
## Configuration
|
||||
|
||||
### Sampling Parameters (Optional)
|
||||
- `--temperature`: Sampling temperature
|
||||
- `--top-k`: Top K sampling
|
||||
- `--top-p`: Top P sampling
|
||||
- `--min-p`: Min P sampling
|
||||
- Only passed to API if explicitly specified
|
||||
|
||||
### Default Values
|
||||
- `--n_predict`: -1 (infinite)
|
||||
- `--grader-type`: llm
|
||||
- `--seed`: 1234
|
||||
- `--threads`: 32
|
||||
- `--output`: llama-eval-state.json
|
||||
|
||||
## Output Format
|
||||
|
||||
### Progress Table
|
||||
- Shows task ID, dataset, prompt (truncated to 43 chars), expected answer, status
|
||||
- Uses `tqdm` for progress bars
|
||||
|
||||
### Results Summary
|
||||
- Format: `Results: X/Y correct (Z%)`
|
||||
- Displayed after all tasks complete
|
||||
|
||||
### JSON Output
|
||||
- Complete eval state saved to output file
|
||||
- Contains: task IDs, correctness, prompts, extracted answers, sampling config
|
||||
- Uses `dataclasses.asdict()` for serialization
|
||||
|
||||
## HuggingFace Datasets
|
||||
|
||||
- Cache directory: `~/.cache/huggingface/datasets`
|
||||
- Set via `HF_DATASETS_CACHE` environment variable
|
||||
- Telemetry disabled via `HF_HUB_DISABLE_TELEMETRY=1`
|
||||
- Datasets loaded with `datasets.load_dataset()`
|
||||
|
||||
## Flask Simulator
|
||||
|
||||
- Runs on configurable port (default: 5000)
|
||||
- Endpoint: `/v1/chat/completions` (OpenAI-compatible)
|
||||
- Uses Dice coefficient for question matching
|
||||
- Configurable success rate for testing
|
||||
- Debug logs to `/tmp/simulator-debug.log`
|
||||
94
examples/llama-eval/IMPLEMENTATION.md
Normal file
94
examples/llama-eval/IMPLEMENTATION.md
Normal file
@@ -0,0 +1,94 @@
|
||||
# llama-eval Implementation Summary
|
||||
|
||||
## Overview
|
||||
|
||||
Simple evaluation tool for llama.cpp with support for multiple datasets (AIME, GSM8K, GPQA) and flexible grading (regex, CLI, LLM).
|
||||
|
||||
## Key Features
|
||||
|
||||
- **Multiple Datasets**: AIME, GSM8K, GPQA with proper answer extraction
|
||||
- **Flexible Grading**: Regex, CLI, or LLM-based grading
|
||||
- **Parallel Processing**: Configurable thread count for concurrent requests
|
||||
- **Sampling Parameters**: Temperature, Top K, Top P, Min P (optional)
|
||||
- **Real-time Feedback**: Progress tracking with detailed output
|
||||
- **JSON Output**: Complete eval state saved for debugging
|
||||
- **GPQA Support**: Answer shuffling with reproducible results
|
||||
|
||||
## Architecture
|
||||
|
||||
### Eval State
|
||||
```python
|
||||
@dataclass
|
||||
class EvalState:
|
||||
id: str
|
||||
tasks: List[str]
|
||||
task_states: Dict[str, Dict[str, Any]]
|
||||
sampling_config: Dict[str, Any]
|
||||
```
|
||||
|
||||
### Processor
|
||||
- Handles processing, grading, and state management
|
||||
- Thread-safe concurrent execution
|
||||
- Configurable sampling parameters
|
||||
|
||||
### Grader
|
||||
- Abstract grading interface supporting multiple types
|
||||
- Regex grader with dataset-specific patterns
|
||||
- CLI grader with external script interface
|
||||
- LLM grader with configurable server and model
|
||||
|
||||
### Datasets
|
||||
- `AimeDataset`: 90 AIME 2025 questions
|
||||
- `Aime2025Dataset`: 30 AIME 2025 I & II questions
|
||||
- `Gsm8kDataset`: 7473 math word problems
|
||||
- `GpqaDataset`: 198 GPQA Diamond questions with shuffling
|
||||
|
||||
## Configuration
|
||||
|
||||
### Sampling Parameters (Optional)
|
||||
- `--temperature`: Sampling temperature
|
||||
- `--top-k`: Top K sampling
|
||||
- `--top-p`: Top P sampling
|
||||
- `--min-p`: Min P sampling
|
||||
- Only passed if explicitly specified
|
||||
|
||||
### Grading Types
|
||||
- **regex**: Built-in patterns for each dataset
|
||||
- **cli**: External script with `--answer` and `--expected` args
|
||||
- **llm**: LLM-based extraction with few-shot examples and configurable server/model
|
||||
|
||||
### Dataset Requirements
|
||||
- **AIME**: Supports regex, CLI, or LLM grader
|
||||
- **AIME2025**: Supports regex, CLI, or LLM grader
|
||||
- **GSM8K**: Supports regex, CLI, or LLM grader
|
||||
- **GPQA**: Requires LLM grader
|
||||
|
||||
## Output Format
|
||||
|
||||
### Progress Table
|
||||
```
|
||||
Task ID Dataset Prompt (first 43 chars) Expected Status
|
||||
aime_000_001 AIME Complete the following reactions and sel... A pending
|
||||
```
|
||||
|
||||
### Results Summary
|
||||
```
|
||||
============================================================
|
||||
Results: 8/10 correct (80.0%)
|
||||
============================================================
|
||||
```
|
||||
|
||||
### JSON Output
|
||||
Complete eval state with task IDs, correctness, prompts, extracted answers, and sampling configuration.
|
||||
|
||||
## Technical Details
|
||||
|
||||
- Default max tokens: -1 (infinite)
|
||||
- Default grader type: llm
|
||||
- Default seed: 1234
|
||||
- Default threads: 32
|
||||
- Prompt truncation: First 43 chars + padding + "..."
|
||||
- Response truncation: Last 10 lines for grading
|
||||
- GPQA requires LLM grader (returns letter A/B/C/D)
|
||||
- Judge model defaults to evaluated model if not specified
|
||||
- Sample answers defined in SAMPLE_ANSWERS dict for few-shot learning
|
||||
112
examples/llama-eval/README.md
Normal file
112
examples/llama-eval/README.md
Normal file
@@ -0,0 +1,112 @@
|
||||
# llama-eval Evaluation Tool
|
||||
|
||||
Simple evaluation tool for llama.cpp with support for multiple datasets.
|
||||
|
||||
## Features
|
||||
|
||||
- **Multiple Datasets**: AIME, GSM8K, GPQA
|
||||
- **Flexible Grading**: Regex, CLI, or LLM-based grading
|
||||
- **Parallel Processing**: Configurable thread count
|
||||
- **Real-time Feedback**: Progress tracking with detailed output
|
||||
- **Sampling Parameters**: Temperature, Top K, Top P, Min P
|
||||
- **JSON Output**: Complete eval state saved for debugging
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
python llama-eval.py \
|
||||
--server http://127.0.0.1:8013 \
|
||||
--model gpt-oss-20b-hf-low \
|
||||
--judge-model gpt-oss-20b-hf-medium \
|
||||
--dataset aime \
|
||||
--n_cases 10 \
|
||||
--grader-type llm \
|
||||
--seed 42
|
||||
```
|
||||
|
||||
## CLI Arguments
|
||||
|
||||
- `--server`: llama-server URL (default: http://127.0.0.1:8013)
|
||||
- `--model`: Model name for evaluation (default: llama)
|
||||
- `--judge-model`: Model name for LLM judge (default: same as main model)
|
||||
- `--judge-server`: Server URL for LLM judge (default: same as main server)
|
||||
- `--dataset`: Dataset type (aime, aime2025, gsm8k, gpqa)
|
||||
- `--n_cases`: Number of cases to evaluate (default: all)
|
||||
- `--n_predict`: Max tokens to predict per prompt (default: -1, infinite)
|
||||
- `--temperature`: Sampling temperature (default: not passed)
|
||||
- `--top-k`: Top K sampling (default: not passed)
|
||||
- `--top-p`: Top P sampling (default: not passed)
|
||||
- `--min-p`: Min P sampling (default: not passed)
|
||||
- `--threads`: Number of threads for parallel requests (default: 32)
|
||||
- `--verbose`: Show detailed output for each case
|
||||
- `--output`: Output file for eval state (default: llama-eval-state.json)
|
||||
- `--grader-type`: Grader type (regex, cli, llm, default: llm)
|
||||
- `--grader-script`: Path to CLI grader script (required for --grader-type cli)
|
||||
- `--seed`: Random seed for shuffling (default: 1234)
|
||||
|
||||
## Datasets
|
||||
|
||||
### AIME
|
||||
- 90 questions from 2025 AIME competition
|
||||
- Answers in boxed format: `\boxed{answer}`
|
||||
- Requires regex grader or LLM grader
|
||||
|
||||
### AIME2025
|
||||
- 30 questions from 2025 AIME I & II competitions
|
||||
- Answers in boxed format: `\boxed{answer}`
|
||||
- Supports regex, CLI, or LLM grader
|
||||
|
||||
### GSM8K
|
||||
- 7473 math word problems
|
||||
- Answers are numeric values
|
||||
- Requires regex grader or LLM grader
|
||||
|
||||
### GPQA
|
||||
- 198 questions from GPQA Diamond dataset
|
||||
- Multiple choice with shuffled options
|
||||
- Requires LLM grader (returns letter A, B, C, or D)
|
||||
|
||||
## Grading Types
|
||||
|
||||
### Regex Grader
|
||||
Built-in patterns for different datasets:
|
||||
- AIME: `\boxed{(\d+)}|\b(\d+)\b`
|
||||
- AIME2025: `\boxed{(\d+)}|\b(\d+)\b`
|
||||
- GSM8K: `\b(\d+)\b`
|
||||
- GPQA: Letter extraction (A, B, C, D)
|
||||
|
||||
### CLI Grader
|
||||
External script interface:
|
||||
```bash
|
||||
./grader.sh --answer <pred> --expected <gold>
|
||||
```
|
||||
Returns exit code 0 if correct, non-zero if incorrect.
|
||||
|
||||
### LLM Grader
|
||||
Uses LLM to extract and compare answers:
|
||||
- Configurable server and model
|
||||
- Includes few-shot examples from sample answers
|
||||
- Case-insensitive comparison
|
||||
- Required for GPQA dataset
|
||||
|
||||
## Output
|
||||
|
||||
### Progress Table
|
||||
```
|
||||
Task ID Dataset Prompt (first 43 chars) Expected Status
|
||||
aime_000_001 AIME Complete the following reactions and sel... A pending
|
||||
```
|
||||
|
||||
### Results
|
||||
```
|
||||
============================================================
|
||||
Results: 8/10 correct (80.0%)
|
||||
============================================================
|
||||
```
|
||||
|
||||
### JSON Output
|
||||
Complete eval state saved to output file with:
|
||||
- Task IDs and correctness status
|
||||
- Prompts and extracted answers
|
||||
- Sampling configuration
|
||||
- Processing metadata
|
||||
1229
examples/llama-eval/llama-eval.py
Executable file
1229
examples/llama-eval/llama-eval.py
Executable file
File diff suppressed because it is too large
Load Diff
36
examples/llama-eval/llama-server-simulator-README.md
Normal file
36
examples/llama-eval/llama-server-simulator-README.md
Normal file
@@ -0,0 +1,36 @@
|
||||
# llama-server-simulator
|
||||
|
||||
Standalone Python script simulating llama-server HTTP endpoint for testing.
|
||||
|
||||
## Features
|
||||
|
||||
- HTTP Server with OpenAI-compatible `/v1/chat/completions` endpoint
|
||||
- AIME Dataset Integration - Loads 90 questions from HuggingFace
|
||||
- Intelligent Question Matching - Uses exact matching, LaTeX removal, and Levenshtein distance
|
||||
- Configurable Success Rate - Control correct/wrong answer generation (0-1)
|
||||
- Debug Logging - Troubleshoot matching issues
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
python llama-server-simulator.py --success-rate 0.8
|
||||
```
|
||||
|
||||
## Arguments
|
||||
|
||||
- `--success-rate`: Probability of returning correct answer (0.0-1.0, default: 0.8)
|
||||
- `--port`: Server port (default: 8033)
|
||||
- `--debug`: Enable debug logging (default: False)
|
||||
|
||||
## Testing
|
||||
|
||||
```bash
|
||||
./test-simulator.sh
|
||||
```
|
||||
|
||||
## Implementation Details
|
||||
|
||||
- Uses Levenshtein distance for partial matching (threshold: 0.3)
|
||||
- Automatic caching via HuggingFace datasets library
|
||||
- Wrong answers generated by incrementing expected answer
|
||||
- Debug output written to stderr
|
||||
283
examples/llama-eval/llama-server-simulator.py
Executable file
283
examples/llama-eval/llama-server-simulator.py
Executable file
@@ -0,0 +1,283 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
from typing import Dict, List, Optional
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
|
||||
import datasets
|
||||
from flask import Flask, request, jsonify
|
||||
|
||||
# Set cache directory for HuggingFace datasets
|
||||
cache_dir = Path.home() / ".cache" / "huggingface" / "datasets"
|
||||
cache_dir.mkdir(parents=True, exist_ok=True)
|
||||
os.environ["HF_DATASETS_CACHE"] = str(cache_dir)
|
||||
|
||||
def dice(s1: str, s2: str) -> float:
|
||||
"""Calculate Dice coefficient between two strings based on bigram overlap."""
|
||||
if not s1 and not s2:
|
||||
return 1.0
|
||||
|
||||
def _bigrams(s: str):
|
||||
return [s[i : i + 2] for i in range(len(s) - 1)]
|
||||
|
||||
bigrams1 = _bigrams(s1)
|
||||
bigrams2 = _bigrams(s2)
|
||||
|
||||
if not bigrams1 and not bigrams2:
|
||||
return 1.0
|
||||
|
||||
from collections import Counter
|
||||
|
||||
freq1 = Counter(bigrams1)
|
||||
freq2 = Counter(bigrams2)
|
||||
|
||||
intersection = sum(min(freq1[bg], freq2[bg]) for bg in freq1)
|
||||
dice_coeff = 2 * intersection / (len(bigrams1) + len(bigrams2))
|
||||
return dice_coeff
|
||||
|
||||
def debug_log(message: str):
|
||||
"""Log debug messages to both stdout and a file"""
|
||||
print(message, file=sys.stderr)
|
||||
with open("/tmp/simulator-debug.log", "a") as f:
|
||||
f.write(message + "\n")
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@dataclass
|
||||
class EvalState:
|
||||
id: str
|
||||
tasks: List[str]
|
||||
task_states: Dict[str, Dict]
|
||||
sampling_config: Dict
|
||||
|
||||
def normalize_number(s: str) -> Optional[int]:
|
||||
match = re.match(r"\d+", s) # match digits from the start
|
||||
if not match:
|
||||
return None
|
||||
return int(match.group(0))
|
||||
|
||||
class AimeDataset:
|
||||
def __init__(self, split: str = "train"):
|
||||
self.split = split
|
||||
self.questions: List[Dict] = []
|
||||
self._load_dataset()
|
||||
|
||||
def _load_dataset(self):
|
||||
print(f"Loading AIME dataset (split: {self.split})...")
|
||||
|
||||
cache_path = Path.home() / ".cache" / "huggingface" / "datasets" / "AI-MO___aimo-validation-aime" / "default" / "0.0.0"
|
||||
if cache_path.exists():
|
||||
print(f"Using cached dataset from {cache_path}")
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split, cache_dir=str(cache_path))
|
||||
else:
|
||||
ds = datasets.load_dataset("AI-MO/aimo-validation-aime", split=self.split)
|
||||
|
||||
self.questions = list(ds)
|
||||
print(f"AIME dataset loaded: {len(self.questions)} questions")
|
||||
|
||||
def find_question(self, request_text: str) -> Optional[Dict]:
|
||||
best_match = None
|
||||
best_distance = -1
|
||||
best_index = -1
|
||||
|
||||
for i, question in enumerate(self.questions):
|
||||
question_text = question["problem"]
|
||||
request_lower = request_text.lower()
|
||||
question_lower = question_text.lower()
|
||||
|
||||
# Exact match
|
||||
if question_lower == request_lower:
|
||||
debug_log(f"DEBUG: Found exact match at index {i}")
|
||||
return question
|
||||
|
||||
# Remove LaTeX formatting for more flexible matching
|
||||
question_no_latex = re.sub(r'\$[^$]+\$', '', question_text)
|
||||
if question_no_latex.lower() == request_lower:
|
||||
debug_log(f"DEBUG: Found match (no LaTeX) at index {i}")
|
||||
return question
|
||||
|
||||
# Calculate Levenshtein distance for partial matches
|
||||
# Only consider if request is at least 50% of question length
|
||||
if len(request_lower) >= len(question_lower) * 0.5:
|
||||
distance = dice(question_lower, request_lower)
|
||||
|
||||
if distance > best_distance:
|
||||
best_distance = distance
|
||||
best_match = question
|
||||
best_index = i
|
||||
|
||||
if best_match and best_distance > 0.3: # Threshold for partial match
|
||||
debug_log(f"DEBUG: Found best partial match at index {best_index} with distance {best_distance:.3f}")
|
||||
return best_match
|
||||
|
||||
debug_log(f"DEBUG: No matching question found for: {request_text[:100]}...")
|
||||
return None
|
||||
|
||||
def get_answer(self, question: Dict) -> str:
|
||||
answer = question["answer"]
|
||||
if isinstance(answer, str):
|
||||
normalized = normalize_number(answer)
|
||||
return str(normalized) if normalized is not None else answer
|
||||
return str(answer)
|
||||
|
||||
class Simulator:
|
||||
def __init__(
|
||||
self,
|
||||
port: int = 8033,
|
||||
host: str = "localhost",
|
||||
success_rate: float = 0.8,
|
||||
dataset_split: str = "train"
|
||||
):
|
||||
self.port = port
|
||||
self.host = host
|
||||
self.success_rate = success_rate
|
||||
self.dataset = AimeDataset(dataset_split)
|
||||
self.eval_state = EvalState(
|
||||
id="aime-2025",
|
||||
tasks=["aime"],
|
||||
task_states={},
|
||||
sampling_config={"temperature": 0, "max_tokens": 2048}
|
||||
)
|
||||
|
||||
def _generate_response(
|
||||
self,
|
||||
question: Dict,
|
||||
should_be_correct: bool
|
||||
) -> Dict:
|
||||
expected_answer = self.dataset.get_answer(question)
|
||||
|
||||
if should_be_correct:
|
||||
response_text = expected_answer
|
||||
else:
|
||||
response_text = self._generate_wrong_answer(question)
|
||||
|
||||
return {
|
||||
"id": f"chatcmpl-{int(time.time())}",
|
||||
"object": "chat.completion",
|
||||
"created": int(time.time()),
|
||||
"model": "llama",
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": response_text
|
||||
},
|
||||
"finish_reason": "stop"
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": 100,
|
||||
"completion_tokens": 50,
|
||||
"total_tokens": 150
|
||||
}
|
||||
}
|
||||
|
||||
def _generate_wrong_answer(self, question: Dict) -> str:
|
||||
expected_answer = self.dataset.get_answer(question)
|
||||
|
||||
if expected_answer.isdigit():
|
||||
wrong_answer = str(int(expected_answer) + 1)
|
||||
else:
|
||||
wrong_answer = expected_answer + " (wrong)"
|
||||
|
||||
return wrong_answer
|
||||
|
||||
def _process_request(self, request_data: Dict) -> Dict:
|
||||
messages = request_data.get("messages", [])
|
||||
if not messages:
|
||||
return {"error": "No messages in request"}
|
||||
|
||||
request_text = messages[0].get("content", "")
|
||||
debug_log(f"DEBUG: Received request with content: {request_text[:150]}...")
|
||||
|
||||
question = self.dataset.find_question(request_text)
|
||||
if not question:
|
||||
debug_log(f"DEBUG: find_question returned None")
|
||||
return {"error": "No matching question found"}
|
||||
|
||||
should_be_correct = random.random() < self.success_rate
|
||||
|
||||
response = self._generate_response(question, should_be_correct)
|
||||
|
||||
task_id = "aime"
|
||||
self.eval_state.task_states[task_id] = {
|
||||
"correct": should_be_correct,
|
||||
"expected": self.dataset.get_answer(question),
|
||||
"predicted": response["choices"][0]["message"]["content"]
|
||||
}
|
||||
|
||||
return response
|
||||
|
||||
@app.route('/v1/chat/completions', methods=['POST'])
|
||||
def chat_completions():
|
||||
try:
|
||||
request_data = request.get_json()
|
||||
|
||||
if not request_data:
|
||||
return jsonify({"error": "Invalid JSON"}), 400
|
||||
|
||||
response = simulator._process_request(request_data)
|
||||
|
||||
return jsonify(response)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing request: {e}")
|
||||
return jsonify({"error": str(e)}), 500
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="llama-server simulator for testing eval scripts"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=8033,
|
||||
help="Server port (default: 8033)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
type=str,
|
||||
default="localhost",
|
||||
help="Server host (default: localhost)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--success-rate",
|
||||
type=float,
|
||||
default=0.8,
|
||||
help="Success rate 0-1 (default: 0.8)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dataset-split",
|
||||
type=str,
|
||||
default="train",
|
||||
help="AIME dataset split to use (default: train)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
global simulator
|
||||
simulator = Simulator(
|
||||
port=args.port,
|
||||
host=args.host,
|
||||
success_rate=args.success_rate,
|
||||
dataset_split=args.dataset_split
|
||||
)
|
||||
|
||||
print("\n=== llama-server-simulator ===")
|
||||
print(f"Server running on http://{args.host}:{args.port}")
|
||||
print(f"Success rate: {args.success_rate}")
|
||||
print(f"AIME dataset loaded: {len(simulator.dataset.questions)} questions")
|
||||
print("\nPress Ctrl+C to stop\n")
|
||||
|
||||
app.run(host=args.host, port=args.port, debug=False)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
86
examples/llama-eval/test-simulator.sh
Executable file
86
examples/llama-eval/test-simulator.sh
Executable file
@@ -0,0 +1,86 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# Get the directory where this script is located
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
echo "=== llama-server-simulator Test Script ==="
|
||||
echo ""
|
||||
|
||||
PORT=8033
|
||||
SUCCESS_RATE=0.8
|
||||
TEST_PORT=8034
|
||||
|
||||
echo "Starting simulator on port $PORT with success rate $SUCCESS_RATE..."
|
||||
source "$SCRIPT_DIR/venv/bin/activate"
|
||||
python3 "$SCRIPT_DIR/llama-server-simulator.py" --port $PORT --success-rate $SUCCESS_RATE > /tmp/simulator-test.log 2>&1 &
|
||||
SIMULATOR_PID=$!
|
||||
|
||||
echo "Waiting for simulator to start..."
|
||||
sleep 5
|
||||
|
||||
# Helper function to make a request and extract the answer
|
||||
make_request() {
|
||||
local question="$1"
|
||||
curl -s -X POST http://localhost:$PORT/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{
|
||||
\"model\": \"llama\",
|
||||
\"messages\": [
|
||||
{\"role\": \"user\", \"content\": \"$question\"}
|
||||
],
|
||||
\"temperature\": 0,
|
||||
\"max_tokens\": 2048
|
||||
}" | python3 -c "import sys, json; data = json.load(sys.stdin); print(data.get('choices', [{}])[0].get('message', {}).get('content', data.get('error', 'No response')))"
|
||||
}
|
||||
|
||||
# Test question (repeated in multiple tests)
|
||||
TEST_QUESTION="Quadratic polynomials P(x) and Q(x) have leading coefficients 2 and -2, respectively. The graphs of both polynomials pass through the two points (16,54) and (20,53). Find P(0) + Q(0)."
|
||||
|
||||
echo ""
|
||||
echo "=== Test 1: Correct Answer ==="
|
||||
echo "Sending request with known question..."
|
||||
answer=$(make_request "$TEST_QUESTION")
|
||||
echo "Answer: $answer"
|
||||
echo "Expected: 116"
|
||||
echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
|
||||
|
||||
echo ""
|
||||
echo "=== Test 2: Wrong Answer ==="
|
||||
echo "Sending request with known question (success rate 0.0)..."
|
||||
answer=$(make_request "$TEST_QUESTION")
|
||||
echo "Answer: $answer"
|
||||
echo "Expected: 116"
|
||||
echo "Correct: $([ "$answer" == "116" ] && echo "Yes" || echo "No")"
|
||||
|
||||
echo ""
|
||||
echo "=== Test 3: No Matching Question ==="
|
||||
echo "Sending request with non-matching text..."
|
||||
response=$(make_request "What is the capital of France?")
|
||||
echo "Response: $response"
|
||||
echo "Expected: No matching question found"
|
||||
echo "Correct: $([ "$response" == "No matching question found" ] && echo "Yes" || echo "No")"
|
||||
|
||||
echo ""
|
||||
echo "=== Test 4: Success Rate Verification ==="
|
||||
echo "Sending 10 requests to test success rate..."
|
||||
correct_count=0
|
||||
for i in {1..10}; do
|
||||
answer=$(make_request "$TEST_QUESTION")
|
||||
if [ "$answer" == "116" ]; then
|
||||
correct_count=$((correct_count + 1))
|
||||
fi
|
||||
echo " Request $i: Answer = $answer"
|
||||
done
|
||||
echo "Correct answers: $correct_count/10"
|
||||
echo "Expected: ~8/10 (80% success rate)"
|
||||
echo "Success rate: $(echo "scale=1; $correct_count * 10" | bc)%"
|
||||
|
||||
echo ""
|
||||
echo "=== Test Complete ==="
|
||||
echo "Stopping simulator..."
|
||||
kill $SIMULATOR_PID 2>/dev/null
|
||||
wait $SIMULATOR_PID 2>/dev/null || true
|
||||
|
||||
echo "Simulator stopped."
|
||||
@@ -576,13 +576,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP NEW
|
||||
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
|
||||
|
||||
FetchContent_Populate(KleidiAI_Download)
|
||||
FetchContent_GetProperties(KleidiAI_Download
|
||||
SOURCE_DIR KLEIDIAI_SRC
|
||||
POPULATED KLEIDIAI_POPULATED)
|
||||
|
||||
if (NOT KLEIDIAI_POPULATED)
|
||||
message(FATAL_ERROR "KleidiAI source downloaded failed.")
|
||||
FetchContent_Populate(KleidiAI_Download)
|
||||
FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
|
||||
endif()
|
||||
|
||||
add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
|
||||
|
||||
@@ -2715,14 +2715,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < QR2_XXS; ++l) {
|
||||
const int * grid_pos = (const int *) (iq2xxs_grid + aux8[l]);
|
||||
const int signs_packed = ksigns_iq2xs[(aux32 >> (7*l)) & 0x7F];
|
||||
const uint2 grid_pos = ((const uint2*)iq2xxs_grid)[aux8[l]];
|
||||
const uint32_t signs = unpack_ksigns(aux32 >> (7 * l));
|
||||
|
||||
const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
|
||||
const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
|
||||
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
|
||||
const int grid0 = __vsub4(grid_pos.x ^ signs0, signs0);
|
||||
|
||||
const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
|
||||
const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
|
||||
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
|
||||
const int grid1 = __vsub4(grid_pos.y ^ signs1, signs1);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid0;
|
||||
@@ -2733,12 +2733,12 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
|
||||
const int ls = aux32 >> 28;
|
||||
const int ls = aux32 >> 27 | 1; // (scale * 2 + 1)
|
||||
const float d = bxi->d;
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = (ls*d + d/2)/4;
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + kqsx] = d * ls / 8; // (d * scale + d / 2) / 4
|
||||
#else
|
||||
x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = (ls*d + d/2)/4;
|
||||
x_df[i*(MMQ_TILE_NE_K/4) + i/4 + kqsx] = d * ls / 8; // (d * scale + d / 2) / 4
|
||||
#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
}
|
||||
}
|
||||
@@ -2776,11 +2776,14 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < QR2_XS; ++l) {
|
||||
const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l] & 0x000001FF));
|
||||
const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9));
|
||||
const uint2 grid_pos = ((const uint2*)iq2xs_grid)[q2[l] & 0x1FF];
|
||||
const uint32_t signs = unpack_ksigns(q2[l] >> 9);
|
||||
|
||||
const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
|
||||
const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
|
||||
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
|
||||
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
|
||||
|
||||
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
|
||||
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q3_K + 8*kqsx + (2*l + 0)] = grid_l;
|
||||
@@ -2904,11 +2907,13 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
|
||||
#pragma unroll
|
||||
for (int l = 0; l < QR3_XXS; ++l) {
|
||||
const int2 grid_pos = make_int2(iq3xxs_grid[q3[2*l+0]], iq3xxs_grid[q3[2*l+1]]);
|
||||
const uint32_t signs = unpack_ksigns(aux32 >> (7*l));
|
||||
|
||||
const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l)) & 0x7F));
|
||||
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
|
||||
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
|
||||
|
||||
const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
|
||||
const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
|
||||
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
|
||||
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
|
||||
|
||||
#if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
|
||||
x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + 8*kqsx + (2*l + 0)] = grid_l;
|
||||
|
||||
@@ -94,6 +94,15 @@ static __device__ __forceinline__ int2 get_int_from_table_16(const int & q4, con
|
||||
#endif
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ uint32_t unpack_ksigns(const uint8_t v) {
|
||||
// v is a 7 bit int, with the 8th sign being encodable as popcnt
|
||||
// with xor we can "correct" the bit instead of having to mask
|
||||
const uint32_t p = __popc(v) & 1;
|
||||
const uint32_t s = v ^ p << 7;
|
||||
// broadcast over uint to allow for 0x08040201 / 0x80402010 as selectors
|
||||
return s * 0x01010101;
|
||||
}
|
||||
|
||||
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
|
||||
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
|
||||
|
||||
@@ -905,22 +914,22 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
||||
int sumi = 0;
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < 8; k0 += 2) {
|
||||
const int * grid_pos = (const int *) (iq2xxs_grid + aux8[k0/2]);
|
||||
const int signs_packed = ksigns_iq2xs[(aux32 >> (7*k0/2)) & 0x7F];
|
||||
const uint2 grid_pos = ((const uint2*)iq2xxs_grid)[aux8[k0/2]];
|
||||
const uint32_t signs = unpack_ksigns(aux32 >> (7 * k0 / 2));
|
||||
|
||||
const int signs0 = __vcmpne4(((signs_packed & 0x03) << 7) | ((signs_packed & 0x0C) << 21), 0x00000000);
|
||||
const int grid0 = __vsub4(grid_pos[0] ^ signs0, signs0);
|
||||
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
|
||||
const int grid0 = __vsub4(grid_pos.x ^ signs0, signs0);
|
||||
const int u0 = get_int_b4(bq8_1[iqs/2].qs, k0 + 0);
|
||||
sumi = ggml_cuda_dp4a(grid0, u0, sumi);
|
||||
|
||||
const int signs1 = __vcmpne4(((signs_packed & 0x30) << 3) | ((signs_packed & 0xC0) << 17), 0x00000000);
|
||||
const int grid1 = __vsub4(grid_pos[1] ^ signs1, signs1);
|
||||
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
|
||||
const int grid1 = __vsub4(grid_pos.y ^ signs1, signs1);
|
||||
const int u1 = get_int_b4(bq8_1[iqs/2].qs, k0 + 1);
|
||||
sumi = ggml_cuda_dp4a(grid1, u1, sumi);
|
||||
}
|
||||
|
||||
const int ls = aux32 >> 28;
|
||||
sumi = (ls*sumi + sumi/2)/4;
|
||||
const int ls = aux32 >> 27 | 1; // (scale * 2 + 1)
|
||||
sumi = sumi * ls / 8; // (sumi * scale + sumi / 2) / 4
|
||||
const float d = __half2float(bq2->d) * __low2float(bq8_1[iqs/2].ds);
|
||||
return d * sumi;
|
||||
}
|
||||
@@ -942,13 +951,15 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
||||
int sumi1 = 0;
|
||||
#pragma unroll
|
||||
for (int l0 = 0; l0 < 8; l0 += 2) {
|
||||
const uint32_t * grid_pos = (const uint32_t *)(iq2xs_grid + (q2[l0/2] & 0x000001FF));
|
||||
const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l0/2] >> 9));
|
||||
|
||||
const int grid_l = __vsub4(grid_pos[0] ^ signs[0], signs[0]);
|
||||
const int grid_h = __vsub4(grid_pos[1] ^ signs[1], signs[1]);
|
||||
const uint2 grid_pos = ((const uint2*)iq2xs_grid)[q2[l0/2] & 0x1FF];
|
||||
const uint32_t signs = unpack_ksigns(q2[l0/2] >> 9);
|
||||
|
||||
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
|
||||
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
|
||||
const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
|
||||
|
||||
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
|
||||
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
|
||||
const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
|
||||
|
||||
if (l0 < 4) {
|
||||
@@ -1028,13 +1039,16 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
|
||||
#pragma unroll
|
||||
for (int l0 = 0; l0 < 8; l0 += 2) {
|
||||
const int2 grid_pos = make_int2(iq3xxs_grid[q3[l0 + 0]], iq3xxs_grid[q3[l0 + 1]]);
|
||||
const uint32_t signs = unpack_ksigns(aux32 >> (7*l0/2));
|
||||
|
||||
const int * signs = (const int *)(ksigns64 + ((aux32 >> (7*l0/2)) & 0x7F));
|
||||
|
||||
const int grid_l = __vsub4(grid_pos.x ^ signs[0], signs[0]);
|
||||
const int grid_h = __vsub4(grid_pos.y ^ signs[1], signs[1]);
|
||||
const int signs0 = __vcmpne4(signs & 0x08040201, 0);
|
||||
const int grid_l = __vsub4(grid_pos.x ^ signs0, signs0);
|
||||
|
||||
const int u0 = get_int_b4(bq8_1[iqs/2].qs, l0 + 0);
|
||||
|
||||
const int signs1 = __vcmpne4(signs & 0x80402010, 0);
|
||||
const int grid_h = __vsub4(grid_pos.y ^ signs1, signs1);
|
||||
|
||||
const int u1 = get_int_b4(bq8_1[iqs/2].qs, l0 + 1);
|
||||
|
||||
sumi = ggml_cuda_dp4a(grid_l, u0, sumi);
|
||||
|
||||
@@ -5,7 +5,7 @@ import os
|
||||
import sys
|
||||
import subprocess
|
||||
|
||||
HTTPLIB_VERSION = "f80864ca031932351abef49b74097c67f14719c6"
|
||||
HTTPLIB_VERSION = "d4180e923f846b44a3d30acd938438d6e64fc9f6"
|
||||
|
||||
vendor = {
|
||||
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",
|
||||
|
||||
@@ -28,10 +28,6 @@ target_link_libraries(${TARGET} PUBLIC common mtmd ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
set(TARGET llama-server)
|
||||
|
||||
if (NOT LLAMA_HTTPLIB)
|
||||
message(FATAL_ERROR "LLAMA_HTTPLIB is OFF, cannot build llama-server. Hint: to skip building server, set -DLLAMA_BUILD_SERVER=OFF")
|
||||
endif()
|
||||
|
||||
set(TARGET_SRCS
|
||||
server.cpp
|
||||
server-http.cpp
|
||||
|
||||
74
vendor/cpp-httplib/httplib.cpp
vendored
74
vendor/cpp-httplib/httplib.cpp
vendored
@@ -1264,78 +1264,32 @@ int poll_wrapper(struct pollfd *fds, nfds_t nfds, int timeout) {
|
||||
#endif
|
||||
}
|
||||
|
||||
template <bool Read>
|
||||
ssize_t select_impl(socket_t sock, time_t sec, time_t usec) {
|
||||
#ifdef __APPLE__
|
||||
if (sock >= FD_SETSIZE) { return -1; }
|
||||
|
||||
fd_set fds, *rfds, *wfds;
|
||||
FD_ZERO(&fds);
|
||||
FD_SET(sock, &fds);
|
||||
rfds = (Read ? &fds : nullptr);
|
||||
wfds = (Read ? nullptr : &fds);
|
||||
|
||||
timeval tv;
|
||||
tv.tv_sec = static_cast<long>(sec);
|
||||
tv.tv_usec = static_cast<decltype(tv.tv_usec)>(usec);
|
||||
|
||||
return handle_EINTR([&]() {
|
||||
return select(static_cast<int>(sock + 1), rfds, wfds, nullptr, &tv);
|
||||
});
|
||||
#else
|
||||
ssize_t select_impl(socket_t sock, short events, time_t sec,
|
||||
time_t usec) {
|
||||
struct pollfd pfd;
|
||||
pfd.fd = sock;
|
||||
pfd.events = (Read ? POLLIN : POLLOUT);
|
||||
pfd.events = events;
|
||||
pfd.revents = 0;
|
||||
|
||||
auto timeout = static_cast<int>(sec * 1000 + usec / 1000);
|
||||
|
||||
return handle_EINTR([&]() { return poll_wrapper(&pfd, 1, timeout); });
|
||||
#endif
|
||||
}
|
||||
|
||||
ssize_t select_read(socket_t sock, time_t sec, time_t usec) {
|
||||
return select_impl<true>(sock, sec, usec);
|
||||
return select_impl(sock, POLLIN, sec, usec);
|
||||
}
|
||||
|
||||
ssize_t select_write(socket_t sock, time_t sec, time_t usec) {
|
||||
return select_impl<false>(sock, sec, usec);
|
||||
return select_impl(sock, POLLOUT, sec, usec);
|
||||
}
|
||||
|
||||
Error wait_until_socket_is_ready(socket_t sock, time_t sec,
|
||||
time_t usec) {
|
||||
#ifdef __APPLE__
|
||||
if (sock >= FD_SETSIZE) { return Error::Connection; }
|
||||
|
||||
fd_set fdsr, fdsw;
|
||||
FD_ZERO(&fdsr);
|
||||
FD_ZERO(&fdsw);
|
||||
FD_SET(sock, &fdsr);
|
||||
FD_SET(sock, &fdsw);
|
||||
|
||||
timeval tv;
|
||||
tv.tv_sec = static_cast<long>(sec);
|
||||
tv.tv_usec = static_cast<decltype(tv.tv_usec)>(usec);
|
||||
|
||||
auto ret = handle_EINTR([&]() {
|
||||
return select(static_cast<int>(sock + 1), &fdsr, &fdsw, nullptr, &tv);
|
||||
});
|
||||
|
||||
if (ret == 0) { return Error::ConnectionTimeout; }
|
||||
|
||||
if (ret > 0 && (FD_ISSET(sock, &fdsr) || FD_ISSET(sock, &fdsw))) {
|
||||
auto error = 0;
|
||||
socklen_t len = sizeof(error);
|
||||
auto res = getsockopt(sock, SOL_SOCKET, SO_ERROR,
|
||||
reinterpret_cast<char *>(&error), &len);
|
||||
auto successful = res >= 0 && !error;
|
||||
return successful ? Error::Success : Error::Connection;
|
||||
}
|
||||
|
||||
return Error::Connection;
|
||||
#else
|
||||
struct pollfd pfd_read;
|
||||
pfd_read.fd = sock;
|
||||
pfd_read.events = POLLIN | POLLOUT;
|
||||
pfd_read.revents = 0;
|
||||
|
||||
auto timeout = static_cast<int>(sec * 1000 + usec / 1000);
|
||||
|
||||
@@ -1354,7 +1308,6 @@ Error wait_until_socket_is_ready(socket_t sock, time_t sec,
|
||||
}
|
||||
|
||||
return Error::Connection;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool is_socket_alive(socket_t sock) {
|
||||
@@ -7138,17 +7091,6 @@ Server::process_request(Stream &strm, const std::string &remote_addr,
|
||||
res.version = "HTTP/1.1";
|
||||
res.headers = default_headers_;
|
||||
|
||||
#ifdef __APPLE__
|
||||
// Socket file descriptor exceeded FD_SETSIZE...
|
||||
if (strm.socket() >= FD_SETSIZE) {
|
||||
Headers dummy;
|
||||
detail::read_headers(strm, dummy);
|
||||
res.status = StatusCode::InternalServerError_500;
|
||||
output_error_log(Error::ExceedMaxSocketDescriptorCount, &req);
|
||||
return write_response(strm, close_connection, req, res);
|
||||
}
|
||||
#endif
|
||||
|
||||
// Request line and headers
|
||||
if (!parse_request_line(line_reader.ptr(), req)) {
|
||||
res.status = StatusCode::BadRequest_400;
|
||||
@@ -12063,7 +12005,7 @@ bool get_cert_sans(cert_t cert, std::vector<SanEntry> &sans) {
|
||||
if (!names) return true; // No SANs is valid
|
||||
|
||||
auto count = sk_GENERAL_NAME_num(names);
|
||||
for (int i = 0; i < count; i++) {
|
||||
for (decltype(count) i = 0; i < count; i++) {
|
||||
auto gen = sk_GENERAL_NAME_value(names, i);
|
||||
if (!gen) continue;
|
||||
|
||||
|
||||
4
vendor/cpp-httplib/httplib.h
vendored
4
vendor/cpp-httplib/httplib.h
vendored
@@ -8,8 +8,8 @@
|
||||
#ifndef CPPHTTPLIB_HTTPLIB_H
|
||||
#define CPPHTTPLIB_HTTPLIB_H
|
||||
|
||||
#define CPPHTTPLIB_VERSION "0.31.0"
|
||||
#define CPPHTTPLIB_VERSION_NUM "0x001F00"
|
||||
#define CPPHTTPLIB_VERSION "0.32.0"
|
||||
#define CPPHTTPLIB_VERSION_NUM "0x002000"
|
||||
|
||||
/*
|
||||
* Platform compatibility check
|
||||
|
||||
Reference in New Issue
Block a user