Compare commits

..

2 Commits

Author SHA1 Message Date
Georgi Gerganov
c64e772d35 pi : add rule to use gh CLI for GitHub resources
Assisted-by: llama.cpp:local pi
2026-04-30 09:50:39 +03:00
Georgi Gerganov
c6dbd31146 docs : update speculative decoding parameters after refactor (#22397)
Update docs/speculative.md to reflect the new parameter naming scheme
introduced in PR #22397:

- Replace --draft-max/--draft-min with --spec-draft-n-max/--spec-draft-n-min
- Replace --spec-ngram-size-n/m with per-implementation variants
- Add documentation for all new --spec-ngram-*- parameters
- Update all example commands

Assisted-by: llama.cpp:local pi
2026-04-30 09:44:48 +03:00
9 changed files with 155 additions and 77 deletions

View File

@@ -31,7 +31,7 @@ jobs:
uses: actions/setup-python@v6
with:
python-version: "3.11"
pip-install: -r requirements/requirements-all.txt ty==0.0.33
pip-install: -r requirements/requirements-all.txt ty==0.0.26
# - name: Type-check with Pyright
# uses: jakebailey/pyright-action@v2
# with:

View File

@@ -4,6 +4,7 @@ General:
- By very precise and concise when writing code, comments, explanations, etc.
- PR and commit titles format: `<module> : <title>`. Lookup recents for examples
- Don't try to build or run the code unless you are explicitly asked to do so
- Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources
Coding:
- When in doubt, always refer to the CONTRIBUTING.md file of the project

View File

@@ -6658,7 +6658,7 @@ class BertModel(TextModel):
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size # ty: ignore[invalid-assignment]
if isinstance(tokenizer, SentencePieceProcessor):
for token_id in range(tokenizer.vocab_size()):

View File

@@ -33,18 +33,18 @@ An example to use this approach can be the rewriting of source code by a LLM.
This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.
```
llama-server [...] --spec-type ngram-simple --draft-max 64
llama-server [...] --spec-type ngram-simple --spec-draft-n-max 64
```
#### n-gram Map Key (`ngram-map-k`)
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.
This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-map-k-min-hits`, default is 1) before generating drafts.
The number of accepted tokens is stored for each used n-gram.
**Example:**
```
llama-server [...] --spec-type ngram-map-k --draft-max 64
llama-server [...] --spec-type ngram-map-k --spec-draft-n-max 64
```
#### n-gram Map Key-4-Values (`ngram-map-k4v`)
@@ -55,7 +55,7 @@ The number of accepted tokens is stored for each used n-gram.
**Example:** Server options to be used if there are a lot of longer repetitions.
```
llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
llama-server [...] --spec-type ngram-map-k4v --spec-ngram-map-k4v-size-n 8 --spec-ngram-map-k4v-size-m 8 --spec-ngram-map-k4v-min-hits 2 --spec-draft-n-max 64
```
### n-gram Mod (`ngram-mod`)
@@ -80,9 +80,9 @@ Currently, a single hash pool is shared across all server slots, so different re
# notes:
# - small `n` are not recommended
# - MoEs require long drafts
# - dense models: can reduce `--draft-min` and `--draft-max`
# - dense models: can reduce `--spec-ngram-mod-n-min` and `--spec-ngram-mod-n-max`
llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
llama-server ... --spec-type ngram-mod --spec-ngram-mod-n-match 24 --spec-ngram-mod-n-min 48 --spec-ngram-mod-n-max 64
```
Applications:
@@ -105,21 +105,90 @@ Example Video:
If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.
### General Speculative Parameters
```
--draft, --draft-n, --draft-max N number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX)
--draft-min, --draft-n-min N minimum number of draft tokens to use for speculative decoding
(default: 0)
(env: LLAMA_ARG_DRAFT_MIN)
[...]
--spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
type of speculative decoding to use when no draft model is provided
(default: none)
--spec-ngram-size-n N ngram size N for ngram-simple/ngram-map speculative decoding, length
of lookup n-gram (default: 12)
--spec-ngram-size-m N ngram size M for ngram-simple/ngram-map speculative decoding, length
of draft m-gram (default: 48)
--spec-ngram-min-hits N minimum hits for ngram-map speculative decoding (default: 1)
(env: LLAMA_ARG_SPEC_TYPE)
--spec-default use default speculative decoding
```
### Draft Model Parameters
```
--spec-draft-model, -md, --model-draft FNAME
draft model for speculative decoding (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_MODEL)
--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft <user>/<model>[:quant]
HuggingFace repository for the draft model
--spec-draft-n-max N
number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_SPEC_DRAFT_N_MAX)
--spec-draft-n-min N
minimum number of draft tokens to use for speculative decoding (default: 0)
(env: LLAMA_ARG_SPEC_DRAFT_N_MIN)
--spec-draft-p-split, --draft-p-split P
speculative decoding split probability (default: 0.10)
(env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT)
--spec-draft-p-min, --draft-p-min P
minimum speculative decoding probability (greedy) (default: 0.75)
(env: LLAMA_ARG_SPEC_DRAFT_P_MIN)
--spec-draft-ctx-size, -cd, --ctx-size-draft N
size of the prompt context for the draft model (default: 0, 0 = loaded from model)
(env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE)
--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft N
max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
(env: LLAMA_ARG_N_GPU_LAYERS_DRAFT)
--spec-draft-device, -devd, --device-draft <dev1,dev2,..>
comma-separated list of devices to use for offloading the draft model
--spec-draft-replace, --spec-replace TARGET DRAFT
translate the string in TARGET into DRAFT if the draft model and main model are not compatible
```
### n-gram Mod Parameters
```
--spec-ngram-mod-n-match N
ngram-mod lookup length (default: 24)
--spec-ngram-mod-n-min N
minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48)
--spec-ngram-mod-n-max N
maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64)
```
### n-gram Simple Parameters
```
--spec-ngram-simple-size-n N
ngram size N for ngram-simple speculative decoding, length of lookup n-gram (default: 12)
--spec-ngram-simple-size-m N
ngram size M for ngram-simple speculative decoding, length of draft m-gram (default: 48)
--spec-ngram-simple-min-hits N
minimum hits for ngram-simple speculative decoding (default: 1)
```
### n-gram Map Key Parameters
```
--spec-ngram-map-k-size-n N
ngram size N for ngram-map-k speculative decoding, length of lookup n-gram (default: 12)
--spec-ngram-map-k-size-m N
ngram size M for ngram-map-k speculative decoding, length of draft m-gram (default: 48)
--spec-ngram-map-k-min-hits N
minimum hits for ngram-map-k speculative decoding (default: 1)
```
### n-gram Map Key-4-Values Parameters
```
--spec-ngram-map-k4v-size-n N
ngram size N for ngram-map-k4v speculative decoding, length of lookup n-gram (default: 12)
--spec-ngram-map-k4v-size-m N
ngram size M for ngram-map-k4v speculative decoding, length of draft m-gram (default: 48)
--spec-ngram-map-k4v-min-hits N
minimum hits for ngram-map-k4v speculative decoding (default: 1)
```
### `--spec-type TYPE`
@@ -140,21 +209,40 @@ Specifies a type of speculative decoding without draft model.
./llama-server [...] --spec-type ngram-simple
```
### `--spec-ngram-size-n N`
### `--spec-ngram-*-size-n N`
Sets the size N of the lookup n-gram for n-gram map based speculative decoding.
The n-gram size N determines how many tokens in a row to look back when searching for matching patterns.
### `--spec-ngram-size-m M`
Each n-gram implementation has its own parameter:
- `--spec-ngram-simple-size-n` for `ngram-simple`
- `--spec-ngram-map-k-size-n` for `ngram-map-k`
- `--spec-ngram-map-k4v-size-n` for `ngram-map-k4v`
- `--spec-ngram-mod-n-match` for `ngram-mod`
### `--spec-ngram-*-size-m M`
Sets the size M of the draft m-gram for n-gram map based speculative decoding.
The m-gram size determines how many tokens to draft when a match is found.
Larger values can provide more speedup but may reduce acceptance rate.
### `--spec-ngram-min-hits H`
Each n-gram implementation has its own parameter:
- `--spec-ngram-simple-size-m` for `ngram-simple`
- `--spec-ngram-map-k-size-m` for `ngram-map-k`
- `--spec-ngram-map-k4v-size-m` for `ngram-map-k4v`
### `--spec-ngram-*-min-hits H`
This option defines how often a key has to appear in the token history to be used as a draft (default is 1).
Each n-gram implementation has its own parameter:
- `--spec-ngram-simple-min-hits` for `ngram-simple`
- `--spec-ngram-map-k-min-hits` for `ngram-map-k`
- `--spec-ngram-map-k4v-min-hits` for `ngram-map-k4v`
## Statistics
Each speculative decoding implementation prints statistics.
@@ -180,4 +268,3 @@ statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts
- `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
- `#acc tokens`: number of tokens accepted by the main model
- `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).

View File

@@ -68,7 +68,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 64, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 64, 64)
@@ -130,7 +130,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2, 32, 128)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 4, 128, 2, 32, 64)
GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512, 8, 256, 2, 32, 64)
@@ -1124,7 +1124,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
constexpr size_t nbytes_shared = 0;
#ifdef GGML_USE_HIP
if constexpr (DKQ <= 128) {
if constexpr (DV <= 128) {
if (Q->ne[1] > 32/ncols2) {
constexpr int cols_per_block = 64;
const int nwarps = ggml_cuda_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
@@ -1138,7 +1138,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
#endif // GGML_USE_HIP
#ifndef GGML_USE_HIP
if constexpr (DKQ <= 256)
if constexpr (DV <= 256)
#endif // GGML_USE_HIP
{
if (Q->ne[1] > 16/ncols2) {
@@ -1220,22 +1220,11 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX;
const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0;
if constexpr (DKQ == 320) {
// This branch is only used for Mistral Small 4 which has a GQA ratio of 32.
// On AMD, simply use that GQA ratio with 32 columns / block since we always have enough SRAM.
// On NVIDIA however, the tile kernel is only used for GPUs that can't use the mma kernel (Pascal and older).
// Therefore, use a GQA ratio of 16 with 16 columns / block to stay below 48 kiB of SRAM / block.
#ifdef GGML_USE_HIP
if constexpr (DKQ == 320) { // Mistral Small 4
if (use_gqa_opt && gqa_ratio % 32 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 32, use_logit_softcap>(ctx, dst);
return;
}
#else
if (use_gqa_opt && gqa_ratio % 16 == 0) {
launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
return;
}
#endif // GGML_USE_HIP
GGML_ABORT("flash-attn tile (320/256): expected GQA ratio multiple of 32");
}

View File

@@ -20,7 +20,6 @@ from PySide6.QtCore import Qt, QRect, QSize
from jinja2 import TemplateSyntaxError
from jinja2.sandbox import ImmutableSandboxedEnvironment
from datetime import datetime
from typing import Callable
def format_template_content(template_content):
@@ -396,7 +395,7 @@ class JinjaTester(QMainWindow):
ensure_ascii=ensure_ascii,
)
)
env.globals["strftime_now"]: Callable[[str], str] = lambda format: datetime.now().strftime(format)
env.globals["strftime_now"] = lambda format: datetime.now().strftime(format) # ty: ignore[invalid-assignment]
env.globals["raise_exception"] = raise_exception # ty: ignore[invalid-assignment]
try:
template = env.from_string(template_str)

View File

@@ -5,7 +5,7 @@ import os
import sys
import subprocess
HTTPLIB_VERSION = "refs/tags/v0.43.2"
HTTPLIB_VERSION = "refs/tags/v0.43.1"
vendor = {
"https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp",

View File

@@ -1464,9 +1464,8 @@ bool mmap::open(const char *path) {
auto wpath = u8string_to_wstring(path);
if (wpath.empty()) { return false; }
hFile_ =
::CreateFile2(wpath.c_str(), GENERIC_READ,
FILE_SHARE_READ | FILE_SHARE_WRITE, OPEN_EXISTING, NULL);
hFile_ = ::CreateFile2(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ,
OPEN_EXISTING, NULL);
if (hFile_ == INVALID_HANDLE_VALUE) { return false; }
@@ -2053,50 +2052,56 @@ int getaddrinfo_with_timeout(const char *node, const char *service,
return 0;
#elif defined(_GNU_SOURCE) && defined(__GLIBC__) && \
(__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2))
// #2431: gai_cancel() is non-blocking and may return EAI_NOTCANCELED while
// the resolver worker still references the stack-local gaicb. The cancel
// path therefore waits (gai_suspend with no timeout) for the worker to
// actually finish before letting the stack frame go. The trade-off is that
// a wedged DNS server can hold this thread for the system resolver timeout
// (~30s by default) past the caller's connection timeout.
struct gaicb request {};
// Linux implementation using getaddrinfo_a for asynchronous DNS resolution
struct gaicb request;
struct gaicb *requests[1] = {&request};
struct sigevent sevp {};
struct timespec timeout {
timeout_sec, 0
};
struct sigevent sevp;
struct timespec timeout;
// Initialize the request structure
memset(&request, 0, sizeof(request));
request.ar_name = node;
request.ar_service = service;
request.ar_request = hints;
// Set up timeout
timeout.tv_sec = timeout_sec;
timeout.tv_nsec = 0;
// Initialize sigevent structure (not used, but required)
memset(&sevp, 0, sizeof(sevp));
sevp.sigev_notify = SIGEV_NONE;
int rc = getaddrinfo_a(GAI_NOWAIT, requests, 1, &sevp);
if (rc != 0) { return rc; }
// Start asynchronous resolution
int start_result = getaddrinfo_a(GAI_NOWAIT, requests, 1, &sevp);
if (start_result != 0) { return start_result; }
auto cleanup = scope_exit([&] {
if (request.ar_result) { freeaddrinfo(request.ar_result); }
});
int wait_result = gai_suspend(requests, 1, &timeout);
// Wait for completion with timeout
int wait_result =
gai_suspend((const struct gaicb *const *)requests, 1, &timeout);
if (wait_result == 0 || wait_result == EAI_ALLDONE) {
// Completed successfully, get the result
int gai_result = gai_error(&request);
if (gai_result == 0) {
*res = request.ar_result;
request.ar_result = nullptr;
return 0;
} else {
// Clean up on error
if (request.ar_result) { freeaddrinfo(request.ar_result); }
return gai_result;
}
return gai_result;
} else if (wait_result == EAI_AGAIN) {
// Timeout occurred, cancel the request
gai_cancel(&request);
return EAI_AGAIN;
} else {
// Other error occurred
gai_cancel(&request);
return wait_result;
}
gai_cancel(&request);
while (gai_error(&request) == EAI_INPROGRESS) {
gai_suspend(requests, 1, nullptr);
}
return wait_result;
#else
// Fallback implementation using thread-based timeout for other Unix systems.
// Fallback implementation using thread-based timeout for other Unix systems
struct GetAddrInfoState {
~GetAddrInfoState() {
@@ -14137,9 +14142,6 @@ ssize_t read(session_t session, void *buf, size_t len, TlsError &err) {
err.code = impl::map_mbedtls_error(ret, err.sys_errno);
err.backend_code = static_cast<uint64_t>(-ret);
impl::mbedtls_last_error() = ret;
// mbedTLS signals a clean close_notify via a negative error code rather
// than 0; surface it as a clean EOF the way OpenSSL/wolfSSL do.
if (err.code == ErrorCode::PeerClosed) { return 0; }
return -1;
}

View File

@@ -8,8 +8,8 @@
#ifndef CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_HTTPLIB_H
#define CPPHTTPLIB_VERSION "0.43.2"
#define CPPHTTPLIB_VERSION_NUM "0x002b02"
#define CPPHTTPLIB_VERSION "0.43.1"
#define CPPHTTPLIB_VERSION_NUM "0x002b01"
#ifdef _WIN32
#if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00