pi : add rule to use gh CLI for GitHub resources

Assisted-by: llama.cpp:local pi
docs : update speculative decoding parameters after refactor (#22397 )
2026-04-30 16:47:31 +03:00 · 2026-04-30 09:50:39 +03:00 · 2026-04-30 09:44:48 +03:00
9 changed files with 155 additions and 77 deletions
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@@ -31,7 +31,7 @@ jobs:
        uses: actions/setup-python@v6
        with:
          python-version: "3.11"
-          pip-install: -r requirements/requirements-all.txt ty==0.0.33
+          pip-install: -r requirements/requirements-all.txt ty==0.0.26
      # - name: Type-check with Pyright
      #   uses: jakebailey/pyright-action@v2
      #   with:
--- a/.pi/gg/SYSTEM.md
+++ b/.pi/gg/SYSTEM.md
@@ -4,6 +4,7 @@ General:
 - By very precise and concise when writing code, comments, explanations, etc.
 - PR and commit titles format: `<module> : <title>`. Lookup recents for examples
 - Don't try to build or run the code unless you are explicitly asked to do so
+- Use the `gh` CLI tool when querying PRs, issues, or other GitHub resources

 Coding:
 - When in doubt, always refer to the CONTRIBUTING.md file of the project
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6658,7 +6658,7 @@ class BertModel(TextModel):

        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
        scores: list[float] = [-10000.0] * vocab_size
-        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size
+        toktypes: list[int] = [SentencePieceTokenTypes.UNUSED] * vocab_size  # ty: ignore[invalid-assignment]

        if isinstance(tokenizer, SentencePieceProcessor):
            for token_id in range(tokenizer.vocab_size()):
--- a/docs/speculative.md
+++ b/docs/speculative.md
@@ -33,18 +33,18 @@ An example to use this approach can be the rewriting of source code by a LLM.
 This implementation looks for the last n-gram in history that matches the current n-gram and creates a draft using the m tokens following the matched n-gram. It is the simplest self-speculative approach with minimal overhead.

 ```
-llama-server [...] --spec-type ngram-simple --draft-max 64
+llama-server [...] --spec-type ngram-simple --spec-draft-n-max 64
 ```

 #### n-gram Map Key (`ngram-map-k`)

-This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-min-hits`, default is 1) before generating drafts.
+This implementation looks for the current n-gram of size n (called the _key_) in the token history. If the key n-gram is followed by the same m tokens (called the _mgram_) multiple times, it creates a draft using these m tokens. This approach requires a minimum number of occurrences (argument `--spec-ngram-map-k-min-hits`, default is 1) before generating drafts.

 The number of accepted tokens is stored for each used n-gram.

 **Example:**
 ```
-llama-server [...] --spec-type ngram-map-k --draft-max 64
+llama-server [...] --spec-type ngram-map-k --spec-draft-n-max 64
 ```

 #### n-gram Map Key-4-Values (`ngram-map-k4v`)
@@ -55,7 +55,7 @@ The number of accepted tokens is stored for each used n-gram.

 **Example:** Server options to be used if there are a lot of longer repetitions.
 ```
-llama-server [...] --spec-type ngram-map-k4v --spec-ngram-size-n 8 --spec-ngram-size-m 8 --spec-ngram-min-hits 2 --draft-max 64
+llama-server [...] --spec-type ngram-map-k4v --spec-ngram-map-k4v-size-n 8 --spec-ngram-map-k4v-size-m 8 --spec-ngram-map-k4v-min-hits 2 --spec-draft-n-max 64
 ```

 ### n-gram Mod (`ngram-mod`)
@@ -80,9 +80,9 @@ Currently, a single hash pool is shared across all server slots, so different re
 # notes:
 # - small `n` are not recommended
 # - MoEs require long drafts
-# - dense models: can reduce `--draft-min` and `--draft-max`
+# - dense models: can reduce `--spec-ngram-mod-n-min` and `--spec-ngram-mod-n-max`

-llama-server ... --spec-type ngram-mod --spec-ngram-size-n 24 --draft-min 48 --draft-max 64
+llama-server ... --spec-type ngram-mod --spec-ngram-mod-n-match 24 --spec-ngram-mod-n-min 48 --spec-ngram-mod-n-max 64
 ```

 Applications:
@@ -105,21 +105,90 @@ Example Video:

 If a draft model is combined with a draftless decoding the draftless decoding has higher precedence.

+### General Speculative Parameters
+
 ```
--draft, --draft-n, --draft-max N       number of tokens to draft for speculative decoding (default: 16)
-                                        (env: LLAMA_ARG_DRAFT_MAX)
--draft-min, --draft-n-min N            minimum number of draft tokens to use for speculative decoding
-                                        (default: 0)
-                                        (env: LLAMA_ARG_DRAFT_MIN)
-[...]
 --spec-type [none|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod]
                                        type of speculative decoding to use when no draft model is provided
                                        (default: none)
--spec-ngram-size-n N                   ngram size N for ngram-simple/ngram-map speculative decoding, length
-                                        of lookup n-gram (default: 12)
--spec-ngram-size-m N                   ngram size M for ngram-simple/ngram-map speculative decoding, length
-                                        of draft m-gram (default: 48)
--spec-ngram-min-hits N                 minimum hits for ngram-map speculative decoding (default: 1)
+                                        (env: LLAMA_ARG_SPEC_TYPE)
+--spec-default                          use default speculative decoding
+```
+
+### Draft Model Parameters
+
+```
+--spec-draft-model, -md, --model-draft  FNAME
+                                        draft model for speculative decoding (default: unused)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_MODEL)
+--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft  <user>/<model>[:quant]
+                                        HuggingFace repository for the draft model
+--spec-draft-n-max                      N
+                                        number of tokens to draft for speculative decoding (default: 16)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_N_MAX)
+--spec-draft-n-min                      N
+                                        minimum number of draft tokens to use for speculative decoding (default: 0)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_N_MIN)
+--spec-draft-p-split, --draft-p-split   P
+                                        speculative decoding split probability (default: 0.10)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_P_SPLIT)
+--spec-draft-p-min, --draft-p-min       P
+                                        minimum speculative decoding probability (greedy) (default: 0.75)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_P_MIN)
+--spec-draft-ctx-size, -cd, --ctx-size-draft  N
+                                        size of the prompt context for the draft model (default: 0, 0 = loaded from model)
+                                        (env: LLAMA_ARG_SPEC_DRAFT_CTX_SIZE)
+--spec-draft-ngl, -ngld, --gpu-layers-draft, --n-gpu-layers-draft  N
+                                        max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: auto)
+                                        (env: LLAMA_ARG_N_GPU_LAYERS_DRAFT)
+--spec-draft-device, -devd, --device-draft  <dev1,dev2,..>
+                                        comma-separated list of devices to use for offloading the draft model
+--spec-draft-replace, --spec-replace    TARGET  DRAFT
+                                        translate the string in TARGET into DRAFT if the draft model and main model are not compatible
+```
+
+### n-gram Mod Parameters
+
+```
+--spec-ngram-mod-n-match                N
+                                        ngram-mod lookup length (default: 24)
+--spec-ngram-mod-n-min                  N
+                                        minimum number of ngram tokens to use for ngram-based speculative decoding (default: 48)
+--spec-ngram-mod-n-max                  N
+                                        maximum number of ngram tokens to use for ngram-based speculative decoding (default: 64)
+```
+
+### n-gram Simple Parameters
+
+```
+--spec-ngram-simple-size-n              N
+                                        ngram size N for ngram-simple speculative decoding, length of lookup n-gram (default: 12)
+--spec-ngram-simple-size-m              N
+                                        ngram size M for ngram-simple speculative decoding, length of draft m-gram (default: 48)
+--spec-ngram-simple-min-hits            N
+                                        minimum hits for ngram-simple speculative decoding (default: 1)
+```
+
+### n-gram Map Key Parameters
+
+```
+--spec-ngram-map-k-size-n               N
+                                        ngram size N for ngram-map-k speculative decoding, length of lookup n-gram (default: 12)
+--spec-ngram-map-k-size-m               N
+                                        ngram size M for ngram-map-k speculative decoding, length of draft m-gram (default: 48)
+--spec-ngram-map-k-min-hits             N
+                                        minimum hits for ngram-map-k speculative decoding (default: 1)
+```
+
+### n-gram Map Key-4-Values Parameters
+
+```
+--spec-ngram-map-k4v-size-n             N
+                                        ngram size N for ngram-map-k4v speculative decoding, length of lookup n-gram (default: 12)
+--spec-ngram-map-k4v-size-m             N
+                                        ngram size M for ngram-map-k4v speculative decoding, length of draft m-gram (default: 48)
+--spec-ngram-map-k4v-min-hits           N
+                                        minimum hits for ngram-map-k4v speculative decoding (default: 1)
 ```

 ### `--spec-type TYPE`
@@ -140,21 +209,40 @@ Specifies a type of speculative decoding without draft model.
 ./llama-server [...] --spec-type ngram-simple
 ```

-### `--spec-ngram-size-n N`
+### `--spec-ngram-*-size-n N`

 Sets the size N of the lookup n-gram for n-gram map based speculative decoding.
 The n-gram size N determines how many tokens in a row to look back when searching for matching patterns.

-### `--spec-ngram-size-m M`
+Each n-gram implementation has its own parameter:
+
+- `--spec-ngram-simple-size-n` for `ngram-simple`
+- `--spec-ngram-map-k-size-n` for `ngram-map-k`
+- `--spec-ngram-map-k4v-size-n` for `ngram-map-k4v`
+- `--spec-ngram-mod-n-match` for `ngram-mod`
+
+### `--spec-ngram-*-size-m M`

 Sets the size M of the draft m-gram for n-gram map based speculative decoding.
 The m-gram size determines how many tokens to draft when a match is found.
 Larger values can provide more speedup but may reduce acceptance rate.

-### `--spec-ngram-min-hits H`
+Each n-gram implementation has its own parameter:
+
+- `--spec-ngram-simple-size-m` for `ngram-simple`
+- `--spec-ngram-map-k-size-m` for `ngram-map-k`
+- `--spec-ngram-map-k4v-size-m` for `ngram-map-k4v`
+
+### `--spec-ngram-*-min-hits H`

 This option defines how often a key has to appear in the token history to be used as a draft (default is 1).

+Each n-gram implementation has its own parameter:
+
+- `--spec-ngram-simple-min-hits` for `ngram-simple`
+- `--spec-ngram-map-k-min-hits` for `ngram-map-k`
+- `--spec-ngram-map-k4v-min-hits` for `ngram-map-k4v`
+
 ## Statistics
 Each speculative decoding implementation prints statistics.

@@ -180,4 +268,3 @@ statistics ngram_map_k: #calls(b,g,a) = 6 1690 26, #gen drafts = 26, #acc drafts
 - `#gen tokens`: number of tokens generated by this implementation (including rejected tokens)
 - `#acc tokens`: number of tokens accepted by the main model
 - `dur(b,g,a): durations of begin (new prompt), generation and accumulation (process acceptance).
-
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -68,7 +68,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  64,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2,  64,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2,  64,  64)

    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  64,  64)
@@ -130,7 +130,7 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 16, 256, 2,  32, 128)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(256, 256, 32, 256, 2,  32,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 16, 256, 2,  32,  64)
+    GGML_CUDA_FATTN_TILE_CONFIG_CASE(320, 256, 32, 256, 2,  32,  64)

    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  4, 128, 2,  32,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE(512, 512,  8, 256, 2,  32,  64)
@@ -1124,7 +1124,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
    constexpr size_t nbytes_shared = 0;

 #ifdef GGML_USE_HIP
-    if constexpr (DKQ <= 128) {
+    if constexpr (DV <= 128) {
        if (Q->ne[1] > 32/ncols2) {
            constexpr int cols_per_block = 64;
            const int nwarps    = ggml_cuda_fattn_tile_get_nthreads (DKQ, DV, cols_per_block, cc) / warp_size;
@@ -1138,7 +1138,7 @@ static void launch_fattn_tile_switch_ncols1(ggml_backend_cuda_context & ctx, ggm
 #endif // GGML_USE_HIP

 #ifndef GGML_USE_HIP
-    if constexpr (DKQ <= 256)
+    if constexpr (DV <= 256)
 #endif // GGML_USE_HIP
    {
        if (Q->ne[1] > 16/ncols2) {
@@ -1220,22 +1220,11 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
    const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX;
    const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0;

-    if constexpr (DKQ == 320) {
-        // This branch is only used for Mistral Small 4 which has a GQA ratio of 32.
-        // On AMD, simply use that GQA ratio with 32 columns / block since we always have enough SRAM.
-        // On NVIDIA however, the tile kernel is only used for GPUs that can't use the mma kernel (Pascal and older).
-        // Therefore, use a GQA ratio of 16 with 16 columns / block to stay below 48 kiB of SRAM / block.
-#ifdef GGML_USE_HIP
+    if constexpr (DKQ == 320) { // Mistral Small 4
        if (use_gqa_opt && gqa_ratio % 32 == 0) {
            launch_fattn_tile_switch_ncols1<DKQ, DV, 32, use_logit_softcap>(ctx, dst);
            return;
        }
-#else
-        if (use_gqa_opt && gqa_ratio % 16 == 0) {
-            launch_fattn_tile_switch_ncols1<DKQ, DV, 16, use_logit_softcap>(ctx, dst);
-            return;
-        }
-#endif // GGML_USE_HIP
        GGML_ABORT("flash-attn tile (320/256): expected GQA ratio multiple of 32");
    }

--- a/scripts/jinja/jinja-tester.py
+++ b/scripts/jinja/jinja-tester.py
@@ -20,7 +20,6 @@ from PySide6.QtCore import Qt, QRect, QSize
 from jinja2 import TemplateSyntaxError
 from jinja2.sandbox import ImmutableSandboxedEnvironment
 from datetime import datetime
-from typing import Callable


 def format_template_content(template_content):
@@ -396,7 +395,7 @@ class JinjaTester(QMainWindow):
                ensure_ascii=ensure_ascii,
            )
        )
-        env.globals["strftime_now"]: Callable[[str], str] = lambda format: datetime.now().strftime(format)
+        env.globals["strftime_now"] = lambda format: datetime.now().strftime(format)  # ty: ignore[invalid-assignment]
        env.globals["raise_exception"] = raise_exception  # ty: ignore[invalid-assignment]
        try:
            template = env.from_string(template_str)
--- a/scripts/sync_vendor.py
+++ b/scripts/sync_vendor.py
@@ -5,7 +5,7 @@ import os
 import sys
 import subprocess

-HTTPLIB_VERSION = "refs/tags/v0.43.2"
+HTTPLIB_VERSION = "refs/tags/v0.43.1"

 vendor = {
    "https://github.com/nlohmann/json/releases/latest/download/json.hpp":     "vendor/nlohmann/json.hpp",
--- a/vendor/cpp-httplib/httplib.cpp
+++ b/vendor/cpp-httplib/httplib.cpp
@@ -1464,9 +1464,8 @@ bool mmap::open(const char *path) {
  auto wpath = u8string_to_wstring(path);
  if (wpath.empty()) { return false; }

-  hFile_ =
-      ::CreateFile2(wpath.c_str(), GENERIC_READ,
-                    FILE_SHARE_READ | FILE_SHARE_WRITE, OPEN_EXISTING, NULL);
+  hFile_ = ::CreateFile2(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ,
+                         OPEN_EXISTING, NULL);

  if (hFile_ == INVALID_HANDLE_VALUE) { return false; }

@@ -2053,50 +2052,56 @@ int getaddrinfo_with_timeout(const char *node, const char *service,
  return 0;
 #elif defined(_GNU_SOURCE) && defined(__GLIBC__) &&                            \
    (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2))
-  // #2431: gai_cancel() is non-blocking and may return EAI_NOTCANCELED while
-  // the resolver worker still references the stack-local gaicb. The cancel
-  // path therefore waits (gai_suspend with no timeout) for the worker to
-  // actually finish before letting the stack frame go. The trade-off is that
-  // a wedged DNS server can hold this thread for the system resolver timeout
-  // (~30s by default) past the caller's connection timeout.
-  struct gaicb request {};
+  // Linux implementation using getaddrinfo_a for asynchronous DNS resolution
+  struct gaicb request;
  struct gaicb *requests[1] = {&request};
-  struct sigevent sevp {};
-  struct timespec timeout {
-    timeout_sec, 0
-  };
+  struct sigevent sevp;
+  struct timespec timeout;

+  // Initialize the request structure
+  memset(&request, 0, sizeof(request));
  request.ar_name = node;
  request.ar_service = service;
  request.ar_request = hints;
+
+  // Set up timeout
+  timeout.tv_sec = timeout_sec;
+  timeout.tv_nsec = 0;
+
+  // Initialize sigevent structure (not used, but required)
+  memset(&sevp, 0, sizeof(sevp));
  sevp.sigev_notify = SIGEV_NONE;

-  int rc = getaddrinfo_a(GAI_NOWAIT, requests, 1, &sevp);
-  if (rc != 0) { return rc; }
+  // Start asynchronous resolution
+  int start_result = getaddrinfo_a(GAI_NOWAIT, requests, 1, &sevp);
+  if (start_result != 0) { return start_result; }

-  auto cleanup = scope_exit([&] {
-    if (request.ar_result) { freeaddrinfo(request.ar_result); }
-  });
-
-  int wait_result = gai_suspend(requests, 1, &timeout);
+  // Wait for completion with timeout
+  int wait_result =
+      gai_suspend((const struct gaicb *const *)requests, 1, &timeout);

  if (wait_result == 0 || wait_result == EAI_ALLDONE) {
+    // Completed successfully, get the result
    int gai_result = gai_error(&request);
    if (gai_result == 0) {
      *res = request.ar_result;
-      request.ar_result = nullptr;
      return 0;
+    } else {
+      // Clean up on error
+      if (request.ar_result) { freeaddrinfo(request.ar_result); }
+      return gai_result;
    }
-    return gai_result;
+  } else if (wait_result == EAI_AGAIN) {
+    // Timeout occurred, cancel the request
+    gai_cancel(&request);
+    return EAI_AGAIN;
+  } else {
+    // Other error occurred
+    gai_cancel(&request);
+    return wait_result;
  }
-
-  gai_cancel(&request);
-  while (gai_error(&request) == EAI_INPROGRESS) {
-    gai_suspend(requests, 1, nullptr);
-  }
-  return wait_result;
 #else
-  // Fallback implementation using thread-based timeout for other Unix systems.
+  // Fallback implementation using thread-based timeout for other Unix systems

  struct GetAddrInfoState {
    ~GetAddrInfoState() {
@@ -14137,9 +14142,6 @@ ssize_t read(session_t session, void *buf, size_t len, TlsError &err) {
  err.code = impl::map_mbedtls_error(ret, err.sys_errno);
  err.backend_code = static_cast<uint64_t>(-ret);
  impl::mbedtls_last_error() = ret;
-  // mbedTLS signals a clean close_notify via a negative error code rather
-  // than 0; surface it as a clean EOF the way OpenSSL/wolfSSL do.
-  if (err.code == ErrorCode::PeerClosed) { return 0; }
  return -1;
 }

--- a/vendor/cpp-httplib/httplib.h
+++ b/vendor/cpp-httplib/httplib.h
@@ -8,8 +8,8 @@
 #ifndef CPPHTTPLIB_HTTPLIB_H
 #define CPPHTTPLIB_HTTPLIB_H

-#define CPPHTTPLIB_VERSION "0.43.2"
-#define CPPHTTPLIB_VERSION_NUM "0x002b02"
+#define CPPHTTPLIB_VERSION "0.43.1"
+#define CPPHTTPLIB_VERSION_NUM "0x002b01"

 #ifdef _WIN32
 #if defined(_WIN32_WINNT) && _WIN32_WINNT < 0x0A00