Files
llama.cpp/common/ngram-cache.h
Sascha Rogmann 72d3b1898a spec : add self‑speculative decoding (no draft model required) + refactor (#18471)
* server: introduce self-speculative decoding

* server: moved self-call into speculative.cpp

* can_speculate() includes self-speculation

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* server: can_speculate() tests self-spec

* server: replace can_speculate() with slot.can_speculate()

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* common: use %zu format specifier for size_t in logging

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* server: can_speculate() requires a task instance

* common: ngram map, config self-speculative decoding

* common: add enum common_speculative_type

* common: add vector of speculative states

* common: add option --spec-draftless

* server: cleanup (remove slot.batch_spec, rename)

* common: moved self-spec impl to ngram-map

* common: cleanup (use common_speculative_state_draft)

* spec : refactor

* cont : naming

* spec: remove --spec-config

* doc: (draftless) speculative decoding

* common: print performance in spec decoding

* minor : cleanup

* common : better names

* minor : cleanup + fix build

* minor: comments

* CODEOWNERS: add common/ngram-map.* (#18471)

* common : rename speculative.draftless_type -> speculative.type

* ngram-map : fix uninitialized values

* ngram-map : take into account the input can become shorter

* ngram-map : revert len check for now

* arg : change `--spec-draftless` -> `--spec-type`

* spec : add common_speculative_state::accept()

* spec : refactor + add common_speculative_begin()

* spec : fix begin() call with mtmd

* spec : additional refactor + remove common_speculative_params

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-01-28 19:42:42 +02:00

102 lines
4.0 KiB
C++

#pragma once
#include "llama.h"
#include <unordered_map>
#include <string>
#include <vector>
#define LLAMA_NGRAM_MIN 1
#define LLAMA_NGRAM_MAX 4
#define LLAMA_NGRAM_STATIC 2
// Data structures to map n-grams to empirical token probabilities:
struct common_ngram {
llama_token tokens[LLAMA_NGRAM_MAX];
common_ngram() {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = LLAMA_TOKEN_NULL;
}
}
common_ngram(const llama_token * input, const int ngram_size) {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
tokens[i] = i < ngram_size ? input[i] : LLAMA_TOKEN_NULL;
}
}
bool operator==(const common_ngram & other) const {
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
if (tokens[i] != other.tokens[i]) {
return false;
}
}
return true;
}
};
struct common_token_hash_function {
size_t operator()(const llama_token token) const {
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
return token * 11400714819323198485llu;
}
};
struct common_ngram_hash_function {
size_t operator()(const common_ngram & ngram) const {
size_t hash = common_token_hash_function{}(ngram.tokens[0]);
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
hash ^= common_token_hash_function{}(ngram.tokens[i]);
}
return hash;
}
};
// token -> number of times token has been seen
typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
// n-gram -> empirical distribution of following tokens
typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
// Update an ngram cache with tokens.
// ngram_cache: the cache to modify.
// ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data.
// inp_data: the token sequence with which to update ngram_cache.
// nnew: how many new tokens have been appended to inp_data since the last call to this function.
// print_progress: whether to print progress to stderr.
//
// In order to get correct results inp_data can ONLY BE APPENDED TO.
// Changes in the middle need a complete rebuild.
void common_ngram_cache_update(
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
// Try to draft tokens from ngram caches.
// inp: the tokens generated so far.
// draft: the token sequence to draft. Expected to initially contain the previously sampled token.
// n_draft: maximum number of tokens to add to draft.
// ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic.
// nc_context: ngram cache based on current context.
// nc_dynamic: ngram cache based on previous user generations.
// nc_static: ngram cache generated from a large text corpus, used for validation.
void common_ngram_cache_draft(
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
// Save an ngram cache to a file.
// ngram_cache: the ngram cache to save.
// filename: the path under which to save the ngram cache.
void common_ngram_cache_save(common_ngram_cache & ngram_cache, const std::string & filename);
// Load an ngram cache saved with common_ngram_cache_save.
// filename: the path from which to load the ngram cache.
// returns: an ngram cache containing the information saved to filename.
common_ngram_cache common_ngram_cache_load(const std::string & filename);
// Merge two ngram caches.
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
// ngram_cache_add: the ngram cache to add to ngram_cache_target.
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);