mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-02 16:13:48 +03:00
Compare commits
6 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1f5d15e665 | ||
|
|
c46758d28f | ||
|
|
bf934f28db | ||
|
|
5c1a7b8355 | ||
|
|
59d840209a | ||
|
|
ff934e29bc |
@@ -1,11 +1,13 @@
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
ARG UBUNTU_VERSION=24.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
ARG TARGETARCH
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git cmake libssl-dev
|
||||
apt-get install -y gcc-14 g++-14 build-essential git cmake libssl-dev
|
||||
|
||||
ENV CC=gcc-14 CXX=g++-14
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -55,8 +57,9 @@ RUN apt-get update \
|
||||
git \
|
||||
python3 \
|
||||
python3-pip \
|
||||
&& pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt \
|
||||
python3-wheel \
|
||||
&& pip install --break-system-packages --upgrade setuptools \
|
||||
&& pip install --break-system-packages -r requirements.txt \
|
||||
&& apt autoremove -y \
|
||||
&& apt clean -y \
|
||||
&& rm -rf /tmp/* /var/tmp/* \
|
||||
|
||||
@@ -41,6 +41,7 @@
|
||||
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
|
||||
enableStatic ? effectiveStdenv.hostPlatform.isStatic,
|
||||
precompileMetalShaders ? false,
|
||||
useWebUi ? true,
|
||||
}:
|
||||
|
||||
let
|
||||
@@ -164,6 +165,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
|
||||
cmakeFlags =
|
||||
[
|
||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||
(cmakeBool "LLAMA_BUILD_WEBUI" useWebUi)
|
||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||
(cmakeBool "GGML_NATIVE" false)
|
||||
|
||||
24
.github/workflows/docker.yml
vendored
24
.github/workflows/docker.yml
vendored
@@ -36,18 +36,16 @@ jobs:
|
||||
matrix:
|
||||
config:
|
||||
# Multi-stage build
|
||||
# Note: the arm64 images are failing, which prevents the amd64 images from being built
|
||||
# https://github.com/ggml-org/llama.cpp/issues/11888
|
||||
#- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "cuda cuda12", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "12.4.0", ubuntu_version: "22.04" }
|
||||
- { tag: "cuda13", dockerfile: ".devops/cuda-new.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04", cuda_version: "13.1.0", ubuntu_version: "24.04" }
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04-s390x" }
|
||||
- { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/arm64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04" }
|
||||
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04" }
|
||||
- { tag: "cuda cuda12", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04", cuda_version: "12.4.0", ubuntu_version: "22.04" }
|
||||
- { tag: "cuda13", dockerfile: ".devops/cuda-new.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04", cuda_version: "13.1.0", ubuntu_version: "24.04" }
|
||||
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04" }
|
||||
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04" }
|
||||
- { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04" }
|
||||
- { tag: "s390x", dockerfile: ".devops/s390x.Dockerfile", platforms: "linux/s390x", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04-s390x" }
|
||||
- { tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true, runs_on: "ubuntu-24.04" }
|
||||
- { tag: "openvino", dockerfile: ".devops/openvino.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-24.04" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v6
|
||||
@@ -58,7 +56,7 @@ jobs:
|
||||
if: ${{ matrix.config.tag != 's390x' }}
|
||||
uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3
|
||||
with:
|
||||
image: tonistiigi/binfmt:qemu-v7.0.0-28
|
||||
image: tonistiigi/binfmt:qemu-v10.2.1
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
|
||||
|
||||
@@ -108,6 +108,7 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_TOOLS "llama: build tools" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_WEBUI "llama: build the embedded Web UI for server" ON)
|
||||
option(LLAMA_TOOLS_INSTALL "llama: install tools" ${LLAMA_TOOLS_INSTALL_DEFAULT})
|
||||
option(LLAMA_TESTS_INSTALL "llama: install tests" ON)
|
||||
|
||||
|
||||
@@ -2807,6 +2807,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
||||
params.port = value;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PORT"));
|
||||
add_opt(common_arg(
|
||||
{"--reuse-port"},
|
||||
string_format("allow multiple sockets to bind to the same port (default: %s)", params.reuse_port ? "enabled" : "disabled"),
|
||||
[](common_params & params) {
|
||||
params.reuse_port = true;
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_REUSE_PORT"));
|
||||
add_opt(common_arg(
|
||||
{"--path"}, "PATH",
|
||||
string_format("path to serve static files from (default: %s)", params.public_path.c_str()),
|
||||
|
||||
@@ -287,7 +287,7 @@ void analyze_reasoning::compare_reasoning_presence() {
|
||||
return p.literal(reasoning_content) + p.space() + p.optional(p.tag("post", (p.marker() + p.space())) + p.rest());
|
||||
});
|
||||
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
|
||||
return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
|
||||
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
|
||||
});
|
||||
// try the more aggressive parse first, if it fails, fall back to the delimiter one
|
||||
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
|
||||
@@ -297,7 +297,7 @@ void analyze_reasoning::compare_reasoning_presence() {
|
||||
if (result.result.success()) {
|
||||
if (!result.tags["pre"].empty() && !result.tags["post"].empty()) {
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
start = trim_whitespace(result.tags["pre"]);
|
||||
start = trim_leading_whitespace(result.tags["pre"]);
|
||||
end = trim_trailing_whitespace(result.tags["post"]);
|
||||
} else if (!result.tags["post"].empty()) {
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
@@ -333,7 +333,7 @@ void analyze_reasoning::compare_thinking_enabled() {
|
||||
if (left_trimmed.empty() && !diff.right.empty()) {
|
||||
if (!right_trimmed.empty() && string_ends_with(comparison->output_B, right_trimmed)) {
|
||||
if (start.empty()) {
|
||||
start = right_trimmed;
|
||||
start = trim_leading_whitespace(diff.right);
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
}
|
||||
}
|
||||
@@ -344,7 +344,7 @@ void analyze_reasoning::compare_thinking_enabled() {
|
||||
if (seg.size() >= 2 && seg[seg.size() - 1].value == left_trimmed && seg[seg.size() - 2].type == segment_type::MARKER) {
|
||||
start = seg[seg.size() - 2].value;
|
||||
}
|
||||
end = left_trimmed;
|
||||
end = trim_trailing_whitespace(diff.left);
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
}
|
||||
}
|
||||
@@ -363,15 +363,23 @@ void analyze_reasoning::compare_thinking_enabled() {
|
||||
size_t len = std::min(base.size(), anchor_len);
|
||||
std::string anchor = base.substr(base.size() - len);
|
||||
auto pos = extended.rfind(anchor);
|
||||
if (pos == std::string::npos || pos + len >= extended.size()) continue;
|
||||
if (pos == std::string::npos || pos + len >= extended.size()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string extra = trim_whitespace(extended.substr(pos + len));
|
||||
if (extra.empty()) continue;
|
||||
if (extra.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto seg = prune_whitespace_segments(segmentize_markers(extra));
|
||||
if (seg.size() == 2 && seg[0].type == segment_type::MARKER && seg[1].type == segment_type::MARKER) {
|
||||
if (start.empty()) start = seg[0].value;
|
||||
if (end.empty()) end = seg[1].value;
|
||||
if (start.empty()) {
|
||||
start = seg[0].value;
|
||||
}
|
||||
if (end.empty()) {
|
||||
end = seg[1].value;
|
||||
}
|
||||
mode = reasoning_mode::TAG_BASED;
|
||||
break;
|
||||
}
|
||||
@@ -423,7 +431,7 @@ void analyze_reasoning::compare_reasoning_scope() {
|
||||
LOG_DBG(ANSI_ORANGE "%s: Detected TOOLS_ONLY reasoning mode\n" ANSI_RESET, __func__);
|
||||
|
||||
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
|
||||
return p.tag("pre", p.marker()) + p.space() + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space()));
|
||||
return p.tag("pre", p.marker() + p.space()) + p.literal(reasoning_content) + p.space() + p.tag("post", (p.marker() + p.space()));
|
||||
});
|
||||
auto result = parser_wrapped.parse_anywhere_and_extract(comparison->output_B);
|
||||
if (result.result.success()) {
|
||||
@@ -516,7 +524,7 @@ analyze_content::analyze_content(const common_chat_template & tmpl, const analyz
|
||||
// Take the more promising diff
|
||||
std::string pure_content = rdiff.length() > diff_tools.left.length() ? rdiff : diff_tools.left;
|
||||
auto parser_wrapped = build_tagged_peg_parser([&](common_peg_parser_builder &p) {
|
||||
return p.tag("pre", p.marker()) + p.space() + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
|
||||
return p.tag("pre", p.marker() + p.space()) + p.literal(response) + p.space() + p.tag("post", (p.marker() + p.space())) + p.rest();
|
||||
});
|
||||
auto result = parser_wrapped.parse_anywhere_and_extract(pure_content);
|
||||
start = result.tags["pre"];
|
||||
|
||||
@@ -656,6 +656,38 @@ bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_over
|
||||
return true;
|
||||
}
|
||||
|
||||
// simple glob: * matches non-/ chars, ** matches anything including /
|
||||
static inline bool glob_match(const char * pattern, const char * str) {
|
||||
if (*pattern == '\0') {
|
||||
return *str == '\0';
|
||||
}
|
||||
if (pattern[0] == '*' && pattern[1] == '*') {
|
||||
const char * p = pattern + 2;
|
||||
if (*p == '/') p++;
|
||||
if (glob_match(p, str)) return true;
|
||||
if (*str != '\0') return glob_match(pattern, str + 1);
|
||||
return false;
|
||||
}
|
||||
if (*pattern == '*') {
|
||||
const char * p = pattern + 1;
|
||||
for (; *str != '\0' && *str != '/'; str++) {
|
||||
if (glob_match(p, str)) return true;
|
||||
}
|
||||
return glob_match(p, str);
|
||||
}
|
||||
if (*pattern == '?' && *str != '\0' && *str != '/') {
|
||||
return glob_match(pattern + 1, str + 1);
|
||||
}
|
||||
if (*pattern == *str) {
|
||||
return glob_match(pattern + 1, str + 1);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool glob_match(const std::string & pattern, const std::string & str) {
|
||||
return glob_match(pattern.c_str(), str.c_str());
|
||||
}
|
||||
|
||||
//
|
||||
// Filesystem utils
|
||||
//
|
||||
|
||||
@@ -573,6 +573,7 @@ struct common_params {
|
||||
|
||||
// server params
|
||||
int32_t port = 8080; // server listens on this network port
|
||||
bool reuse_port = false; // allow multiple sockets to bind to the same port
|
||||
int32_t timeout_read = 600; // http read timeout in seconds
|
||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||
@@ -793,6 +794,8 @@ std::string string_from(const std::vector<int> & values);
|
||||
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
|
||||
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
|
||||
|
||||
bool glob_match(const std::string & pattern, const std::string & str);
|
||||
|
||||
//
|
||||
// Filesystem utils
|
||||
//
|
||||
|
||||
@@ -115,9 +115,11 @@ static void common_reasoning_budget_accept(struct llama_sampler * smpl, llama_to
|
||||
break;
|
||||
}
|
||||
case REASONING_BUDGET_FORCING:
|
||||
// force_pos is advanced in apply(), not here.
|
||||
// This ensures the first forced token isn't skipped when the sampler
|
||||
// is initialized directly in FORCING state (e.g. COUNTING + budget=0)
|
||||
ctx->force_pos++;
|
||||
if (ctx->force_pos >= ctx->forced_tokens.size()) {
|
||||
ctx->state = REASONING_BUDGET_DONE;
|
||||
LOG_INF("reasoning-budget: forced sequence complete, done\n");
|
||||
}
|
||||
break;
|
||||
case REASONING_BUDGET_DONE:
|
||||
break;
|
||||
@@ -144,14 +146,6 @@ static void common_reasoning_budget_apply(struct llama_sampler * smpl, llama_tok
|
||||
cur_p->data[i].logit = -INFINITY;
|
||||
}
|
||||
}
|
||||
|
||||
// advance to next forced token (done here rather than in accept so that
|
||||
// the first forced token isn't skipped when starting in FORCING state)
|
||||
ctx->force_pos++;
|
||||
if (ctx->force_pos >= ctx->forced_tokens.size()) {
|
||||
ctx->state = REASONING_BUDGET_DONE;
|
||||
LOG_INF("reasoning-budget: forced sequence complete, done\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void common_reasoning_budget_reset(struct llama_sampler * smpl) {
|
||||
@@ -261,3 +255,10 @@ struct llama_sampler * common_reasoning_budget_init(
|
||||
common_reasoning_budget_state initial_state) {
|
||||
return common_reasoning_budget_init_state(vocab, start_tokens, end_tokens, forced_tokens, budget, initial_state);
|
||||
}
|
||||
|
||||
common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl) {
|
||||
if (!smpl) {
|
||||
return REASONING_BUDGET_IDLE;
|
||||
}
|
||||
return ((const common_reasoning_budget_ctx *)smpl->ctx)->state;
|
||||
}
|
||||
|
||||
@@ -51,3 +51,5 @@ struct llama_sampler * common_reasoning_budget_init(
|
||||
const std::vector<llama_token> & forced_tokens,
|
||||
int32_t budget,
|
||||
common_reasoning_budget_state initial_state);
|
||||
|
||||
common_reasoning_budget_state common_reasoning_budget_get_state(const struct llama_sampler * smpl);
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <cctype>
|
||||
#include <climits>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <unordered_map>
|
||||
@@ -109,6 +110,7 @@ struct common_sampler {
|
||||
common_params_sampling params;
|
||||
|
||||
struct llama_sampler * grmr;
|
||||
struct llama_sampler * rbudget;
|
||||
struct llama_sampler * chain;
|
||||
|
||||
ring_buffer<llama_token> prev;
|
||||
@@ -188,6 +190,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||
lparams.no_perf = params.no_perf;
|
||||
|
||||
llama_sampler * grmr = nullptr;
|
||||
llama_sampler * rbudget = nullptr;
|
||||
llama_sampler * chain = llama_sampler_chain_init(lparams);
|
||||
|
||||
std::vector<llama_sampler *> samplers;
|
||||
@@ -270,7 +273,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||
}
|
||||
}
|
||||
|
||||
if (grmr) {
|
||||
if (grmr && !params.grammar_lazy) {
|
||||
try {
|
||||
for (const auto & token : prefill_tokens) {
|
||||
llama_sampler_accept(grmr, token);
|
||||
@@ -284,15 +287,15 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||
}
|
||||
}
|
||||
|
||||
// reasoning budget sampler — added first so it can force tokens before other samplers
|
||||
if (params.reasoning_budget_tokens >= 0 && !params.reasoning_budget_forced.empty()) {
|
||||
samplers.push_back(common_reasoning_budget_init(
|
||||
// reasoning budget sampler
|
||||
if (!params.reasoning_budget_start.empty() && !params.reasoning_budget_end.empty()) {
|
||||
rbudget = common_reasoning_budget_init(
|
||||
vocab,
|
||||
params.reasoning_budget_start,
|
||||
params.reasoning_budget_end,
|
||||
params.reasoning_budget_forced,
|
||||
params.reasoning_budget_tokens,
|
||||
prefill_tokens));
|
||||
params.reasoning_budget_tokens < 0 ? INT_MAX : params.reasoning_budget_tokens,
|
||||
prefill_tokens);
|
||||
}
|
||||
|
||||
if (params.has_logit_bias()) {
|
||||
@@ -383,6 +386,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
|
||||
auto * result = new common_sampler {
|
||||
/* .params = */ params,
|
||||
/* .grmr = */ grmr,
|
||||
/* .rbudget = */ rbudget,
|
||||
/* .chain = */ chain,
|
||||
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
|
||||
/* .cur = */ {},
|
||||
@@ -398,11 +402,27 @@ void common_sampler_free(struct common_sampler * gsmpl) {
|
||||
}
|
||||
|
||||
llama_sampler_free(gsmpl->grmr);
|
||||
llama_sampler_free(gsmpl->rbudget);
|
||||
llama_sampler_free(gsmpl->chain);
|
||||
|
||||
delete gsmpl;
|
||||
}
|
||||
|
||||
static bool grammar_should_apply(struct common_sampler * gsmpl) {
|
||||
if (!gsmpl->grmr) {
|
||||
return false;
|
||||
}
|
||||
if (!gsmpl->rbudget) {
|
||||
return true;
|
||||
}
|
||||
if (gsmpl->params.grammar_lazy) {
|
||||
// if grammar is lazy, only apply when reasoning budget is not active
|
||||
const auto state = common_reasoning_budget_get_state(gsmpl->rbudget);
|
||||
return state == REASONING_BUDGET_IDLE || state == REASONING_BUDGET_DONE;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||
if (!gsmpl) {
|
||||
return;
|
||||
@@ -410,6 +430,11 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
|
||||
|
||||
const auto tm = gsmpl->tm();
|
||||
|
||||
// grammar_should_apply() checks the reasoning budget state, so calculate this before we accept
|
||||
accept_grammar = accept_grammar && grammar_should_apply(gsmpl);
|
||||
|
||||
llama_sampler_accept(gsmpl->rbudget, token);
|
||||
|
||||
if (gsmpl->grmr && accept_grammar) {
|
||||
llama_sampler_accept(gsmpl->grmr, token);
|
||||
}
|
||||
@@ -431,6 +456,7 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||
return new common_sampler {
|
||||
/* .params = */ gsmpl->params,
|
||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
||||
/* .rbudget = */ llama_sampler_clone(gsmpl->rbudget),
|
||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||
/* .prev = */ gsmpl->prev,
|
||||
/* .cur = */ gsmpl->cur,
|
||||
@@ -500,6 +526,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
llama_token id = LLAMA_TOKEN_NULL;
|
||||
|
||||
auto & grmr = gsmpl->grmr;
|
||||
auto & rbudget = gsmpl->rbudget;
|
||||
auto & chain = gsmpl->chain;
|
||||
auto & cur_p = gsmpl->cur_p; // initialized by set_logits
|
||||
|
||||
@@ -511,7 +538,8 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
if (id != LLAMA_TOKEN_NULL) {
|
||||
LOG_DBG("%s: Backend sampler selected token: '%d'. Will not run any CPU samplers\n", __func__, id);
|
||||
|
||||
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
|
||||
GGML_ASSERT(!gsmpl->grmr && "using grammar in combination with backend sampling is not supported");
|
||||
GGML_ASSERT(!gsmpl->rbudget && "using reasoning budget in combination with backend sampling is not supported");
|
||||
|
||||
// TODO: simplify
|
||||
gsmpl->cur.resize(1);
|
||||
@@ -524,7 +552,10 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
if (grammar_first) {
|
||||
// apply reasoning budget first
|
||||
llama_sampler_apply(rbudget, &cur_p);
|
||||
|
||||
if (grammar_first && grammar_should_apply(gsmpl)) {
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
}
|
||||
|
||||
@@ -532,7 +563,7 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
|
||||
id = cur_p.data[cur_p.selected].id;
|
||||
|
||||
if (grammar_first) {
|
||||
if (grammar_first || !grammar_should_apply(gsmpl)) {
|
||||
return id;
|
||||
}
|
||||
|
||||
@@ -553,7 +584,12 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
|
||||
// if the token is not valid, sample again, but first apply the grammar sampler and then the sampling chain
|
||||
gsmpl->set_logits(ctx, idx);
|
||||
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
llama_sampler_apply(rbudget, &cur_p);
|
||||
|
||||
if (grammar_should_apply(gsmpl)) {
|
||||
llama_sampler_apply(grmr, &cur_p);
|
||||
}
|
||||
|
||||
llama_sampler_apply(chain, &cur_p);
|
||||
|
||||
GGML_ASSERT(cur_p.selected != -1 && "no selected token during sampling - check your sampling configuration");
|
||||
|
||||
@@ -1330,7 +1330,7 @@ static void test_nemotron_reasoning_detection(testing & t) {
|
||||
analysis.analyze_template(tmpl);
|
||||
|
||||
// Check reasoning markers
|
||||
t.assert_equal("reasoning_start should be '<think>'", "<think>", analysis.reasoning.start);
|
||||
t.assert_equal("reasoning_start should be '<think>\\n'", "<think>\n", analysis.reasoning.start);
|
||||
t.assert_equal("reasoning_end should be '</think>'", "</think>", analysis.reasoning.end);
|
||||
|
||||
// Check reasoning mode detection
|
||||
|
||||
@@ -805,7 +805,8 @@ struct peg_test_case {
|
||||
common_chat_templates_inputs params;
|
||||
std::string input;
|
||||
common_chat_msg expect;
|
||||
bool is_partial = false;
|
||||
bool is_partial = false;
|
||||
bool expect_reconstruction = false;
|
||||
};
|
||||
|
||||
struct make_peg_parser {
|
||||
@@ -828,6 +829,12 @@ struct make_peg_parser {
|
||||
}
|
||||
};
|
||||
|
||||
// Global template filter for --template flag
|
||||
static std::string g_template_filter;
|
||||
|
||||
// When true, run reconstruction test on every non-partial test and report results
|
||||
static bool g_force_reconstruction_test = false;
|
||||
|
||||
static void test_peg_parser(common_chat_templates * tmpls,
|
||||
const std::function<void(peg_test_case &)> & init,
|
||||
bool detailed_debug) {
|
||||
@@ -936,75 +943,158 @@ static void test_peg_parser(common_chat_templates * tmpls,
|
||||
throw std::runtime_error("Failed to build grammar: " + parser.params_.grammar);
|
||||
}
|
||||
|
||||
// Find the earliest trigger position to determine the constrained portion
|
||||
auto earliest_trigger_pos = std::string::npos;
|
||||
for (const auto & trigger : parser.params_.grammar_triggers) {
|
||||
size_t pos = std::string::npos;
|
||||
std::smatch match;
|
||||
switch (trigger.type) {
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
|
||||
{
|
||||
const auto & word = trigger.value;
|
||||
pos = tc.input.find(word);
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
||||
{
|
||||
const auto & pattern = std::regex(trigger.value);
|
||||
if (std::regex_search(tc.input, match, pattern)) {
|
||||
pos = match.position(pattern.mark_count());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
|
||||
{
|
||||
const auto & pattern = trigger.value;
|
||||
if (std::regex_match(tc.input, match, std::regex(pattern))) {
|
||||
auto mpos = std::string::npos;
|
||||
for (size_t i = 1; i < match.size(); ++i) {
|
||||
if (match[i].length() > 0) {
|
||||
mpos = match.position(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (mpos == std::string::npos) {
|
||||
mpos = match.position(0);
|
||||
}
|
||||
pos = mpos;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw std::runtime_error("Unknown trigger type");
|
||||
}
|
||||
if (pos != std::string::npos) {
|
||||
if (earliest_trigger_pos == std::string::npos || pos < earliest_trigger_pos) {
|
||||
earliest_trigger_pos = pos;
|
||||
// In production, grammar triggers match against the full generated text
|
||||
// including the generation prompt. All positions are in full_input coordinates.
|
||||
const auto & gen_prompt = parser.params_.generation_prompt;
|
||||
std::string full_input = gen_prompt + tc.input;
|
||||
|
||||
// Determine whether the reasoning-budget sampler path applies: tool-call grammar
|
||||
// with all WORD triggers and thinking tags present. In production, the reasoning
|
||||
// budget sampler inhibits grammar application while inside thinking blocks —
|
||||
// triggers inside <think>...</think> are suppressed.
|
||||
bool use_reasoning_budget_path = false;
|
||||
if (parser.params_.grammar_lazy && !parser.params_.thinking_end_tag.empty()) {
|
||||
use_reasoning_budget_path = true;
|
||||
for (const auto & trigger : parser.params_.grammar_triggers) {
|
||||
if (trigger.type != COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
|
||||
use_reasoning_budget_path = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Determine the constrained portion of input to test against grammar
|
||||
std::string constrained = tc.input;
|
||||
// Find the earliest trigger position to determine the constrained portion
|
||||
auto earliest_trigger_pos = std::string::npos;
|
||||
|
||||
if (use_reasoning_budget_path) {
|
||||
// Reasoning-budget path: simulate thinking-aware trigger detection.
|
||||
// Walk through full_input tracking thinking state; only match triggers
|
||||
// when outside thinking blocks.
|
||||
const auto & think_start = parser.params_.thinking_start_tag;
|
||||
const auto & think_end = parser.params_.thinking_end_tag;
|
||||
|
||||
bool in_thinking = false;
|
||||
for (size_t i = 0; i < full_input.size(); ++i) {
|
||||
if (!in_thinking && !think_start.empty()
|
||||
&& full_input.compare(i, think_start.size(), think_start) == 0) {
|
||||
in_thinking = true;
|
||||
i += think_start.size() - 1;
|
||||
continue;
|
||||
}
|
||||
if (in_thinking && full_input.compare(i, think_end.size(), think_end) == 0) {
|
||||
in_thinking = false;
|
||||
i += think_end.size() - 1;
|
||||
continue;
|
||||
}
|
||||
if (in_thinking) {
|
||||
continue;
|
||||
}
|
||||
// Outside thinking — check if any trigger word starts here
|
||||
for (const auto & trigger : parser.params_.grammar_triggers) {
|
||||
if (full_input.compare(i, trigger.value.size(), trigger.value) == 0) {
|
||||
if (earliest_trigger_pos == std::string::npos || i < earliest_trigger_pos) {
|
||||
earliest_trigger_pos = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (earliest_trigger_pos != std::string::npos) {
|
||||
break; // found the earliest
|
||||
}
|
||||
}
|
||||
|
||||
// If the reasoning-budget path found no trigger outside thinking but the test
|
||||
// expects tool calls, this template nests tool calls inside thinking
|
||||
// blocks (e.g. Kimi). Fall back to the legacy path for this case.
|
||||
if (earliest_trigger_pos == std::string::npos && !tc.expect.tool_calls.empty()) {
|
||||
use_reasoning_budget_path = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!use_reasoning_budget_path) {
|
||||
// Legacy path: find triggers without thinking-awareness
|
||||
for (const auto & trigger : parser.params_.grammar_triggers) {
|
||||
size_t pos = std::string::npos;
|
||||
std::smatch match;
|
||||
switch (trigger.type) {
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_WORD:
|
||||
{
|
||||
const auto & word = trigger.value;
|
||||
pos = full_input.find(word);
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN:
|
||||
{
|
||||
const auto & compiled = std::regex(trigger.value);
|
||||
if (std::regex_search(full_input, match, compiled)) {
|
||||
pos = match.position(compiled.mark_count());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL:
|
||||
{
|
||||
// In production, PATTERN_FULL triggers are checked against
|
||||
// the text generated so far, growing token by token. Simulate
|
||||
// by trying every prefix of full_input.
|
||||
const auto & compiled = std::regex(trigger.value);
|
||||
for (size_t end = gen_prompt.size(); end <= full_input.size(); ++end) {
|
||||
std::string prefix = full_input.substr(0, end);
|
||||
if (std::regex_match(prefix, match, compiled)) {
|
||||
pos = std::string::npos;
|
||||
for (size_t gi = 1; gi < match.size(); ++gi) {
|
||||
if (match[gi].length() > 0) {
|
||||
pos = match.position(gi);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (pos == std::string::npos) {
|
||||
pos = match.position(0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw std::runtime_error("Unknown trigger type");
|
||||
}
|
||||
if (pos != std::string::npos) {
|
||||
if (earliest_trigger_pos == std::string::npos || pos < earliest_trigger_pos) {
|
||||
earliest_trigger_pos = pos;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If the test expects tool calls and the grammar is lazy, the trigger must fire.
|
||||
// Otherwise the grammar would never activate in production and tool calls wouldn't
|
||||
// be constrained. A silent skip here would hide broken triggers.
|
||||
if (parser.params_.grammar_lazy && !tc.expect.tool_calls.empty() && !tc.is_partial
|
||||
&& earliest_trigger_pos == std::string::npos) {
|
||||
std::string trigger_desc;
|
||||
for (const auto & trigger : parser.params_.grammar_triggers) {
|
||||
trigger_desc += "\n [type=" + std::to_string(trigger.type) + "] " + trigger.value;
|
||||
}
|
||||
throw std::runtime_error(
|
||||
"Grammar trigger did not fire, but test expects tool calls (lazy grammar).\n"
|
||||
">>> Input: " + full_input + "\n"
|
||||
">>> Triggers (" + std::to_string(parser.params_.grammar_triggers.size()) + "):" + trigger_desc);
|
||||
}
|
||||
|
||||
// Determine the constrained portion of input to test against grammar.
|
||||
// If the trigger position falls inside the generation prompt, the grammar
|
||||
// sampler was already active before model output began — constrain from the
|
||||
// start of the model output (i.e. tc.input).
|
||||
std::string constrained = full_input;
|
||||
bool grammar_triggered = false;
|
||||
if (earliest_trigger_pos != std::string::npos) {
|
||||
constrained = tc.input.substr(earliest_trigger_pos);
|
||||
auto constrain_from = std::max(earliest_trigger_pos, gen_prompt.size());
|
||||
constrained = full_input.substr(constrain_from);
|
||||
grammar_triggered = true;
|
||||
} else if (!parser.params_.grammar_lazy) {
|
||||
// For non-lazy grammars, the entire input should match
|
||||
grammar_triggered = true;
|
||||
}
|
||||
|
||||
// For non-lazy grammars, prepend reasoning prefill to grammar input, just like
|
||||
// PEG parsing does. The grammar includes the full reasoning pattern (e.g. optional
|
||||
// <think>...</think>), but the model output may start mid-reasoning if the template
|
||||
// already placed the opening tag in the prompt.
|
||||
// For lazy grammars, the grammar only activates from the trigger position, so the
|
||||
// reasoning prefill is irrelevant — reasoning is handled by the PEG parser.
|
||||
if (!parser.params_.generation_prompt.empty() && earliest_trigger_pos == std::string::npos) {
|
||||
constrained = parser.params_.generation_prompt + constrained;
|
||||
}
|
||||
|
||||
// Test the constrained portion against the grammar
|
||||
if (grammar_triggered && !tc.is_partial) {
|
||||
auto result = match_string_detailed(constrained, grammar.get());
|
||||
@@ -1036,10 +1126,57 @@ static void test_peg_parser(common_chat_templates * tmpls,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Global template filter for --template flag
|
||||
static std::string g_template_filter;
|
||||
// Reconstruction test: verify that appending the parsed message to the original
|
||||
// messages and re-rendering the template (without generation prompt) reproduces
|
||||
// the original prompt + input exactly, or as a proper prefix (the template may
|
||||
// append end-of-turn tokens after the assistant message).
|
||||
if ((tc.expect_reconstruction || g_force_reconstruction_test) && !tc.is_partial) {
|
||||
// Start from tc.expect but copy tool call arguments from the actual parser
|
||||
// output, which preserves original JSON formatting (e.g. {"arg1":1} vs {"arg1": 1}).
|
||||
auto reconstruction_msg = tc.expect;
|
||||
auto parsed_msg = parser.parse(tc.input, false);
|
||||
for (size_t i = 0; i < reconstruction_msg.tool_calls.size() && i < parsed_msg.tool_calls.size(); i++) {
|
||||
reconstruction_msg.tool_calls[i].arguments = parsed_msg.tool_calls[i].arguments;
|
||||
}
|
||||
common_chat_templates_inputs reconstruction_inputs = tc.params;
|
||||
reconstruction_inputs.messages.push_back(reconstruction_msg);
|
||||
reconstruction_inputs.add_generation_prompt = false;
|
||||
|
||||
auto reconstruction_params = common_chat_templates_apply(tmpls, reconstruction_inputs);
|
||||
std::string expected_text = parser.params_.prompt + tc.input;
|
||||
bool match = reconstruction_params.prompt == expected_text ||
|
||||
(reconstruction_params.prompt.size() > expected_text.size() &&
|
||||
reconstruction_params.prompt.compare(0, expected_text.size(), expected_text) == 0);
|
||||
if (!match && g_force_reconstruction_test && !tc.expect_reconstruction) {
|
||||
// In forced mode, report mismatch but don't fail
|
||||
// Find the first difference position
|
||||
size_t diff_pos = 0;
|
||||
size_t min_len = std::min(expected_text.size(), reconstruction_params.prompt.size());
|
||||
while (diff_pos < min_len && expected_text[diff_pos] == reconstruction_params.prompt[diff_pos]) {
|
||||
diff_pos++;
|
||||
}
|
||||
size_t ctx_start = diff_pos > 60 ? diff_pos - 60 : 0;
|
||||
size_t ctx_end_e = std::min(expected_text.size(), diff_pos + 40);
|
||||
size_t ctx_end_r = std::min(reconstruction_params.prompt.size(), diff_pos + 40);
|
||||
LOG_ERR("\x1b[31m[RECONSTRUCTION FAIL]\x1b[0m "
|
||||
"first diff at byte %zu (expected len=%zu, reconstructed len=%zu)\n"
|
||||
" expected: ...%s...\n"
|
||||
" reconstructed: ...%s...\n",
|
||||
diff_pos, expected_text.size(), reconstruction_params.prompt.size(),
|
||||
expected_text.substr(ctx_start, ctx_end_e - ctx_start).c_str(),
|
||||
reconstruction_params.prompt.substr(ctx_start, ctx_end_r - ctx_start).c_str());
|
||||
} else if (!match) {
|
||||
std::string error_msg =
|
||||
"Reconstruction mismatch:\n\n"
|
||||
">>> Expected (prompt + input):\n" + expected_text +
|
||||
"\n\n>>> Reconstructed:\n" + reconstruction_params.prompt;
|
||||
throw std::runtime_error(error_msg);
|
||||
} else if (g_force_reconstruction_test) {
|
||||
LOG_INF("\x1b[32m[RECONSTRUCTION OK]\x1b[0m\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fluent builder for PEG parser tests
|
||||
class peg_test_builder;
|
||||
@@ -1099,6 +1236,11 @@ class peg_test_builder {
|
||||
return *this;
|
||||
}
|
||||
|
||||
peg_test_builder & expect_reconstruction(bool val = true) {
|
||||
tc_.expect_reconstruction = val;
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Expect setters
|
||||
peg_test_builder & expect(const common_chat_msg & msg) {
|
||||
tc_.expect = msg;
|
||||
@@ -1272,16 +1414,18 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// Ministral-3-14B-Reasoning-2512
|
||||
auto tst = peg_tester("models/templates/mistralai-Ministral-3-14B-Reasoning-2512.jinja", detailed_debug);
|
||||
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
|
||||
tst.test("[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?")
|
||||
.expect_content("[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?")
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test("[THINK]I'm\nthinking[/THINK]Hello, world!\nWhat's up?")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.expect(message_assist_thoughts)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test(R"([TOOL_CALLS]special_function[ARGS]{"arg1":1})")
|
||||
@@ -1311,6 +1455,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
@@ -1323,6 +1468,20 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_reasoning("I need to output the invoice details in JSON")
|
||||
.expect_content(R"({"amount": 123.45, "date": "2025-12-03"})")
|
||||
.run();
|
||||
|
||||
// fake tool call marker in reasoning
|
||||
tst.test(
|
||||
"[THINK]Let me think about [TOOL_CALLS]special_function[ARGS]{\"arg1\":1} and more[/THINK]"
|
||||
R"([TOOL_CALLS]special_function[ARGS]{"arg1": 1})")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.tools({ special_function_tool })
|
||||
.expect_reasoning("Let me think about [TOOL_CALLS]special_function[ARGS]{\"arg1\":1} and more")
|
||||
.expect_tool_calls({
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
{
|
||||
@@ -1425,6 +1584,50 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_reasoning("I need to output the invoice details in JSON")
|
||||
.expect_content(R"({"amount": 123.45, "date": "2025-12-03"})")
|
||||
.run();
|
||||
|
||||
// tool call segment in reasoning
|
||||
tst.test(
|
||||
"Let's call a tool: <tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
"def hello():\n"
|
||||
" print(\"Not the real call!\")\n"
|
||||
"\n"
|
||||
"hello()\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call></think>\n"
|
||||
"<tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
"def hello():\n"
|
||||
" print(\"Hello, world!\")\n"
|
||||
"\n"
|
||||
"hello()\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>"
|
||||
)
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({
|
||||
python_tool
|
||||
})
|
||||
.expect_reasoning("Let's call a tool: <tool_call>\n"
|
||||
"<function=python>\n"
|
||||
"<parameter=code>\n"
|
||||
"def hello():\n"
|
||||
" print(\"Not the real call!\")\n"
|
||||
"\n"
|
||||
"hello()\n"
|
||||
"</parameter>\n"
|
||||
"</function>\n"
|
||||
"</tool_call>")
|
||||
.expect_tool_calls({
|
||||
{ "python", "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}", {} },
|
||||
})
|
||||
.run();
|
||||
|
||||
}
|
||||
|
||||
{
|
||||
@@ -1481,9 +1684,9 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// Google Gemma 2 2B - does not support tool calling
|
||||
auto tst = peg_tester("models/templates/google-gemma-2-2b-it.jinja");
|
||||
|
||||
tst.test("Hello, world!").expect(simple_assist_msg("Hello, world!")).run();
|
||||
tst.test("Hello, world!").expect(simple_assist_msg("Hello, world!")).expect_reconstruction().run();
|
||||
|
||||
tst.test("Line 1\nLine 2\nLine 3").expect(simple_assist_msg("Line 1\nLine 2\nLine 3")).run();
|
||||
tst.test("Line 1\nLine 2\nLine 3").expect(simple_assist_msg("Line 1\nLine 2\nLine 3")).expect_reconstruction().run();
|
||||
}
|
||||
|
||||
{
|
||||
@@ -1526,7 +1729,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// Test simple content-only template
|
||||
auto tst = peg_tester("models/templates/google-gemma-2-2b-it.jinja", detailed_debug);
|
||||
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
}
|
||||
{
|
||||
// IBM Granite (reasoning and tool calling model)
|
||||
@@ -1638,7 +1841,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// Qwen3-Coder (tool calling with XML-style format)
|
||||
auto tst = peg_tester("models/templates/Qwen3-Coder.jinja", detailed_debug);
|
||||
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
|
||||
tst.test(
|
||||
"<tool_call>\n"
|
||||
@@ -1650,6 +1853,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
"</tool_call>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
@@ -1678,6 +1882,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test with code content (multiline)
|
||||
@@ -1698,6 +1903,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "python", "{\"code\": \"def hello():\\n print(\\\"Hello, world!\\\")\\n\\nhello()\"}", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test with code content (asian unicode chars)
|
||||
@@ -1715,6 +1921,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "python", "{\"code\": \"格\"}", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test with HTML tag content
|
||||
@@ -1736,6 +1943,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "html", "{\"markup\": \"<html>\\n <head>\\n <title>Hello!</title>\\n </head>\\n</html>\"}", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test with TODO list (array of objects)
|
||||
@@ -1753,6 +1961,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "todo_list", "{\"todos\": [{\"item\": \"Check stuff\", \"selected\": false}, {\"item\": \"Prepare stuff\", \"selected\": true}]}", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test flexible optional argument ordering (2 required + 4 optional, reversed optional order)
|
||||
@@ -1769,6 +1978,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "tool_2req_4opt", R"({"req1": "hello", "req2": 42, "opt4": 100, "opt2": 200})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test flexible optional argument ordering (2 required + 5 optional, reversed optional order)
|
||||
@@ -1786,6 +1996,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "tool_2req_5opt", R"({"req1": "world", "req2": 7, "opt5": "last", "opt3": "middle", "opt1": "first"})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Test flexible optional argument ordering (2 required + 5 optional, all 5 in shuffled order)
|
||||
@@ -1805,6 +2016,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_tool_calls({
|
||||
{ "tool_2req_5opt", R"({"req1": "test", "req2": 99, "opt3": "c", "opt1": "a", "opt5": "e", "opt4": 4, "opt2": 2})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
{
|
||||
@@ -1885,6 +2097,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
tst.test("Hello, world!\nWhat's up?")
|
||||
.enable_thinking(false)
|
||||
.expect(message_assist)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Reasoning with content (forced-open mode - input starts after <think>)
|
||||
@@ -1892,6 +2105,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
|
||||
.expect(message_assist_thoughts)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Tool call without reasoning
|
||||
@@ -1902,6 +2116,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.enable_thinking(false)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Tool call with reasoning (forced-open mode)
|
||||
@@ -1914,6 +2129,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_thoughts)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
@@ -1933,6 +2149,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
|
||||
})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// #20650: tool with no required args, model emits <tool_call>name</tool_call> with no arg tags.
|
||||
@@ -1950,6 +2167,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.tools({ no_args_tool })
|
||||
.expect_reasoning("Let me read the diff content.")
|
||||
.expect_tool_calls({{ "read_file_diff_md", "{}", {} }})
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
}
|
||||
@@ -2208,22 +2426,24 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
|
||||
// Kimi-K2 old template
|
||||
auto tst = peg_tester("models/templates/moonshotai-Kimi-K2.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test(
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>"
|
||||
"{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(kimi_id_special_func_tool_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Kimi-K2-Instruct
|
||||
auto tst2 = peg_tester("models/templates/Kimi-K2-Instruct.jinja", detailed_debug);
|
||||
tst2.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst2.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst2.test(
|
||||
"<|tool_calls_section_begin|><|tool_call_begin|>functions.special_function:0<|tool_call_argument_begin|>"
|
||||
"{\"arg1\": 1}<|tool_call_end|><|tool_calls_section_end|>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(kimi_id_special_func_tool_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
@@ -2297,6 +2517,19 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.tools({ empty_args_tool })
|
||||
.expect(simple_assist_msg("", "", "empty_args", "{}"))
|
||||
.run();
|
||||
|
||||
// fake tool call marker in reasoning
|
||||
tst.test(
|
||||
"<think>Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm</think>"
|
||||
"<|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|>")
|
||||
.enable_thinking(true)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({ special_function_tool })
|
||||
.expect_reasoning("Let me think about <|tool_call_start|>[special_function(arg1=1)]<|tool_call_end|> hmm")
|
||||
.expect_tool_calls({
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
})
|
||||
.run();
|
||||
}
|
||||
|
||||
// Apertus-8B-Instruct tests - FUNC_NAME_AS_KEY format
|
||||
@@ -2306,6 +2539,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
tst.test("<|tools_prefix|>[{\"special_function\": {\"arg1\": 1}}]<|tools_suffix|>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
@@ -2314,7 +2548,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
{
|
||||
auto tst = peg_tester("models/templates/MiniMax-M2.jinja", detailed_debug);
|
||||
tst.test(
|
||||
"</think><minimax:tool_call>\n<invoke name=\"special_function\">\n<parameter "
|
||||
"<minimax:tool_call>\n<invoke name=\"special_function\">\n<parameter "
|
||||
"name=\"arg1\">1</parameter>\n</invoke>\n</minimax:tool_call>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
@@ -2364,37 +2598,41 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// mistralai-Mistral-Nemo-Instruct-2407.jinja
|
||||
{
|
||||
auto tst = peg_tester("models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("[TOOL_CALLS][{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}, \"id\": \"123456789\"}]")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_id)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
{
|
||||
auto tst = peg_tester("models/templates/meetkai-functionary-medium-v3.1.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("<function=special_function>{\"arg1\": 1}</function>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
// Functionary v3.2 - recipient-based format: >>>recipient\n{content}
|
||||
{
|
||||
auto tst = peg_tester("models/templates/meetkai-functionary-medium-v3.2.jinja", detailed_debug);
|
||||
tst.test("all\nHello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("all\nHello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("special_function\n{\"arg1\": 1}")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
// FireFunction
|
||||
{
|
||||
auto tst = peg_tester("models/templates/fireworks-ai-llama-3-firefunction-v2.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test(" functools[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
@@ -2455,10 +2693,11 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
{ "models/templates/MiMo-VL.jinja", "models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja",
|
||||
"models/templates/Qwen-Qwen2.5-7B-Instruct.jinja" }) {
|
||||
auto tst = peg_tester(path, detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("<tool_call>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}\n</tool_call>")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
@@ -2481,6 +2720,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.enable_thinking(true)
|
||||
.expect(simple_assist_msg("Hello, world!\nWhat's up?", "Here are my reasoning steps:\nI'm\nthinking"))
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
// Reasoning + Tool calls
|
||||
@@ -2497,42 +2737,45 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// Mistral Small 3.2 - FUNC_BRACKET_TAG format: [TOOL_CALLS]func_name[CALL_ID]id[ARGS]{...}
|
||||
{
|
||||
auto tst = peg_tester("models/templates/Mistral-Small-3.2-24B-Instruct-2506.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("[TOOL_CALLS]special_function[CALL_ID]123456789[ARGS]{\"arg1\": 1}")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_id)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
// Devstral
|
||||
{
|
||||
auto tst = peg_tester("models/templates/unsloth-mistral-Devstral-Small-2507.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("[TOOL_CALLS]special_function[ARGS]{\"arg1\": 1}")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
tst.test("Hello, world!\nWhat's up?[TOOL_CALLS]special_function[ARGS]{\"arg1\": 1}")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_content)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
{
|
||||
// Llama 3.1
|
||||
auto tst = peg_tester("models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).expect_reconstruction().run();
|
||||
}
|
||||
|
||||
{
|
||||
// Llama 3.2
|
||||
auto tst = peg_tester("models/templates/meta-llama-Llama-3.2-3B-Instruct.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).expect_reconstruction().run();
|
||||
}
|
||||
|
||||
{
|
||||
// Llama 3.3
|
||||
auto tst = peg_tester("models/templates/meta-llama-Llama-3.3-70B-Instruct.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ python_tool }).expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").tools({ python_tool }).expect(message_assist).expect_reconstruction().run();
|
||||
}
|
||||
|
||||
// GPT-OSS format tests
|
||||
@@ -2836,10 +3079,11 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
// GigaChat V3
|
||||
{
|
||||
auto tst = peg_tester("models/templates/GigaChat3-10B-A1.8B.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("<|message_sep|>\n\nfunction call<|role_sep|>\n{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
@@ -2848,16 +3092,18 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_content)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
|
||||
// GigaChat V3.1
|
||||
{
|
||||
auto tst = peg_tester("models/templates/GigaChat3.1-10B-A1.8B.jinja", detailed_debug);
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).run();
|
||||
tst.test("Hello, world!\nWhat's up?").expect(message_assist).expect_reconstruction().run();
|
||||
tst.test("<|function_call|>{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}")
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
|
||||
tst.test(
|
||||
@@ -2866,6 +3112,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_content)
|
||||
.expect_reconstruction()
|
||||
.run();
|
||||
}
|
||||
}
|
||||
@@ -3002,6 +3249,10 @@ int main(int argc, char ** argv) {
|
||||
detailed_debug = true;
|
||||
common_log_set_verbosity_thold(999);
|
||||
}
|
||||
if (arg == "--force-reconstruction-test") {
|
||||
g_force_reconstruction_test = true;
|
||||
only_run_filtered = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (only_run_filtered) {
|
||||
|
||||
@@ -61,8 +61,6 @@ static void test_reasoning_budget(
|
||||
|
||||
// Feed the sequence and track when forcing occurs
|
||||
for (size_t i = 0; i < sequence.size(); i++) {
|
||||
llama_sampler_accept(sampler, sequence[i]);
|
||||
|
||||
// Check if we're in forcing state by applying and seeing if logits are modified
|
||||
cur_p.selected = -1;
|
||||
for (size_t j = 0; j < cur.size(); j++) {
|
||||
@@ -81,6 +79,8 @@ static void test_reasoning_budget(
|
||||
}
|
||||
}
|
||||
|
||||
llama_sampler_accept(sampler, sequence[i]);
|
||||
|
||||
fprintf(stderr, " i=%zu: token=%d, finite_count=%zu, finite_token=%d\n", i, (int)sequence[i], finite_count, (int)finite_token);
|
||||
|
||||
if (finite_count == 1) {
|
||||
@@ -167,9 +167,9 @@ int main(void) {
|
||||
}
|
||||
|
||||
// Test 2: Budget exhausted, forcing should occur
|
||||
// Flow: i=0 accept(100)->COUNTING, i=1 accept(50)->remaining=1, i=2 accept(51)->remaining=0->FORCING
|
||||
// Forcing is active at i=2 and i=3 (when apply() is called while in FORCING state)
|
||||
// At i=4, force_pos becomes 2 which equals forced_tokens.size(), so state becomes DONE
|
||||
// Flow: i=0 apply()->passthrough, accept(100)->COUNTING; i=1 accept(50)->remaining=1
|
||||
// i=2 accept(51)->remaining=0->FORCING; i=3 apply() forces token[0]; i=4 apply() forces token[1]
|
||||
// At i=4, accept() advances force_pos to 2 which equals forced_tokens.size(), so state becomes DONE
|
||||
{
|
||||
const std::vector<llama_token> start = {100};
|
||||
const std::vector<llama_token> end = {101};
|
||||
@@ -179,13 +179,12 @@ int main(void) {
|
||||
test_reasoning_budget("budget exhausted forcing", sequence, start, end, forced,
|
||||
2, // budget of 2 tokens
|
||||
REASONING_BUDGET_IDLE,
|
||||
2, // forcing starts at i=2 (after accept(51) depletes budget, apply() forces)
|
||||
3); // forcing continues through i=3 (at i=4 state becomes DONE)
|
||||
3, // forcing starts at i=3 (accept at i=2 depletes budget, apply at i=3 forces)
|
||||
4); // forcing continues through i=4 (accept at i=4 transitions to DONE)
|
||||
}
|
||||
|
||||
// Test 3: Activate immediately with budget=0, forcing should start right away
|
||||
// Flow: Since no start token in sequence, state stays IDLE (no start/end configured means passthrough)
|
||||
// This test needs start token to be in the sequence or use activate_immediately with start token present
|
||||
// Flow: init promotes COUNTING+budget=0 to FORCING, so apply() sees FORCING at i=0
|
||||
{
|
||||
const std::vector<llama_token> start = {100};
|
||||
const std::vector<llama_token> end = {101};
|
||||
@@ -195,8 +194,8 @@ int main(void) {
|
||||
test_reasoning_budget("activate immediately budget=0", sequence, start, end, forced,
|
||||
0, // budget of 0 tokens
|
||||
REASONING_BUDGET_COUNTING, // starts counting, promoted to FORCING since budget=0
|
||||
0, // forcing starts at i=0 (after accept(100), budget=0 goes straight to FORCING)
|
||||
1); // forcing continues through i=1 (at i=2 state becomes DONE)
|
||||
0, // forcing starts at i=0 (initialized in FORCING, apply forces immediately)
|
||||
1); // forcing continues through i=1 (accept at i=1 transitions to DONE)
|
||||
}
|
||||
|
||||
// Test 4: No start/end tokens configured - passthrough (no forcing)
|
||||
@@ -214,7 +213,7 @@ int main(void) {
|
||||
|
||||
// Test 5: Activate immediately with budget > 0, count down then force
|
||||
// Flow: i=0 accept(50)->remaining=1, i=1 accept(51)->remaining=0->FORCING
|
||||
// So forcing starts at i=1 (apply after accept sees FORCING with force_pos=0)
|
||||
// Forcing starts at i=2 (apply sees FORCING after accept at i=1 transitioned)
|
||||
{
|
||||
const std::vector<llama_token> start = {100};
|
||||
const std::vector<llama_token> end = {101};
|
||||
@@ -224,8 +223,8 @@ int main(void) {
|
||||
test_reasoning_budget("activate immediately with budget", sequence, start, end, forced,
|
||||
2, // budget of 2 tokens
|
||||
REASONING_BUDGET_COUNTING,
|
||||
1, // forcing starts at i=1 (after 2 accepts deplete budget)
|
||||
2); // forcing continues through i=2
|
||||
2, // forcing starts at i=2 (after 2 accepts deplete budget, apply at i=2 forces)
|
||||
3); // forcing continues through i=3
|
||||
}
|
||||
|
||||
printf("OK (5 tests passed)\n");
|
||||
|
||||
@@ -100,7 +100,7 @@ struct cli_context {
|
||||
}
|
||||
|
||||
// reasoning budget sampler
|
||||
if (reasoning_budget >= 0 && !chat_params.thinking_end_tag.empty()) {
|
||||
if (!chat_params.thinking_end_tag.empty()) {
|
||||
const llama_vocab * vocab = llama_model_get_vocab(
|
||||
llama_get_model(ctx_server.get_llama_context()));
|
||||
|
||||
@@ -224,10 +224,11 @@ struct cli_context {
|
||||
};
|
||||
|
||||
// TODO?: Make this reusable, enums, docs
|
||||
static const std::array<const std::string, 6> cmds = {
|
||||
static const std::array<const std::string, 7> cmds = {
|
||||
"/audio ",
|
||||
"/clear",
|
||||
"/exit",
|
||||
"/glob ",
|
||||
"/image ",
|
||||
"/read ",
|
||||
"/regen",
|
||||
@@ -258,7 +259,7 @@ static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std:
|
||||
}
|
||||
}
|
||||
|
||||
if (!cmd.empty() && line.length() >= cmd.length() && cursor_byte_pos >= cmd.length()) {
|
||||
if (!cmd.empty() && cmd != "/glob " && line.length() >= cmd.length() && cursor_byte_pos >= cmd.length()) {
|
||||
const std::string path_prefix = std::string(line.substr(cmd.length(), cursor_byte_pos - cmd.length()));
|
||||
const std::string path_postfix = std::string(line.substr(cursor_byte_pos));
|
||||
auto cur_dir = std::filesystem::current_path();
|
||||
@@ -339,6 +340,8 @@ static std::vector<std::pair<std::string, size_t>> auto_completion_callback(std:
|
||||
return matches;
|
||||
}
|
||||
|
||||
static constexpr size_t FILE_GLOB_MAX_RESULTS = 100;
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
common_params params;
|
||||
|
||||
@@ -430,7 +433,8 @@ int main(int argc, char ** argv) {
|
||||
console::log(" /exit or Ctrl+C stop or exit\n");
|
||||
console::log(" /regen regenerate the last response\n");
|
||||
console::log(" /clear clear the chat history\n");
|
||||
console::log(" /read add a text file\n");
|
||||
console::log(" /read <file> add a text file\n");
|
||||
console::log(" /glob <pattern> add text files using globbing pattern\n");
|
||||
if (inf.has_inp_image) {
|
||||
console::log(" /image <file> add an image file\n");
|
||||
}
|
||||
@@ -441,6 +445,27 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// interactive loop
|
||||
std::string cur_msg;
|
||||
|
||||
auto add_text_file = [&](const std::string & fname) -> bool {
|
||||
std::string marker = ctx_cli.load_input_file(fname, false);
|
||||
if (marker.empty()) {
|
||||
console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
|
||||
return false;
|
||||
}
|
||||
if (inf.fim_sep_token != LLAMA_TOKEN_NULL) {
|
||||
cur_msg += common_token_to_piece(ctx_cli.ctx_server.get_llama_context(), inf.fim_sep_token, true);
|
||||
cur_msg += fname;
|
||||
cur_msg.push_back('\n');
|
||||
} else {
|
||||
cur_msg += "--- File: ";
|
||||
cur_msg += fname;
|
||||
cur_msg += " ---\n";
|
||||
}
|
||||
cur_msg += marker;
|
||||
console::log("Loaded text from '%s'\n", fname.c_str());
|
||||
return true;
|
||||
};
|
||||
|
||||
while (true) {
|
||||
std::string buffer;
|
||||
console::set_display(DISPLAY_TYPE_USER_INPUT);
|
||||
@@ -525,22 +550,60 @@ int main(int argc, char ** argv) {
|
||||
continue;
|
||||
} else if (string_starts_with(buffer, "/read ")) {
|
||||
std::string fname = string_strip(buffer.substr(6));
|
||||
std::string marker = ctx_cli.load_input_file(fname, false);
|
||||
if (marker.empty()) {
|
||||
console::error("file does not exist or cannot be opened: '%s'\n", fname.c_str());
|
||||
continue;
|
||||
add_text_file(fname);
|
||||
continue;
|
||||
} else if (string_starts_with(buffer, "/glob ")) {
|
||||
std::error_code ec;
|
||||
size_t count = 0;
|
||||
auto curdir = std::filesystem::current_path();
|
||||
std::string pattern = string_strip(buffer.substr(6));
|
||||
std::filesystem::path rel_path;
|
||||
|
||||
auto startglob = pattern.find_first_of("![*?");
|
||||
if (startglob != std::string::npos && startglob != 0) {
|
||||
auto endpath = pattern.substr(0, startglob).find_last_of('/');
|
||||
if (endpath != std::string::npos) {
|
||||
std::string rel_pattern = pattern.substr(0, endpath);
|
||||
#if !defined(_WIN32)
|
||||
if (string_starts_with(rel_pattern, "~")) {
|
||||
const char * home = std::getenv("HOME");
|
||||
if (home && home[0]) {
|
||||
rel_pattern = std::string(home) + rel_pattern.substr(1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
rel_path = rel_pattern;
|
||||
pattern.erase(0, endpath + 1);
|
||||
curdir /= rel_path;
|
||||
}
|
||||
}
|
||||
if (inf.fim_sep_token != LLAMA_TOKEN_NULL) {
|
||||
cur_msg += common_token_to_piece(ctx_cli.ctx_server.get_llama_context(), inf.fim_sep_token, true);
|
||||
cur_msg += fname;
|
||||
cur_msg.push_back('\n');
|
||||
} else {
|
||||
cur_msg += "--- File: ";
|
||||
cur_msg += fname;
|
||||
cur_msg += " ---\n";
|
||||
|
||||
for (const auto & entry : std::filesystem::recursive_directory_iterator(curdir,
|
||||
std::filesystem::directory_options::skip_permission_denied, ec)) {
|
||||
if (!entry.is_regular_file()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
std::string rel = std::filesystem::relative(entry.path(), curdir, ec).string();
|
||||
if (ec) {
|
||||
ec.clear();
|
||||
continue;
|
||||
}
|
||||
std::replace(rel.begin(), rel.end(), '\\', '/');
|
||||
|
||||
if (!glob_match(pattern, rel)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!add_text_file((rel_path / rel).string())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (++count >= FILE_GLOB_MAX_RESULTS) {
|
||||
console::error("Maximum number of globbed files allowed (%zu) reached.\n", FILE_GLOB_MAX_RESULTS);
|
||||
break;
|
||||
}
|
||||
}
|
||||
cur_msg += marker;
|
||||
console::log("Loaded text from '%s'\n", fname.c_str());
|
||||
continue;
|
||||
} else {
|
||||
// not a command
|
||||
|
||||
@@ -37,22 +37,29 @@ set(TARGET_SRCS
|
||||
server-models.cpp
|
||||
server-models.h
|
||||
)
|
||||
set(PUBLIC_ASSETS
|
||||
index.html.gz
|
||||
loading.html
|
||||
)
|
||||
|
||||
foreach(asset ${PUBLIC_ASSETS})
|
||||
set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
|
||||
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
|
||||
list(APPEND TARGET_SRCS ${output})
|
||||
add_custom_command(
|
||||
DEPENDS "${input}"
|
||||
OUTPUT "${output}"
|
||||
COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
|
||||
option(LLAMA_BUILD_WEBUI "Build the embedded Web UI" ON)
|
||||
|
||||
if (LLAMA_BUILD_WEBUI)
|
||||
set(PUBLIC_ASSETS
|
||||
index.html.gz
|
||||
loading.html
|
||||
)
|
||||
set_source_files_properties(${output} PROPERTIES GENERATED TRUE)
|
||||
endforeach()
|
||||
|
||||
foreach(asset ${PUBLIC_ASSETS})
|
||||
set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
|
||||
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
|
||||
list(APPEND TARGET_SRCS ${output})
|
||||
add_custom_command(
|
||||
DEPENDS "${input}"
|
||||
OUTPUT "${output}"
|
||||
COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
|
||||
)
|
||||
set_source_files_properties(${output} PROPERTIES GENERATED TRUE)
|
||||
endforeach()
|
||||
add_definitions(-DLLAMA_BUILD_WEBUI)
|
||||
else()
|
||||
endif()
|
||||
|
||||
add_executable(${TARGET} ${TARGET_SRCS})
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
|
||||
@@ -188,6 +188,7 @@ For the full list of features, please refer to [server's changelog](https://gith
|
||||
| `--tags STRING` | set model tags, comma-separated (informational, not used for routing)<br/>(env: LLAMA_ARG_TAGS) |
|
||||
| `--host HOST` | ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: 127.0.0.1)<br/>(env: LLAMA_ARG_HOST) |
|
||||
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
|
||||
| `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)<br/>(env: LLAMA_ARG_REUSE_PORT) |
|
||||
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
|
||||
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
|
||||
| `--webui-config JSON` | JSON that provides default WebUI settings (overrides WebUI defaults)<br/>(env: LLAMA_ARG_WEBUI_CONFIG) |
|
||||
|
||||
@@ -1110,7 +1110,7 @@ json oaicompat_chat_params_parse(
|
||||
reasoning_budget = json_value(body, "thinking_budget_tokens", -1);
|
||||
}
|
||||
|
||||
if (reasoning_budget >= 0 && !chat_params.thinking_end_tag.empty()) {
|
||||
if (!chat_params.thinking_end_tag.empty()) {
|
||||
llama_params["reasoning_budget_tokens"] = reasoning_budget;
|
||||
llama_params["reasoning_budget_start_tag"] = chat_params.thinking_start_tag;
|
||||
llama_params["reasoning_budget_end_tag"] = chat_params.thinking_end_tag;
|
||||
|
||||
@@ -8,9 +8,11 @@
|
||||
#include <string>
|
||||
#include <thread>
|
||||
|
||||
#ifdef LLAMA_BUILD_WEBUI
|
||||
// auto generated files (see README.md for details)
|
||||
#include "index.html.gz.hpp"
|
||||
#include "loading.html.hpp"
|
||||
#endif
|
||||
|
||||
//
|
||||
// HTTP implementation using cpp-httplib
|
||||
@@ -110,6 +112,22 @@ bool server_http_context::init(const common_params & params) {
|
||||
// set timeouts and change hostname and port
|
||||
srv->set_read_timeout (params.timeout_read);
|
||||
srv->set_write_timeout(params.timeout_write);
|
||||
srv->set_socket_options([reuse_port = params.reuse_port](socket_t sock) {
|
||||
int opt = 1;
|
||||
#ifdef _WIN32
|
||||
const char * optval = (const char *)&opt;
|
||||
#else
|
||||
const void * optval = &opt;
|
||||
#endif
|
||||
setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, optval, sizeof(opt));
|
||||
if (reuse_port) {
|
||||
#ifdef SO_REUSEPORT
|
||||
setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, optval, sizeof(opt));
|
||||
#else
|
||||
LOG_WRN("%s: SO_REUSEPORT is not supported\n", __func__);
|
||||
#endif
|
||||
}
|
||||
});
|
||||
|
||||
if (params.api_keys.size() == 1) {
|
||||
auto key = params.api_keys[0];
|
||||
@@ -181,11 +199,14 @@ bool server_http_context::init(const common_params & params) {
|
||||
auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) {
|
||||
bool ready = is_ready.load();
|
||||
if (!ready) {
|
||||
#ifdef LLAMA_BUILD_WEBUI
|
||||
auto tmp = string_split<std::string>(req.path, '.');
|
||||
if (req.path == "/" || tmp.back() == "html") {
|
||||
res.status = 503;
|
||||
res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
|
||||
} else {
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
// no endpoints is allowed to be accessed when the server is not ready
|
||||
// this is to prevent any data races or inconsistent states
|
||||
res.status = 503;
|
||||
@@ -255,6 +276,7 @@ bool server_http_context::init(const common_params & params) {
|
||||
return 1;
|
||||
}
|
||||
} else {
|
||||
#ifdef LLAMA_BUILD_WEBUI
|
||||
// using embedded static index.html
|
||||
srv->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) {
|
||||
if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
|
||||
@@ -268,6 +290,7 @@ bool server_http_context::init(const common_params & params) {
|
||||
}
|
||||
return false;
|
||||
});
|
||||
#endif
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
||||
@@ -478,19 +478,17 @@ task_params server_task::params_from_json_cmpl(
|
||||
// Parse reasoning budget sampler parameters
|
||||
{
|
||||
const int32_t budget = json_value(data, "reasoning_budget_tokens", (int32_t) -1);
|
||||
if (budget >= 0) {
|
||||
const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string());
|
||||
const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string());
|
||||
const auto message = json_value(data, "reasoning_budget_message", std::string());
|
||||
params.sampling.reasoning_budget_tokens = budget;
|
||||
const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string());
|
||||
const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string());
|
||||
const auto message = json_value(data, "reasoning_budget_message", std::string());
|
||||
params.sampling.reasoning_budget_tokens = budget;
|
||||
|
||||
if (!start_tag.empty()) {
|
||||
params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
|
||||
}
|
||||
if (!end_tag.empty()) {
|
||||
params.sampling.reasoning_budget_end = common_tokenize(vocab, end_tag, false, true);
|
||||
params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true);
|
||||
}
|
||||
if (!start_tag.empty()) {
|
||||
params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true);
|
||||
}
|
||||
if (!end_tag.empty()) {
|
||||
params.sampling.reasoning_budget_end = common_tokenize(vocab, end_tag, false, true);
|
||||
params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true);
|
||||
|
||||
SRV_DBG("reasoning budget: tokens=%d, generation_prompt='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n",
|
||||
budget, params.sampling.generation_prompt.c_str(),
|
||||
|
||||
@@ -101,38 +101,6 @@ static run_proc_result run_process(
|
||||
return res;
|
||||
}
|
||||
|
||||
// simple glob: * matches non-/ chars, ** matches anything including /
|
||||
static bool glob_match(const char * pattern, const char * str) {
|
||||
if (*pattern == '\0') {
|
||||
return *str == '\0';
|
||||
}
|
||||
if (pattern[0] == '*' && pattern[1] == '*') {
|
||||
const char * p = pattern + 2;
|
||||
if (*p == '/') p++;
|
||||
if (glob_match(p, str)) return true;
|
||||
if (*str != '\0') return glob_match(pattern, str + 1);
|
||||
return false;
|
||||
}
|
||||
if (*pattern == '*') {
|
||||
const char * p = pattern + 1;
|
||||
for (; *str != '\0' && *str != '/'; str++) {
|
||||
if (glob_match(p, str)) return true;
|
||||
}
|
||||
return glob_match(p, str);
|
||||
}
|
||||
if (*pattern == '?' && *str != '\0' && *str != '/') {
|
||||
return glob_match(pattern + 1, str + 1);
|
||||
}
|
||||
if (*pattern == *str) {
|
||||
return glob_match(pattern + 1, str + 1);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool glob_match(const std::string & pattern, const std::string & str) {
|
||||
return glob_match(pattern.c_str(), str.c_str());
|
||||
}
|
||||
|
||||
json server_tool::to_json() {
|
||||
return {
|
||||
{"display_name", display_name},
|
||||
|
||||
Reference in New Issue
Block a user