Compare commits

..

7 Commits
b8400 ... b8407

Author SHA1 Message Date
Pop Flamingo
312cf03328 llama : re-enable manual LoRA adapter free (#19983)
* Re-enable manual LoRA adapter free

* Remove stale "all adapters must be loaded before context creation" stale comments
2026-03-18 12:03:26 +02:00
Masato Nakasaka
f4049ad735 tests : fix test-jinja-py Windows failures by bypassing command-line args [no ci] (#20483)
* Fix errors occurring on Windows

* Reverted fix

#20365 will take care of CRLF isue

* Changed to write to directly to stdin

* Prevent fclose to happen twice
2026-03-18 10:43:31 +01:00
Aldehir Rojas
5e8910a0db common : rework gpt-oss parser (#20393)
* common : rework gpt-oss parser

* cont : fix gpt-oss tests

* cont : add structured output test

* cont : rename final to final_msg
2026-03-18 10:41:25 +01:00
Aaron Teo
fe00a84b4b tests: enable kv_unified to prevent cuda oom error on rtx 2060 (#20645)
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
2026-03-18 17:40:22 +08:00
Aleksander Grygier
7ab321d40d webui: Fix duplicated messages on q param (#20715)
* fix: Remove duplicate message sending on `?q` param

* chore: update webui build output
2026-03-18 10:32:43 +01:00
uvos
7533a7d509 HIP : ignore return of hipMemAdvise [no ci] (#20696) 2026-03-18 09:53:13 +01:00
Andreas Obersteiner
a69d54f990 context : fix graph not resetting when control vector changes (#20381) 2026-03-18 08:10:13 +02:00
13 changed files with 102 additions and 134 deletions

View File

@@ -933,17 +933,12 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
// Copy reasoning to the "thinking" field as expected by the gpt-oss template
auto adjusted_messages = json::array();
for (const auto & msg : inputs.messages) {
auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
if (has_reasoning_content && has_tool_calls) {
auto adjusted_message = msg;
adjusted_message["thinking"] = msg.at("reasoning_content");
adjusted_messages.push_back(adjusted_message);
} else {
adjusted_messages.push_back(msg);
for (auto msg : inputs.messages) {
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
msg["thinking"] = msg.at("reasoning_content");
msg.erase("content");
}
adjusted_messages.push_back(msg);
}
auto prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
@@ -969,45 +964,31 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
"<|channel|>", "<|constrain|>", "<|message|>", "<|start|>", "<|end|>",
};
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
auto include_grammar = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && has_tools;
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
const std::string END = "<|end|>";
const std::string START = "<|start|>";
const std::string MESSAGE = "<|message|>";
const std::string CHANNEL = "<|channel|>";
const std::string CONSTRAIN = "<|constrain|>";
const std::string START_ASSISTANT = START + "assistant";
const std::string CHANNEL_ANALYSIS = CHANNEL + "analysis";
const std::string CHANNEL_COMMENTARY = CHANNEL + "commentary";
const std::string CHANNEL_FINAL = CHANNEL + "final";
auto start = p.rule("start", p.literal("<|start|>assistant"));
auto end = p.rule("end", p.literal("<|end|>"));
auto content = p.rule("message-content", p.until("<|end|>"));
auto channel = p.literal("<|channel|>") + (p.literal("commentary") | p.literal("analysis"));
auto constrain_type = p.chars("[A-Za-z0-9_-]", 1, -1);
auto the_end = END | p.end();
auto analysis = p.rule("analysis", p.literal("<|channel|>analysis<|message|>") + p.reasoning(content) + end);
auto preamble = p.rule("preamble", p.literal("<|channel|>commentary<|message|>") + p.content(content) + end);
auto final_msg = p.rule("final", p.literal("<|channel|>final<|message|>") + p.content(content));
auto any = p.rule("any", preamble | analysis);
const std::string analysis_header = CHANNEL_ANALYSIS + MESSAGE;
auto segment_content = p.until(END);
auto analysis_segment = extract_reasoning ?
p.literal(analysis_header) + p.reasoning(segment_content) + p.until(END) + the_end :
p.content(analysis_header + p.until(END) + the_end);
if (has_response_format) {
auto constraint = p.optional(p.space() + p.literal("<|constrain|>") + constrain_type);
auto response_format = p.rule("response-format",
p.literal("<|channel|>final") + constraint + p.literal("<|message|>") +
p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
auto channel_header_content = p.until_one_of({ " to=functions.", MESSAGE });
auto content_header = p.choice({ p.literal(CHANNEL_COMMENTARY), p.literal(CHANNEL_FINAL) });
auto content_segment = p.rule("content-segment", content_header + channel_header_content + MESSAGE +
p.content(segment_content) + the_end);
if (!inputs.json_schema.is_null()) {
auto final_header = p.literal(CHANNEL_FINAL);
auto constraint = p.optional(p.space() + p.literal(CONSTRAIN) + channel_header_content);
return p.optional(analysis_segment) + final_header + constraint + MESSAGE +
p.content(p.schema(p.json(), "response-format", inputs.json_schema));
return response_format | (analysis + p.zero_or_more(start + analysis) + start + response_format);
}
auto segment = p.optional(START_ASSISTANT + p.space()) + p.choice({ content_segment, analysis_segment });
auto contents = p.optional(segment + p.repeat(p.optional(p.space()) + segment, 0, -1)) + p.end();
// Tool call parser
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
auto tool_choice = p.choice();
@@ -1016,42 +997,37 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
std::string name = function.at("name");
const auto & params = function.at("parameters");
// Tool call can appear as:
// 1. In role header: " to=functions.NAME<|channel|>..."
// 2. In channel: "<|channel|>(analysis|commentary) to=functions.NAME..."
auto func_name = p.literal(" to=functions.") + p.tool_name(p.literal(name));
auto channel = p.literal(CHANNEL_COMMENTARY) | p.literal(CHANNEL_ANALYSIS);
auto constraint = p.space() + p.optional(p.literal(CONSTRAIN) + channel_header_content);
auto func_name = p.literal(" to=functions.") + p.tool_name(p.literal(name));
auto constraint = p.optional(p.space() + p.literal("<|constrain|>") + constrain_type);
auto args = p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", params));
// Pattern 1: recipient in role header
// " to=functions.NAME<|channel|>(analysis|commentary)[constraint]<|message|>ARGS"
auto tool_in_role = p.tool(p.tool_open(func_name + channel) + constraint + MESSAGE + args);
// recipient in role header
// <|start|>assistant to=functions.NAME<|channel|>(commentary|analysis)[constraint]<|message|>ARGS
auto tool_in_role = p.tool(p.tool_open(func_name + channel + constraint + p.literal("<|message|>")) + args);
// Pattern 2: recipient in channel header
// "<|channel|>(analysis|commentary) to=functions.NAME[constraint]<|message|>ARGS"
auto tool_in_channel = p.tool(channel + p.tool_open(func_name + constraint + MESSAGE) + args);
// recipient in channel header
// <|channel|>(commentary|analysis) to=functions.NAME[constraint]<|message|>ARGS
auto tool_in_channel = p.tool(p.tool_open(channel + func_name + constraint + p.literal("<|message|>")) + args);
tool_choice |= tool_in_role | tool_in_channel;
tool_choice |= p.rule("tool-" + name, tool_in_role | tool_in_channel);
});
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
auto tool_call = p.trigger_rule("tool-call", tool_choice);
auto role_start = p.optional(p.space() + p.literal(START_ASSISTANT));
auto tool_call = p.rule("tool-call", p.repeat(role_start + tool_choice, min_calls, max_calls) + p.end());
if (inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED) {
return tool_call | ( any + p.zero_or_more(start + any) + start + tool_call);
}
return p.choice({ p.trigger_rule("single-tool", tool_call), p.trigger_rule("tools", p.one_or_more(segment) + tool_call) });
return tool_call | final_msg | (any + p.zero_or_more(start + any) + start + (tool_call | final_msg));
}
return contents;
return final_msg | (any + p.zero_or_more(start + any) + start + final_msg);
});
data.parser = parser.save();
if (include_grammar) {
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
foreach_function(inputs.tools, [&](const json & tool) {
const auto & function = tool.at("function");
@@ -1062,10 +1038,9 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
});
data.grammar_triggers = {
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "^(?:<\\|start\\|>assistant\\s*)?(\\s+to=functions)" },
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "(?:<\\|end\\|>)(?:<\\|start\\|>assistant\\s*)?(\\s+to=functions)" },
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
"(?:<\\|start\\|>assistant\\s*)?(<\\|channel\\|>(?:commentary|analysis)\\s+to=functions)" }
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "^\\s+to$" },
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "<\\|start\\|>assistant(\\s+to)" },
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "<\\|start\\|>assistant(<\\|channel\\|>(?:commentary|analysis)\\s+to)" }
};
}

View File

@@ -1067,7 +1067,7 @@ common_init_result::common_init_result(common_params & params) :
const llama_vocab * vocab = llama_model_get_vocab(model);
// load and optionally apply lora adapters (must be loaded before context creation)
// load and optionally apply lora adapters
for (auto & la : params.lora_adapters) {
llama_adapter_lora_ptr lora;
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));

View File

@@ -126,7 +126,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
if (err == hipSuccess) {
// hipMemAdviseSetCoarseGrain is an optional performance hint;
// ignore errors (e.g. hipErrorInvalidValue on some APU/iGPU configs).
cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device);
(void)cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device);
(void)hipGetLastError(); // clear any error
}

View File

@@ -21,9 +21,7 @@ struct llama_sampler_deleter {
};
struct llama_adapter_lora_deleter {
void operator()(llama_adapter_lora *) {
// llama_adapter_lora_free is deprecated
}
void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
};
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;

View File

@@ -636,7 +636,6 @@ extern "C" {
// Load a LoRA adapter from file
// The adapter is valid as long as the associated model is not freed
// All adapters must be loaded before context creation
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
struct llama_model * model,
const char * path_lora);
@@ -660,9 +659,8 @@ extern "C" {
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
// Manually free a LoRA adapter
// NOTE: loaded adapters will be free when the associated model is deleted
LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
"adapters are now freed together with the associated model");
// NOTE: loaded adapters that are not manually freed will be freed when the associated model is deleted
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
// Get the invocation tokens if the current lora is an alora
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);

View File

@@ -418,7 +418,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
}
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
llama_adapter_lora * adapter = new llama_adapter_lora();
llama_adapter_lora * adapter = new llama_adapter_lora(model);
try {
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
@@ -471,8 +471,17 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
return snprintf(buf, buf_size, "%s", it->second.c_str());
}
void llama_adapter_lora_free(llama_adapter_lora *) {
// deprecated: adapters are freed by llama_model's destructor
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
if (adapter == nullptr) {
return;
}
if (adapter->model != nullptr) {
adapter->model->loras.erase(adapter);
adapter->model = nullptr;
}
delete adapter;
}
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {

View File

@@ -61,6 +61,8 @@ struct llama_adapter_lora_weight {
};
struct llama_adapter_lora {
llama_model * model = nullptr;
// map tensor name to lora_a_b
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
@@ -75,7 +77,7 @@ struct llama_adapter_lora {
// activated lora (aLoRA)
std::vector<llama_token> alora_invocation_tokens;
llama_adapter_lora() = default;
explicit llama_adapter_lora(llama_model * model) : model(model) {}
~llama_adapter_lora() = default;
llama_adapter_lora_weight * get_weight(ggml_tensor * w);

View File

@@ -1165,9 +1165,11 @@ bool llama_context::set_adapter_cvec(
int32_t il_end) {
LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
// TODO: should we reserve?
bool res = cvec->apply(model, data, len, n_embd, il_start, il_end);
return cvec->apply(model, data, len, n_embd, il_start, il_end);
sched_need_reserve = true;
return res;
}
llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {

View File

@@ -89,6 +89,7 @@ struct test_context {
cparams.n_batch = 512;
cparams.samplers = configs.data();
cparams.n_samplers = configs.size();
cparams.kv_unified = true;
// If n_seq_max is not specified, calculate it from configs
if (n_seq_max < 0) {

View File

@@ -2448,7 +2448,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
// Analysis channel (reasoning) with final channel (content)
tst.test(
"<|channel|>analysis<|message|>I'm\nthinking<|end|>\n<|channel|>final<|message|>Hello, world!\nWhat's "
"<|channel|>analysis<|message|>I'm\nthinking<|end|><|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's "
"up?")
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.expect(message_assist_thoughts)
@@ -2461,15 +2461,6 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
.expect_reasoning("I'm\nthinking")
.run();
// Reasoning format none - reasoning stays in content
tst.test(
"<|channel|>analysis<|message|>I'm\nthinking<|end|>\n<|channel|>final<|message|>Hello, world!\nWhat's "
"up?")
.reasoning_format(COMMON_REASONING_FORMAT_NONE)
.expect_content(
"<|channel|>analysis<|message|>I'm\nthinking<|end|>Hello, world!\nWhat's up?")
.run();
// Tool call with recipient in role header: " to=functions.NAME<|channel|>analysis<|message|>JSON"
tst.test(" to=functions.special_function<|channel|>analysis<|message|>{\"arg1\": 1}")
.tools({ special_function_tool })
@@ -2496,37 +2487,16 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
// Tool call with reasoning + content (analysis first, then tool call)
tst.test(
"<|channel|>analysis<|message|>I'm\nthinking<|end|>\n"
"<|channel|>analysis<|message|>I'm\nthinking<|end|>"
"<|start|>assistant to=functions.special_function<|channel|>analysis<|message|>{\"arg1\": 1}")
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ special_function_tool })
.expect(message_assist_call_thoughts)
.run();
// Tool calling with extra channel before
// Complex tool calling
tst.test(
"<|channel|>analysis<|message|>I'm\nthinking<|end|><|start|>assistant<|channel|>commentary"
" to=functions.special_function <|message|>{\"arg1\": 1}")
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.tools({ special_function_tool })
.expect(message_assist_call_thoughts)
.run();
// Reasoning after final channel
// Tool calling after final channel
tst.test(
"<|channel|>final<|message|><|end|>"
"<|start|>assistant<|channel|>analysis<|message|>Thinking about edit..."
)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.expect_reasoning("Thinking about edit...")
.expect_content("")
.run();
// Tool calling after final channel
tst.test(
"<|channel|>final<|message|><|end|>"
"<|start|>assistant<|channel|>analysis<|message|>Thinking about edit...<|end|>"
"<|channel|>analysis<|message|>Thinking about edit...<|end|>"
"<|start|>assistant<|channel|>commentary to=functions.edit <|constrain|>json"
"<|message|>{\"oldString\": \"if (part < railCount - 1) {\", \"newString\": \"if (part < 4) {\", \"replaceAll\": false}"
)
@@ -2561,19 +2531,17 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
})
.run();
// Parallel tool calls
// Structured output
tst.test(
" to=functions.special_function<|channel|>analysis<|message|>{\"arg1\": 1}\n"
"<|start|>assistant to=functions.special_function_with_opt<|channel|>analysis<|message|>{\"arg1\": 1, "
"\"arg2\": 2}")
.parallel_tool_calls(true)
.tools({
special_function_tool, special_function_tool_with_optional_param
})
.expect_tool_calls({
{ "special_function", R"({"arg1": 1})", {} },
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
})
"<|channel|>analysis<|message|>I need to output the invoice details in JSON<|end|>"
"<|start|>assistant<|channel|>final <|constrain|>json"
"<|message|>"
R"({"amount": 123.45, "date": "2025-12-03"})"
)
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
.json_schema(invoice_schema)
.expect_reasoning("I need to output the invoice details in JSON")
.expect_content(R"({"amount": 123.45, "date": "2025-12-03"})")
.run();
}

View File

@@ -1897,8 +1897,9 @@ import sys
from datetime import datetime
from jinja2.sandbox import SandboxedEnvironment
tmpl = json.loads(sys.argv[1])
vars_json = json.loads(sys.argv[2])
merged_input = json.loads(sys.stdin.buffer.read().decode("utf-8"))
tmpl = merged_input["tmpl"]
vars_json = merged_input["vars"]
env = SandboxedEnvironment(
trim_blocks=True,
@@ -1921,8 +1922,9 @@ sys.stdout.buffer.write(result.encode())
static void test_template_py(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) {
t.test(name, [&tmpl, &vars, &expect](testing & t) {
// Prepare arguments
std::string tmpl_json = json(tmpl).dump();
std::string vars_json = vars.dump();
json merged;
merged["tmpl"] = json(tmpl);
merged["vars"] = vars;
#ifdef _WIN32
const char * python_executable = "python.exe";
@@ -1930,7 +1932,7 @@ static void test_template_py(testing & t, const std::string & name, const std::s
const char * python_executable = "python3";
#endif
const char * command_line[] = {python_executable, "-c", py_script.c_str(), tmpl_json.c_str(), vars_json.c_str(), NULL};
const char * command_line[] = {python_executable, "-c", py_script.c_str(), NULL};
struct subprocess_s subprocess;
int options = subprocess_option_combined_stdout_stderr
@@ -1944,6 +1946,20 @@ static void test_template_py(testing & t, const std::string & name, const std::s
t.assert_true("subprocess creation", false);
return;
}
FILE * p_stdin = subprocess_stdin(&subprocess);
// Write input
std::string input = merged.dump();
auto written = fwrite(input.c_str(), 1, input.size(), p_stdin);
if (written != input.size()) {
t.log("Failed to write complete input to subprocess stdin");
t.assert_true("subprocess stdin write", false);
subprocess_destroy(&subprocess);
return;
}
fflush(p_stdin);
fclose(p_stdin); // Close stdin to signal EOF to the Python process
subprocess.stdin_file = nullptr;
// Read output
std::string output;

Binary file not shown.

View File

@@ -57,7 +57,6 @@
// Handle ?q= parameter - create new conversation and send message
if (qParam !== null) {
await conversationsStore.createConversation();
await chatStore.sendMessage(qParam);
clearUrlParams();
} else if (modelParam || newChatParam === 'true') {
clearUrlParams();