mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-03-19 14:53:28 +02:00
Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
07ba6d275b | ||
|
|
6729d4920c | ||
|
|
d13d60af1d | ||
|
|
5744d7ec43 | ||
|
|
8ced5f41f9 | ||
|
|
78d550b541 | ||
|
|
4efd326e71 | ||
|
|
b08f7322ee | ||
|
|
79187f2fb8 | ||
|
|
48e61238e1 | ||
|
|
312cf03328 | ||
|
|
f4049ad735 | ||
|
|
5e8910a0db | ||
|
|
fe00a84b4b | ||
|
|
7ab321d40d | ||
|
|
7533a7d509 | ||
|
|
a69d54f990 |
109
common/chat.cpp
109
common/chat.cpp
@@ -933,17 +933,12 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
|
||||
// Copy reasoning to the "thinking" field as expected by the gpt-oss template
|
||||
auto adjusted_messages = json::array();
|
||||
for (const auto & msg : inputs.messages) {
|
||||
auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
|
||||
auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
|
||||
|
||||
if (has_reasoning_content && has_tool_calls) {
|
||||
auto adjusted_message = msg;
|
||||
adjusted_message["thinking"] = msg.at("reasoning_content");
|
||||
adjusted_messages.push_back(adjusted_message);
|
||||
} else {
|
||||
adjusted_messages.push_back(msg);
|
||||
for (auto msg : inputs.messages) {
|
||||
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
|
||||
msg["thinking"] = msg.at("reasoning_content");
|
||||
msg.erase("content");
|
||||
}
|
||||
adjusted_messages.push_back(msg);
|
||||
}
|
||||
|
||||
auto prompt = common_chat_template_direct_apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
|
||||
@@ -969,45 +964,31 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
"<|channel|>", "<|constrain|>", "<|message|>", "<|start|>", "<|end|>",
|
||||
};
|
||||
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE;
|
||||
auto include_grammar = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && has_tools;
|
||||
auto has_tools = inputs.tools.is_array() && !inputs.tools.empty();
|
||||
auto has_response_format = !inputs.json_schema.is_null() && inputs.json_schema.is_object();
|
||||
auto include_grammar = has_response_format || (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE);
|
||||
|
||||
auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) {
|
||||
const std::string END = "<|end|>";
|
||||
const std::string START = "<|start|>";
|
||||
const std::string MESSAGE = "<|message|>";
|
||||
const std::string CHANNEL = "<|channel|>";
|
||||
const std::string CONSTRAIN = "<|constrain|>";
|
||||
const std::string START_ASSISTANT = START + "assistant";
|
||||
const std::string CHANNEL_ANALYSIS = CHANNEL + "analysis";
|
||||
const std::string CHANNEL_COMMENTARY = CHANNEL + "commentary";
|
||||
const std::string CHANNEL_FINAL = CHANNEL + "final";
|
||||
auto start = p.rule("start", p.literal("<|start|>assistant"));
|
||||
auto end = p.rule("end", p.literal("<|end|>"));
|
||||
auto content = p.rule("message-content", p.until("<|end|>"));
|
||||
auto channel = p.literal("<|channel|>") + (p.literal("commentary") | p.literal("analysis"));
|
||||
auto constrain_type = p.chars("[A-Za-z0-9_-]", 1, -1);
|
||||
|
||||
auto the_end = END | p.end();
|
||||
auto analysis = p.rule("analysis", p.literal("<|channel|>analysis<|message|>") + p.reasoning(content) + end);
|
||||
auto preamble = p.rule("preamble", p.literal("<|channel|>commentary<|message|>") + p.content(content) + end);
|
||||
auto final_msg = p.rule("final", p.literal("<|channel|>final<|message|>") + p.content(content));
|
||||
auto any = p.rule("any", preamble | analysis);
|
||||
|
||||
const std::string analysis_header = CHANNEL_ANALYSIS + MESSAGE;
|
||||
auto segment_content = p.until(END);
|
||||
auto analysis_segment = extract_reasoning ?
|
||||
p.literal(analysis_header) + p.reasoning(segment_content) + p.until(END) + the_end :
|
||||
p.content(analysis_header + p.until(END) + the_end);
|
||||
if (has_response_format) {
|
||||
auto constraint = p.optional(p.space() + p.literal("<|constrain|>") + constrain_type);
|
||||
auto response_format = p.rule("response-format",
|
||||
p.literal("<|channel|>final") + constraint + p.literal("<|message|>") +
|
||||
p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema)));
|
||||
|
||||
auto channel_header_content = p.until_one_of({ " to=functions.", MESSAGE });
|
||||
auto content_header = p.choice({ p.literal(CHANNEL_COMMENTARY), p.literal(CHANNEL_FINAL) });
|
||||
auto content_segment = p.rule("content-segment", content_header + channel_header_content + MESSAGE +
|
||||
p.content(segment_content) + the_end);
|
||||
|
||||
if (!inputs.json_schema.is_null()) {
|
||||
auto final_header = p.literal(CHANNEL_FINAL);
|
||||
auto constraint = p.optional(p.space() + p.literal(CONSTRAIN) + channel_header_content);
|
||||
return p.optional(analysis_segment) + final_header + constraint + MESSAGE +
|
||||
p.content(p.schema(p.json(), "response-format", inputs.json_schema));
|
||||
return response_format | (analysis + p.zero_or_more(start + analysis) + start + response_format);
|
||||
}
|
||||
|
||||
auto segment = p.optional(START_ASSISTANT + p.space()) + p.choice({ content_segment, analysis_segment });
|
||||
auto contents = p.optional(segment + p.repeat(p.optional(p.space()) + segment, 0, -1)) + p.end();
|
||||
|
||||
// Tool call parser
|
||||
if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
|
||||
auto tool_choice = p.choice();
|
||||
|
||||
@@ -1016,42 +997,37 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
std::string name = function.at("name");
|
||||
const auto & params = function.at("parameters");
|
||||
|
||||
// Tool call can appear as:
|
||||
// 1. In role header: " to=functions.NAME<|channel|>..."
|
||||
// 2. In channel: "<|channel|>(analysis|commentary) to=functions.NAME..."
|
||||
auto func_name = p.literal(" to=functions.") + p.tool_name(p.literal(name));
|
||||
|
||||
auto channel = p.literal(CHANNEL_COMMENTARY) | p.literal(CHANNEL_ANALYSIS);
|
||||
auto constraint = p.space() + p.optional(p.literal(CONSTRAIN) + channel_header_content);
|
||||
auto func_name = p.literal(" to=functions.") + p.tool_name(p.literal(name));
|
||||
auto constraint = p.optional(p.space() + p.literal("<|constrain|>") + constrain_type);
|
||||
auto args = p.tool_args(p.schema(p.json(), "tool-" + name + "-schema", params));
|
||||
|
||||
// Pattern 1: recipient in role header
|
||||
// " to=functions.NAME<|channel|>(analysis|commentary)[constraint]<|message|>ARGS"
|
||||
auto tool_in_role = p.tool(p.tool_open(func_name + channel) + constraint + MESSAGE + args);
|
||||
// recipient in role header
|
||||
// <|start|>assistant to=functions.NAME<|channel|>(commentary|analysis)[constraint]<|message|>ARGS
|
||||
auto tool_in_role = p.tool(p.tool_open(func_name + channel + constraint + p.literal("<|message|>")) + args);
|
||||
|
||||
// Pattern 2: recipient in channel header
|
||||
// "<|channel|>(analysis|commentary) to=functions.NAME[constraint]<|message|>ARGS"
|
||||
auto tool_in_channel = p.tool(channel + p.tool_open(func_name + constraint + MESSAGE) + args);
|
||||
// recipient in channel header
|
||||
// <|channel|>(commentary|analysis) to=functions.NAME[constraint]<|message|>ARGS
|
||||
auto tool_in_channel = p.tool(p.tool_open(channel + func_name + constraint + p.literal("<|message|>")) + args);
|
||||
|
||||
tool_choice |= tool_in_role | tool_in_channel;
|
||||
tool_choice |= p.rule("tool-" + name, tool_in_role | tool_in_channel);
|
||||
});
|
||||
|
||||
auto min_calls = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED ? 1 : 0;
|
||||
auto max_calls = inputs.parallel_tool_calls ? -1 : 1;
|
||||
auto tool_call = p.trigger_rule("tool-call", tool_choice);
|
||||
|
||||
auto role_start = p.optional(p.space() + p.literal(START_ASSISTANT));
|
||||
auto tool_call = p.rule("tool-call", p.repeat(role_start + tool_choice, min_calls, max_calls) + p.end());
|
||||
if (inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED) {
|
||||
return tool_call | ( any + p.zero_or_more(start + any) + start + tool_call);
|
||||
}
|
||||
|
||||
return p.choice({ p.trigger_rule("single-tool", tool_call), p.trigger_rule("tools", p.one_or_more(segment) + tool_call) });
|
||||
return tool_call | final_msg | (any + p.zero_or_more(start + any) + start + (tool_call | final_msg));
|
||||
}
|
||||
|
||||
return contents;
|
||||
return final_msg | (any + p.zero_or_more(start + any) + start + final_msg);
|
||||
});
|
||||
|
||||
data.parser = parser.save();
|
||||
|
||||
if (include_grammar) {
|
||||
data.grammar_lazy = has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||
data.grammar_lazy = !(has_response_format || (has_tools && inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED));
|
||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||
foreach_function(inputs.tools, [&](const json & tool) {
|
||||
const auto & function = tool.at("function");
|
||||
@@ -1062,10 +1038,9 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
});
|
||||
|
||||
data.grammar_triggers = {
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "^(?:<\\|start\\|>assistant\\s*)?(\\s+to=functions)" },
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "(?:<\\|end\\|>)(?:<\\|start\\|>assistant\\s*)?(\\s+to=functions)" },
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
|
||||
"(?:<\\|start\\|>assistant\\s*)?(<\\|channel\\|>(?:commentary|analysis)\\s+to=functions)" }
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "^\\s+to$" },
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "<\\|start\\|>assistant(\\s+to)" },
|
||||
{ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN, "<\\|start\\|>assistant(<\\|channel\\|>(?:commentary|analysis)\\s+to)" }
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -1067,7 +1067,7 @@ common_init_result::common_init_result(common_params & params) :
|
||||
|
||||
const llama_vocab * vocab = llama_model_get_vocab(model);
|
||||
|
||||
// load and optionally apply lora adapters (must be loaded before context creation)
|
||||
// load and optionally apply lora adapters
|
||||
for (auto & la : params.lora_adapters) {
|
||||
llama_adapter_lora_ptr lora;
|
||||
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
|
||||
|
||||
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 9)
|
||||
set(GGML_VERSION_PATCH 7)
|
||||
set(GGML_VERSION_PATCH 8)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
@@ -733,6 +733,10 @@ extern "C" {
|
||||
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||
|
||||
GGML_DEPRECATED(
|
||||
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
||||
"use ggml_row_size() instead");
|
||||
|
||||
GGML_API const char * ggml_type_name(enum ggml_type type);
|
||||
GGML_API const char * ggml_op_name (enum ggml_op op);
|
||||
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
||||
|
||||
@@ -1544,8 +1544,8 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx,
|
||||
end = 2 * ((n_head - 1) - n_head_log2) + 1;
|
||||
step = 2;
|
||||
count = n_head - n_head_log2;
|
||||
aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * sizeof(float), m1, count, start, end + 1, step,
|
||||
dtype);
|
||||
aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * ggml_type_size(dtype), m1, count, start, end + 1,
|
||||
step, dtype);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3599,6 +3599,44 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
|
||||
acl_k_tensor = ggml_cann_create_tensor(src1, src1_bsnd_ne, src1_bsnd_nb, GGML_MAX_DIMS);
|
||||
acl_v_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne, src2_bsnd_nb, GGML_MAX_DIMS);
|
||||
|
||||
// Step 2.5: Pad Q, K, V along head dimension if D is not a multiple of 16
|
||||
// (required by FusedInferAttentionScoreV2)
|
||||
const int64_t D = src0->ne[0];
|
||||
const int64_t D_padded = GGML_PAD(D, 16);
|
||||
const bool needs_padding = (D != D_padded);
|
||||
|
||||
ggml_cann_pool_alloc q_pad_allocator(ctx.pool());
|
||||
ggml_cann_pool_alloc k_pad_allocator(ctx.pool());
|
||||
ggml_cann_pool_alloc v_pad_allocator(ctx.pool());
|
||||
|
||||
if (needs_padding) {
|
||||
int64_t paddings[] = { 0, D_padded - D, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
auto pad_fa_tensor = [&](acl_tensor_ptr & tensor, const int64_t * bsnd_ne,
|
||||
ggml_cann_pool_alloc & allocator) {
|
||||
int64_t pad_ne[GGML_MAX_DIMS] = { D_padded, bsnd_ne[1], bsnd_ne[2], bsnd_ne[3] };
|
||||
size_t pad_nb[GGML_MAX_DIMS];
|
||||
pad_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
pad_nb[i] = pad_nb[i - 1] * pad_ne[i - 1];
|
||||
}
|
||||
int64_t nelements = pad_ne[0] * pad_ne[1] * pad_ne[2] * pad_ne[3];
|
||||
void * buffer = allocator.alloc(nelements * faElemSize);
|
||||
acl_tensor_ptr padded =
|
||||
ggml_cann_create_tensor(buffer, faDataType, faElemSize, pad_ne, pad_nb, GGML_MAX_DIMS);
|
||||
aclnn_pad(ctx, tensor.get(), padded.get(), paddings);
|
||||
tensor = std::move(padded);
|
||||
};
|
||||
|
||||
pad_fa_tensor(acl_q_tensor, src0_bsnd_ne, q_pad_allocator);
|
||||
pad_fa_tensor(acl_k_tensor, src1_bsnd_ne, k_pad_allocator);
|
||||
pad_fa_tensor(acl_v_tensor, src2_bsnd_ne, v_pad_allocator);
|
||||
|
||||
src0_bsnd_ne[0] = D_padded;
|
||||
src1_bsnd_ne[0] = D_padded;
|
||||
src2_bsnd_ne[0] = D_padded;
|
||||
}
|
||||
|
||||
// Step 3: create the PSEShift tensor if needed
|
||||
// this tensor is considered as mask (f16) in the llama.cpp
|
||||
acl_tensor_ptr bcast_pse_tensor;
|
||||
@@ -3688,17 +3726,16 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
|
||||
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
acl_tensor_ptr fa_dst_tensor;
|
||||
acl_tensor_ptr acl_dst_tensor;
|
||||
ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
void * out_f16_buffer = out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
|
||||
|
||||
if (dst->type == GGML_TYPE_F32 || needs_padding) {
|
||||
int64_t * out_f16_ne = src0_bsnd_ne;
|
||||
size_t out_f16_nb[GGML_MAX_DIMS];
|
||||
out_f16_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
|
||||
}
|
||||
int64_t out_nelements = out_f16_ne[0] * out_f16_ne[1] * out_f16_ne[2] * out_f16_ne[3];
|
||||
void * out_f16_buffer = out_f16_allocator.alloc(out_nelements * faElemSize);
|
||||
|
||||
fa_dst_tensor =
|
||||
ggml_cann_create_tensor(out_f16_buffer, faDataType, faElemSize, out_f16_ne, out_f16_nb, GGML_MAX_DIMS);
|
||||
@@ -3730,8 +3767,33 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
|
||||
nullptr // softmaxLse
|
||||
);
|
||||
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
// Step 6: post-processing, permute and cast to f32
|
||||
// Step 6: post-processing — slice padded output and/or cast to f32
|
||||
if (needs_padding) {
|
||||
ggml_cann_pool_alloc sliced_f16_allocator(ctx.pool());
|
||||
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
int64_t sliced_ne[GGML_MAX_DIMS] = { D, src0_bsnd_ne[1], src0_bsnd_ne[2], src0_bsnd_ne[3] };
|
||||
size_t sliced_nb[GGML_MAX_DIMS];
|
||||
sliced_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
sliced_nb[i] = sliced_nb[i - 1] * sliced_ne[i - 1];
|
||||
}
|
||||
int64_t sliced_nelements = sliced_ne[0] * sliced_ne[1] * sliced_ne[2] * sliced_ne[3];
|
||||
void * sliced_buffer = sliced_f16_allocator.alloc(sliced_nelements * faElemSize);
|
||||
acl_tensor_ptr sliced_f16_tensor = ggml_cann_create_tensor(sliced_buffer, faDataType, faElemSize,
|
||||
sliced_ne, sliced_nb, GGML_MAX_DIMS);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Slice, fa_dst_tensor.get(),
|
||||
(int64_t) -1, (int64_t) 0, D, (int64_t) 1, sliced_f16_tensor.get());
|
||||
|
||||
acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
aclnn_cast(ctx, sliced_f16_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
|
||||
} else {
|
||||
acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Slice, fa_dst_tensor.get(),
|
||||
(int64_t) -1, (int64_t) 0, D, (int64_t) 1, acl_dst_tensor.get());
|
||||
}
|
||||
} else if (dst->type == GGML_TYPE_F32) {
|
||||
acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
aclnn_cast(ctx, fa_dst_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
|
||||
}
|
||||
|
||||
@@ -2503,10 +2503,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||
// different head sizes of K and V are not supported yet
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] % 16 != 0) {
|
||||
// TODO: padding to support
|
||||
return false;
|
||||
}
|
||||
float logitSoftcap = 0.0f;
|
||||
memcpy(&logitSoftcap, (const float *) (op->op_params) + 2, sizeof(float));
|
||||
if (logitSoftcap != 0.0f) {
|
||||
|
||||
@@ -531,7 +531,6 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
||||
|
||||
UNUSED(bs);
|
||||
|
||||
__m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
|
||||
__m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
|
||||
|
||||
// Permute mask used for easier vector processing at later stages
|
||||
@@ -580,6 +579,7 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
||||
if constexpr (
|
||||
std::is_same_v<block_tx8, block_q4_0x8> ||
|
||||
std::is_same_v<block_tx8, block_iq4_nlx8>) {
|
||||
const __m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
|
||||
col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
|
||||
} else if constexpr (std::is_same_v<block_tx8, block_mxfp4x8>) {
|
||||
// Load 8 E8M0 exponents and convert to float via LUT
|
||||
|
||||
@@ -126,7 +126,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
||||
if (err == hipSuccess) {
|
||||
// hipMemAdviseSetCoarseGrain is an optional performance hint;
|
||||
// ignore errors (e.g. hipErrorInvalidValue on some APU/iGPU configs).
|
||||
cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device);
|
||||
(void)cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device);
|
||||
(void)hipGetLastError(); // clear any error
|
||||
}
|
||||
|
||||
|
||||
@@ -509,50 +509,39 @@ static void ggml_backend_webgpu_wait_profile_futures(webgpu_global_context &
|
||||
static void ggml_backend_webgpu_wait(webgpu_global_context & ctx,
|
||||
std::vector<webgpu_submission> & subs,
|
||||
bool block = true) {
|
||||
// If we have too many in-flight submissions, wait on the oldest one first.
|
||||
if (subs.empty()) {
|
||||
return;
|
||||
}
|
||||
while (subs.size() >= WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD) {
|
||||
auto waitStatus = ctx->instance.WaitAny(1, &subs[0].submit_done, UINT64_MAX);
|
||||
if (ggml_backend_webgpu_handle_wait_status(waitStatus)) {
|
||||
|
||||
bool blocking_wait = block || subs.size() >= WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD;
|
||||
while (blocking_wait) {
|
||||
auto waitStatus = ctx->instance.WaitAny(1, &subs[0].submit_done, 0);
|
||||
if (ggml_backend_webgpu_handle_wait_status(waitStatus, true)) {
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
ggml_backend_webgpu_wait_profile_futures(ctx, subs[0].profile_futures, true);
|
||||
#endif
|
||||
subs.erase(subs.begin());
|
||||
}
|
||||
blocking_wait = (block && !subs.empty()) || subs.size() >= WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD;
|
||||
}
|
||||
|
||||
if (subs.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (block) {
|
||||
for (auto & sub : subs) {
|
||||
while (!sub.submit_done.completed) {
|
||||
auto waitStatus = ctx->instance.WaitAny(1, &sub.submit_done, UINT64_MAX);
|
||||
ggml_backend_webgpu_handle_wait_status(waitStatus);
|
||||
}
|
||||
// Poll each submit future once and remove completed submissions.
|
||||
for (auto sub = subs.begin(); sub != subs.end();) {
|
||||
auto waitStatus = ctx->instance.WaitAny(1, &sub->submit_done, 0);
|
||||
bool success = ggml_backend_webgpu_handle_wait_status(waitStatus, true);
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
ggml_backend_webgpu_wait_profile_futures(ctx, sub.profile_futures, true);
|
||||
#endif
|
||||
}
|
||||
subs.clear();
|
||||
} else {
|
||||
// Poll each submit future once and remove completed submissions.
|
||||
for (auto sub = subs.begin(); sub != subs.end();) {
|
||||
auto waitStatus = ctx->instance.WaitAny(1, &sub->submit_done, 0);
|
||||
ggml_backend_webgpu_handle_wait_status(waitStatus, true);
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
ggml_backend_webgpu_wait_profile_futures(ctx, sub->profile_futures, false);
|
||||
if (sub->submit_done.completed && sub->profile_futures.empty()) {
|
||||
ggml_backend_webgpu_wait_profile_futures(ctx, sub->profile_futures, false);
|
||||
if (success && sub->profile_futures.empty()) {
|
||||
#else
|
||||
if (sub->submit_done.completed) {
|
||||
if (success) {
|
||||
#endif
|
||||
sub = subs.erase(sub);
|
||||
} else {
|
||||
++sub;
|
||||
}
|
||||
sub = subs.erase(sub);
|
||||
} else {
|
||||
++sub;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2961,17 +2950,16 @@ static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggm
|
||||
|
||||
static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_webgpu_buffer_type_get_name,
|
||||
/* .alloc_buffer = */
|
||||
ggml_backend_webgpu_buffer_type_alloc_buffer, /* .get_alignment = */
|
||||
ggml_backend_webgpu_buffer_type_get_alignment, /* .get_max_size = */
|
||||
ggml_backend_webgpu_buffer_type_get_max_size, /* .get_alloc_size = */
|
||||
ggml_backend_webgpu_buffer_type_get_alloc_size, /* .is_host = */ NULL, // defaults to false
|
||||
/* .get_name = */ ggml_backend_webgpu_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_webgpu_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_webgpu_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_webgpu_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_webgpu_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ NULL, // defaults to false
|
||||
},
|
||||
/* .device = */
|
||||
dev,
|
||||
/* .context = */
|
||||
NULL
|
||||
dev,
|
||||
/* .context = */ NULL
|
||||
};
|
||||
|
||||
return &ggml_backend_webgpu_buffer_type;
|
||||
|
||||
@@ -1294,6 +1294,12 @@ size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
||||
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
||||
}
|
||||
|
||||
double ggml_type_sizef(enum ggml_type type) {
|
||||
assert(type >= 0);
|
||||
assert(type < GGML_TYPE_COUNT);
|
||||
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
||||
}
|
||||
|
||||
const char * ggml_type_name(enum ggml_type type) {
|
||||
assert(type >= 0);
|
||||
assert(type < GGML_TYPE_COUNT);
|
||||
|
||||
@@ -425,8 +425,7 @@ class GGUFWriter:
|
||||
fout = self.fout[file_id]
|
||||
|
||||
# pop the first tensor info
|
||||
# TODO: cleaner way to get the first key
|
||||
first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
|
||||
first_tensor_name = next(iter(self.tensors[file_id]))
|
||||
ti = self.tensors[file_id].pop(first_tensor_name)
|
||||
assert ti.nbytes == tensor.nbytes
|
||||
|
||||
|
||||
@@ -21,9 +21,7 @@ struct llama_sampler_deleter {
|
||||
};
|
||||
|
||||
struct llama_adapter_lora_deleter {
|
||||
void operator()(llama_adapter_lora *) {
|
||||
// llama_adapter_lora_free is deprecated
|
||||
}
|
||||
void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
|
||||
};
|
||||
|
||||
typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
|
||||
|
||||
@@ -636,7 +636,6 @@ extern "C" {
|
||||
|
||||
// Load a LoRA adapter from file
|
||||
// The adapter is valid as long as the associated model is not freed
|
||||
// All adapters must be loaded before context creation
|
||||
LLAMA_API struct llama_adapter_lora * llama_adapter_lora_init(
|
||||
struct llama_model * model,
|
||||
const char * path_lora);
|
||||
@@ -660,9 +659,8 @@ extern "C" {
|
||||
LLAMA_API int32_t llama_adapter_meta_val_str_by_index(const struct llama_adapter_lora * adapter, int32_t i, char * buf, size_t buf_size);
|
||||
|
||||
// Manually free a LoRA adapter
|
||||
// NOTE: loaded adapters will be free when the associated model is deleted
|
||||
LLAMA_API DEPRECATED(void llama_adapter_lora_free(struct llama_adapter_lora * adapter),
|
||||
"adapters are now freed together with the associated model");
|
||||
// NOTE: loaded adapters that are not manually freed will be freed when the associated model is deleted
|
||||
LLAMA_API void llama_adapter_lora_free(struct llama_adapter_lora * adapter);
|
||||
|
||||
// Get the invocation tokens if the current lora is an alora
|
||||
LLAMA_API uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter);
|
||||
|
||||
@@ -1 +1 @@
|
||||
553552e1d88be2b214b85e5159eedd39a63e2c34
|
||||
c044a8eeae2591faa0950c8b5e514cbc4bbfc4ca
|
||||
|
||||
@@ -418,7 +418,7 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
||||
}
|
||||
|
||||
llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
|
||||
llama_adapter_lora * adapter = new llama_adapter_lora();
|
||||
llama_adapter_lora * adapter = new llama_adapter_lora(model);
|
||||
|
||||
try {
|
||||
llama_adapter_lora_init_impl(*model, path_lora, *adapter);
|
||||
@@ -471,8 +471,17 @@ int32_t llama_adapter_meta_val_str_by_index(const llama_adapter_lora * adapter,
|
||||
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||
}
|
||||
|
||||
void llama_adapter_lora_free(llama_adapter_lora *) {
|
||||
// deprecated: adapters are freed by llama_model's destructor
|
||||
void llama_adapter_lora_free(llama_adapter_lora * adapter) {
|
||||
if (adapter == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (adapter->model != nullptr) {
|
||||
adapter->model->loras.erase(adapter);
|
||||
adapter->model = nullptr;
|
||||
}
|
||||
|
||||
delete adapter;
|
||||
}
|
||||
|
||||
uint64_t llama_adapter_get_alora_n_invocation_tokens(const struct llama_adapter_lora * adapter) {
|
||||
|
||||
@@ -61,6 +61,8 @@ struct llama_adapter_lora_weight {
|
||||
};
|
||||
|
||||
struct llama_adapter_lora {
|
||||
llama_model * model = nullptr;
|
||||
|
||||
// map tensor name to lora_a_b
|
||||
std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
|
||||
|
||||
@@ -75,7 +77,7 @@ struct llama_adapter_lora {
|
||||
// activated lora (aLoRA)
|
||||
std::vector<llama_token> alora_invocation_tokens;
|
||||
|
||||
llama_adapter_lora() = default;
|
||||
explicit llama_adapter_lora(llama_model * model) : model(model) {}
|
||||
~llama_adapter_lora() = default;
|
||||
|
||||
llama_adapter_lora_weight * get_weight(ggml_tensor * w);
|
||||
|
||||
@@ -1165,9 +1165,11 @@ bool llama_context::set_adapter_cvec(
|
||||
int32_t il_end) {
|
||||
LLAMA_LOG_DEBUG("%s: il_start = %d, il_end = %d\n", __func__, il_start, il_end);
|
||||
|
||||
// TODO: should we reserve?
|
||||
bool res = cvec->apply(model, data, len, n_embd, il_start, il_end);
|
||||
|
||||
return cvec->apply(model, data, len, n_embd, il_start, il_end);
|
||||
sched_need_reserve = true;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
|
||||
|
||||
@@ -121,6 +121,9 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
@@ -111,8 +111,13 @@ llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_
|
||||
|
||||
}
|
||||
|
||||
inpL = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(inpL, "l_out", il);
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = build_norm(inpL,
|
||||
|
||||
@@ -86,6 +86,10 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
|
||||
@@ -82,6 +82,7 @@ llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_grap
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
|
||||
@@ -66,8 +66,14 @@ llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
inpL = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(inpL, "l_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
cur = build_norm(inpL,
|
||||
model.output_norm,
|
||||
|
||||
@@ -362,6 +362,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
|
||||
@@ -177,6 +177,9 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
|
||||
cb(ffn_norm_out, "model.layers.{}.ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_out);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
}
|
||||
|
||||
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||
|
||||
@@ -71,6 +71,7 @@ llm_build_plamo2::llm_build_plamo2(const llama_model & model, const llm_graph_pa
|
||||
cur = ggml_add(ctx0, cur, residual);
|
||||
cb(cur, "ffn_residual", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
|
||||
@@ -109,6 +109,8 @@ llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_gr
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
|
||||
@@ -64,6 +64,9 @@ llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_pa
|
||||
cur = ggml_add(ctx0, cur, ffn_residual);
|
||||
cb(cur, "post_ffn", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// Input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
@@ -64,6 +64,9 @@ llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_gr
|
||||
cur = ggml_add(ctx0, cur, ffn_residual);
|
||||
cb(cur, "post_moe", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// Input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
@@ -56,6 +56,9 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
|
||||
cur = ggml_add(ctx0, cur, ffn_residual);
|
||||
cb(cur, "post_moe", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// Input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
@@ -101,6 +101,7 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
|
||||
cur = ffn_out;
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
|
||||
@@ -145,9 +145,11 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
|
||||
@@ -89,6 +89,7 @@ struct test_context {
|
||||
cparams.n_batch = 512;
|
||||
cparams.samplers = configs.data();
|
||||
cparams.n_samplers = configs.size();
|
||||
cparams.kv_unified = true;
|
||||
|
||||
// If n_seq_max is not specified, calculate it from configs
|
||||
if (n_seq_max < 0) {
|
||||
|
||||
@@ -2448,7 +2448,7 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
|
||||
// Analysis channel (reasoning) with final channel (content)
|
||||
tst.test(
|
||||
"<|channel|>analysis<|message|>I'm\nthinking<|end|>\n<|channel|>final<|message|>Hello, world!\nWhat's "
|
||||
"<|channel|>analysis<|message|>I'm\nthinking<|end|><|start|>assistant<|channel|>final<|message|>Hello, world!\nWhat's "
|
||||
"up?")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.expect(message_assist_thoughts)
|
||||
@@ -2461,15 +2461,6 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
.expect_reasoning("I'm\nthinking")
|
||||
.run();
|
||||
|
||||
// Reasoning format none - reasoning stays in content
|
||||
tst.test(
|
||||
"<|channel|>analysis<|message|>I'm\nthinking<|end|>\n<|channel|>final<|message|>Hello, world!\nWhat's "
|
||||
"up?")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_NONE)
|
||||
.expect_content(
|
||||
"<|channel|>analysis<|message|>I'm\nthinking<|end|>Hello, world!\nWhat's up?")
|
||||
.run();
|
||||
|
||||
// Tool call with recipient in role header: " to=functions.NAME<|channel|>analysis<|message|>JSON"
|
||||
tst.test(" to=functions.special_function<|channel|>analysis<|message|>{\"arg1\": 1}")
|
||||
.tools({ special_function_tool })
|
||||
@@ -2496,37 +2487,16 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
|
||||
// Tool call with reasoning + content (analysis first, then tool call)
|
||||
tst.test(
|
||||
"<|channel|>analysis<|message|>I'm\nthinking<|end|>\n"
|
||||
"<|channel|>analysis<|message|>I'm\nthinking<|end|>"
|
||||
"<|start|>assistant to=functions.special_function<|channel|>analysis<|message|>{\"arg1\": 1}")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_thoughts)
|
||||
.run();
|
||||
|
||||
// Tool calling with extra channel before
|
||||
// Complex tool calling
|
||||
tst.test(
|
||||
"<|channel|>analysis<|message|>I'm\nthinking<|end|><|start|>assistant<|channel|>commentary"
|
||||
" to=functions.special_function <|message|>{\"arg1\": 1}")
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.tools({ special_function_tool })
|
||||
.expect(message_assist_call_thoughts)
|
||||
.run();
|
||||
|
||||
// Reasoning after final channel
|
||||
// Tool calling after final channel
|
||||
tst.test(
|
||||
"<|channel|>final<|message|><|end|>"
|
||||
"<|start|>assistant<|channel|>analysis<|message|>Thinking about edit..."
|
||||
)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.expect_reasoning("Thinking about edit...")
|
||||
.expect_content("")
|
||||
.run();
|
||||
|
||||
// Tool calling after final channel
|
||||
tst.test(
|
||||
"<|channel|>final<|message|><|end|>"
|
||||
"<|start|>assistant<|channel|>analysis<|message|>Thinking about edit...<|end|>"
|
||||
"<|channel|>analysis<|message|>Thinking about edit...<|end|>"
|
||||
"<|start|>assistant<|channel|>commentary to=functions.edit <|constrain|>json"
|
||||
"<|message|>{\"oldString\": \"if (part < railCount - 1) {\", \"newString\": \"if (part < 4) {\", \"replaceAll\": false}"
|
||||
)
|
||||
@@ -2561,19 +2531,17 @@ static void test_template_output_peg_parsers(bool detailed_debug) {
|
||||
})
|
||||
.run();
|
||||
|
||||
// Parallel tool calls
|
||||
// Structured output
|
||||
tst.test(
|
||||
" to=functions.special_function<|channel|>analysis<|message|>{\"arg1\": 1}\n"
|
||||
"<|start|>assistant to=functions.special_function_with_opt<|channel|>analysis<|message|>{\"arg1\": 1, "
|
||||
"\"arg2\": 2}")
|
||||
.parallel_tool_calls(true)
|
||||
.tools({
|
||||
special_function_tool, special_function_tool_with_optional_param
|
||||
})
|
||||
.expect_tool_calls({
|
||||
{ "special_function", R"({"arg1": 1})", {} },
|
||||
{ "special_function_with_opt", R"({"arg1": 1, "arg2": 2})", {} },
|
||||
})
|
||||
"<|channel|>analysis<|message|>I need to output the invoice details in JSON<|end|>"
|
||||
"<|start|>assistant<|channel|>final <|constrain|>json"
|
||||
"<|message|>"
|
||||
R"({"amount": 123.45, "date": "2025-12-03"})"
|
||||
)
|
||||
.reasoning_format(COMMON_REASONING_FORMAT_AUTO)
|
||||
.json_schema(invoice_schema)
|
||||
.expect_reasoning("I need to output the invoice details in JSON")
|
||||
.expect_content(R"({"amount": 123.45, "date": "2025-12-03"})")
|
||||
.run();
|
||||
}
|
||||
|
||||
|
||||
@@ -1897,8 +1897,9 @@ import sys
|
||||
from datetime import datetime
|
||||
from jinja2.sandbox import SandboxedEnvironment
|
||||
|
||||
tmpl = json.loads(sys.argv[1])
|
||||
vars_json = json.loads(sys.argv[2])
|
||||
merged_input = json.loads(sys.stdin.buffer.read().decode("utf-8"))
|
||||
tmpl = merged_input["tmpl"]
|
||||
vars_json = merged_input["vars"]
|
||||
|
||||
env = SandboxedEnvironment(
|
||||
trim_blocks=True,
|
||||
@@ -1921,8 +1922,9 @@ sys.stdout.buffer.write(result.encode())
|
||||
static void test_template_py(testing & t, const std::string & name, const std::string & tmpl, const json & vars, const std::string & expect) {
|
||||
t.test(name, [&tmpl, &vars, &expect](testing & t) {
|
||||
// Prepare arguments
|
||||
std::string tmpl_json = json(tmpl).dump();
|
||||
std::string vars_json = vars.dump();
|
||||
json merged;
|
||||
merged["tmpl"] = json(tmpl);
|
||||
merged["vars"] = vars;
|
||||
|
||||
#ifdef _WIN32
|
||||
const char * python_executable = "python.exe";
|
||||
@@ -1930,7 +1932,7 @@ static void test_template_py(testing & t, const std::string & name, const std::s
|
||||
const char * python_executable = "python3";
|
||||
#endif
|
||||
|
||||
const char * command_line[] = {python_executable, "-c", py_script.c_str(), tmpl_json.c_str(), vars_json.c_str(), NULL};
|
||||
const char * command_line[] = {python_executable, "-c", py_script.c_str(), NULL};
|
||||
|
||||
struct subprocess_s subprocess;
|
||||
int options = subprocess_option_combined_stdout_stderr
|
||||
@@ -1944,6 +1946,20 @@ static void test_template_py(testing & t, const std::string & name, const std::s
|
||||
t.assert_true("subprocess creation", false);
|
||||
return;
|
||||
}
|
||||
FILE * p_stdin = subprocess_stdin(&subprocess);
|
||||
|
||||
// Write input
|
||||
std::string input = merged.dump();
|
||||
auto written = fwrite(input.c_str(), 1, input.size(), p_stdin);
|
||||
if (written != input.size()) {
|
||||
t.log("Failed to write complete input to subprocess stdin");
|
||||
t.assert_true("subprocess stdin write", false);
|
||||
subprocess_destroy(&subprocess);
|
||||
return;
|
||||
}
|
||||
fflush(p_stdin);
|
||||
fclose(p_stdin); // Close stdin to signal EOF to the Python process
|
||||
subprocess.stdin_file = nullptr;
|
||||
|
||||
// Read output
|
||||
std::string output;
|
||||
|
||||
Binary file not shown.
@@ -148,7 +148,7 @@
|
||||
</Tooltip.Trigger>
|
||||
|
||||
<Tooltip.Content side="right">
|
||||
<p>Images require vision models to be processed</p>
|
||||
<p>Image processing requires a vision model</p>
|
||||
</Tooltip.Content>
|
||||
</Tooltip.Root>
|
||||
{/if}
|
||||
@@ -173,7 +173,7 @@
|
||||
</Tooltip.Trigger>
|
||||
|
||||
<Tooltip.Content side="right">
|
||||
<p>Audio files require audio models to be processed</p>
|
||||
<p>Audio files processing requires an audio model</p>
|
||||
</Tooltip.Content>
|
||||
</Tooltip.Root>
|
||||
{/if}
|
||||
|
||||
@@ -57,7 +57,6 @@
|
||||
// Handle ?q= parameter - create new conversation and send message
|
||||
if (qParam !== null) {
|
||||
await conversationsStore.createConversation();
|
||||
await chatStore.sendMessage(qParam);
|
||||
clearUrlParams();
|
||||
} else if (modelParam || newChatParam === 'true') {
|
||||
clearUrlParams();
|
||||
|
||||
Reference in New Issue
Block a user