Compare commits

...

11 Commits
b8685 ... b8696

Author SHA1 Message Date
Aaron Teo
69c28f1547 llama-server: fix model params not propagated (#21509)
Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
2026-04-07 21:39:41 +08:00
Son H. Nguyen
0d049d6a92 unicode : add custom Qwen2 regex handler to fix segfault on long input (#21257)
* unicode : add custom Qwen2 regex handler to fix segfault on long input

std::regex uses recursive backtracking internally, which causes a stack
overflow (segfault) when tokenizing long sequences of repeated characters
(e.g. 43K 'A's). The Qwen2 tokenizer regex differs from Llama3 only in
the digit pattern (\p{N} vs \p{N}{1,3}), so it was falling through to
the std::regex fallback path instead of using a custom handler.

Add unicode_regex_split_custom_qwen2() following the established pattern
used by gpt2, llama3, kimi_k2, and afmoe custom handlers.

Closes: https://github.com/ggml-org/llama.cpp/issues/21113

* cont : remove TODO comment

* cont : update comment to reflect original regex

* use the correct regex in the comment this time... [no ci]

---------

Co-authored-by: Aldehir Rojas <hello@alde.dev>
2026-04-07 16:13:38 +03:00
Johannes Gäßler
a8ec0df461 llama: remove per-arch tensor name lists (#21531) 2026-04-07 15:02:03 +02:00
Georgi Gerganov
e8f5082697 server : fix restore for checkpoints with pos_min == 0 (#21510) 2026-04-07 15:29:17 +03:00
Georgi Gerganov
22fc79134e ggml : deprecate GGML_OP_ADD1 (#21363)
* ggml : deprecate GGML_OP_ADD1

* cont : remove tests

* cont : re-enable vulkan check
2026-04-07 15:28:27 +03:00
Tom Overlund
2a619f6fbc ggml: Vulkan build, Linux -- output error string for errno on fork failure (#20868) (#20904) 2026-04-07 13:54:55 +02:00
mkoker
edd4d9bca5 vulkan: add FA dequant for q4_1, q5_0, q5_1, iq4_nl (#21029)
Add dequantize4() implementations for Q4_1, Q5_0, Q5_1, and IQ4_NL
in the flash attention base shader. Register them in the shader
generator, pipeline creation, and enable in the scalar/coopmat1 FA
support check.
2026-04-07 13:41:29 +02:00
Aldehir Rojas
482192f12d webui : store reasoning_content so it is sent back in subsequent requests (#21249) 2026-04-07 13:32:44 +02:00
Antoine Viallon
71a81f6fcc ggml-cuda : fix CDNA2 compute capability constant for gfx90a (MI210) (#21519)
GGML_CUDA_CC_CDNA2 was set to 0x910
Fix by setting the constant to 0x90a to match the actual gfx90a ISA.
2026-04-07 12:18:55 +02:00
Aleksander Grygier
ecce0087da fix: Detect streaming state in reasoning content blocks (#21549) 2026-04-07 12:04:41 +02:00
Kabir08
d1f82e382d Fix rtl text rendering (#21382)
* Fix Arabic RTL text rendering in web UI

- Add dir='auto' attributes to markdown containers and blocks
- Implement post-processing to add dir='auto' to all text elements
- Replace directional CSS properties with logical properties for proper RTL list alignment
- Ensure bidirectional text support for mixed Arabic/English content

* Clean up commented duplicate function

Remove the commented-out duplicate transformMdastNode function
that was left over from refactoring.

* Fix Arabic RTL text rendering in web UI

- Add dir='auto' attributes to markdown containers and blocks
- Implement post-processing to add dir='auto' to all text elements
- Replace directional CSS properties with logical properties for proper RTL list alignment
- Minor code formatting improvements

This ensures bidirectional text support for mixed Arabic/English content in the llama.cpp web UI.

* Implement rehype plugin for comprehensive RTL text support

- Add rehypeRtlSupport plugin that applies dir='auto' to all elements with children
- Replace DOMParser-based approach with efficient HAST tree processing
- Remove hardcoded element lists for better maintainability
- Ensure proper bidirectional text rendering for mixed RTL/LTR content

* Fix RTL text rendering with rehype plugin and cleanup

* fix: prettier formatting
2026-04-07 11:37:20 +02:00
22 changed files with 446 additions and 2193 deletions

View File

@@ -902,15 +902,17 @@ extern "C" {
struct ggml_tensor * b,
struct ggml_tensor * ids);
GGML_API struct ggml_tensor * ggml_add1(
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_add1(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * b),
"use ggml_add instead");
GGML_API struct ggml_tensor * ggml_add1_inplace(
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_add1_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
struct ggml_tensor * b),
"use ggml_add_inplace instead");
// dst = a
// view(dst, nb1, nb2, nb3, offset) += b

View File

@@ -65,7 +65,7 @@
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x90a) // MI210 (gfx90a), minimum acc register renaming
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32

View File

@@ -3447,11 +3447,19 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, )
CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, )
CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_SCALAR, )
CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_SCALAR, )
CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_SCALAR, )
CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_SCALAR, )
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_SCALAR, )
} else {
CREATE_FA(GGML_TYPE_F32, f32, FA_SCALAR, _fp32)
CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, _fp32)
CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, _fp32)
CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_SCALAR, _fp32)
CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_SCALAR, _fp32)
CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_SCALAR, _fp32)
CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_SCALAR, _fp32)
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_SCALAR, _fp32)
}
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
if (device->coopmat1_fa_support) {
@@ -3459,6 +3467,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
CREATE_FA(GGML_TYPE_F16, f16, FA_COOPMAT1, _cm1)
CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_COOPMAT1, _cm1)
CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_COOPMAT1, _cm1)
CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_COOPMAT1, _cm1)
CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_COOPMAT1, _cm1)
CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_COOPMAT1, _cm1)
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_COOPMAT1, _cm1)
}
#endif
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
@@ -15331,11 +15343,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
case GGML_TYPE_F32:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q8_0:
// supported in scalar and coopmat2 paths
break;
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_IQ4_NL:
// supported in scalar and coopmat2 paths
break;
// K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently
//case GGML_TYPE_Q2_K:
//case GGML_TYPE_Q3_K:
@@ -15350,12 +15363,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
//case GGML_TYPE_IQ3_XXS:
//case GGML_TYPE_IQ3_S:
//case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ4_NL:
// currently supported only in coopmat2 path
if (!coopmat2) {
return false;
}
break;
default:
return false;
}

View File

@@ -110,6 +110,97 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
#if defined(DATA_A_Q4_0)
#define BLOCK_BYTE_SIZE 18
#elif defined(DATA_A_Q4_1)
#define BLOCK_BYTE_SIZE 20
#endif
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q4_1)
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
if (binding_idx == BINDING_IDX_K) {
uint vui_lo = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
uint vui_hi = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
uint shift = (iqs & 0x10) >> 2;
vui_lo >>= shift;
vui_hi >>= shift;
FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF);
#ifdef DATA_A_Q4_1
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * nibbles + FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].m);
#else
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (nibbles - FLOAT_TYPE(8.0f));
#endif
} else {
uint vui_lo = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
uint vui_hi = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
uint shift = (iqs & 0x10) >> 2;
vui_lo >>= shift;
vui_hi >>= shift;
FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF);
#ifdef DATA_A_Q4_1
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * nibbles + FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].m);
#else
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (nibbles - FLOAT_TYPE(8.0f));
#endif
}
}
#endif
#if defined(DATA_A_Q5_0)
#define BLOCK_BYTE_SIZE 22
#elif defined(DATA_A_Q5_1)
#define BLOCK_BYTE_SIZE 24
#endif
#if defined(DATA_A_Q5_0) || defined(DATA_A_Q5_1)
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
if (binding_idx == BINDING_IDX_K) {
uint vui_lo = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
uint vui_hi = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
uint shift = (iqs & 0x10) >> 2;
vui_lo >>= shift;
vui_hi >>= shift;
#ifdef DATA_A_Q5_1
uint qh = k_packed.k_data_packed16[a_offset + ib].qh;
#else
uint qh = uint(k_packed.k_data_packed16[a_offset + ib].qh[0]) | (uint(k_packed.k_data_packed16[a_offset + ib].qh[1]) << 16);
#endif
FLOAT_TYPEV4 hb = FLOAT_TYPEV4((qh >> iqs) & 1, (qh >> (iqs + 1)) & 1, (qh >> (iqs + 2)) & 1, (qh >> (iqs + 3)) & 1) * FLOAT_TYPE(16.0f);
FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF);
#ifdef DATA_A_Q5_1
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (nibbles + hb) + FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].m);
#else
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (nibbles + hb - FLOAT_TYPE(16.0f));
#endif
} else {
uint vui_lo = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
uint vui_hi = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
uint shift = (iqs & 0x10) >> 2;
vui_lo >>= shift;
vui_hi >>= shift;
#ifdef DATA_A_Q5_1
uint qh = v_packed.v_data_packed16[a_offset + ib].qh;
#else
uint qh = uint(v_packed.v_data_packed16[a_offset + ib].qh[0]) | (uint(v_packed.v_data_packed16[a_offset + ib].qh[1]) << 16);
#endif
FLOAT_TYPEV4 hb = FLOAT_TYPEV4((qh >> iqs) & 1, (qh >> (iqs + 1)) & 1, (qh >> (iqs + 2)) & 1, (qh >> (iqs + 3)) & 1) * FLOAT_TYPE(16.0f);
FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF);
#ifdef DATA_A_Q5_1
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (nibbles + hb) + FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].m);
#else
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (nibbles + hb - FLOAT_TYPE(16.0f));
#endif
}
}
#endif
#if defined(DATA_A_IQ4_NL)
#define BLOCK_BYTE_SIZE 18
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
if (binding_idx == BINDING_IDX_K) {
@@ -119,7 +210,11 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
vui_lo >>= shift;
vui_hi >>= shift;
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - FLOAT_TYPE(8.0f));
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * FLOAT_TYPEV4(
kvalues_iq4nl[vui_lo & 0xF],
kvalues_iq4nl[(vui_lo >> 8) & 0xF],
kvalues_iq4nl[vui_hi & 0xF],
kvalues_iq4nl[(vui_hi >> 8) & 0xF]);
} else {
uint vui_lo = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
uint vui_hi = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
@@ -127,11 +222,14 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
vui_lo >>= shift;
vui_hi >>= shift;
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - FLOAT_TYPE(8.0f));
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * FLOAT_TYPEV4(
kvalues_iq4nl[vui_lo & 0xF],
kvalues_iq4nl[(vui_lo >> 8) & 0xF],
kvalues_iq4nl[vui_hi & 0xF],
kvalues_iq4nl[(vui_hi >> 8) & 0xF]);
}
}
#endif
#if defined(DATA_A_Q8_0)
#define BLOCK_BYTE_SIZE 34
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {

View File

@@ -137,6 +137,7 @@ void execute_command(std::vector<std::string>& command, std::string& stdout_str,
pid_t pid = fork();
if (pid < 0) {
std::cerr << strerror(errno) << "\n";
throw std::runtime_error("Failed to fork process");
}
@@ -655,7 +656,7 @@ void process_shaders() {
if (tname == "f16") {
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp",
merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"COOPMAT", "1"}}), fp16, true, false, f16acc);
} else if (tname == "q4_0" || tname == "q8_0" || tname == "f32") {
} else if (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "iq4_nl" || tname == "q8_0" || tname == "f32") {
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp",
merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname)}, {"COOPMAT", "1"}}), fp16, true, false, f16acc);
@@ -666,7 +667,7 @@ void process_shaders() {
if (tname == "f16") {
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}}), fp16, false, false, f16acc);
} else if (tname == "q4_0" || tname == "q8_0" || tname == "f32") {
} else if (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "iq4_nl" || tname == "q8_0" || tname == "f32") {
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), fp16, false, false, f16acc);

File diff suppressed because it is too large Load Diff

View File

@@ -585,8 +585,6 @@ struct LLM_TN_IMPL {
const int bid;
const int xid;
const std::set<llm_tensor> model_tensors;
LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
std::string str() const;

View File

@@ -470,6 +470,141 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
return bpe_offsets;
}
// Qwen2 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
static std::vector<size_t> unicode_regex_split_custom_qwen2(const std::string & text, const std::vector<size_t> & offsets) {
std::vector<size_t> bpe_offsets; // store the offset of each word
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
const auto cpts = unicode_cpts_from_utf8(text);
size_t start = 0;
for (auto offset : offsets) {
const size_t offset_ini = start;
const size_t offset_end = start + offset;
assert(offset_end <= cpts.size());
start = offset_end;
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
};
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
};
size_t _prev_end = offset_ini;
auto _add_token = [&] (const size_t end) -> size_t {
assert(_prev_end <= end && end <= offset_end);
size_t len = end - _prev_end;
if (len > 0) {
bpe_offsets.push_back(len);
}
_prev_end = end;
//if (len > 0) {
// std::string s = "";
// for(size_t p = end-len; p < end; p++)
// s += unicode_cpt_to_utf8(cpts[p]);
// printf(">>> '%s'\n", s.c_str());
//}
return len;
};
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
const uint32_t cpt = _get_cpt(pos);
const auto flags = _get_flags(pos);
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
if (cpt == '\'' && pos+1 < offset_end) {
uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
pos += _add_token(pos+2);
continue;
}
if (pos+2 < offset_end) {
uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
(cpt_next == 'v' && cpt_next_next == 'e') ||
(cpt_next == 'l' && cpt_next_next == 'l')) {
pos += _add_token(pos+3);
continue;
}
}
}
// regex: [^\r\n\p{L}\p{N}]?\p{L}+
if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
pos++;
while (_get_flags(pos).is_letter) {
pos++;
}
_add_token(pos);
continue;
}
}
// regex: \p{N}
if (flags.is_number) {
pos++;
_add_token(pos);
continue;
}
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
pos += (cpt == ' ');
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
flags2 = _get_flags(++pos);
}
uint32_t cpt2 = _get_cpt(pos);
while (cpt2 == '\r' || cpt2 == '\n') {
cpt2 = _get_cpt(++pos);
}
_add_token(pos);
continue;
}
size_t num_whitespaces = 0;
size_t last_end_r_or_n = 0;
while (_get_flags(pos+num_whitespaces).is_whitespace) {
uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
if (cpt2 == '\r' || cpt2 == '\n') {
last_end_r_or_n = pos + num_whitespaces + 1;
}
num_whitespaces++;
}
// regex: \s*[\r\n]+
if (last_end_r_or_n > 0) {
pos = last_end_r_or_n;
_add_token(pos);
continue;
}
// regex: \s+(?!\S)
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
pos += num_whitespaces - 1;
_add_token(pos);
continue;
}
// regex: \s+
if (num_whitespaces > 0) {
pos += num_whitespaces;
_add_token(pos);
continue;
}
// no matches
_add_token(++pos);
}
}
return bpe_offsets;
}
template <typename CharT>
static std::vector<size_t> unicode_regex_split_stl(const std::basic_string<CharT> & text, const std::basic_string<CharT> & regex, const std::vector<size_t> & offsets) {
using BidirIt = typename std::basic_string<CharT>::const_iterator;
@@ -790,8 +925,10 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
} else if (
regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
} else if (
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
bpe_offsets = unicode_regex_split_custom_qwen2(text, offsets);
} else if (regex_expr == "\\p{Han}+") {
// K2's first pattern - handle all K2 patterns together
bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);

View File

@@ -3129,39 +3129,6 @@ struct test_add_id : public test_case {
}
};
// GGML_OP_ADD1
struct test_add1 : public test_case {
const ggml_type type;
const std::array<int64_t, 4> ne;
std::string vars() override {
return VARS_TO_STR2(type, ne);
}
test_add1(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 5, 4, 3})
: type(type), ne(ne) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
ggml_set_param(a);
ggml_set_name(a, "a");
ggml_tensor * b = ggml_new_tensor_1d(ctx, type, 1);
// ggml_set_param(b); // TODO: implement
ggml_set_name(b, "b");
ggml_tensor * out = ggml_add1(ctx, a, b);
ggml_set_name(out, "out");
return out;
}
float grad_eps() override {
return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
}
};
// GGML_OP_SCALE
struct test_scale : public test_case {
const ggml_type type;
@@ -7886,8 +7853,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {2, 2, 2, 2}, 8));
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 1}, 16));
test_cases.emplace_back(new test_add1());
test_cases.emplace_back(new test_add1(GGML_TYPE_F32, {1024, 1024, 1, 1}));
test_cases.emplace_back(new test_scale());
test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f));
test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f, true)); // inplace test

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -18,7 +18,7 @@
<div style="display: contents">
<script>
{
__sveltekit_1trm5n9 = {
__sveltekit_10avopp = {
base: new URL('.', location).pathname.slice(0, -1)
};

View File

@@ -632,7 +632,7 @@ private:
// load the model and initialize llama_context
// this may also be called to resume from sleeping state
bool load_model(const common_params & params) {
bool load_model(common_params & params) {
bool is_resume = sleeping;
SRV_INF("loading model '%s'\n", params.model.path.c_str());
@@ -641,6 +641,9 @@ private:
llama_init = common_init_from_params(params_base);
// propagate model-metadata sampling defaults back to caller
params.sampling = params_base.sampling;
model = llama_init->model();
ctx = llama_init->context();
@@ -2404,7 +2407,7 @@ private:
// guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS]
LOG_INF("slot %12.*s: id %2d | task %d | Checking checkpoint with [%d, %d] against %d...\n", 12,
func_name, (slot).id, ((slot).task ? (slot).task->id : -1), cur.pos_min, cur.pos_max, pos_min_thold);
return cur.pos_min < pos_min_thold;
return cur.pos_min < pos_min_thold || cur.pos_min == 0;
}
);
@@ -2978,7 +2981,7 @@ private:
server_context::server_context() : impl(new server_context_impl()) {}
server_context::~server_context() = default;
bool server_context::load_model(const common_params & params) {
bool server_context::load_model(common_params & params) {
return impl->load_model(params);
}

View File

@@ -56,7 +56,7 @@ struct server_context {
// load the model and initialize llama_context
// returns true on success
bool load_model(const common_params & params);
bool load_model(common_params & params);
// this function will block main thread until termination
void start_loop();

View File

@@ -51,7 +51,6 @@
"eslint-config-prettier": "^10.0.1",
"eslint-plugin-storybook": "^10.2.4",
"eslint-plugin-svelte": "^3.0.0",
"fflate": "^0.8.2",
"globals": "^16.0.0",
"http-server": "^14.1.1",
"mdast": "^3.0.0",
@@ -5051,13 +5050,6 @@
}
}
},
"node_modules/fflate": {
"version": "0.8.2",
"resolved": "https://registry.npmjs.org/fflate/-/fflate-0.8.2.tgz",
"integrity": "sha512-cPJU47OaAoCbg0pBvzsgpTPhmhqI5eJjh/JIu8tPj5q+T7iLvW/JAYUqmE7KOB4R1ZyEhzBaIQpQpardBF5z8A==",
"dev": true,
"license": "MIT"
},
"node_modules/file-entry-cache": {
"version": "8.0.0",
"resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz",

View File

@@ -33,7 +33,7 @@
const showToolCallInProgress = $derived(config().showToolCallInProgress as boolean);
const showThoughtInProgress = $derived(config().showThoughtInProgress as boolean);
const sections = $derived(deriveAgenticSections(message, toolMessages, []));
const sections = $derived(deriveAgenticSections(message, toolMessages, [], isStreaming));
// Parse tool results with images
const sectionsParsed = $derived(

View File

@@ -16,6 +16,7 @@
import { rehypeEnhanceLinks } from '$lib/markdown/enhance-links';
import { rehypeEnhanceCodeBlocks } from '$lib/markdown/enhance-code-blocks';
import { rehypeResolveAttachmentImages } from '$lib/markdown/resolve-attachment-images';
import { rehypeRtlSupport } from '$lib/markdown/rehype-rtl-support';
import { remarkLiteralHtml } from '$lib/markdown/literal-html';
import { copyCodeToClipboard, preprocessLaTeX, getImageErrorFallbackHtml } from '$lib/utils';
import {
@@ -101,6 +102,7 @@
.use(rehypeEnhanceLinks) // Add target="_blank" to links
.use(rehypeEnhanceCodeBlocks) // Wrap code blocks with header and actions
.use(rehypeResolveAttachmentImages, { attachments })
.use(rehypeRtlSupport) // Add bidirectional text support
.use(rehypeStringify, { allowDangerousHtml: true }); // Convert to HTML string
});
@@ -781,19 +783,19 @@
/* Lists */
div :global(ul) {
list-style-type: disc;
margin-left: 1.5rem;
margin-inline-start: 1.5rem;
margin-bottom: 1rem;
}
div :global(ol) {
list-style-type: decimal;
margin-left: 1.5rem;
margin-inline-start: 1.5rem;
margin-bottom: 1rem;
}
div :global(li) {
margin-bottom: 0.25rem;
padding-left: 0.5rem;
padding-inline-start: 0.5rem;
}
div :global(li::marker) {
@@ -816,8 +818,8 @@
/* Task lists */
div :global(.task-list-item) {
list-style: none;
margin-left: 0;
padding-left: 0;
margin-inline-start: 0;
padding-inline-start: 0;
}
div :global(.task-list-item-checkbox) {

View File

@@ -0,0 +1,28 @@
/**
* Rehype plugin to provide comprehensive RTL support by adding dir="auto"
* to all text-containing elements.
*
* This operates directly on the HAST tree, ensuring that all elements
* (including those not in a predefined list) receive the attribute.
*/
import type { Plugin } from 'unified';
import type { Root, Element } from 'hast';
import { visit } from 'unist-util-visit';
/**
* Rehype plugin to add dir="auto" to all elements that have children.
* This provides bidirectional text support for mixed RTL/LTR content.
*/
export const rehypeRtlSupport: Plugin<[], Root> = () => {
return (tree: Root) => {
visit(tree, 'element', (node: Element) => {
if (node.children && node.children.length > 0) {
node.properties = {
...node.properties,
dir: 'auto'
};
}
});
};
};

View File

@@ -474,6 +474,7 @@ class AgenticStore {
sessionMessages.push({
role: MessageRole.ASSISTANT,
content: turnContent || undefined,
reasoning_content: turnReasoningContent || undefined,
tool_calls: normalizedCalls
});

View File

@@ -41,6 +41,7 @@ export type AgenticMessage =
| {
role: MessageRole.ASSISTANT;
content?: string | ApiChatMessageContentPart[];
reasoning_content?: string;
tool_calls?: AgenticToolCallPayload[];
}
| {

View File

@@ -38,14 +38,19 @@ export type ToolResultLine = {
function deriveSingleTurnSections(
message: DatabaseMessage,
toolMessages: DatabaseMessage[] = [],
streamingToolCalls: ApiChatCompletionToolCall[] = []
streamingToolCalls: ApiChatCompletionToolCall[] = [],
isStreaming: boolean = false
): AgenticSection[] {
const sections: AgenticSection[] = [];
// 1. Reasoning content (from dedicated field)
if (message.reasoningContent) {
const toolCalls = parseToolCalls(message.toolCalls);
const hasContentAfterReasoning =
!!message.content?.trim() || toolCalls.length > 0 || streamingToolCalls.length > 0;
const isPending = isStreaming && !hasContentAfterReasoning;
sections.push({
type: AgenticSectionType.REASONING,
type: isPending ? AgenticSectionType.REASONING_PENDING : AgenticSectionType.REASONING,
content: message.reasoningContent
});
}
@@ -104,12 +109,13 @@ function deriveSingleTurnSections(
export function deriveAgenticSections(
message: DatabaseMessage,
toolMessages: DatabaseMessage[] = [],
streamingToolCalls: ApiChatCompletionToolCall[] = []
streamingToolCalls: ApiChatCompletionToolCall[] = [],
isStreaming: boolean = false
): AgenticSection[] {
const hasAssistantContinuations = toolMessages.some((m) => m.role === MessageRole.ASSISTANT);
if (!hasAssistantContinuations) {
return deriveSingleTurnSections(message, toolMessages, streamingToolCalls);
return deriveSingleTurnSections(message, toolMessages, streamingToolCalls, isStreaming);
}
const sections: AgenticSection[] = [];
@@ -127,7 +133,12 @@ export function deriveAgenticSections(
const isLastTurn = i + 1 + turnToolMsgs.length >= toolMessages.length;
sections.push(
...deriveSingleTurnSections(msg, turnToolMsgs, isLastTurn ? streamingToolCalls : [])
...deriveSingleTurnSections(
msg,
turnToolMsgs,
isLastTurn ? streamingToolCalls : [],
isLastTurn && isStreaming
)
);
i += 1 + turnToolMsgs.length;

View File

@@ -162,6 +162,36 @@ describe('deriveAgenticSections', () => {
expect(sections[4].content).toBe('Here is the analysis.');
});
it('returns REASONING_PENDING when streaming with only reasoning content', () => {
const msg = makeAssistant({
reasoningContent: 'Let me think about this...'
});
const sections = deriveAgenticSections(msg, [], [], true);
expect(sections).toHaveLength(1);
expect(sections[0].type).toBe(AgenticSectionType.REASONING_PENDING);
expect(sections[0].content).toBe('Let me think about this...');
});
it('returns REASONING (not pending) when streaming but text content has appeared', () => {
const msg = makeAssistant({
content: 'The answer is',
reasoningContent: 'Let me think...'
});
const sections = deriveAgenticSections(msg, [], [], true);
expect(sections).toHaveLength(2);
expect(sections[0].type).toBe(AgenticSectionType.REASONING);
expect(sections[1].type).toBe(AgenticSectionType.TEXT);
});
it('returns REASONING (not pending) when not streaming', () => {
const msg = makeAssistant({
reasoningContent: 'Let me think...'
});
const sections = deriveAgenticSections(msg, [], [], false);
expect(sections).toHaveLength(1);
expect(sections[0].type).toBe(AgenticSectionType.REASONING);
});
it('multi-turn: streaming tool calls on last turn', () => {
const assistant1 = makeAssistant({
toolCalls: JSON.stringify([