mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-16 16:27:32 +03:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
69c28f1547 | ||
|
|
0d049d6a92 | ||
|
|
a8ec0df461 | ||
|
|
e8f5082697 | ||
|
|
22fc79134e | ||
|
|
2a619f6fbc | ||
|
|
edd4d9bca5 | ||
|
|
482192f12d | ||
|
|
71a81f6fcc | ||
|
|
ecce0087da | ||
|
|
d1f82e382d |
@@ -902,15 +902,17 @@ extern "C" {
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * ids);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_add1(
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_add1(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
struct ggml_tensor * b),
|
||||
"use ggml_add instead");
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_add1_inplace(
|
||||
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_add1_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
struct ggml_tensor * b),
|
||||
"use ggml_add_inplace instead");
|
||||
|
||||
// dst = a
|
||||
// view(dst, nb1, nb2, nb3, offset) += b
|
||||
|
||||
@@ -65,7 +65,7 @@
|
||||
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
|
||||
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
|
||||
#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
|
||||
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
|
||||
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x90a) // MI210 (gfx90a), minimum acc register renaming
|
||||
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
|
||||
|
||||
// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
|
||||
|
||||
@@ -3447,11 +3447,19 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, )
|
||||
CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, )
|
||||
CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_SCALAR, )
|
||||
CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_SCALAR, )
|
||||
CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_SCALAR, )
|
||||
CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_SCALAR, )
|
||||
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_SCALAR, )
|
||||
} else {
|
||||
CREATE_FA(GGML_TYPE_F32, f32, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_SCALAR, _fp32)
|
||||
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_SCALAR, _fp32)
|
||||
}
|
||||
#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT)
|
||||
if (device->coopmat1_fa_support) {
|
||||
@@ -3459,6 +3467,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
CREATE_FA(GGML_TYPE_F16, f16, FA_COOPMAT1, _cm1)
|
||||
CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_COOPMAT1, _cm1)
|
||||
CREATE_FA(GGML_TYPE_Q8_0, q8_0, FA_COOPMAT1, _cm1)
|
||||
CREATE_FA(GGML_TYPE_Q4_1, q4_1, FA_COOPMAT1, _cm1)
|
||||
CREATE_FA(GGML_TYPE_Q5_0, q5_0, FA_COOPMAT1, _cm1)
|
||||
CREATE_FA(GGML_TYPE_Q5_1, q5_1, FA_COOPMAT1, _cm1)
|
||||
CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_COOPMAT1, _cm1)
|
||||
}
|
||||
#endif
|
||||
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
@@ -15331,11 +15343,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
// supported in scalar and coopmat2 paths
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
// supported in scalar and coopmat2 paths
|
||||
break;
|
||||
// K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently
|
||||
//case GGML_TYPE_Q2_K:
|
||||
//case GGML_TYPE_Q3_K:
|
||||
@@ -15350,12 +15363,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
//case GGML_TYPE_IQ3_XXS:
|
||||
//case GGML_TYPE_IQ3_S:
|
||||
//case GGML_TYPE_IQ4_XS:
|
||||
case GGML_TYPE_IQ4_NL:
|
||||
// currently supported only in coopmat2 path
|
||||
if (!coopmat2) {
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -110,6 +110,97 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
|
||||
#if defined(DATA_A_Q4_0)
|
||||
#define BLOCK_BYTE_SIZE 18
|
||||
#elif defined(DATA_A_Q4_1)
|
||||
#define BLOCK_BYTE_SIZE 20
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q4_0) || defined(DATA_A_Q4_1)
|
||||
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
if (binding_idx == BINDING_IDX_K) {
|
||||
uint vui_lo = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
|
||||
uint vui_hi = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
|
||||
uint shift = (iqs & 0x10) >> 2;
|
||||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF);
|
||||
#ifdef DATA_A_Q4_1
|
||||
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * nibbles + FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].m);
|
||||
#else
|
||||
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (nibbles - FLOAT_TYPE(8.0f));
|
||||
#endif
|
||||
} else {
|
||||
uint vui_lo = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
|
||||
uint vui_hi = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
|
||||
uint shift = (iqs & 0x10) >> 2;
|
||||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF);
|
||||
#ifdef DATA_A_Q4_1
|
||||
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * nibbles + FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].m);
|
||||
#else
|
||||
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (nibbles - FLOAT_TYPE(8.0f));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q5_0)
|
||||
#define BLOCK_BYTE_SIZE 22
|
||||
#elif defined(DATA_A_Q5_1)
|
||||
#define BLOCK_BYTE_SIZE 24
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q5_0) || defined(DATA_A_Q5_1)
|
||||
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
if (binding_idx == BINDING_IDX_K) {
|
||||
uint vui_lo = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
|
||||
uint vui_hi = uint(k_packed.k_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
|
||||
uint shift = (iqs & 0x10) >> 2;
|
||||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
#ifdef DATA_A_Q5_1
|
||||
uint qh = k_packed.k_data_packed16[a_offset + ib].qh;
|
||||
#else
|
||||
uint qh = uint(k_packed.k_data_packed16[a_offset + ib].qh[0]) | (uint(k_packed.k_data_packed16[a_offset + ib].qh[1]) << 16);
|
||||
#endif
|
||||
FLOAT_TYPEV4 hb = FLOAT_TYPEV4((qh >> iqs) & 1, (qh >> (iqs + 1)) & 1, (qh >> (iqs + 2)) & 1, (qh >> (iqs + 3)) & 1) * FLOAT_TYPE(16.0f);
|
||||
|
||||
FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF);
|
||||
#ifdef DATA_A_Q5_1
|
||||
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (nibbles + hb) + FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].m);
|
||||
#else
|
||||
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (nibbles + hb - FLOAT_TYPE(16.0f));
|
||||
#endif
|
||||
} else {
|
||||
uint vui_lo = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
|
||||
uint vui_hi = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
|
||||
uint shift = (iqs & 0x10) >> 2;
|
||||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
#ifdef DATA_A_Q5_1
|
||||
uint qh = v_packed.v_data_packed16[a_offset + ib].qh;
|
||||
#else
|
||||
uint qh = uint(v_packed.v_data_packed16[a_offset + ib].qh[0]) | (uint(v_packed.v_data_packed16[a_offset + ib].qh[1]) << 16);
|
||||
#endif
|
||||
FLOAT_TYPEV4 hb = FLOAT_TYPEV4((qh >> iqs) & 1, (qh >> (iqs + 1)) & 1, (qh >> (iqs + 2)) & 1, (qh >> (iqs + 3)) & 1) * FLOAT_TYPE(16.0f);
|
||||
|
||||
FLOAT_TYPEV4 nibbles = FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF);
|
||||
#ifdef DATA_A_Q5_1
|
||||
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (nibbles + hb) + FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].m);
|
||||
#else
|
||||
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (nibbles + hb - FLOAT_TYPE(16.0f));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(DATA_A_IQ4_NL)
|
||||
#define BLOCK_BYTE_SIZE 18
|
||||
|
||||
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
if (binding_idx == BINDING_IDX_K) {
|
||||
@@ -119,7 +210,11 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * (FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - FLOAT_TYPE(8.0f));
|
||||
return FLOAT_TYPE(k_packed.k_data_packed16[a_offset + ib].d) * FLOAT_TYPEV4(
|
||||
kvalues_iq4nl[vui_lo & 0xF],
|
||||
kvalues_iq4nl[(vui_lo >> 8) & 0xF],
|
||||
kvalues_iq4nl[vui_hi & 0xF],
|
||||
kvalues_iq4nl[(vui_hi >> 8) & 0xF]);
|
||||
} else {
|
||||
uint vui_lo = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
|
||||
uint vui_hi = uint(v_packed.v_data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
|
||||
@@ -127,11 +222,14 @@ FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
vui_lo >>= shift;
|
||||
vui_hi >>= shift;
|
||||
|
||||
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * (FLOAT_TYPEV4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - FLOAT_TYPE(8.0f));
|
||||
return FLOAT_TYPE(v_packed.v_data_packed16[a_offset + ib].d) * FLOAT_TYPEV4(
|
||||
kvalues_iq4nl[vui_lo & 0xF],
|
||||
kvalues_iq4nl[(vui_lo >> 8) & 0xF],
|
||||
kvalues_iq4nl[vui_hi & 0xF],
|
||||
kvalues_iq4nl[(vui_hi >> 8) & 0xF]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(DATA_A_Q8_0)
|
||||
#define BLOCK_BYTE_SIZE 34
|
||||
FLOAT_TYPEV4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
|
||||
|
||||
@@ -137,6 +137,7 @@ void execute_command(std::vector<std::string>& command, std::string& stdout_str,
|
||||
|
||||
pid_t pid = fork();
|
||||
if (pid < 0) {
|
||||
std::cerr << strerror(errno) << "\n";
|
||||
throw std::runtime_error("Failed to fork process");
|
||||
}
|
||||
|
||||
@@ -655,7 +656,7 @@ void process_shaders() {
|
||||
if (tname == "f16") {
|
||||
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp",
|
||||
merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"COOPMAT", "1"}}), fp16, true, false, f16acc);
|
||||
} else if (tname == "q4_0" || tname == "q8_0" || tname == "f32") {
|
||||
} else if (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "iq4_nl" || tname == "q8_0" || tname == "f32") {
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm1.comp",
|
||||
merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname)}, {"COOPMAT", "1"}}), fp16, true, false, f16acc);
|
||||
@@ -666,7 +667,7 @@ void process_shaders() {
|
||||
if (tname == "f16") {
|
||||
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
|
||||
merge_maps(fa_base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}}), fp16, false, false, f16acc);
|
||||
} else if (tname == "q4_0" || tname == "q8_0" || tname == "f32") {
|
||||
} else if (tname == "q4_0" || tname == "q4_1" || tname == "q5_0" || tname == "q5_1" || tname == "iq4_nl" || tname == "q8_0" || tname == "f32") {
|
||||
std::string data_a_key = "DATA_A_" + to_uppercase(tname);
|
||||
string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
|
||||
merge_maps(fa_base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"D_TYPEV4", "vec4"}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), fp16, false, false, f16acc);
|
||||
|
||||
2026
src/llama-arch.cpp
2026
src/llama-arch.cpp
File diff suppressed because it is too large
Load Diff
@@ -585,8 +585,6 @@ struct LLM_TN_IMPL {
|
||||
const int bid;
|
||||
const int xid;
|
||||
|
||||
const std::set<llm_tensor> model_tensors;
|
||||
|
||||
LLM_TN_IMPL(llm_arch arch, llm_tensor tensor, const char * suffix, int bid, int xid);
|
||||
|
||||
std::string str() const;
|
||||
|
||||
139
src/unicode.cpp
139
src/unicode.cpp
@@ -470,6 +470,141 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
|
||||
return bpe_offsets;
|
||||
}
|
||||
|
||||
// Qwen2 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
static std::vector<size_t> unicode_regex_split_custom_qwen2(const std::string & text, const std::vector<size_t> & offsets) {
|
||||
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||
|
||||
const auto cpts = unicode_cpts_from_utf8(text);
|
||||
|
||||
size_t start = 0;
|
||||
for (auto offset : offsets) {
|
||||
const size_t offset_ini = start;
|
||||
const size_t offset_end = start + offset;
|
||||
assert(offset_end <= cpts.size());
|
||||
start = offset_end;
|
||||
|
||||
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
||||
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
auto _add_token = [&] (const size_t end) -> size_t {
|
||||
assert(_prev_end <= end && end <= offset_end);
|
||||
size_t len = end - _prev_end;
|
||||
if (len > 0) {
|
||||
bpe_offsets.push_back(len);
|
||||
}
|
||||
_prev_end = end;
|
||||
//if (len > 0) {
|
||||
// std::string s = "";
|
||||
// for(size_t p = end-len; p < end; p++)
|
||||
// s += unicode_cpt_to_utf8(cpts[p]);
|
||||
// printf(">>> '%s'\n", s.c_str());
|
||||
//}
|
||||
return len;
|
||||
};
|
||||
|
||||
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
||||
const uint32_t cpt = _get_cpt(pos);
|
||||
const auto flags = _get_flags(pos);
|
||||
|
||||
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
|
||||
if (cpt == '\'' && pos+1 < offset_end) {
|
||||
uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
|
||||
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
||||
pos += _add_token(pos+2);
|
||||
continue;
|
||||
}
|
||||
if (pos+2 < offset_end) {
|
||||
uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
|
||||
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
||||
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
||||
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
||||
pos += _add_token(pos+3);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// regex: [^\r\n\p{L}\p{N}]?\p{L}+
|
||||
if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
|
||||
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
|
||||
pos++;
|
||||
while (_get_flags(pos).is_letter) {
|
||||
pos++;
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// regex: \p{N}
|
||||
if (flags.is_number) {
|
||||
pos++;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
||||
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
|
||||
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
|
||||
pos += (cpt == ' ');
|
||||
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
|
||||
flags2 = _get_flags(++pos);
|
||||
}
|
||||
uint32_t cpt2 = _get_cpt(pos);
|
||||
while (cpt2 == '\r' || cpt2 == '\n') {
|
||||
cpt2 = _get_cpt(++pos);
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t num_whitespaces = 0;
|
||||
size_t last_end_r_or_n = 0;
|
||||
while (_get_flags(pos+num_whitespaces).is_whitespace) {
|
||||
uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
||||
if (cpt2 == '\r' || cpt2 == '\n') {
|
||||
last_end_r_or_n = pos + num_whitespaces + 1;
|
||||
}
|
||||
num_whitespaces++;
|
||||
}
|
||||
|
||||
// regex: \s*[\r\n]+
|
||||
if (last_end_r_or_n > 0) {
|
||||
pos = last_end_r_or_n;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// regex: \s+(?!\S)
|
||||
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
|
||||
pos += num_whitespaces - 1;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// regex: \s+
|
||||
if (num_whitespaces > 0) {
|
||||
pos += num_whitespaces;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// no matches
|
||||
_add_token(++pos);
|
||||
}
|
||||
}
|
||||
|
||||
return bpe_offsets;
|
||||
}
|
||||
|
||||
template <typename CharT>
|
||||
static std::vector<size_t> unicode_regex_split_stl(const std::basic_string<CharT> & text, const std::basic_string<CharT> & regex, const std::vector<size_t> & offsets) {
|
||||
using BidirIt = typename std::basic_string<CharT>::const_iterator;
|
||||
@@ -790,8 +925,10 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
|
||||
} else if (
|
||||
regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
|
||||
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
||||
|
||||
bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
|
||||
} else if (
|
||||
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
||||
bpe_offsets = unicode_regex_split_custom_qwen2(text, offsets);
|
||||
} else if (regex_expr == "\\p{Han}+") {
|
||||
// K2's first pattern - handle all K2 patterns together
|
||||
bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);
|
||||
|
||||
@@ -3129,39 +3129,6 @@ struct test_add_id : public test_case {
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_ADD1
|
||||
struct test_add1 : public test_case {
|
||||
const ggml_type type;
|
||||
const std::array<int64_t, 4> ne;
|
||||
|
||||
std::string vars() override {
|
||||
return VARS_TO_STR2(type, ne);
|
||||
}
|
||||
|
||||
test_add1(ggml_type type = GGML_TYPE_F32,
|
||||
std::array<int64_t, 4> ne = {10, 5, 4, 3})
|
||||
: type(type), ne(ne) {}
|
||||
|
||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||
ggml_set_param(a);
|
||||
ggml_set_name(a, "a");
|
||||
|
||||
ggml_tensor * b = ggml_new_tensor_1d(ctx, type, 1);
|
||||
// ggml_set_param(b); // TODO: implement
|
||||
ggml_set_name(b, "b");
|
||||
|
||||
ggml_tensor * out = ggml_add1(ctx, a, b);
|
||||
ggml_set_name(out, "out");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
float grad_eps() override {
|
||||
return 0.1f * ne[0]*ne[1]*ne[2]*ne[3];
|
||||
}
|
||||
};
|
||||
|
||||
// GGML_OP_SCALE
|
||||
struct test_scale : public test_case {
|
||||
const ggml_type type;
|
||||
@@ -7886,8 +7853,6 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {2, 2, 2, 2}, 8));
|
||||
test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {16, 5, 4, 3}, {1, 1, 1, 1}, 16));
|
||||
|
||||
test_cases.emplace_back(new test_add1());
|
||||
test_cases.emplace_back(new test_add1(GGML_TYPE_F32, {1024, 1024, 1, 1}));
|
||||
test_cases.emplace_back(new test_scale());
|
||||
test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f));
|
||||
test_cases.emplace_back(new test_scale(GGML_TYPE_F32, {10, 10, 10, 10}, 2.0f, 1.0f, true)); // inplace test
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -18,7 +18,7 @@
|
||||
<div style="display: contents">
|
||||
<script>
|
||||
{
|
||||
__sveltekit_1trm5n9 = {
|
||||
__sveltekit_10avopp = {
|
||||
base: new URL('.', location).pathname.slice(0, -1)
|
||||
};
|
||||
|
||||
|
||||
@@ -632,7 +632,7 @@ private:
|
||||
|
||||
// load the model and initialize llama_context
|
||||
// this may also be called to resume from sleeping state
|
||||
bool load_model(const common_params & params) {
|
||||
bool load_model(common_params & params) {
|
||||
bool is_resume = sleeping;
|
||||
|
||||
SRV_INF("loading model '%s'\n", params.model.path.c_str());
|
||||
@@ -641,6 +641,9 @@ private:
|
||||
|
||||
llama_init = common_init_from_params(params_base);
|
||||
|
||||
// propagate model-metadata sampling defaults back to caller
|
||||
params.sampling = params_base.sampling;
|
||||
|
||||
model = llama_init->model();
|
||||
ctx = llama_init->context();
|
||||
|
||||
@@ -2404,7 +2407,7 @@ private:
|
||||
// guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS]
|
||||
LOG_INF("slot %12.*s: id %2d | task %d | Checking checkpoint with [%d, %d] against %d...\n", 12,
|
||||
func_name, (slot).id, ((slot).task ? (slot).task->id : -1), cur.pos_min, cur.pos_max, pos_min_thold);
|
||||
return cur.pos_min < pos_min_thold;
|
||||
return cur.pos_min < pos_min_thold || cur.pos_min == 0;
|
||||
}
|
||||
);
|
||||
|
||||
@@ -2978,7 +2981,7 @@ private:
|
||||
server_context::server_context() : impl(new server_context_impl()) {}
|
||||
server_context::~server_context() = default;
|
||||
|
||||
bool server_context::load_model(const common_params & params) {
|
||||
bool server_context::load_model(common_params & params) {
|
||||
return impl->load_model(params);
|
||||
}
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ struct server_context {
|
||||
|
||||
// load the model and initialize llama_context
|
||||
// returns true on success
|
||||
bool load_model(const common_params & params);
|
||||
bool load_model(common_params & params);
|
||||
|
||||
// this function will block main thread until termination
|
||||
void start_loop();
|
||||
|
||||
8
tools/server/webui/package-lock.json
generated
8
tools/server/webui/package-lock.json
generated
@@ -51,7 +51,6 @@
|
||||
"eslint-config-prettier": "^10.0.1",
|
||||
"eslint-plugin-storybook": "^10.2.4",
|
||||
"eslint-plugin-svelte": "^3.0.0",
|
||||
"fflate": "^0.8.2",
|
||||
"globals": "^16.0.0",
|
||||
"http-server": "^14.1.1",
|
||||
"mdast": "^3.0.0",
|
||||
@@ -5051,13 +5050,6 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/fflate": {
|
||||
"version": "0.8.2",
|
||||
"resolved": "https://registry.npmjs.org/fflate/-/fflate-0.8.2.tgz",
|
||||
"integrity": "sha512-cPJU47OaAoCbg0pBvzsgpTPhmhqI5eJjh/JIu8tPj5q+T7iLvW/JAYUqmE7KOB4R1ZyEhzBaIQpQpardBF5z8A==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/file-entry-cache": {
|
||||
"version": "8.0.0",
|
||||
"resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz",
|
||||
|
||||
@@ -33,7 +33,7 @@
|
||||
const showToolCallInProgress = $derived(config().showToolCallInProgress as boolean);
|
||||
const showThoughtInProgress = $derived(config().showThoughtInProgress as boolean);
|
||||
|
||||
const sections = $derived(deriveAgenticSections(message, toolMessages, []));
|
||||
const sections = $derived(deriveAgenticSections(message, toolMessages, [], isStreaming));
|
||||
|
||||
// Parse tool results with images
|
||||
const sectionsParsed = $derived(
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
import { rehypeEnhanceLinks } from '$lib/markdown/enhance-links';
|
||||
import { rehypeEnhanceCodeBlocks } from '$lib/markdown/enhance-code-blocks';
|
||||
import { rehypeResolveAttachmentImages } from '$lib/markdown/resolve-attachment-images';
|
||||
import { rehypeRtlSupport } from '$lib/markdown/rehype-rtl-support';
|
||||
import { remarkLiteralHtml } from '$lib/markdown/literal-html';
|
||||
import { copyCodeToClipboard, preprocessLaTeX, getImageErrorFallbackHtml } from '$lib/utils';
|
||||
import {
|
||||
@@ -101,6 +102,7 @@
|
||||
.use(rehypeEnhanceLinks) // Add target="_blank" to links
|
||||
.use(rehypeEnhanceCodeBlocks) // Wrap code blocks with header and actions
|
||||
.use(rehypeResolveAttachmentImages, { attachments })
|
||||
.use(rehypeRtlSupport) // Add bidirectional text support
|
||||
.use(rehypeStringify, { allowDangerousHtml: true }); // Convert to HTML string
|
||||
});
|
||||
|
||||
@@ -781,19 +783,19 @@
|
||||
/* Lists */
|
||||
div :global(ul) {
|
||||
list-style-type: disc;
|
||||
margin-left: 1.5rem;
|
||||
margin-inline-start: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
div :global(ol) {
|
||||
list-style-type: decimal;
|
||||
margin-left: 1.5rem;
|
||||
margin-inline-start: 1.5rem;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
div :global(li) {
|
||||
margin-bottom: 0.25rem;
|
||||
padding-left: 0.5rem;
|
||||
padding-inline-start: 0.5rem;
|
||||
}
|
||||
|
||||
div :global(li::marker) {
|
||||
@@ -816,8 +818,8 @@
|
||||
/* Task lists */
|
||||
div :global(.task-list-item) {
|
||||
list-style: none;
|
||||
margin-left: 0;
|
||||
padding-left: 0;
|
||||
margin-inline-start: 0;
|
||||
padding-inline-start: 0;
|
||||
}
|
||||
|
||||
div :global(.task-list-item-checkbox) {
|
||||
|
||||
28
tools/server/webui/src/lib/markdown/rehype-rtl-support.ts
Normal file
28
tools/server/webui/src/lib/markdown/rehype-rtl-support.ts
Normal file
@@ -0,0 +1,28 @@
|
||||
/**
|
||||
* Rehype plugin to provide comprehensive RTL support by adding dir="auto"
|
||||
* to all text-containing elements.
|
||||
*
|
||||
* This operates directly on the HAST tree, ensuring that all elements
|
||||
* (including those not in a predefined list) receive the attribute.
|
||||
*/
|
||||
|
||||
import type { Plugin } from 'unified';
|
||||
import type { Root, Element } from 'hast';
|
||||
import { visit } from 'unist-util-visit';
|
||||
|
||||
/**
|
||||
* Rehype plugin to add dir="auto" to all elements that have children.
|
||||
* This provides bidirectional text support for mixed RTL/LTR content.
|
||||
*/
|
||||
export const rehypeRtlSupport: Plugin<[], Root> = () => {
|
||||
return (tree: Root) => {
|
||||
visit(tree, 'element', (node: Element) => {
|
||||
if (node.children && node.children.length > 0) {
|
||||
node.properties = {
|
||||
...node.properties,
|
||||
dir: 'auto'
|
||||
};
|
||||
}
|
||||
});
|
||||
};
|
||||
};
|
||||
@@ -474,6 +474,7 @@ class AgenticStore {
|
||||
sessionMessages.push({
|
||||
role: MessageRole.ASSISTANT,
|
||||
content: turnContent || undefined,
|
||||
reasoning_content: turnReasoningContent || undefined,
|
||||
tool_calls: normalizedCalls
|
||||
});
|
||||
|
||||
|
||||
@@ -41,6 +41,7 @@ export type AgenticMessage =
|
||||
| {
|
||||
role: MessageRole.ASSISTANT;
|
||||
content?: string | ApiChatMessageContentPart[];
|
||||
reasoning_content?: string;
|
||||
tool_calls?: AgenticToolCallPayload[];
|
||||
}
|
||||
| {
|
||||
|
||||
@@ -38,14 +38,19 @@ export type ToolResultLine = {
|
||||
function deriveSingleTurnSections(
|
||||
message: DatabaseMessage,
|
||||
toolMessages: DatabaseMessage[] = [],
|
||||
streamingToolCalls: ApiChatCompletionToolCall[] = []
|
||||
streamingToolCalls: ApiChatCompletionToolCall[] = [],
|
||||
isStreaming: boolean = false
|
||||
): AgenticSection[] {
|
||||
const sections: AgenticSection[] = [];
|
||||
|
||||
// 1. Reasoning content (from dedicated field)
|
||||
if (message.reasoningContent) {
|
||||
const toolCalls = parseToolCalls(message.toolCalls);
|
||||
const hasContentAfterReasoning =
|
||||
!!message.content?.trim() || toolCalls.length > 0 || streamingToolCalls.length > 0;
|
||||
const isPending = isStreaming && !hasContentAfterReasoning;
|
||||
sections.push({
|
||||
type: AgenticSectionType.REASONING,
|
||||
type: isPending ? AgenticSectionType.REASONING_PENDING : AgenticSectionType.REASONING,
|
||||
content: message.reasoningContent
|
||||
});
|
||||
}
|
||||
@@ -104,12 +109,13 @@ function deriveSingleTurnSections(
|
||||
export function deriveAgenticSections(
|
||||
message: DatabaseMessage,
|
||||
toolMessages: DatabaseMessage[] = [],
|
||||
streamingToolCalls: ApiChatCompletionToolCall[] = []
|
||||
streamingToolCalls: ApiChatCompletionToolCall[] = [],
|
||||
isStreaming: boolean = false
|
||||
): AgenticSection[] {
|
||||
const hasAssistantContinuations = toolMessages.some((m) => m.role === MessageRole.ASSISTANT);
|
||||
|
||||
if (!hasAssistantContinuations) {
|
||||
return deriveSingleTurnSections(message, toolMessages, streamingToolCalls);
|
||||
return deriveSingleTurnSections(message, toolMessages, streamingToolCalls, isStreaming);
|
||||
}
|
||||
|
||||
const sections: AgenticSection[] = [];
|
||||
@@ -127,7 +133,12 @@ export function deriveAgenticSections(
|
||||
const isLastTurn = i + 1 + turnToolMsgs.length >= toolMessages.length;
|
||||
|
||||
sections.push(
|
||||
...deriveSingleTurnSections(msg, turnToolMsgs, isLastTurn ? streamingToolCalls : [])
|
||||
...deriveSingleTurnSections(
|
||||
msg,
|
||||
turnToolMsgs,
|
||||
isLastTurn ? streamingToolCalls : [],
|
||||
isLastTurn && isStreaming
|
||||
)
|
||||
);
|
||||
|
||||
i += 1 + turnToolMsgs.length;
|
||||
|
||||
@@ -162,6 +162,36 @@ describe('deriveAgenticSections', () => {
|
||||
expect(sections[4].content).toBe('Here is the analysis.');
|
||||
});
|
||||
|
||||
it('returns REASONING_PENDING when streaming with only reasoning content', () => {
|
||||
const msg = makeAssistant({
|
||||
reasoningContent: 'Let me think about this...'
|
||||
});
|
||||
const sections = deriveAgenticSections(msg, [], [], true);
|
||||
expect(sections).toHaveLength(1);
|
||||
expect(sections[0].type).toBe(AgenticSectionType.REASONING_PENDING);
|
||||
expect(sections[0].content).toBe('Let me think about this...');
|
||||
});
|
||||
|
||||
it('returns REASONING (not pending) when streaming but text content has appeared', () => {
|
||||
const msg = makeAssistant({
|
||||
content: 'The answer is',
|
||||
reasoningContent: 'Let me think...'
|
||||
});
|
||||
const sections = deriveAgenticSections(msg, [], [], true);
|
||||
expect(sections).toHaveLength(2);
|
||||
expect(sections[0].type).toBe(AgenticSectionType.REASONING);
|
||||
expect(sections[1].type).toBe(AgenticSectionType.TEXT);
|
||||
});
|
||||
|
||||
it('returns REASONING (not pending) when not streaming', () => {
|
||||
const msg = makeAssistant({
|
||||
reasoningContent: 'Let me think...'
|
||||
});
|
||||
const sections = deriveAgenticSections(msg, [], [], false);
|
||||
expect(sections).toHaveLength(1);
|
||||
expect(sections[0].type).toBe(AgenticSectionType.REASONING);
|
||||
});
|
||||
|
||||
it('multi-turn: streaming tool calls on last turn', () => {
|
||||
const assistant1 = makeAssistant({
|
||||
toolCalls: JSON.stringify([
|
||||
|
||||
Reference in New Issue
Block a user