mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-03-19 14:53:28 +02:00
Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4065c1a3a6 | ||
|
|
1e64534570 | ||
|
|
cd708db0cc | ||
|
|
512bba6ee0 | ||
|
|
b486c17b3e | ||
|
|
1b9bbaa357 | ||
|
|
07feeaa92e | ||
|
|
3fee84e156 | ||
|
|
811397745e | ||
|
|
c014c3f83a | ||
|
|
7f2cbd9a4d | ||
|
|
509a31d00f | ||
|
|
ea01d196d7 |
@@ -936,7 +936,9 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
for (auto msg : inputs.messages) {
|
||||
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
|
||||
msg["thinking"] = msg.at("reasoning_content");
|
||||
msg.erase("content");
|
||||
if (msg.contains("tool_calls") && msg.at("tool_calls").is_array() && !msg.at("tool_calls").empty()) {
|
||||
msg.erase("content");
|
||||
}
|
||||
}
|
||||
adjusted_messages.push_back(msg);
|
||||
}
|
||||
|
||||
@@ -1062,6 +1062,10 @@ class TextModel(ModelBase):
|
||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||
logger.info(f"gguf: key-value head count = {n_head_kv}")
|
||||
|
||||
if self.hparams.get("is_causal") is False:
|
||||
self.gguf_writer.add_causal_attention(False)
|
||||
logger.info("gguf: causal attention = False")
|
||||
|
||||
# TODO: Handle "sliding_attention" similarly when models start implementing it
|
||||
rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
|
||||
if (rope_type := rope_params.get("rope_type")) is not None:
|
||||
|
||||
@@ -28,6 +28,9 @@ Additionally, there the following images, similar to the above:
|
||||
- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:full-openvino`: Same as `full` but compiled with OpenVino support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:light-openvino`: Same as `light` but compiled with OpenVino support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:server-openvino`: Same as `server` but compiled with OpenVino support. (platforms: `linux/amd64`)
|
||||
|
||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ Legend:
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
@@ -62,7 +62,7 @@ Legend:
|
||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
@@ -115,7 +115,7 @@ Legend:
|
||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
|
||||
@@ -5744,49 +5744,61 @@
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000000,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000000,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.000000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000000,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000000,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000001","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000001,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000001","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000001,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000001,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000001","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000001,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000001","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000001,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.000001","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000001,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000001,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000100","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000100,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000100","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000100,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000100","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000100,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000100,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000100","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000100,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000100","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000100,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.000100","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000100,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000100,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.100000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.100000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.100000,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.100000,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.100000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.100000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.100000,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.100000,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=10.000000,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=10.000000,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=10.000000,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=10.000000,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","WebGPU"
|
||||
@@ -10036,17 +10048,17 @@
|
||||
"WebGPU: WebGPU","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","CUMSUM","type=f32,ne=[20481,4,1,1]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","XIELU","type=f32,ne=[10,5,4,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","DIAG","type=f32,ne=[10,1,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","DIAG","type=f32,ne=[79,1,19,13]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","DIAG","type=f32,ne=[256,1,8,16]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","DIAG","type=f32,ne=[10,1,4,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","DIAG","type=f32,ne=[79,1,19,13]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","DIAG","type=f32,ne=[256,1,8,16]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","WebGPU"
|
||||
|
||||
|
Can't render this file because it is too large.
|
@@ -2943,6 +2943,27 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
// Rotate full tensor (no tail), using trans tensors
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), acl_cos_reshape_tensor.get(),
|
||||
acl_sin_reshape_tensor.get(), acl_mode, acl_dst_trans_tensor.get());
|
||||
} else if (src0->data == dst->data && !ggml_is_contiguous(src0)) {
|
||||
// In-place on non-contiguous tensor: RotaryPositionEmbedding cannot safely
|
||||
// read and write the same non-contiguous buffer. Use contiguous temporaries.
|
||||
size_t contiguous_nb[GGML_MAX_DIMS];
|
||||
contiguous_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
contiguous_nb[i] = contiguous_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
int64_t total_elements = ggml_nelements(src0);
|
||||
ggml_cann_pool_alloc inplace_src_alloc(ctx.pool(), total_elements * sizeof(float));
|
||||
ggml_cann_pool_alloc inplace_dst_alloc(ctx.pool(), total_elements * sizeof(float));
|
||||
|
||||
acl_tensor_ptr acl_src_contig = ggml_cann_create_tensor(inplace_src_alloc.get(), ACL_FLOAT, sizeof(float),
|
||||
src0->ne, contiguous_nb, GGML_MAX_DIMS);
|
||||
acl_tensor_ptr acl_dst_contig = ggml_cann_create_tensor(inplace_dst_alloc.get(), ACL_FLOAT, sizeof(float),
|
||||
dst->ne, contiguous_nb, GGML_MAX_DIMS);
|
||||
|
||||
cann_copy(ctx, acl_src.get(), acl_src_contig.get());
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_contig.get(), acl_cos_reshape_tensor.get(),
|
||||
acl_sin_reshape_tensor.get(), acl_mode, acl_dst_contig.get());
|
||||
cann_copy(ctx, acl_dst_contig.get(), acl_dst.get());
|
||||
} else {
|
||||
// Rotate full tensor (no tail), using original tensors
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
|
||||
|
||||
@@ -570,24 +570,34 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "54049037570ab0ee0a0d126b2ba5ece1")
|
||||
|
||||
if (POLICY CMP0135)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
endif()
|
||||
|
||||
# TODO: Use FetchContent_MakeAvailable with EXCLUDE_FROM_ALL after bumping minimum CMake version to 3.28+
|
||||
# Using FetchContent_Populate instead to avoid EXCLUDE_FROM_ALL which requires CMake 3.28
|
||||
FetchContent_Declare(KleidiAI_Download
|
||||
set(KLEIDIAI_FETCH_ARGS
|
||||
URL ${KLEIDIAI_DOWNLOAD_URL}
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP NEW
|
||||
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
|
||||
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5}
|
||||
)
|
||||
|
||||
FetchContent_GetProperties(KleidiAI_Download
|
||||
SOURCE_DIR KLEIDIAI_SRC
|
||||
POPULATED KLEIDIAI_POPULATED)
|
||||
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.28")
|
||||
FetchContent_Declare(KleidiAI_Download
|
||||
${KLEIDIAI_FETCH_ARGS}
|
||||
EXCLUDE_FROM_ALL
|
||||
)
|
||||
|
||||
if (NOT KLEIDIAI_POPULATED)
|
||||
FetchContent_Populate(KleidiAI_Download)
|
||||
FetchContent_MakeAvailable(KleidiAI_Download)
|
||||
FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
|
||||
else()
|
||||
FetchContent_Declare(KleidiAI_Download
|
||||
${KLEIDIAI_FETCH_ARGS}
|
||||
)
|
||||
|
||||
FetchContent_GetProperties(KleidiAI_Download
|
||||
SOURCE_DIR KLEIDIAI_SRC
|
||||
POPULATED KLEIDIAI_POPULATED
|
||||
)
|
||||
|
||||
if (NOT KLEIDIAI_POPULATED)
|
||||
FetchContent_Populate(KleidiAI_Download)
|
||||
FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
|
||||
|
||||
@@ -444,19 +444,20 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint ib32 = (idx % 128) / 16; // 0..7
|
||||
const uint iq = 16 * ib32 + 2 * (idx % 8);
|
||||
const uint ib = idx / 64; // 4 values per idx
|
||||
const uint ib32 = (idx % 64) / 8; // 0..7
|
||||
const uint iq = 4 * ib32 + (idx % 4);
|
||||
|
||||
const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
|
||||
const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3;
|
||||
const uint qshift = (idx & 8) >> 1;
|
||||
u8vec2 qs = unpack8((uint(data_a_packed16[ib].qs[iq/2]) >> qshift) & 0x0F0F).xy;
|
||||
const uint qshift = idx & 4;
|
||||
u8vec4 qs = unpack8((uint(data_a_packed32[ib].qs[iq]) >> qshift) & 0x0F0F0F0F);
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
|
||||
const vec4 v = d * float(int(sl | (sh << 4)) - 32) * vec4(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y], kvalues_iq4nl[qs.z], kvalues_iq4nl[qs.w]);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v.xy);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(v.zw);
|
||||
#elif defined(DATA_A_IQ4_NL)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
|
||||
|
||||
@@ -554,7 +554,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
std::string load_vec_quant = "2";
|
||||
if ((tname == "q4_0") || (tname == "q4_1") || (tname == "q5_1") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s"))
|
||||
load_vec_quant = "8";
|
||||
else if ((tname == "q5_0") || (tname == "q8_0") || (tname == "q2_k") || (tname == "q4_k") || (tname == "q5_k") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_nl") || (tname == "mxfp4"))
|
||||
else if ((tname == "q5_0") || (tname == "q8_0") || (tname == "q2_k") || (tname == "q4_k") || (tname == "q5_k") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_xs") || (tname == "iq4_nl") || (tname == "mxfp4"))
|
||||
load_vec_quant = "4";
|
||||
|
||||
if (tname == "bf16") {
|
||||
|
||||
@@ -151,6 +151,26 @@ struct ggml_webgpu_get_rows_pipeline_key_hash {
|
||||
}
|
||||
};
|
||||
|
||||
/** Row Norm **/
|
||||
|
||||
struct ggml_webgpu_row_norm_pipeline_key {
|
||||
ggml_op op;
|
||||
bool inplace;
|
||||
|
||||
bool operator==(const ggml_webgpu_row_norm_pipeline_key & other) const {
|
||||
return op == other.op && inplace == other.inplace;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_row_norm_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_row_norm_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.op);
|
||||
ggml_webgpu_hash_combine(seed, key.inplace);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
/** Pad **/
|
||||
struct ggml_webgpu_pad_pipeline_key {
|
||||
bool circular;
|
||||
@@ -244,13 +264,15 @@ struct ggml_webgpu_binary_pipeline_key_hash {
|
||||
/** Unary **/
|
||||
|
||||
struct ggml_webgpu_unary_pipeline_key {
|
||||
int type;
|
||||
int op;
|
||||
bool is_unary; // many unary operators fall under the GGML_OP_UNARY umbrella
|
||||
bool inplace;
|
||||
int type;
|
||||
int op;
|
||||
bool is_unary; // many unary operators fall under the GGML_OP_UNARY umbrella
|
||||
bool inplace;
|
||||
ggml_tri_type ttype; // only used for GGML_OP_TRI
|
||||
|
||||
bool operator==(const ggml_webgpu_unary_pipeline_key & other) const {
|
||||
return type == other.type && op == other.op && is_unary == other.is_unary && inplace == other.inplace;
|
||||
return type == other.type && op == other.op && is_unary == other.is_unary && inplace == other.inplace &&
|
||||
ttype == other.ttype;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -261,6 +283,7 @@ struct ggml_webgpu_unary_pipeline_key_hash {
|
||||
ggml_webgpu_hash_combine(seed, key.op);
|
||||
ggml_webgpu_hash_combine(seed, key.is_unary);
|
||||
ggml_webgpu_hash_combine(seed, key.inplace);
|
||||
ggml_webgpu_hash_combine(seed, key.ttype);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
@@ -435,6 +458,8 @@ class ggml_webgpu_shader_lib {
|
||||
std::unordered_map<int, webgpu_pipeline> argsort_pipelines; // key is order
|
||||
std::unordered_map<int, webgpu_pipeline> argsort_merge_pipelines; // key is order
|
||||
std::unordered_map<int, webgpu_pipeline> cumsum_pipelines; // key is fixed, no variants yet
|
||||
std::unordered_map<ggml_webgpu_row_norm_pipeline_key, webgpu_pipeline, ggml_webgpu_row_norm_pipeline_key_hash>
|
||||
row_norm_pipelines; // op/inplace
|
||||
std::unordered_map<ggml_webgpu_get_rows_pipeline_key, webgpu_pipeline, ggml_webgpu_get_rows_pipeline_key_hash>
|
||||
get_rows_pipelines; // src_type, vectorized
|
||||
std::unordered_map<ggml_webgpu_unary_pipeline_key, webgpu_pipeline, ggml_webgpu_unary_pipeline_key_hash>
|
||||
@@ -479,6 +504,44 @@ class ggml_webgpu_shader_lib {
|
||||
return sum_rows_pipelines[1];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_row_norm_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_row_norm_pipeline_key key = {
|
||||
.op = context.dst->op,
|
||||
.inplace = context.inplace,
|
||||
};
|
||||
|
||||
auto it = row_norm_pipelines.find(key);
|
||||
if (it != row_norm_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
std::vector<std::string> defines;
|
||||
std::string variant;
|
||||
|
||||
switch (key.op) {
|
||||
case GGML_OP_RMS_NORM:
|
||||
defines.push_back("OP_RMS_NORM");
|
||||
variant = "rms_norm";
|
||||
break;
|
||||
case GGML_OP_L2_NORM:
|
||||
defines.push_back("OP_L2_NORM");
|
||||
variant = "l2_norm";
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported op for row_norm shader");
|
||||
}
|
||||
|
||||
if (key.inplace) {
|
||||
defines.push_back("INPLACE");
|
||||
variant += "_inplace";
|
||||
}
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
|
||||
|
||||
auto processed = preprocessor.preprocess(wgsl_row_norm, defines);
|
||||
row_norm_pipelines[key] = ggml_webgpu_create_pipeline(device, processed, variant);
|
||||
return row_norm_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_argmax_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
bool vec4 = context.src0->ne[0] % 4 == 0;
|
||||
|
||||
@@ -1058,6 +1121,7 @@ class ggml_webgpu_shader_lib {
|
||||
.op = op,
|
||||
.is_unary = is_unary,
|
||||
.inplace = context.inplace,
|
||||
.ttype = (ggml_tri_type) ggml_get_op_params_i32(context.dst, 0),
|
||||
};
|
||||
|
||||
auto it = unary_pipelines.find(key);
|
||||
@@ -1088,6 +1152,29 @@ class ggml_webgpu_shader_lib {
|
||||
variant += "_inplace";
|
||||
}
|
||||
|
||||
if (op == GGML_OP_TRI) {
|
||||
switch (key.ttype) {
|
||||
case GGML_TRI_TYPE_LOWER:
|
||||
defines.push_back("TRI_TYPE_LOWER");
|
||||
variant += "_tri_type_lower";
|
||||
break;
|
||||
case GGML_TRI_TYPE_LOWER_DIAG:
|
||||
defines.push_back("TRI_TYPE_LOWER_DIAG");
|
||||
variant += "_tri_type_lower_diag";
|
||||
break;
|
||||
case GGML_TRI_TYPE_UPPER:
|
||||
defines.push_back("TRI_TYPE_UPPER");
|
||||
variant += "_tri_type_upper";
|
||||
break;
|
||||
case GGML_TRI_TYPE_UPPER_DIAG:
|
||||
defines.push_back("TRI_TYPE_UPPER_DIAG");
|
||||
variant += "_tri_upper_diag";
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported ggml_tri_type for unary shader");
|
||||
}
|
||||
}
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
|
||||
|
||||
auto processed = preprocessor.preprocess(wgsl_unary, defines);
|
||||
|
||||
@@ -366,7 +366,6 @@ struct webgpu_context_struct {
|
||||
|
||||
std::map<int, std::map<int, webgpu_pipeline>> cpy_pipelines; // src_type, dst_type
|
||||
|
||||
std::map<int, webgpu_pipeline> rms_norm_pipelines; // inplace
|
||||
std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> rope_pipelines; // type, ff, inplace
|
||||
std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> glu_pipelines; // glu_op, type, split
|
||||
|
||||
@@ -1598,8 +1597,8 @@ static webgpu_command ggml_webgpu_repeat(webgpu_context & ctx, ggml_tensor * src
|
||||
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
|
||||
}
|
||||
|
||||
static webgpu_command ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
|
||||
int inplace = ggml_webgpu_tensor_equal(src, dst);
|
||||
static webgpu_command ggml_webgpu_row_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
|
||||
bool inplace = ggml_webgpu_tensor_equal(src, dst);
|
||||
|
||||
std::vector<uint32_t> params = {
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
|
||||
@@ -1630,8 +1629,15 @@ static webgpu_command ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * s
|
||||
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
|
||||
}
|
||||
|
||||
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, ctx->rms_norm_pipelines[inplace], params,
|
||||
entries, ggml_nrows(src));
|
||||
ggml_webgpu_shader_lib_context shader_lib_ctx = {
|
||||
.src0 = src,
|
||||
.dst = dst,
|
||||
.max_wg_size = WEBGPU_ROW_SPLIT_WG_SIZE,
|
||||
.inplace = inplace,
|
||||
};
|
||||
|
||||
webgpu_pipeline pipeline = ctx->shader_lib->get_row_norm_pipeline(shader_lib_ctx);
|
||||
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, ggml_nrows(src));
|
||||
}
|
||||
|
||||
static webgpu_command ggml_webgpu_rope(webgpu_context & ctx,
|
||||
@@ -2192,7 +2198,8 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
|
||||
case GGML_OP_REPEAT:
|
||||
return ggml_webgpu_repeat(ctx, src0, node);
|
||||
case GGML_OP_RMS_NORM:
|
||||
return ggml_webgpu_rms_norm(ctx, src0, node);
|
||||
case GGML_OP_L2_NORM:
|
||||
return ggml_webgpu_row_norm(ctx, src0, node);
|
||||
case GGML_OP_ROPE:
|
||||
return ggml_webgpu_rope(ctx, src0, src1, src2, node);
|
||||
case GGML_OP_GLU:
|
||||
@@ -2209,6 +2216,8 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_DIAG:
|
||||
case GGML_OP_TRI:
|
||||
return ggml_webgpu_unary_op(ctx, src0, node);
|
||||
case GGML_OP_PAD:
|
||||
return ggml_webgpu_pad(ctx, src0, node);
|
||||
@@ -2614,15 +2623,6 @@ static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->global_ctx->device, wgsl_cpy_f16_f16, "cpy_f16_f16", constants);
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_rms_norm_pipeline(webgpu_context & webgpu_ctx) {
|
||||
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(WEBGPU_ROW_SPLIT_WG_SIZE);
|
||||
|
||||
webgpu_ctx->rms_norm_pipelines[0] =
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->global_ctx->device, wgsl_rms_norm, "rms_norm", constants);
|
||||
webgpu_ctx->rms_norm_pipelines[1] = ggml_webgpu_create_pipeline(
|
||||
webgpu_ctx->global_ctx->device, wgsl_rms_norm_inplace, "rms_norm_inplace", constants);
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_rope_pipeline(webgpu_context & webgpu_ctx) {
|
||||
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(WEBGPU_MAX_WG_SIZE);
|
||||
|
||||
@@ -2907,7 +2907,6 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
|
||||
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "set_rows_host_error_buf");
|
||||
|
||||
ggml_webgpu_init_cpy_pipeline(webgpu_ctx);
|
||||
ggml_webgpu_init_rms_norm_pipeline(webgpu_ctx);
|
||||
ggml_webgpu_init_rope_pipeline(webgpu_ctx);
|
||||
ggml_webgpu_init_glu_pipeline(webgpu_ctx);
|
||||
ggml_webgpu_init_soft_max_pipeline(webgpu_ctx);
|
||||
@@ -3118,6 +3117,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
||||
break;
|
||||
}
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_L2_NORM:
|
||||
supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
|
||||
break;
|
||||
case GGML_OP_ROPE:
|
||||
@@ -3201,6 +3201,12 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
||||
case GGML_OP_COS:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
|
||||
break;
|
||||
case GGML_OP_DIAG:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
|
||||
break;
|
||||
case GGML_OP_TRI:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
|
||||
break;
|
||||
case GGML_OP_PAD:
|
||||
supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
|
||||
break;
|
||||
|
||||
@@ -1,21 +1,11 @@
|
||||
#define(VARIANTS)
|
||||
|
||||
[
|
||||
{
|
||||
"DECLS": ["NOT_INPLACE"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "inplace",
|
||||
"DECLS": ["INPLACE"]
|
||||
},
|
||||
]
|
||||
|
||||
#end(VARIANTS)
|
||||
|
||||
#define(DECLS)
|
||||
|
||||
#decl(NOT_INPLACE)
|
||||
#ifdef INPLACE
|
||||
fn update(src_offset: u32, dst_offset: u32, scale: f32) {
|
||||
src[dst_offset] = scale * src[src_offset];
|
||||
}
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<uniform> params: Params;
|
||||
#else
|
||||
fn update(src_offset: u32, dst_offset: u32, scale: f32) {
|
||||
dst[dst_offset] = scale * src[src_offset];
|
||||
}
|
||||
@@ -25,23 +15,7 @@ var<storage, read_write> dst: array<f32>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
|
||||
#enddecl(NOT_INPLACE)
|
||||
|
||||
#decl(INPLACE)
|
||||
|
||||
fn update(src_offset: u32, dst_offset: u32, scale: f32) {
|
||||
src[dst_offset] = scale * src[src_offset];
|
||||
}
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<uniform> params: Params;
|
||||
|
||||
#enddecl(INPLACE)
|
||||
|
||||
#end(DECLS)
|
||||
|
||||
#define(SHADER)
|
||||
#endif
|
||||
|
||||
struct Params {
|
||||
offset_src: u32, // in elements
|
||||
@@ -68,12 +42,9 @@ struct Params {
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src: array<f32>;
|
||||
|
||||
DECLS
|
||||
var<workgroup> scratch: array<f32, WG_SIZE>;
|
||||
|
||||
override wg_size: u32;
|
||||
var<workgroup> scratch: array<f32, wg_size>;
|
||||
|
||||
@compute @workgroup_size(wg_size)
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(workgroup_id) wid: vec3<u32>,
|
||||
@builtin(local_invocation_id) lid: vec3<u32>) {
|
||||
|
||||
@@ -86,7 +57,7 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
|
||||
let i_src_row = params.offset_src + i3 * params.stride_src3 + i2 * params.stride_src2 + i1 * params.stride_src1;
|
||||
let i_dst_row = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1;
|
||||
|
||||
let elems = (params.ne0 + wg_size - 1) / wg_size;
|
||||
let elems = (params.ne0 + WG_SIZE - 1) / WG_SIZE;
|
||||
|
||||
var sum = 0.0f;
|
||||
var col = lid.x;
|
||||
@@ -95,12 +66,12 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
|
||||
break;
|
||||
}
|
||||
sum += pow(src[i_src_row + col], 2.0);
|
||||
col += wg_size;
|
||||
col += WG_SIZE;
|
||||
}
|
||||
|
||||
scratch[lid.x] = sum;
|
||||
workgroupBarrier();
|
||||
var offset = wg_size / 2;
|
||||
var offset: u32 = WG_SIZE / 2;
|
||||
while (offset > 0) {
|
||||
if (lid.x < offset) {
|
||||
scratch[lid.x] += scratch[lid.x + offset];
|
||||
@@ -110,14 +81,17 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
|
||||
}
|
||||
sum = scratch[0];
|
||||
|
||||
#ifdef OP_RMS_NORM
|
||||
let scale = 1.0/sqrt(sum/f32(params.ne0) + params.eps);
|
||||
#elif OP_L2_NORM
|
||||
let scale = 1.0/max(sqrt(sum), params.eps);
|
||||
#endif
|
||||
col = lid.x;
|
||||
for (var j: u32 = 0; j < elems; j++) {
|
||||
if (col >= params.ne0) {
|
||||
break;
|
||||
}
|
||||
update(i_src_row + col, i_dst_row + col, scale);
|
||||
col += wg_size;
|
||||
col += WG_SIZE;
|
||||
}
|
||||
}
|
||||
#end(SHADER)
|
||||
@@ -5,7 +5,6 @@ enable f16;
|
||||
#define TYPE f32
|
||||
#endif
|
||||
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src: array<TYPE>;
|
||||
|
||||
@@ -57,12 +56,20 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
return;
|
||||
}
|
||||
var i = gid.x;
|
||||
let i3 = i / (params.ne2 * params.ne1 * params.ne0);
|
||||
i = i % (params.ne2 * params.ne1 * params.ne0);
|
||||
let i2 = i / (params.ne1 * params.ne0);
|
||||
i = i % (params.ne1 * params.ne0);
|
||||
let i1 = i / params.ne0;
|
||||
let i0 = i % params.ne0;
|
||||
let ne2 = params.ne2;
|
||||
#ifdef DIAG
|
||||
let ne1 = params.ne0;
|
||||
#else
|
||||
let ne1 = params.ne1;
|
||||
#endif
|
||||
let ne0 = params.ne0;
|
||||
|
||||
let i3 = i / (ne2 * ne1 * ne0);
|
||||
i = i % (ne2 * ne1 * ne0);
|
||||
let i2 = i / (ne1 * ne0);
|
||||
i = i % (ne1 * ne0);
|
||||
let i1 = i / ne0;
|
||||
let i0 = i % ne0;
|
||||
|
||||
let src_idx = i0 * params.stride_src0 + i1 * params.stride_src1 +
|
||||
i2 * params.stride_src2 + i3 * params.stride_src3;
|
||||
@@ -184,6 +191,20 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
let res_f32 = cos(f32(src[params.offset_src + src_idx]));
|
||||
let res = TYPE(res_f32);
|
||||
#endif
|
||||
#ifdef DIAG
|
||||
let res = select(0.0, src[params.offset_src + i0 + i2 * params.stride_src2 + i3 * params.stride_src3], i0 == i1);
|
||||
#endif
|
||||
#ifdef TRI
|
||||
#ifdef TRI_TYPE_LOWER
|
||||
let res = select(0.0, src[params.offset_src + src_idx], i0 < i1);
|
||||
#elif TRI_TYPE_LOWER_DIAG
|
||||
let res = select(0.0, src[params.offset_src + src_idx], i0 <= i1);
|
||||
#elif TRI_TYPE_UPPER
|
||||
let res = select(0.0, src[params.offset_src + src_idx], i0 > i1);
|
||||
#elif TRI_TYPE_UPPER_DIAG
|
||||
let res = select(0.0, src[params.offset_src + src_idx], i0 >= i1);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef INPLACE
|
||||
src[params.offset_src + src_idx] = res;
|
||||
|
||||
@@ -2129,19 +2129,28 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
||||
}
|
||||
|
||||
const uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
|
||||
|
||||
const float * scores = nullptr;
|
||||
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
||||
if (score_idx != -1) {
|
||||
const uint32_t n_scores = gguf_get_arr_n(ctx, score_idx);
|
||||
if (n_scores < n_tokens) {
|
||||
throw std::runtime_error("Index out of array bounds for scores (" + std::to_string(n_scores) + " < " + std::to_string(n_tokens) + ")\n");
|
||||
}
|
||||
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||||
}
|
||||
|
||||
const int * toktypes = nullptr;
|
||||
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
||||
if (toktype_idx != -1) {
|
||||
const uint32_t n_toktypes = gguf_get_arr_n(ctx, toktype_idx);
|
||||
if (n_toktypes < n_tokens) {
|
||||
throw std::runtime_error("Index out of array bounds for toktypes (" + std::to_string(n_toktypes) + " < " + std::to_string(n_tokens) + ")\n");
|
||||
}
|
||||
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||
}
|
||||
|
||||
uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
|
||||
id_to_token.resize(n_tokens);
|
||||
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
|
||||
@@ -41,6 +41,11 @@ struct clip_graph {
|
||||
virtual ~clip_graph() = default;
|
||||
virtual ggml_cgraph * build() = 0;
|
||||
|
||||
// wrapper around ggml_mul_mat, allow hooking (e.g. LoRA, clamping) depending on the model
|
||||
// tensor w should be the weight matrix, and tensor x should be the input
|
||||
virtual ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const;
|
||||
// TODO: build_mm(w, b, x) to support bias
|
||||
|
||||
//
|
||||
// utility functions
|
||||
//
|
||||
|
||||
@@ -255,6 +255,10 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
|
||||
gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
|
||||
}
|
||||
|
||||
ggml_tensor * clip_graph::build_mm(ggml_tensor * w, ggml_tensor * x) const {
|
||||
return ggml_mul_mat(ctx0, w, x);
|
||||
}
|
||||
|
||||
void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
|
||||
if (il >= 0) {
|
||||
ggml_format_name(cur, "%s-%d", name, il);
|
||||
@@ -326,7 +330,7 @@ ggml_tensor * clip_graph::build_vit(
|
||||
ggml_tensor * Vcur = nullptr;
|
||||
if (layer.qkv_w != nullptr) {
|
||||
// fused qkv
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
cur = build_mm(layer.qkv_w, cur);
|
||||
if (layer.qkv_b != nullptr) {
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
}
|
||||
@@ -360,17 +364,17 @@ ggml_tensor * clip_graph::build_vit(
|
||||
|
||||
} else {
|
||||
// separate q, k, v
|
||||
Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
Qcur = build_mm(layer.q_w, cur);
|
||||
if (layer.q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
}
|
||||
|
||||
Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
Kcur = build_mm(layer.k_w, cur);
|
||||
if (layer.k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
}
|
||||
|
||||
Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
Vcur = build_mm(layer.v_w, cur);
|
||||
if (layer.v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
@@ -517,7 +521,7 @@ ggml_tensor * clip_graph::build_ffn(
|
||||
ffn_op_type type_op,
|
||||
int il) const {
|
||||
|
||||
ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
|
||||
ggml_tensor * tmp = up ? build_mm(up, cur) : cur;
|
||||
cb(tmp, "ffn_up", il);
|
||||
|
||||
if (up_b) {
|
||||
@@ -526,7 +530,7 @@ ggml_tensor * clip_graph::build_ffn(
|
||||
}
|
||||
|
||||
if (gate) {
|
||||
cur = ggml_mul_mat(ctx0, gate, cur);
|
||||
cur = build_mm(gate, cur);
|
||||
cb(cur, "ffn_gate", il);
|
||||
|
||||
if (gate_b) {
|
||||
@@ -580,7 +584,7 @@ ggml_tensor * clip_graph::build_ffn(
|
||||
}
|
||||
|
||||
if (down) {
|
||||
cur = ggml_mul_mat(ctx0, down, cur);
|
||||
cur = build_mm(down, cur);
|
||||
}
|
||||
|
||||
if (down_b) {
|
||||
@@ -646,7 +650,7 @@ ggml_tensor * clip_graph::build_attn(
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
cur = ggml_mul_mat(ctx0, wo, cur);
|
||||
cur = build_mm(wo, cur);
|
||||
}
|
||||
|
||||
if (wo_b) {
|
||||
|
||||
@@ -19,7 +19,7 @@ ggml_cgraph * clip_graph_cogvlm::build() {
|
||||
auto & layer = model.layers[il];
|
||||
ggml_tensor * cur = inpL;
|
||||
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
cur = build_mm(layer.qkv_w, cur);
|
||||
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
|
||||
@@ -67,7 +67,7 @@ ggml_cgraph * clip_graph_cogvlm::build() {
|
||||
ggml_row_size(inpL->type, n_embd), 0);
|
||||
|
||||
// Multiply with mm_model_proj
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
|
||||
cur = build_mm(model.mm_model_proj, cur);
|
||||
|
||||
// Apply layernorm, weight, bias
|
||||
cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
@@ -76,16 +76,16 @@ ggml_cgraph * clip_graph_cogvlm::build() {
|
||||
cur = ggml_gelu_inplace(ctx0, cur);
|
||||
|
||||
// Branch 1: multiply with mm_h_to_4h_w
|
||||
ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
|
||||
ggml_tensor * h_to_4h = build_mm(model.mm_h_to_4h_w, cur);
|
||||
|
||||
// Branch 2: multiply with mm_gate_w
|
||||
ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
|
||||
ggml_tensor * gate = build_mm(model.mm_gate_w, cur);
|
||||
|
||||
// Apply silu
|
||||
gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
|
||||
|
||||
// Apply mm_4h_to_h_w
|
||||
cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
|
||||
cur = build_mm(model.mm_4h_to_h_w, gate);
|
||||
|
||||
// Concatenate with boi and eoi
|
||||
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
|
||||
|
||||
@@ -56,7 +56,7 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
|
||||
|
||||
// calculate out
|
||||
cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
|
||||
cur = build_mm(model.pre_encode_out_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
|
||||
cb(cur, "conformer.pre_encode.out", -1);
|
||||
}
|
||||
@@ -87,7 +87,7 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
|
||||
cb(cur, "conformer.layers.{}.norm_self_att", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
|
||||
ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
|
||||
@@ -96,12 +96,12 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
|
||||
|
||||
// TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
|
||||
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
|
||||
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
||||
|
||||
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
|
||||
Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
|
||||
@@ -111,7 +111,7 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
|
||||
cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
|
||||
|
||||
auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
|
||||
auto * p = build_mm(layer.linear_pos_w, pos_emb);
|
||||
cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
|
||||
p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
|
||||
p = ggml_permute(ctx0, p, 0, 2, 1, 3);
|
||||
@@ -143,7 +143,7 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
x = ggml_permute(ctx0, x, 2, 0, 1, 3);
|
||||
x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
|
||||
|
||||
ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
|
||||
ggml_tensor * out = build_mm(layer.o_w, x);
|
||||
out = ggml_add(ctx0, out, layer.o_b);
|
||||
cb(out, "conformer.layers.{}.self_attn.linear_out", il);
|
||||
|
||||
@@ -157,7 +157,7 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
// conv
|
||||
{
|
||||
auto * x = cur;
|
||||
x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
|
||||
x = build_mm(layer.conv_pw1_w, x);
|
||||
x = ggml_add(ctx0, x, layer.conv_pw1_b);
|
||||
cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
|
||||
|
||||
@@ -181,7 +181,7 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
x = ggml_silu(ctx0, x);
|
||||
|
||||
// pointwise_conv2
|
||||
x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
|
||||
x = build_mm(layer.conv_pw2_w, x);
|
||||
x = ggml_add(ctx0, x, layer.conv_pw2_b);
|
||||
|
||||
cur = x;
|
||||
|
||||
@@ -97,7 +97,7 @@ ggml_cgraph * clip_graph_glm4v::build() {
|
||||
|
||||
// FC projector
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
||||
cur = build_mm(model.projection, cur);
|
||||
// default LayerNorm (post_projection_norm)
|
||||
cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
|
||||
@@ -22,7 +22,7 @@ ggml_cgraph * clip_graph_llama4::build() {
|
||||
ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
|
||||
patch_size, patch_size, 3, n_embd);
|
||||
inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
|
||||
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
|
||||
inp = build_mm(model.patch_embeddings_0, inp);
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
|
||||
cb(inp, "patch_conv", -1);
|
||||
}
|
||||
@@ -78,15 +78,15 @@ ggml_cgraph * clip_graph_llama4::build() {
|
||||
|
||||
// based on Llama4VisionMLP2 (always uses GELU activation, no bias)
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
|
||||
cur = build_mm(model.mm_model_mlp_1_w, cur);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
|
||||
cur = build_mm(model.mm_model_mlp_2_w, cur);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cb(cur, "adapter_mlp", -1);
|
||||
}
|
||||
|
||||
// Llama4MultiModalProjector
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
|
||||
cur = build_mm(model.mm_model_proj, cur);
|
||||
cb(cur, "projected", -1);
|
||||
|
||||
// build the graph
|
||||
|
||||
@@ -70,17 +70,17 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
||||
if (layer.q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
}
|
||||
|
||||
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
||||
if (layer.k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
}
|
||||
|
||||
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
||||
if (layer.v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
@@ -164,17 +164,17 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
|
||||
// llava projector
|
||||
if (proj_type == PROJECTOR_TYPE_MLP) {
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = build_mm(model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
if (model.mm_2_w) {
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
||||
embeddings = build_mm(model.mm_2_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||
}
|
||||
}
|
||||
else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = build_mm(model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
||||
// First LayerNorm
|
||||
@@ -186,7 +186,7 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
// Second linear layer
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
|
||||
embeddings = build_mm(model.mm_3_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
|
||||
|
||||
// Second LayerNorm
|
||||
@@ -197,10 +197,10 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
else if (proj_type == PROJECTOR_TYPE_LDP) {
|
||||
// MobileVLM projector
|
||||
int n_patch = 24;
|
||||
ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
|
||||
ggml_tensor * mlp_1 = build_mm(model.mm_model_mlp_1_w, embeddings);
|
||||
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
|
||||
mlp_1 = ggml_gelu(ctx0, mlp_1);
|
||||
ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
|
||||
ggml_tensor * mlp_3 = build_mm(model.mm_model_mlp_3_w, mlp_1);
|
||||
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
|
||||
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
|
||||
|
||||
@@ -229,10 +229,10 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
// pointwise conv
|
||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_1_block_1_fc1_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
|
||||
block_1 = ggml_relu(ctx0, block_1);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_1_block_1_fc2_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
|
||||
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
|
||||
@@ -244,7 +244,7 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||
|
||||
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_1_block_2_0_w, block_1);
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||
|
||||
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||
@@ -277,10 +277,10 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
// pointwise conv
|
||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_2_block_1_fc1_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
|
||||
block_1 = ggml_relu(ctx0, block_1);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_2_block_1_fc2_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
|
||||
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||
|
||||
@@ -292,7 +292,7 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_2_block_2_0_w, block_1);
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||
|
||||
|
||||
@@ -307,10 +307,10 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
else if (proj_type == PROJECTOR_TYPE_LDPV2)
|
||||
{
|
||||
int n_patch = 24;
|
||||
ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
||||
ggml_tensor * mlp_0 = build_mm(model.mm_model_mlp_0_w, embeddings);
|
||||
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
|
||||
mlp_0 = ggml_gelu(ctx0, mlp_0);
|
||||
ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
|
||||
ggml_tensor * mlp_2 = build_mm(model.mm_model_mlp_2_w, mlp_0);
|
||||
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
|
||||
// mlp_2 ne = [2048, 576, 1, 1]
|
||||
// // AVG Pool Layer 2*2, strides = 2
|
||||
@@ -344,15 +344,15 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
|
||||
// GLU
|
||||
{
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
||||
embeddings = build_mm(model.mm_model_mlp_0_w, embeddings);
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
||||
embeddings = ggml_gelu_inplace(ctx0, embeddings);
|
||||
ggml_tensor * x = embeddings;
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
|
||||
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
|
||||
embeddings = build_mm(model.mm_model_mlp_2_w, embeddings);
|
||||
x = build_mm(model.mm_model_mlp_1_w,x);
|
||||
embeddings = ggml_swiglu_split(ctx0, embeddings, x);
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
|
||||
embeddings = build_mm(model.mm_model_mlp_3_w, embeddings);
|
||||
}
|
||||
// arrangement of BOI/EOI token embeddings
|
||||
// note: these embeddings are not present in text model, hence we cannot process them as text tokens
|
||||
|
||||
@@ -38,7 +38,7 @@ ggml_cgraph * clip_graph_minicpmv::build() {
|
||||
// resampler projector (it is just another transformer)
|
||||
|
||||
ggml_tensor * q = model.mm_model_query;
|
||||
ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
|
||||
ggml_tensor * v = build_mm(model.mm_model_kv_proj, embeddings);
|
||||
|
||||
// norm
|
||||
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
@@ -77,13 +77,13 @@ ggml_cgraph * clip_graph_minicpmv::build() {
|
||||
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||
int num_query = hparams.minicpmv_query_num;
|
||||
ggml_tensor * Q = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
|
||||
build_mm(model.mm_model_attn_q_w, q),
|
||||
model.mm_model_attn_q_b);
|
||||
ggml_tensor * K = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
|
||||
build_mm(model.mm_model_attn_k_w, k),
|
||||
model.mm_model_attn_k_b);
|
||||
ggml_tensor * V = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
|
||||
build_mm(model.mm_model_attn_v_w, v),
|
||||
model.mm_model_attn_v_b);
|
||||
|
||||
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
|
||||
@@ -105,7 +105,7 @@ ggml_cgraph * clip_graph_minicpmv::build() {
|
||||
embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
|
||||
// projection
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
|
||||
embeddings = build_mm(model.mm_model_proj, embeddings);
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
@@ -429,7 +429,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
|
||||
// PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
|
||||
// Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
|
||||
if (model.mm_input_proj_w) {
|
||||
cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
|
||||
cur = build_mm(model.mm_input_proj_w, cur);
|
||||
}
|
||||
|
||||
// 5. POST PROJECTION NORM
|
||||
|
||||
@@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_pixtral::build() {
|
||||
|
||||
// project to n_embd
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
|
||||
cur = build_mm(model.mm_patch_merger_w, cur);
|
||||
}
|
||||
|
||||
// LlavaMultiModalProjector (always using GELU activation)
|
||||
|
||||
@@ -90,11 +90,11 @@ ggml_cgraph * clip_graph_qwen2vl::build() {
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
|
||||
build_mm(layer.q_w, cur), layer.q_b);
|
||||
ggml_tensor * Kcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
|
||||
build_mm(layer.k_w, cur), layer.k_b);
|
||||
ggml_tensor * Vcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
|
||||
build_mm(layer.v_w, cur), layer.v_b);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
|
||||
|
||||
@@ -85,7 +85,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
cur = build_mm(layer.qkv_w, cur);
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
|
||||
@@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_siglip::build() {
|
||||
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
||||
cur = build_mm(model.projection, cur);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_LFM2) {
|
||||
// pixel unshuffle block
|
||||
|
||||
@@ -59,7 +59,7 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
||||
|
||||
// ffn in
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = build_mm(model.mm_1_w, cur);
|
||||
|
||||
// swiglu
|
||||
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
|
||||
@@ -70,11 +70,11 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
|
||||
|
||||
// ffn out
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = build_mm(model.mm_2_w, cur);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
|
||||
// projector
|
||||
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
|
||||
cur = build_mm(model.mm_fc_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_fc_b);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
|
||||
|
||||
@@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_youtuvl::build() {
|
||||
ctx0, inp,
|
||||
3*patch_size* patch_size, Hm * Wm * m * m, 1);
|
||||
}
|
||||
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
|
||||
inp = build_mm(model.patch_embeddings_0, inp);
|
||||
|
||||
if (model.patch_bias) {
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
@@ -97,11 +97,11 @@ ggml_cgraph * clip_graph_youtuvl::build() {
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
|
||||
build_mm(layer.q_w, cur), layer.q_b);
|
||||
ggml_tensor * Kcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
|
||||
build_mm(layer.k_w, cur), layer.k_b);
|
||||
ggml_tensor * Vcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
|
||||
build_mm(layer.v_w, cur), layer.v_b);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
|
||||
|
||||
Binary file not shown.
@@ -5,9 +5,12 @@
|
||||
import Label from '$lib/components/ui/label/label.svelte';
|
||||
import * as Select from '$lib/components/ui/select';
|
||||
import { Textarea } from '$lib/components/ui/textarea';
|
||||
import { SETTING_CONFIG_DEFAULT, SETTING_CONFIG_INFO, SETTINGS_KEYS } from '$lib/constants';
|
||||
import { SETTING_CONFIG_INFO, SETTINGS_KEYS } from '$lib/constants';
|
||||
import { SettingsFieldType } from '$lib/enums/settings';
|
||||
import { settingsStore } from '$lib/stores/settings.svelte';
|
||||
import { serverStore } from '$lib/stores/server.svelte';
|
||||
import { modelsStore, selectedModelName } from '$lib/stores/models.svelte';
|
||||
import { normalizeFloatingPoint } from '$lib/utils/precision';
|
||||
import { ChatSettingsParameterSourceIndicator } from '$lib/components/app';
|
||||
import type { Component } from 'svelte';
|
||||
|
||||
@@ -20,35 +23,36 @@
|
||||
|
||||
let { fields, localConfig, onConfigChange, onThemeChange }: Props = $props();
|
||||
|
||||
// Helper function to get parameter source info for syncable parameters
|
||||
function getParameterSourceInfo(key: string) {
|
||||
if (!settingsStore.canSyncParameter(key)) {
|
||||
return null;
|
||||
// server sampling defaults for placeholders
|
||||
let sp = $derived.by(() => {
|
||||
if (serverStore.isRouterMode) {
|
||||
const m = selectedModelName();
|
||||
if (m) {
|
||||
const p = modelsStore.getModelProps(m);
|
||||
return (p?.default_generation_settings?.params ?? {}) as Record<string, unknown>;
|
||||
}
|
||||
}
|
||||
|
||||
return settingsStore.getParameterInfo(key);
|
||||
}
|
||||
return (serverStore.defaultParams ?? {}) as Record<string, unknown>;
|
||||
});
|
||||
</script>
|
||||
|
||||
{#each fields as field (field.key)}
|
||||
<div class="space-y-2">
|
||||
{#if field.type === SettingsFieldType.INPUT}
|
||||
{@const paramInfo = getParameterSourceInfo(field.key)}
|
||||
{@const currentValue = String(localConfig[field.key] ?? '')}
|
||||
{@const propsDefault = paramInfo?.serverDefault}
|
||||
{@const serverDefault = sp[field.key]}
|
||||
{@const isCustomRealTime = (() => {
|
||||
if (!paramInfo || propsDefault === undefined) return false;
|
||||
if (serverDefault == null) return false;
|
||||
if (currentValue === '') return false;
|
||||
|
||||
// Apply same rounding logic for real-time comparison
|
||||
const inputValue = currentValue;
|
||||
const numericInput = parseFloat(inputValue);
|
||||
const numericInput = parseFloat(currentValue);
|
||||
const normalizedInput = !isNaN(numericInput)
|
||||
? Math.round(numericInput * 1000000) / 1000000
|
||||
: inputValue;
|
||||
: currentValue;
|
||||
const normalizedDefault =
|
||||
typeof propsDefault === 'number'
|
||||
? Math.round(propsDefault * 1000000) / 1000000
|
||||
: propsDefault;
|
||||
typeof serverDefault === 'number'
|
||||
? Math.round(serverDefault * 1000000) / 1000000
|
||||
: serverDefault;
|
||||
|
||||
return normalizedInput !== normalizedDefault;
|
||||
})()}
|
||||
@@ -74,7 +78,9 @@
|
||||
// Update local config immediately for real-time badge feedback
|
||||
onConfigChange(field.key, e.currentTarget.value);
|
||||
}}
|
||||
placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] ?? 'none'}`}
|
||||
placeholder={sp[field.key] != null
|
||||
? `Default: ${normalizeFloatingPoint(sp[field.key])}`
|
||||
: ''}
|
||||
class="w-full {isCustomRealTime ? 'pr-8' : ''}"
|
||||
/>
|
||||
{#if isCustomRealTime}
|
||||
@@ -82,9 +88,7 @@
|
||||
type="button"
|
||||
onclick={() => {
|
||||
settingsStore.resetParameterToServerDefault(field.key);
|
||||
// Trigger UI update by calling onConfigChange with the default value
|
||||
const defaultValue = propsDefault ?? SETTING_CONFIG_DEFAULT[field.key];
|
||||
onConfigChange(field.key, String(defaultValue));
|
||||
onConfigChange(field.key, '');
|
||||
}}
|
||||
class="absolute top-1/2 right-2 inline-flex h-5 w-5 -translate-y-1/2 items-center justify-center rounded transition-colors hover:bg-muted"
|
||||
aria-label="Reset to default"
|
||||
@@ -112,7 +116,7 @@
|
||||
id={field.key}
|
||||
value={String(localConfig[field.key] ?? '')}
|
||||
onchange={(e) => onConfigChange(field.key, e.currentTarget.value)}
|
||||
placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] ?? 'none'}`}
|
||||
placeholder=""
|
||||
class="min-h-[10rem] w-full md:max-w-2xl"
|
||||
/>
|
||||
|
||||
@@ -140,14 +144,12 @@
|
||||
(opt: { value: string; label: string; icon?: Component }) =>
|
||||
opt.value === localConfig[field.key]
|
||||
)}
|
||||
{@const paramInfo = getParameterSourceInfo(field.key)}
|
||||
{@const currentValue = localConfig[field.key]}
|
||||
{@const propsDefault = paramInfo?.serverDefault}
|
||||
{@const serverDefault = sp[field.key]}
|
||||
{@const isCustomRealTime = (() => {
|
||||
if (!paramInfo || propsDefault === undefined) return false;
|
||||
|
||||
// For select fields, do direct comparison (no rounding needed)
|
||||
return currentValue !== propsDefault;
|
||||
if (serverDefault == null) return false;
|
||||
if (currentValue === '' || currentValue === undefined) return false;
|
||||
return currentValue !== serverDefault;
|
||||
})()}
|
||||
|
||||
<div class="flex items-center gap-2">
|
||||
@@ -190,9 +192,7 @@
|
||||
type="button"
|
||||
onclick={() => {
|
||||
settingsStore.resetParameterToServerDefault(field.key);
|
||||
// Trigger UI update by calling onConfigChange with the default value
|
||||
const defaultValue = propsDefault ?? SETTING_CONFIG_DEFAULT[field.key];
|
||||
onConfigChange(field.key, String(defaultValue));
|
||||
onConfigChange(field.key, '');
|
||||
}}
|
||||
class="absolute top-1/2 right-8 inline-flex h-5 w-5 -translate-y-1/2 items-center justify-center rounded transition-colors hover:bg-muted"
|
||||
aria-label="Reset to default"
|
||||
|
||||
@@ -28,6 +28,11 @@
|
||||
|
||||
let parsed = $derived(ModelsService.parseModelId(modelId));
|
||||
let resolvedShowRaw = $derived(showRaw ?? (config().showRawModelNames as boolean) ?? false);
|
||||
let displayName = $derived(
|
||||
aliases && aliases.length > 0 ? aliases[0] : (parsed.modelName ?? modelId)
|
||||
);
|
||||
let remainingAliases = $derived(aliases && aliases.length > 1 ? aliases.slice(1) : []);
|
||||
let allTags = $derived([...(parsed.tags ?? []), ...(tags ?? [])]);
|
||||
</script>
|
||||
|
||||
{#if resolvedShowRaw}
|
||||
@@ -35,7 +40,7 @@
|
||||
{:else}
|
||||
<span class="flex min-w-0 flex-wrap items-center gap-1 {className}">
|
||||
<span class="min-w-0 truncate font-medium">
|
||||
{#if showOrgName && parsed.orgName}{parsed.orgName}/{/if}{parsed.modelName ?? modelId}
|
||||
{#if showOrgName && parsed.orgName && !(aliases && aliases.length > 0)}{parsed.orgName}/{/if}{displayName}
|
||||
</span>
|
||||
|
||||
{#if parsed.params}
|
||||
@@ -50,14 +55,14 @@
|
||||
</span>
|
||||
{/if}
|
||||
|
||||
{#if aliases && aliases.length > 0}
|
||||
{#each aliases as alias (alias)}
|
||||
{#if remainingAliases.length > 0}
|
||||
{#each remainingAliases as alias (alias)}
|
||||
<span class={badgeClass}>{alias}</span>
|
||||
{/each}
|
||||
{/if}
|
||||
|
||||
{#if tags && tags.length > 0}
|
||||
{#each tags as tag (tag)}
|
||||
{#if allTags.length > 0}
|
||||
{#each allTags as tag (tag)}
|
||||
<span class={tagBadgeClass}>{tag}</span>
|
||||
{/each}
|
||||
{/if}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
export const CONFIG_LOCALSTORAGE_KEY = 'LlamaCppWebui.config';
|
||||
export const USER_OVERRIDES_LOCALSTORAGE_KEY = 'LlamaCppWebui.userOverrides';
|
||||
export const FAVOURITE_MODELS_LOCALSTORAGE_KEY = 'LlamaCppWebui.favouriteModels';
|
||||
export const MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY = 'LlamaCppWebui.mcpDefaultEnabled';
|
||||
|
||||
@@ -11,10 +11,16 @@ export const MODEL_ID_SEGMENT_SEPARATOR = '-';
|
||||
export const MODEL_ID_QUANTIZATION_SEPARATOR = ':';
|
||||
|
||||
/**
|
||||
* Matches a trailing ALL-CAPS format segment, e.g. `GGUF`, `BF16`, `Q4_K_M`.
|
||||
* Must be at least 2 uppercase letters, optionally followed by uppercase letters or digits.
|
||||
* Matches a quantization/precision segment, e.g. `Q4_K_M`, `IQ4_XS`, `F16`, `BF16`, `MXFP4`.
|
||||
* Case-insensitive to handle both uppercase and lowercase inputs.
|
||||
*/
|
||||
export const MODEL_FORMAT_SEGMENT_RE = /^[A-Z]{2,}[A-Z0-9]*$/;
|
||||
export const MODEL_QUANTIZATION_SEGMENT_RE =
|
||||
/^(I?Q\d+(_[A-Z0-9]+)*|F\d+|BF\d+|MXFP\d+(_[A-Z0-9]+)*)$/i;
|
||||
|
||||
/**
|
||||
* Matches prefix for custom quantization types, e.g. `UD-Q8_K_XL`.
|
||||
*/
|
||||
export const MODEL_CUSTOM_QUANTIZATION_PREFIX_RE = /^UD$/i;
|
||||
|
||||
/**
|
||||
* Matches a parameter-count segment, e.g. `7B`, `1.5b`, `120M`.
|
||||
@@ -22,7 +28,12 @@ export const MODEL_FORMAT_SEGMENT_RE = /^[A-Z]{2,}[A-Z0-9]*$/;
|
||||
export const MODEL_PARAMS_RE = /^\d+(\.\d+)?[BbMmKkTt]$/;
|
||||
|
||||
/**
|
||||
* Matches an activated-parameter-count segment, e.g. `A10B`, `A2.4b`.
|
||||
* The leading `A` distinguishes it from a regular params segment.
|
||||
* Matches an activated-parameter-count segment, e.g. `A10B`, `a2.4b`.
|
||||
* The leading `A`/`a` distinguishes it from a regular params segment.
|
||||
*/
|
||||
export const MODEL_ACTIVATED_PARAMS_RE = /^A\d+(\.\d+)?[BbMmKkTt]$/;
|
||||
export const MODEL_ACTIVATED_PARAMS_RE = /^[Aa]\d+(\.\d+)?[BbMmKkTt]$/;
|
||||
|
||||
/**
|
||||
* Container format segments to exclude from tags (every model uses these).
|
||||
*/
|
||||
export const MODEL_IGNORED_SEGMENTS = new Set(['GGUF', 'GGML']);
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import { ColorMode } from '$lib/enums/ui';
|
||||
import { Monitor, Moon, Sun } from '@lucide/svelte';
|
||||
|
||||
export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> = {
|
||||
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
|
||||
export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean | undefined> = {
|
||||
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value.
|
||||
// Do not use nested objects, keep it single level. Prefix the key if you need to group them.
|
||||
apiKey: '',
|
||||
systemMessage: '',
|
||||
@@ -30,27 +30,30 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
|
||||
agenticMaxToolPreviewLines: 25,
|
||||
showToolCallInProgress: false,
|
||||
alwaysShowAgenticTurns: false,
|
||||
// make sure these default values are in sync with `common.h`
|
||||
samplers: 'top_k;typ_p;top_p;min_p;temperature',
|
||||
// sampling params: empty means "use server default"
|
||||
// the server / preset is the source of truth
|
||||
// empty values are shown as placeholders from /props in the UI
|
||||
// and are NOT sent in API requests, letting the server decide
|
||||
samplers: '',
|
||||
backend_sampling: false,
|
||||
temperature: 0.8,
|
||||
dynatemp_range: 0.0,
|
||||
dynatemp_exponent: 1.0,
|
||||
top_k: 40,
|
||||
top_p: 0.95,
|
||||
min_p: 0.05,
|
||||
xtc_probability: 0.0,
|
||||
xtc_threshold: 0.1,
|
||||
typ_p: 1.0,
|
||||
repeat_last_n: 64,
|
||||
repeat_penalty: 1.0,
|
||||
presence_penalty: 0.0,
|
||||
frequency_penalty: 0.0,
|
||||
dry_multiplier: 0.0,
|
||||
dry_base: 1.75,
|
||||
dry_allowed_length: 2,
|
||||
dry_penalty_last_n: -1,
|
||||
max_tokens: -1,
|
||||
temperature: undefined,
|
||||
dynatemp_range: undefined,
|
||||
dynatemp_exponent: undefined,
|
||||
top_k: undefined,
|
||||
top_p: undefined,
|
||||
min_p: undefined,
|
||||
xtc_probability: undefined,
|
||||
xtc_threshold: undefined,
|
||||
typ_p: undefined,
|
||||
repeat_last_n: undefined,
|
||||
repeat_penalty: undefined,
|
||||
presence_penalty: undefined,
|
||||
frequency_penalty: undefined,
|
||||
dry_multiplier: undefined,
|
||||
dry_base: undefined,
|
||||
dry_allowed_length: undefined,
|
||||
dry_penalty_last_n: undefined,
|
||||
max_tokens: undefined,
|
||||
custom: '', // custom json-stringified object
|
||||
// experimental features
|
||||
pyInterpreterEnabled: false,
|
||||
|
||||
@@ -2,9 +2,11 @@ import { ServerModelStatus } from '$lib/enums';
|
||||
import { apiFetch, apiPost } from '$lib/utils';
|
||||
import type { ParsedModelId } from '$lib/types/models';
|
||||
import {
|
||||
MODEL_FORMAT_SEGMENT_RE,
|
||||
MODEL_QUANTIZATION_SEGMENT_RE,
|
||||
MODEL_CUSTOM_QUANTIZATION_PREFIX_RE,
|
||||
MODEL_PARAMS_RE,
|
||||
MODEL_ACTIVATED_PARAMS_RE,
|
||||
MODEL_IGNORED_SEGMENTS,
|
||||
MODEL_ID_NOT_FOUND,
|
||||
MODEL_ID_ORG_SEPARATOR,
|
||||
MODEL_ID_SEGMENT_SEPARATOR,
|
||||
@@ -119,8 +121,9 @@ export class ModelsService {
|
||||
/**
|
||||
* Parse a model ID string into its structured components.
|
||||
*
|
||||
* Handles the convention:
|
||||
* `<org>/<ModelName>-<Parameters>(-<ActivatedParameters>)-<Format>:<QuantizationType>`
|
||||
* Handles conventions like:
|
||||
* `<org>/<ModelName>-<Parameters>(-<ActivatedParameters>)(-<Tags>)(-<Quantization>):<Quantization>`
|
||||
* `<ModelName>.<Quantization>` (dot-separated quantization, e.g. `model.Q4_K_M`)
|
||||
*
|
||||
* @param modelId - Raw model identifier string
|
||||
* @returns Structured {@link ParsedModelId} with all detected fields
|
||||
@@ -132,11 +135,11 @@ export class ModelsService {
|
||||
modelName: null,
|
||||
params: null,
|
||||
activatedParams: null,
|
||||
format: null,
|
||||
quantization: null,
|
||||
tags: []
|
||||
};
|
||||
|
||||
// 1. Extract colon-separated quantization (e.g. `model:Q4_K_M`)
|
||||
const colonIdx = modelId.indexOf(MODEL_ID_QUANTIZATION_SEPARATOR);
|
||||
let modelPath: string;
|
||||
|
||||
@@ -147,6 +150,7 @@ export class ModelsService {
|
||||
modelPath = modelId;
|
||||
}
|
||||
|
||||
// 2. Extract org name (e.g. `org/model` -> org = "org")
|
||||
const slashIdx = modelPath.indexOf(MODEL_ID_ORG_SEPARATOR);
|
||||
let modelStr: string;
|
||||
|
||||
@@ -157,37 +161,66 @@ export class ModelsService {
|
||||
modelStr = modelPath;
|
||||
}
|
||||
|
||||
const segments = modelStr.split(MODEL_ID_SEGMENT_SEPARATOR);
|
||||
// 3. Handle dot-separated quantization (e.g. `model-name.Q4_K_M`)
|
||||
const dotIdx = modelStr.lastIndexOf('.');
|
||||
|
||||
if (segments.length > 0 && MODEL_FORMAT_SEGMENT_RE.test(segments[segments.length - 1])) {
|
||||
result.format = segments.pop()!;
|
||||
if (dotIdx !== MODEL_ID_NOT_FOUND && !result.quantization) {
|
||||
const afterDot = modelStr.slice(dotIdx + 1);
|
||||
|
||||
if (MODEL_QUANTIZATION_SEGMENT_RE.test(afterDot)) {
|
||||
result.quantization = afterDot;
|
||||
modelStr = modelStr.slice(0, dotIdx);
|
||||
}
|
||||
}
|
||||
|
||||
const paramsRe = MODEL_PARAMS_RE;
|
||||
const activatedParamsRe = MODEL_ACTIVATED_PARAMS_RE;
|
||||
const segments = modelStr.split(MODEL_ID_SEGMENT_SEPARATOR);
|
||||
|
||||
// 4. Detect trailing quantization from dash-separated segments
|
||||
// Handle UD-prefixed quantization (e.g. `UD-Q8_K_XL`) and
|
||||
// standalone quantization (e.g. `Q4_K_M`, `BF16`, `F16`, `MXFP4`)
|
||||
if (!result.quantization && segments.length > 1) {
|
||||
const last = segments[segments.length - 1];
|
||||
const secondLast = segments.length > 2 ? segments[segments.length - 2] : null;
|
||||
|
||||
if (MODEL_QUANTIZATION_SEGMENT_RE.test(last)) {
|
||||
if (secondLast && MODEL_CUSTOM_QUANTIZATION_PREFIX_RE.test(secondLast)) {
|
||||
result.quantization = `${secondLast}-${last}`;
|
||||
segments.splice(segments.length - 2, 2);
|
||||
} else {
|
||||
result.quantization = last;
|
||||
segments.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Find params and activated params
|
||||
let paramsIdx = MODEL_ID_NOT_FOUND;
|
||||
let activatedParamsIdx = MODEL_ID_NOT_FOUND;
|
||||
|
||||
for (let i = 0; i < segments.length; i++) {
|
||||
const seg = segments[i];
|
||||
if (paramsIdx === -1 && paramsRe.test(seg)) {
|
||||
|
||||
if (paramsIdx === MODEL_ID_NOT_FOUND && MODEL_PARAMS_RE.test(seg)) {
|
||||
paramsIdx = i;
|
||||
result.params = seg.toUpperCase();
|
||||
} else if (activatedParamsRe.test(seg)) {
|
||||
} else if (paramsIdx !== MODEL_ID_NOT_FOUND && MODEL_ACTIVATED_PARAMS_RE.test(seg)) {
|
||||
activatedParamsIdx = i;
|
||||
result.activatedParams = seg.toUpperCase();
|
||||
}
|
||||
}
|
||||
|
||||
// 6. Model name = segments before params; tags = remaining segments after params
|
||||
const pivotIdx = paramsIdx !== MODEL_ID_NOT_FOUND ? paramsIdx : segments.length;
|
||||
|
||||
result.modelName = segments.slice(0, pivotIdx).join(MODEL_ID_SEGMENT_SEPARATOR) || null;
|
||||
|
||||
if (paramsIdx !== MODEL_ID_NOT_FOUND) {
|
||||
result.tags = segments
|
||||
.slice(paramsIdx + 1)
|
||||
.filter((_, relIdx) => paramsIdx + 1 + relIdx !== activatedParamsIdx);
|
||||
result.tags = segments.slice(paramsIdx + 1).filter((_, relIdx) => {
|
||||
const absIdx = paramsIdx + 1 + relIdx;
|
||||
if (absIdx === activatedParamsIdx) return false;
|
||||
|
||||
return !MODEL_IGNORED_SEGMENTS.has(segments[absIdx].toUpperCase());
|
||||
});
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
@@ -36,7 +36,8 @@ import {
|
||||
ISO_TIME_SEPARATOR,
|
||||
ISO_TIME_SEPARATOR_REPLACEMENT,
|
||||
NON_ALPHANUMERIC_REGEX,
|
||||
MULTIPLE_UNDERSCORE_REGEX
|
||||
MULTIPLE_UNDERSCORE_REGEX,
|
||||
MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY
|
||||
} from '$lib/constants';
|
||||
|
||||
class ConversationsStore {
|
||||
@@ -61,7 +62,37 @@ class ConversationsStore {
|
||||
isInitialized = $state(false);
|
||||
|
||||
/** Pending MCP server overrides for new conversations (before first message) */
|
||||
pendingMcpServerOverrides = $state<McpServerOverride[]>([]);
|
||||
pendingMcpServerOverrides = $state<McpServerOverride[]>(ConversationsStore.loadMcpDefaults());
|
||||
|
||||
/** Load MCP default overrides from localStorage */
|
||||
private static loadMcpDefaults(): McpServerOverride[] {
|
||||
if (typeof globalThis.localStorage === 'undefined') return [];
|
||||
try {
|
||||
const raw = localStorage.getItem(MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY);
|
||||
if (!raw) return [];
|
||||
const parsed = JSON.parse(raw);
|
||||
if (!Array.isArray(parsed)) return [];
|
||||
return parsed.filter(
|
||||
(o: unknown) => typeof o === 'object' && o !== null && 'serverId' in o && 'enabled' in o
|
||||
) as McpServerOverride[];
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/** Persist MCP default overrides to localStorage */
|
||||
private saveMcpDefaults(): void {
|
||||
if (typeof globalThis.localStorage === 'undefined') return;
|
||||
const plain = this.pendingMcpServerOverrides.map((o) => ({
|
||||
serverId: o.serverId,
|
||||
enabled: o.enabled
|
||||
}));
|
||||
if (plain.length > 0) {
|
||||
localStorage.setItem(MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY, JSON.stringify(plain));
|
||||
} else {
|
||||
localStorage.removeItem(MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY);
|
||||
}
|
||||
}
|
||||
|
||||
/** Callback for title update confirmation dialog */
|
||||
titleUpdateConfirmationCallback?: (currentTitle: string, newTitle: string) => Promise<boolean>;
|
||||
@@ -261,6 +292,8 @@ class ConversationsStore {
|
||||
clearActiveConversation(): void {
|
||||
this.activeConversation = null;
|
||||
this.activeMessages = [];
|
||||
// reload MCP defaults so new chats inherit persisted state
|
||||
this.pendingMcpServerOverrides = ConversationsStore.loadMcpDefaults();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -597,6 +630,7 @@ class ConversationsStore {
|
||||
this.pendingMcpServerOverrides = [...this.pendingMcpServerOverrides, { serverId, enabled }];
|
||||
}
|
||||
}
|
||||
this.saveMcpDefaults();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -621,6 +655,7 @@ class ConversationsStore {
|
||||
*/
|
||||
clearPendingMcpServerOverrides(): void {
|
||||
this.pendingMcpServerOverrides = [];
|
||||
this.saveMcpDefaults();
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -208,23 +208,16 @@ class MCPStore {
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a server is enabled, considering per-chat overrides.
|
||||
* Checks if a server is enabled for a given chat.
|
||||
* Only per-chat overrides (persisted in localStorage for new chats,
|
||||
* or in IndexedDB for existing conversations) control enabled state.
|
||||
*/
|
||||
#checkServerEnabled(
|
||||
server: MCPServerSettingsEntry,
|
||||
perChatOverrides?: McpServerOverride[]
|
||||
): boolean {
|
||||
if (!server.enabled) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (perChatOverrides) {
|
||||
const override = perChatOverrides.find((o) => o.serverId === server.id);
|
||||
|
||||
return override?.enabled ?? false;
|
||||
}
|
||||
|
||||
return false;
|
||||
const override = perChatOverrides?.find((o) => o.serverId === server.id);
|
||||
return override?.enabled ?? false;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -570,18 +563,8 @@ class MCPStore {
|
||||
getEnabledServersForConversation(
|
||||
perChatOverrides?: McpServerOverride[]
|
||||
): MCPServerSettingsEntry[] {
|
||||
if (!perChatOverrides?.length) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return this.getServers().filter((server) => {
|
||||
if (!server.enabled) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const override = perChatOverrides.find((o) => o.serverId === server.id);
|
||||
|
||||
return override?.enabled ?? false;
|
||||
return this.#checkServerEnabled(server, perChatOverrides);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -289,16 +289,10 @@ class SettingsStore {
|
||||
const serverDefaults = this.getServerDefaults();
|
||||
|
||||
if (serverDefaults[key] !== undefined) {
|
||||
const value = normalizeFloatingPoint(serverDefaults[key]);
|
||||
|
||||
this.config[key as keyof SettingsConfigType] =
|
||||
value as SettingsConfigType[keyof SettingsConfigType];
|
||||
} else {
|
||||
if (key in SETTING_CONFIG_DEFAULT) {
|
||||
const defaultValue = getConfigValue(SETTING_CONFIG_DEFAULT, key);
|
||||
|
||||
setConfigValue(this.config, key, defaultValue);
|
||||
}
|
||||
// sampling param known by server: clear it, let server decide
|
||||
setConfigValue(this.config, key, '');
|
||||
} else if (key in SETTING_CONFIG_DEFAULT) {
|
||||
setConfigValue(this.config, key, getConfigValue(SETTING_CONFIG_DEFAULT, key));
|
||||
}
|
||||
|
||||
this.userOverrides.delete(key);
|
||||
@@ -319,12 +313,7 @@ class SettingsStore {
|
||||
*/
|
||||
syncWithServerDefaults(): void {
|
||||
const propsDefaults = this.getServerDefaults();
|
||||
|
||||
if (Object.keys(propsDefaults).length === 0) {
|
||||
console.warn('No server defaults available for initialization');
|
||||
|
||||
return;
|
||||
}
|
||||
if (Object.keys(propsDefaults).length === 0) return;
|
||||
|
||||
for (const [key, propsValue] of Object.entries(propsDefaults)) {
|
||||
const currentValue = getConfigValue(this.config, key);
|
||||
@@ -332,17 +321,14 @@ class SettingsStore {
|
||||
const normalizedCurrent = normalizeFloatingPoint(currentValue);
|
||||
const normalizedDefault = normalizeFloatingPoint(propsValue);
|
||||
|
||||
// if user value matches server, it's not a real override
|
||||
if (normalizedCurrent === normalizedDefault) {
|
||||
this.userOverrides.delete(key);
|
||||
setConfigValue(this.config, key, propsValue);
|
||||
} else if (!this.userOverrides.has(key)) {
|
||||
setConfigValue(this.config, key, propsValue);
|
||||
}
|
||||
}
|
||||
|
||||
this.saveConfig();
|
||||
console.log('Settings initialized with props defaults:', propsDefaults);
|
||||
console.log('Current user overrides after sync:', Array.from(this.userOverrides));
|
||||
console.log('User overrides after sync:', Array.from(this.userOverrides));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -352,19 +338,11 @@ class SettingsStore {
|
||||
*/
|
||||
forceSyncWithServerDefaults(): void {
|
||||
const propsDefaults = this.getServerDefaults();
|
||||
const syncableKeys = ParameterSyncService.getSyncableParameterKeys();
|
||||
|
||||
for (const key of syncableKeys) {
|
||||
for (const key of ParameterSyncService.getSyncableParameterKeys()) {
|
||||
if (propsDefaults[key] !== undefined) {
|
||||
const normalizedValue = normalizeFloatingPoint(propsDefaults[key]);
|
||||
|
||||
setConfigValue(this.config, key, normalizedValue);
|
||||
} else {
|
||||
if (key in SETTING_CONFIG_DEFAULT) {
|
||||
const defaultValue = getConfigValue(SETTING_CONFIG_DEFAULT, key);
|
||||
|
||||
setConfigValue(this.config, key, defaultValue);
|
||||
}
|
||||
setConfigValue(this.config, key, '');
|
||||
} else if (key in SETTING_CONFIG_DEFAULT) {
|
||||
setConfigValue(this.config, key, getConfigValue(SETTING_CONFIG_DEFAULT, key));
|
||||
}
|
||||
|
||||
this.userOverrides.delete(key);
|
||||
|
||||
1
tools/server/webui/src/lib/types/models.d.ts
vendored
1
tools/server/webui/src/lib/types/models.d.ts
vendored
@@ -25,7 +25,6 @@ export interface ParsedModelId {
|
||||
modelName: string | null;
|
||||
params: string | null;
|
||||
activatedParams: string | null;
|
||||
format: string | null;
|
||||
quantization: string | null;
|
||||
tags: string[];
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@ import type { DatabaseMessageExtra } from './database';
|
||||
import type { ParameterSource, SyncableParameterType, SettingsFieldType } from '$lib/enums';
|
||||
import type { Icon } from '@lucide/svelte';
|
||||
|
||||
export type SettingsConfigValue = string | number | boolean;
|
||||
export type SettingsConfigValue = string | number | boolean | undefined;
|
||||
|
||||
export interface SettingsFieldConfig {
|
||||
key: string;
|
||||
|
||||
270
tools/server/webui/tests/unit/model-id-parser.test.ts
Normal file
270
tools/server/webui/tests/unit/model-id-parser.test.ts
Normal file
@@ -0,0 +1,270 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { ModelsService } from '$lib/services/models.service';
|
||||
|
||||
const { parseModelId } = ModelsService;
|
||||
|
||||
describe('parseModelId', () => {
|
||||
it('handles unknown patterns correctly', () => {
|
||||
expect(parseModelId('model-name-1')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'model-name-1',
|
||||
orgName: null,
|
||||
params: null,
|
||||
quantization: null,
|
||||
raw: 'model-name-1',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('org/model-name-2')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'model-name-2',
|
||||
orgName: 'org',
|
||||
params: null,
|
||||
quantization: null,
|
||||
raw: 'org/model-name-2',
|
||||
tags: []
|
||||
});
|
||||
});
|
||||
|
||||
it('extracts model parameters correctly', () => {
|
||||
expect(parseModelId('model-100B-BF16')).toMatchObject({ params: '100B' });
|
||||
expect(parseModelId('model-100B:Q4_K_M')).toMatchObject({ params: '100B' });
|
||||
});
|
||||
|
||||
it('extracts model parameters correctly in lowercase', () => {
|
||||
expect(parseModelId('model-100b-bf16')).toMatchObject({ params: '100B' });
|
||||
expect(parseModelId('model-100b:q4_k_m')).toMatchObject({ params: '100B' });
|
||||
});
|
||||
|
||||
it('extracts activated parameters correctly', () => {
|
||||
expect(parseModelId('model-100B-A10B-BF16')).toMatchObject({ activatedParams: 'A10B' });
|
||||
expect(parseModelId('model-100B-A10B:Q4_K_M')).toMatchObject({ activatedParams: 'A10B' });
|
||||
});
|
||||
|
||||
it('extracts activated parameters correctly in lowercase', () => {
|
||||
expect(parseModelId('model-100b-a10b-bf16')).toMatchObject({ activatedParams: 'A10B' });
|
||||
expect(parseModelId('model-100b-a10b:q4_k_m')).toMatchObject({ activatedParams: 'A10B' });
|
||||
});
|
||||
|
||||
it('extracts quantization correctly', () => {
|
||||
// Dash-separated quantization
|
||||
expect(parseModelId('model-100B-UD-IQ1_S')).toMatchObject({ quantization: 'UD-IQ1_S' });
|
||||
expect(parseModelId('model-100B-IQ4_XS')).toMatchObject({ quantization: 'IQ4_XS' });
|
||||
expect(parseModelId('model-100B-Q4_K_M')).toMatchObject({ quantization: 'Q4_K_M' });
|
||||
expect(parseModelId('model-100B-Q8_0')).toMatchObject({ quantization: 'Q8_0' });
|
||||
expect(parseModelId('model-100B-UD-Q8_K_XL')).toMatchObject({ quantization: 'UD-Q8_K_XL' });
|
||||
expect(parseModelId('model-100B-F16')).toMatchObject({ quantization: 'F16' });
|
||||
expect(parseModelId('model-100B-BF16')).toMatchObject({ quantization: 'BF16' });
|
||||
expect(parseModelId('model-100B-MXFP4')).toMatchObject({ quantization: 'MXFP4' });
|
||||
|
||||
// Colon-separated quantization
|
||||
expect(parseModelId('model-100B:UD-IQ1_S')).toMatchObject({ quantization: 'UD-IQ1_S' });
|
||||
expect(parseModelId('model-100B:IQ4_XS')).toMatchObject({ quantization: 'IQ4_XS' });
|
||||
expect(parseModelId('model-100B:Q4_K_M')).toMatchObject({ quantization: 'Q4_K_M' });
|
||||
expect(parseModelId('model-100B:Q8_0')).toMatchObject({ quantization: 'Q8_0' });
|
||||
expect(parseModelId('model-100B:UD-Q8_K_XL')).toMatchObject({ quantization: 'UD-Q8_K_XL' });
|
||||
expect(parseModelId('model-100B:F16')).toMatchObject({ quantization: 'F16' });
|
||||
expect(parseModelId('model-100B:BF16')).toMatchObject({ quantization: 'BF16' });
|
||||
expect(parseModelId('model-100B:MXFP4')).toMatchObject({ quantization: 'MXFP4' });
|
||||
|
||||
// Dot-separated quantization
|
||||
expect(parseModelId('nomic-embed-text-v2-moe.Q4_K_M')).toMatchObject({
|
||||
quantization: 'Q4_K_M'
|
||||
});
|
||||
});
|
||||
|
||||
it('extracts additional tags correctly', () => {
|
||||
expect(parseModelId('model-100B-foobar-Q4_K_M')).toMatchObject({ tags: ['foobar'] });
|
||||
expect(parseModelId('model-100B-A10B-foobar-1M-BF16')).toMatchObject({
|
||||
tags: ['foobar', '1M']
|
||||
});
|
||||
expect(parseModelId('model-100B-1M-foobar:UD-Q8_K_XL')).toMatchObject({
|
||||
tags: ['1M', 'foobar']
|
||||
});
|
||||
});
|
||||
|
||||
it('filters out container format segments from tags', () => {
|
||||
expect(parseModelId('model-100B-GGUF-Instruct-BF16')).toMatchObject({
|
||||
tags: ['Instruct']
|
||||
});
|
||||
expect(parseModelId('model-100B-GGML-Instruct:Q4_K_M')).toMatchObject({
|
||||
tags: ['Instruct']
|
||||
});
|
||||
});
|
||||
|
||||
it('handles real-world examples correctly', () => {
|
||||
expect(parseModelId('meta-llama/Llama-3.1-8B')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'Llama-3.1',
|
||||
orgName: 'meta-llama',
|
||||
params: '8B',
|
||||
quantization: null,
|
||||
raw: 'meta-llama/Llama-3.1-8B',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('openai/gpt-oss-120b-MXFP4')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'gpt-oss',
|
||||
orgName: 'openai',
|
||||
params: '120B',
|
||||
quantization: 'MXFP4',
|
||||
raw: 'openai/gpt-oss-120b-MXFP4',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('openai/gpt-oss-20b:Q4_K_M')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'gpt-oss',
|
||||
orgName: 'openai',
|
||||
params: '20B',
|
||||
quantization: 'Q4_K_M',
|
||||
raw: 'openai/gpt-oss-20b:Q4_K_M',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('Qwen/Qwen3-Coder-30B-A3B-Instruct-1M-BF16')).toStrictEqual({
|
||||
activatedParams: 'A3B',
|
||||
modelName: 'Qwen3-Coder',
|
||||
orgName: 'Qwen',
|
||||
params: '30B',
|
||||
quantization: 'BF16',
|
||||
raw: 'Qwen/Qwen3-Coder-30B-A3B-Instruct-1M-BF16',
|
||||
tags: ['Instruct', '1M']
|
||||
});
|
||||
});
|
||||
|
||||
it('handles real-world examples with quantization in segments', () => {
|
||||
expect(parseModelId('meta-llama/Llama-4-Scout-17B-16E-Instruct-Q4_K_M')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'Llama-4-Scout',
|
||||
orgName: 'meta-llama',
|
||||
params: '17B',
|
||||
quantization: 'Q4_K_M',
|
||||
raw: 'meta-llama/Llama-4-Scout-17B-16E-Instruct-Q4_K_M',
|
||||
tags: ['16E', 'Instruct']
|
||||
});
|
||||
|
||||
expect(parseModelId('MiniMaxAI/MiniMax-M2-IQ4_XS')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'MiniMax-M2',
|
||||
orgName: 'MiniMaxAI',
|
||||
params: null,
|
||||
quantization: 'IQ4_XS',
|
||||
raw: 'MiniMaxAI/MiniMax-M2-IQ4_XS',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('MiniMaxAI/MiniMax-M2-UD-Q3_K_XL')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'MiniMax-M2',
|
||||
orgName: 'MiniMaxAI',
|
||||
params: null,
|
||||
quantization: 'UD-Q3_K_XL',
|
||||
raw: 'MiniMaxAI/MiniMax-M2-UD-Q3_K_XL',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('mistralai/Devstral-2-123B-Instruct-2512-Q4_K_M')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'Devstral-2',
|
||||
orgName: 'mistralai',
|
||||
params: '123B',
|
||||
quantization: 'Q4_K_M',
|
||||
raw: 'mistralai/Devstral-2-123B-Instruct-2512-Q4_K_M',
|
||||
tags: ['Instruct', '2512']
|
||||
});
|
||||
|
||||
expect(parseModelId('mistralai/Devstral-Small-2-24B-Instruct-2512-Q8_0')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'Devstral-Small-2',
|
||||
orgName: 'mistralai',
|
||||
params: '24B',
|
||||
quantization: 'Q8_0',
|
||||
raw: 'mistralai/Devstral-Small-2-24B-Instruct-2512-Q8_0',
|
||||
tags: ['Instruct', '2512']
|
||||
});
|
||||
|
||||
expect(parseModelId('noctrex/GLM-4.7-Flash-MXFP4_MOE')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'GLM-4.7-Flash',
|
||||
orgName: 'noctrex',
|
||||
params: null,
|
||||
quantization: 'MXFP4_MOE',
|
||||
raw: 'noctrex/GLM-4.7-Flash-MXFP4_MOE',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('Qwen/Qwen3-Coder-Next-Q4_K_M')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'Qwen3-Coder-Next',
|
||||
orgName: 'Qwen',
|
||||
params: null,
|
||||
quantization: 'Q4_K_M',
|
||||
raw: 'Qwen/Qwen3-Coder-Next-Q4_K_M',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('openai/gpt-oss-120b-Q4_K_M')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'gpt-oss',
|
||||
orgName: 'openai',
|
||||
params: '120B',
|
||||
quantization: 'Q4_K_M',
|
||||
raw: 'openai/gpt-oss-120b-Q4_K_M',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('openai/gpt-oss-20b-F16')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'gpt-oss',
|
||||
orgName: 'openai',
|
||||
params: '20B',
|
||||
quantization: 'F16',
|
||||
raw: 'openai/gpt-oss-20b-F16',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('nomic-embed-text-v2-moe.Q4_K_M')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'nomic-embed-text-v2-moe',
|
||||
orgName: null,
|
||||
params: null,
|
||||
quantization: 'Q4_K_M',
|
||||
raw: 'nomic-embed-text-v2-moe.Q4_K_M',
|
||||
tags: []
|
||||
});
|
||||
});
|
||||
|
||||
it('handles ambiguous model names', () => {
|
||||
// Qwen3.5 Instruct vs Thinking — tags should distinguish them
|
||||
expect(parseModelId('Qwen/Qwen3.5-30B-A3B-Instruct')).toMatchObject({
|
||||
modelName: 'Qwen3.5',
|
||||
params: '30B',
|
||||
activatedParams: 'A3B',
|
||||
tags: ['Instruct']
|
||||
});
|
||||
|
||||
expect(parseModelId('Qwen/Qwen3.5-30B-A3B-Thinking')).toMatchObject({
|
||||
modelName: 'Qwen3.5',
|
||||
params: '30B',
|
||||
activatedParams: 'A3B',
|
||||
tags: ['Thinking']
|
||||
});
|
||||
|
||||
// Dot-separated quantization with variant suffixes
|
||||
expect(parseModelId('gemma-3-27b-it-heretic-v2.Q8_0')).toMatchObject({
|
||||
modelName: 'gemma-3',
|
||||
params: '27B',
|
||||
quantization: 'Q8_0',
|
||||
tags: ['it', 'heretic', 'v2']
|
||||
});
|
||||
|
||||
expect(parseModelId('gemma-3-27b-it.Q8_0')).toMatchObject({
|
||||
modelName: 'gemma-3',
|
||||
params: '27B',
|
||||
quantization: 'Q8_0',
|
||||
tags: ['it']
|
||||
});
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user