Compare commits

...

13 Commits

Author SHA1 Message Date
Pascal
4065c1a3a6 Server becomes the source of truth for sampling parameter defaults (#20558)
* webui: make server the source of truth for sampling defaults

* webui: fix Custom badge for sampling parameters

* webui: log user overrides after server sync

* chore: update webui build output

* fix: Default values for sampling settings config object

* chore: update webui build output

---------

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>
2026-03-19 13:20:39 +01:00
Xuan-Son Nguyen
1e64534570 mtmd: add clip_graph::build_mm() (#20751)
* clip: add build_mm()

* apply to all models

* add TODO for bias overload
2026-03-19 13:11:39 +01:00
Pascal
cd708db0cc WebUI: Persist the on/off state of the MCP servers for new conversations (#20750)
* webui: add persistent storage for MCP server on/off state in new chats

* webui: simplify MCP enabled checks, remove dead server.enabled fallback

* chore: update webui build output

* chore: update webui build output

---------

Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>
2026-03-19 12:54:06 +01:00
Aleksander Grygier
512bba6ee0 webui: Improve model parsing logic + add unit tests (#20749)
* add tests for model id parser

* add test case having activated params

* add structured tests for model id parser

* add ToDo

* feat: Improve model parsing logic + tests

* chore: update webui build output

---------

Co-authored-by: bluemoehre <bluemoehre@gmx.de>
2026-03-19 12:25:50 +01:00
Dowon
b486c17b3e convert : support is_causal hyperparameter (#20746)
* convert : support is_causal hyperparameter

Check for the `is_causal` attribute in the Hugging Face model configuration and include it in the GGUF metadata.

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* style: fix F541 f-string is missing placeholders

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2026-03-19 11:41:11 +01:00
Aldehir Rojas
1b9bbaa357 common : fix gpt-oss content removal (#20745) 2026-03-19 11:40:39 +01:00
Eve
07feeaa92e vulkan: dequantize iq4_xs 4 at a time (#20657) 2026-03-19 11:32:04 +01:00
Charles Xu
3fee84e156 cmake : fix build warning when kleidiai is enabled (#20457)
* cmake : fix build warning when kleidiai is enabled

* remove LLAMA_ARG_THREADS from KleidiAI backend
2026-03-19 10:14:48 +02:00
Sigbjørn Skjæret
811397745e vocab : assert array size of scores and toktypes (#20737) 2026-03-19 08:34:04 +01:00
Kevin Hannon
c014c3f83a docs: add information about openvino in the docker page (#20743) 2026-03-19 15:08:47 +08:00
Chenguang Li
7f2cbd9a4d CANN: handle in-place ROPE on non-contiguous f32 tensors (#20274)
RotaryPositionEmbedding on CANN fails when src and dst share the same
non-contiguous buffer (inplace + view), because the operator overwrites
source data before it is fully read.

Add a branch that detects this case and uses contiguous temporary
buffers: copy src to temp, run ROPE into another temp, then copy back
to the non-contiguous dst. Fixes 20 failing ROPE tests (f32, v=1,
inplace=1).

Signed-off-by: noemotiovon <757486878@qq.com>
2026-03-19 14:05:01 +08:00
Masashi Yoshimura
509a31d00f ggml-webgpu: Update the RMS_NORM preprocessor and add L2_NORM (#20665)
* Update the preprocessor of RMS_NORM and add L2_NORM.

* Fix the name of rms_norm to row_norm.
2026-03-18 21:08:59 -07:00
Masashi Yoshimura
ea01d196d7 ggml-webgpu: Add supports for DIAG and TRI (#20664)
* Add supports for DIAG and TRI.

* Remove extra ttype and add a comment for TRI op.
2026-03-18 21:08:35 -07:00
42 changed files with 794 additions and 317 deletions

View File

@@ -936,7 +936,9 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
for (auto msg : inputs.messages) {
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
msg["thinking"] = msg.at("reasoning_content");
msg.erase("content");
if (msg.contains("tool_calls") && msg.at("tool_calls").is_array() && !msg.at("tool_calls").empty()) {
msg.erase("content");
}
}
adjusted_messages.push_back(msg);
}

View File

@@ -1062,6 +1062,10 @@ class TextModel(ModelBase):
self.gguf_writer.add_head_count_kv(n_head_kv)
logger.info(f"gguf: key-value head count = {n_head_kv}")
if self.hparams.get("is_causal") is False:
self.gguf_writer.add_causal_attention(False)
logger.info("gguf: causal attention = False")
# TODO: Handle "sliding_attention" similarly when models start implementing it
rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
if (rope_type := rope_params.get("rope_type")) is not None:

View File

@@ -28,6 +28,9 @@ Additionally, there the following images, similar to the above:
- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:full-openvino`: Same as `full` but compiled with OpenVino support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:light-openvino`: Same as `light` but compiled with OpenVino support. (platforms: `linux/amd64`)
- `ghcr.io/ggml-org/llama.cpp:server-openvino`: Same as `server` but compiled with OpenVino support. (platforms: `linux/amd64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).

View File

@@ -37,7 +37,7 @@ Legend:
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
| CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
| DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | | ❌ | ❌ |
| DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | | ❌ | ❌ |
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -62,7 +62,7 @@ Legend:
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | ❌ | ❌ |
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | ❌ | ❌ |
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
@@ -115,7 +115,7 @@ Legend:
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | ❌ | ❌ |
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | ❌ | ❌ |
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |

View File

@@ -5744,49 +5744,61 @@
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000000","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000000,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000000","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000000,v=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000000,v=1","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000000","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000000,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000000","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000000,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.000000","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000000,v=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000000,v=1","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000001","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000001,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000001","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000001,v=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000001,v=1","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000001","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000001,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000001","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000001,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.000001","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000001,v=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000001,v=1","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000100","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000100,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000100","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000100,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000100","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000100,v=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000100,v=1","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000100","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000100,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000100","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000100,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.000100","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000100,v=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000100,v=1","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.100000","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.100000,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.100000","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.100000,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.100000","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.100000,v=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.100000,v=1","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.100000","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.100000,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.100000","support","0","no","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.100000,inplace=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.100000","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.100000,v=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.100000,v=1","support","1","yes","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=10.000000,v=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=10.000000,v=1","support","1","yes","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=10.000000,v=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=10.000000,v=1","support","1","yes","WebGPU"
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","1","yes","WebGPU"
"WebGPU: WebGPU","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","WebGPU"
"WebGPU: WebGPU","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","WebGPU"
@@ -10036,17 +10048,17 @@
"WebGPU: WebGPU","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","CUMSUM","type=f32,ne=[20481,4,1,1]","support","1","yes","WebGPU"
"WebGPU: WebGPU","XIELU","type=f32,ne=[10,5,4,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","WebGPU"
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","WebGPU"
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","WebGPU"
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","WebGPU"
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","WebGPU"
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","WebGPU"
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","WebGPU"
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","WebGPU"
"WebGPU: WebGPU","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","WebGPU"
"WebGPU: WebGPU","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","WebGPU"
"WebGPU: WebGPU","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","WebGPU"
"WebGPU: WebGPU","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","WebGPU"
"WebGPU: WebGPU","DIAG","type=f32,ne=[10,1,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","DIAG","type=f32,ne=[79,1,19,13]","support","0","no","WebGPU"
"WebGPU: WebGPU","DIAG","type=f32,ne=[256,1,8,16]","support","0","no","WebGPU"
"WebGPU: WebGPU","DIAG","type=f32,ne=[10,1,4,3]","support","1","yes","WebGPU"
"WebGPU: WebGPU","DIAG","type=f32,ne=[79,1,19,13]","support","1","yes","WebGPU"
"WebGPU: WebGPU","DIAG","type=f32,ne=[256,1,8,16]","support","1","yes","WebGPU"
"WebGPU: WebGPU","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","WebGPU"
"WebGPU: WebGPU","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","WebGPU"
"WebGPU: WebGPU","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","WebGPU"
Can't render this file because it is too large.

View File

@@ -2943,6 +2943,27 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
// Rotate full tensor (no tail), using trans tensors
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), acl_cos_reshape_tensor.get(),
acl_sin_reshape_tensor.get(), acl_mode, acl_dst_trans_tensor.get());
} else if (src0->data == dst->data && !ggml_is_contiguous(src0)) {
// In-place on non-contiguous tensor: RotaryPositionEmbedding cannot safely
// read and write the same non-contiguous buffer. Use contiguous temporaries.
size_t contiguous_nb[GGML_MAX_DIMS];
contiguous_nb[0] = sizeof(float);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
contiguous_nb[i] = contiguous_nb[i - 1] * src0->ne[i - 1];
}
int64_t total_elements = ggml_nelements(src0);
ggml_cann_pool_alloc inplace_src_alloc(ctx.pool(), total_elements * sizeof(float));
ggml_cann_pool_alloc inplace_dst_alloc(ctx.pool(), total_elements * sizeof(float));
acl_tensor_ptr acl_src_contig = ggml_cann_create_tensor(inplace_src_alloc.get(), ACL_FLOAT, sizeof(float),
src0->ne, contiguous_nb, GGML_MAX_DIMS);
acl_tensor_ptr acl_dst_contig = ggml_cann_create_tensor(inplace_dst_alloc.get(), ACL_FLOAT, sizeof(float),
dst->ne, contiguous_nb, GGML_MAX_DIMS);
cann_copy(ctx, acl_src.get(), acl_src_contig.get());
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_contig.get(), acl_cos_reshape_tensor.get(),
acl_sin_reshape_tensor.get(), acl_mode, acl_dst_contig.get());
cann_copy(ctx, acl_dst_contig.get(), acl_dst.get());
} else {
// Rotate full tensor (no tail), using original tensors
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),

View File

@@ -570,24 +570,34 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
set(KLEIDIAI_ARCHIVE_MD5 "54049037570ab0ee0a0d126b2ba5ece1")
if (POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
endif()
# TODO: Use FetchContent_MakeAvailable with EXCLUDE_FROM_ALL after bumping minimum CMake version to 3.28+
# Using FetchContent_Populate instead to avoid EXCLUDE_FROM_ALL which requires CMake 3.28
FetchContent_Declare(KleidiAI_Download
set(KLEIDIAI_FETCH_ARGS
URL ${KLEIDIAI_DOWNLOAD_URL}
DOWNLOAD_EXTRACT_TIMESTAMP NEW
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5}
)
FetchContent_GetProperties(KleidiAI_Download
SOURCE_DIR KLEIDIAI_SRC
POPULATED KLEIDIAI_POPULATED)
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.28")
FetchContent_Declare(KleidiAI_Download
${KLEIDIAI_FETCH_ARGS}
EXCLUDE_FROM_ALL
)
if (NOT KLEIDIAI_POPULATED)
FetchContent_Populate(KleidiAI_Download)
FetchContent_MakeAvailable(KleidiAI_Download)
FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
else()
FetchContent_Declare(KleidiAI_Download
${KLEIDIAI_FETCH_ARGS}
)
FetchContent_GetProperties(KleidiAI_Download
SOURCE_DIR KLEIDIAI_SRC
POPULATED KLEIDIAI_POPULATED
)
if (NOT KLEIDIAI_POPULATED)
FetchContent_Populate(KleidiAI_Download)
FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
endif()
endif()
add_compile_definitions(GGML_USE_CPU_KLEIDIAI)

View File

@@ -444,19 +444,20 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
const uint ib = idx / 128; // 2 values per idx
const uint ib32 = (idx % 128) / 16; // 0..7
const uint iq = 16 * ib32 + 2 * (idx % 8);
const uint ib = idx / 64; // 4 values per idx
const uint ib32 = (idx % 64) / 8; // 0..7
const uint iq = 4 * ib32 + (idx % 4);
const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3;
const uint qshift = (idx & 8) >> 1;
u8vec2 qs = unpack8((uint(data_a_packed16[ib].qs[iq/2]) >> qshift) & 0x0F0F).xy;
const uint qshift = idx & 4;
u8vec4 qs = unpack8((uint(data_a_packed32[ib].qs[iq]) >> qshift) & 0x0F0F0F0F);
const float d = float(data_a[ib].d);
const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
const vec4 v = d * float(int(sl | (sh << 4)) - 32) * vec4(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y], kvalues_iq4nl[qs.z], kvalues_iq4nl[qs.w]);
buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v.xy);
buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(v.zw);
#elif defined(DATA_A_IQ4_NL)
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;

View File

@@ -554,7 +554,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
std::string load_vec_quant = "2";
if ((tname == "q4_0") || (tname == "q4_1") || (tname == "q5_1") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s"))
load_vec_quant = "8";
else if ((tname == "q5_0") || (tname == "q8_0") || (tname == "q2_k") || (tname == "q4_k") || (tname == "q5_k") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_nl") || (tname == "mxfp4"))
else if ((tname == "q5_0") || (tname == "q8_0") || (tname == "q2_k") || (tname == "q4_k") || (tname == "q5_k") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_xs") || (tname == "iq4_nl") || (tname == "mxfp4"))
load_vec_quant = "4";
if (tname == "bf16") {

View File

@@ -151,6 +151,26 @@ struct ggml_webgpu_get_rows_pipeline_key_hash {
}
};
/** Row Norm **/
struct ggml_webgpu_row_norm_pipeline_key {
ggml_op op;
bool inplace;
bool operator==(const ggml_webgpu_row_norm_pipeline_key & other) const {
return op == other.op && inplace == other.inplace;
}
};
struct ggml_webgpu_row_norm_pipeline_key_hash {
size_t operator()(const ggml_webgpu_row_norm_pipeline_key & key) const {
size_t seed = 0;
ggml_webgpu_hash_combine(seed, key.op);
ggml_webgpu_hash_combine(seed, key.inplace);
return seed;
}
};
/** Pad **/
struct ggml_webgpu_pad_pipeline_key {
bool circular;
@@ -244,13 +264,15 @@ struct ggml_webgpu_binary_pipeline_key_hash {
/** Unary **/
struct ggml_webgpu_unary_pipeline_key {
int type;
int op;
bool is_unary; // many unary operators fall under the GGML_OP_UNARY umbrella
bool inplace;
int type;
int op;
bool is_unary; // many unary operators fall under the GGML_OP_UNARY umbrella
bool inplace;
ggml_tri_type ttype; // only used for GGML_OP_TRI
bool operator==(const ggml_webgpu_unary_pipeline_key & other) const {
return type == other.type && op == other.op && is_unary == other.is_unary && inplace == other.inplace;
return type == other.type && op == other.op && is_unary == other.is_unary && inplace == other.inplace &&
ttype == other.ttype;
}
};
@@ -261,6 +283,7 @@ struct ggml_webgpu_unary_pipeline_key_hash {
ggml_webgpu_hash_combine(seed, key.op);
ggml_webgpu_hash_combine(seed, key.is_unary);
ggml_webgpu_hash_combine(seed, key.inplace);
ggml_webgpu_hash_combine(seed, key.ttype);
return seed;
}
};
@@ -435,6 +458,8 @@ class ggml_webgpu_shader_lib {
std::unordered_map<int, webgpu_pipeline> argsort_pipelines; // key is order
std::unordered_map<int, webgpu_pipeline> argsort_merge_pipelines; // key is order
std::unordered_map<int, webgpu_pipeline> cumsum_pipelines; // key is fixed, no variants yet
std::unordered_map<ggml_webgpu_row_norm_pipeline_key, webgpu_pipeline, ggml_webgpu_row_norm_pipeline_key_hash>
row_norm_pipelines; // op/inplace
std::unordered_map<ggml_webgpu_get_rows_pipeline_key, webgpu_pipeline, ggml_webgpu_get_rows_pipeline_key_hash>
get_rows_pipelines; // src_type, vectorized
std::unordered_map<ggml_webgpu_unary_pipeline_key, webgpu_pipeline, ggml_webgpu_unary_pipeline_key_hash>
@@ -479,6 +504,44 @@ class ggml_webgpu_shader_lib {
return sum_rows_pipelines[1];
}
webgpu_pipeline get_row_norm_pipeline(const ggml_webgpu_shader_lib_context & context) {
ggml_webgpu_row_norm_pipeline_key key = {
.op = context.dst->op,
.inplace = context.inplace,
};
auto it = row_norm_pipelines.find(key);
if (it != row_norm_pipelines.end()) {
return it->second;
}
std::vector<std::string> defines;
std::string variant;
switch (key.op) {
case GGML_OP_RMS_NORM:
defines.push_back("OP_RMS_NORM");
variant = "rms_norm";
break;
case GGML_OP_L2_NORM:
defines.push_back("OP_L2_NORM");
variant = "l2_norm";
break;
default:
GGML_ABORT("Unsupported op for row_norm shader");
}
if (key.inplace) {
defines.push_back("INPLACE");
variant += "_inplace";
}
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
auto processed = preprocessor.preprocess(wgsl_row_norm, defines);
row_norm_pipelines[key] = ggml_webgpu_create_pipeline(device, processed, variant);
return row_norm_pipelines[key];
}
webgpu_pipeline get_argmax_pipeline(const ggml_webgpu_shader_lib_context & context) {
bool vec4 = context.src0->ne[0] % 4 == 0;
@@ -1058,6 +1121,7 @@ class ggml_webgpu_shader_lib {
.op = op,
.is_unary = is_unary,
.inplace = context.inplace,
.ttype = (ggml_tri_type) ggml_get_op_params_i32(context.dst, 0),
};
auto it = unary_pipelines.find(key);
@@ -1088,6 +1152,29 @@ class ggml_webgpu_shader_lib {
variant += "_inplace";
}
if (op == GGML_OP_TRI) {
switch (key.ttype) {
case GGML_TRI_TYPE_LOWER:
defines.push_back("TRI_TYPE_LOWER");
variant += "_tri_type_lower";
break;
case GGML_TRI_TYPE_LOWER_DIAG:
defines.push_back("TRI_TYPE_LOWER_DIAG");
variant += "_tri_type_lower_diag";
break;
case GGML_TRI_TYPE_UPPER:
defines.push_back("TRI_TYPE_UPPER");
variant += "_tri_type_upper";
break;
case GGML_TRI_TYPE_UPPER_DIAG:
defines.push_back("TRI_TYPE_UPPER_DIAG");
variant += "_tri_upper_diag";
break;
default:
GGML_ABORT("Unsupported ggml_tri_type for unary shader");
}
}
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
auto processed = preprocessor.preprocess(wgsl_unary, defines);

View File

@@ -366,7 +366,6 @@ struct webgpu_context_struct {
std::map<int, std::map<int, webgpu_pipeline>> cpy_pipelines; // src_type, dst_type
std::map<int, webgpu_pipeline> rms_norm_pipelines; // inplace
std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> rope_pipelines; // type, ff, inplace
std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> glu_pipelines; // glu_op, type, split
@@ -1598,8 +1597,8 @@ static webgpu_command ggml_webgpu_repeat(webgpu_context & ctx, ggml_tensor * src
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
}
static webgpu_command ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
int inplace = ggml_webgpu_tensor_equal(src, dst);
static webgpu_command ggml_webgpu_row_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
bool inplace = ggml_webgpu_tensor_equal(src, dst);
std::vector<uint32_t> params = {
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
@@ -1630,8 +1629,15 @@ static webgpu_command ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * s
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
}
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, ctx->rms_norm_pipelines[inplace], params,
entries, ggml_nrows(src));
ggml_webgpu_shader_lib_context shader_lib_ctx = {
.src0 = src,
.dst = dst,
.max_wg_size = WEBGPU_ROW_SPLIT_WG_SIZE,
.inplace = inplace,
};
webgpu_pipeline pipeline = ctx->shader_lib->get_row_norm_pipeline(shader_lib_ctx);
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, ggml_nrows(src));
}
static webgpu_command ggml_webgpu_rope(webgpu_context & ctx,
@@ -2192,7 +2198,8 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
case GGML_OP_REPEAT:
return ggml_webgpu_repeat(ctx, src0, node);
case GGML_OP_RMS_NORM:
return ggml_webgpu_rms_norm(ctx, src0, node);
case GGML_OP_L2_NORM:
return ggml_webgpu_row_norm(ctx, src0, node);
case GGML_OP_ROPE:
return ggml_webgpu_rope(ctx, src0, src1, src2, node);
case GGML_OP_GLU:
@@ -2209,6 +2216,8 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
case GGML_OP_SQRT:
case GGML_OP_SIN:
case GGML_OP_COS:
case GGML_OP_DIAG:
case GGML_OP_TRI:
return ggml_webgpu_unary_op(ctx, src0, node);
case GGML_OP_PAD:
return ggml_webgpu_pad(ctx, src0, node);
@@ -2614,15 +2623,6 @@ static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
ggml_webgpu_create_pipeline(webgpu_ctx->global_ctx->device, wgsl_cpy_f16_f16, "cpy_f16_f16", constants);
}
static void ggml_webgpu_init_rms_norm_pipeline(webgpu_context & webgpu_ctx) {
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(WEBGPU_ROW_SPLIT_WG_SIZE);
webgpu_ctx->rms_norm_pipelines[0] =
ggml_webgpu_create_pipeline(webgpu_ctx->global_ctx->device, wgsl_rms_norm, "rms_norm", constants);
webgpu_ctx->rms_norm_pipelines[1] = ggml_webgpu_create_pipeline(
webgpu_ctx->global_ctx->device, wgsl_rms_norm_inplace, "rms_norm_inplace", constants);
}
static void ggml_webgpu_init_rope_pipeline(webgpu_context & webgpu_ctx) {
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(WEBGPU_MAX_WG_SIZE);
@@ -2907,7 +2907,6 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "set_rows_host_error_buf");
ggml_webgpu_init_cpy_pipeline(webgpu_ctx);
ggml_webgpu_init_rms_norm_pipeline(webgpu_ctx);
ggml_webgpu_init_rope_pipeline(webgpu_ctx);
ggml_webgpu_init_glu_pipeline(webgpu_ctx);
ggml_webgpu_init_soft_max_pipeline(webgpu_ctx);
@@ -3118,6 +3117,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
break;
}
case GGML_OP_RMS_NORM:
case GGML_OP_L2_NORM:
supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
break;
case GGML_OP_ROPE:
@@ -3201,6 +3201,12 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
case GGML_OP_COS:
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
break;
case GGML_OP_DIAG:
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
break;
case GGML_OP_TRI:
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
break;
case GGML_OP_PAD:
supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
break;

View File

@@ -1,21 +1,11 @@
#define(VARIANTS)
[
{
"DECLS": ["NOT_INPLACE"]
},
{
"SHADER_SUFFIX": "inplace",
"DECLS": ["INPLACE"]
},
]
#end(VARIANTS)
#define(DECLS)
#decl(NOT_INPLACE)
#ifdef INPLACE
fn update(src_offset: u32, dst_offset: u32, scale: f32) {
src[dst_offset] = scale * src[src_offset];
}
@group(0) @binding(1)
var<uniform> params: Params;
#else
fn update(src_offset: u32, dst_offset: u32, scale: f32) {
dst[dst_offset] = scale * src[src_offset];
}
@@ -25,23 +15,7 @@ var<storage, read_write> dst: array<f32>;
@group(0) @binding(2)
var<uniform> params: Params;
#enddecl(NOT_INPLACE)
#decl(INPLACE)
fn update(src_offset: u32, dst_offset: u32, scale: f32) {
src[dst_offset] = scale * src[src_offset];
}
@group(0) @binding(1)
var<uniform> params: Params;
#enddecl(INPLACE)
#end(DECLS)
#define(SHADER)
#endif
struct Params {
offset_src: u32, // in elements
@@ -68,12 +42,9 @@ struct Params {
@group(0) @binding(0)
var<storage, read_write> src: array<f32>;
DECLS
var<workgroup> scratch: array<f32, WG_SIZE>;
override wg_size: u32;
var<workgroup> scratch: array<f32, wg_size>;
@compute @workgroup_size(wg_size)
@compute @workgroup_size(WG_SIZE)
fn main(@builtin(workgroup_id) wid: vec3<u32>,
@builtin(local_invocation_id) lid: vec3<u32>) {
@@ -86,7 +57,7 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
let i_src_row = params.offset_src + i3 * params.stride_src3 + i2 * params.stride_src2 + i1 * params.stride_src1;
let i_dst_row = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1;
let elems = (params.ne0 + wg_size - 1) / wg_size;
let elems = (params.ne0 + WG_SIZE - 1) / WG_SIZE;
var sum = 0.0f;
var col = lid.x;
@@ -95,12 +66,12 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
break;
}
sum += pow(src[i_src_row + col], 2.0);
col += wg_size;
col += WG_SIZE;
}
scratch[lid.x] = sum;
workgroupBarrier();
var offset = wg_size / 2;
var offset: u32 = WG_SIZE / 2;
while (offset > 0) {
if (lid.x < offset) {
scratch[lid.x] += scratch[lid.x + offset];
@@ -110,14 +81,17 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
}
sum = scratch[0];
#ifdef OP_RMS_NORM
let scale = 1.0/sqrt(sum/f32(params.ne0) + params.eps);
#elif OP_L2_NORM
let scale = 1.0/max(sqrt(sum), params.eps);
#endif
col = lid.x;
for (var j: u32 = 0; j < elems; j++) {
if (col >= params.ne0) {
break;
}
update(i_src_row + col, i_dst_row + col, scale);
col += wg_size;
col += WG_SIZE;
}
}
#end(SHADER)

View File

@@ -5,7 +5,6 @@ enable f16;
#define TYPE f32
#endif
@group(0) @binding(0)
var<storage, read_write> src: array<TYPE>;
@@ -57,12 +56,20 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
return;
}
var i = gid.x;
let i3 = i / (params.ne2 * params.ne1 * params.ne0);
i = i % (params.ne2 * params.ne1 * params.ne0);
let i2 = i / (params.ne1 * params.ne0);
i = i % (params.ne1 * params.ne0);
let i1 = i / params.ne0;
let i0 = i % params.ne0;
let ne2 = params.ne2;
#ifdef DIAG
let ne1 = params.ne0;
#else
let ne1 = params.ne1;
#endif
let ne0 = params.ne0;
let i3 = i / (ne2 * ne1 * ne0);
i = i % (ne2 * ne1 * ne0);
let i2 = i / (ne1 * ne0);
i = i % (ne1 * ne0);
let i1 = i / ne0;
let i0 = i % ne0;
let src_idx = i0 * params.stride_src0 + i1 * params.stride_src1 +
i2 * params.stride_src2 + i3 * params.stride_src3;
@@ -184,6 +191,20 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
let res_f32 = cos(f32(src[params.offset_src + src_idx]));
let res = TYPE(res_f32);
#endif
#ifdef DIAG
let res = select(0.0, src[params.offset_src + i0 + i2 * params.stride_src2 + i3 * params.stride_src3], i0 == i1);
#endif
#ifdef TRI
#ifdef TRI_TYPE_LOWER
let res = select(0.0, src[params.offset_src + src_idx], i0 < i1);
#elif TRI_TYPE_LOWER_DIAG
let res = select(0.0, src[params.offset_src + src_idx], i0 <= i1);
#elif TRI_TYPE_UPPER
let res = select(0.0, src[params.offset_src + src_idx], i0 > i1);
#elif TRI_TYPE_UPPER_DIAG
let res = select(0.0, src[params.offset_src + src_idx], i0 >= i1);
#endif
#endif
#ifdef INPLACE
src[params.offset_src + src_idx] = res;

View File

@@ -2129,19 +2129,28 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
}
const uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
const float * scores = nullptr;
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
if (score_idx != -1) {
const uint32_t n_scores = gguf_get_arr_n(ctx, score_idx);
if (n_scores < n_tokens) {
throw std::runtime_error("Index out of array bounds for scores (" + std::to_string(n_scores) + " < " + std::to_string(n_tokens) + ")\n");
}
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
}
const int * toktypes = nullptr;
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
if (toktype_idx != -1) {
const uint32_t n_toktypes = gguf_get_arr_n(ctx, toktype_idx);
if (n_toktypes < n_tokens) {
throw std::runtime_error("Index out of array bounds for toktypes (" + std::to_string(n_toktypes) + " < " + std::to_string(n_tokens) + ")\n");
}
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
}
uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
id_to_token.resize(n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {

View File

@@ -41,6 +41,11 @@ struct clip_graph {
virtual ~clip_graph() = default;
virtual ggml_cgraph * build() = 0;
// wrapper around ggml_mul_mat, allow hooking (e.g. LoRA, clamping) depending on the model
// tensor w should be the weight matrix, and tensor x should be the input
virtual ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const;
// TODO: build_mm(w, b, x) to support bias
//
// utility functions
//

View File

@@ -255,6 +255,10 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
}
ggml_tensor * clip_graph::build_mm(ggml_tensor * w, ggml_tensor * x) const {
return ggml_mul_mat(ctx0, w, x);
}
void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
if (il >= 0) {
ggml_format_name(cur, "%s-%d", name, il);
@@ -326,7 +330,7 @@ ggml_tensor * clip_graph::build_vit(
ggml_tensor * Vcur = nullptr;
if (layer.qkv_w != nullptr) {
// fused qkv
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
cur = build_mm(layer.qkv_w, cur);
if (layer.qkv_b != nullptr) {
cur = ggml_add(ctx0, cur, layer.qkv_b);
}
@@ -360,17 +364,17 @@ ggml_tensor * clip_graph::build_vit(
} else {
// separate q, k, v
Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
Qcur = build_mm(layer.q_w, cur);
if (layer.q_b) {
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
}
Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
Kcur = build_mm(layer.k_w, cur);
if (layer.k_b) {
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
}
Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
Vcur = build_mm(layer.v_w, cur);
if (layer.v_b) {
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
}
@@ -517,7 +521,7 @@ ggml_tensor * clip_graph::build_ffn(
ffn_op_type type_op,
int il) const {
ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
ggml_tensor * tmp = up ? build_mm(up, cur) : cur;
cb(tmp, "ffn_up", il);
if (up_b) {
@@ -526,7 +530,7 @@ ggml_tensor * clip_graph::build_ffn(
}
if (gate) {
cur = ggml_mul_mat(ctx0, gate, cur);
cur = build_mm(gate, cur);
cb(cur, "ffn_gate", il);
if (gate_b) {
@@ -580,7 +584,7 @@ ggml_tensor * clip_graph::build_ffn(
}
if (down) {
cur = ggml_mul_mat(ctx0, down, cur);
cur = build_mm(down, cur);
}
if (down_b) {
@@ -646,7 +650,7 @@ ggml_tensor * clip_graph::build_attn(
cb(cur, "kqv_out", il);
if (wo) {
cur = ggml_mul_mat(ctx0, wo, cur);
cur = build_mm(wo, cur);
}
if (wo_b) {

View File

@@ -19,7 +19,7 @@ ggml_cgraph * clip_graph_cogvlm::build() {
auto & layer = model.layers[il];
ggml_tensor * cur = inpL;
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
cur = build_mm(layer.qkv_w, cur);
cur = ggml_add(ctx0, cur, layer.qkv_b);
@@ -67,7 +67,7 @@ ggml_cgraph * clip_graph_cogvlm::build() {
ggml_row_size(inpL->type, n_embd), 0);
// Multiply with mm_model_proj
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
cur = build_mm(model.mm_model_proj, cur);
// Apply layernorm, weight, bias
cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
@@ -76,16 +76,16 @@ ggml_cgraph * clip_graph_cogvlm::build() {
cur = ggml_gelu_inplace(ctx0, cur);
// Branch 1: multiply with mm_h_to_4h_w
ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
ggml_tensor * h_to_4h = build_mm(model.mm_h_to_4h_w, cur);
// Branch 2: multiply with mm_gate_w
ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
ggml_tensor * gate = build_mm(model.mm_gate_w, cur);
// Apply silu
gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
// Apply mm_4h_to_h_w
cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
cur = build_mm(model.mm_4h_to_h_w, gate);
// Concatenate with boi and eoi
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);

View File

@@ -56,7 +56,7 @@ ggml_cgraph * clip_graph_conformer::build() {
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
// calculate out
cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
cur = build_mm(model.pre_encode_out_w, cur);
cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
cb(cur, "conformer.pre_encode.out", -1);
}
@@ -87,7 +87,7 @@ ggml_cgraph * clip_graph_conformer::build() {
cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
cb(cur, "conformer.layers.{}.norm_self_att", il);
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
@@ -96,12 +96,12 @@ ggml_cgraph * clip_graph_conformer::build() {
Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
// TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
@@ -111,7 +111,7 @@ ggml_cgraph * clip_graph_conformer::build() {
matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
auto * p = build_mm(layer.linear_pos_w, pos_emb);
cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
p = ggml_permute(ctx0, p, 0, 2, 1, 3);
@@ -143,7 +143,7 @@ ggml_cgraph * clip_graph_conformer::build() {
x = ggml_permute(ctx0, x, 2, 0, 1, 3);
x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
ggml_tensor * out = build_mm(layer.o_w, x);
out = ggml_add(ctx0, out, layer.o_b);
cb(out, "conformer.layers.{}.self_attn.linear_out", il);
@@ -157,7 +157,7 @@ ggml_cgraph * clip_graph_conformer::build() {
// conv
{
auto * x = cur;
x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
x = build_mm(layer.conv_pw1_w, x);
x = ggml_add(ctx0, x, layer.conv_pw1_b);
cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
@@ -181,7 +181,7 @@ ggml_cgraph * clip_graph_conformer::build() {
x = ggml_silu(ctx0, x);
// pointwise_conv2
x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
x = build_mm(layer.conv_pw2_w, x);
x = ggml_add(ctx0, x, layer.conv_pw2_b);
cur = x;

View File

@@ -97,7 +97,7 @@ ggml_cgraph * clip_graph_glm4v::build() {
// FC projector
{
cur = ggml_mul_mat(ctx0, model.projection, cur);
cur = build_mm(model.projection, cur);
// default LayerNorm (post_projection_norm)
cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
cur = ggml_gelu_erf(ctx0, cur);

View File

@@ -22,7 +22,7 @@ ggml_cgraph * clip_graph_llama4::build() {
ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
patch_size, patch_size, 3, n_embd);
inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
inp = build_mm(model.patch_embeddings_0, inp);
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
cb(inp, "patch_conv", -1);
}
@@ -78,15 +78,15 @@ ggml_cgraph * clip_graph_llama4::build() {
// based on Llama4VisionMLP2 (always uses GELU activation, no bias)
{
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
cur = build_mm(model.mm_model_mlp_1_w, cur);
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
cur = build_mm(model.mm_model_mlp_2_w, cur);
cur = ggml_gelu(ctx0, cur);
cb(cur, "adapter_mlp", -1);
}
// Llama4MultiModalProjector
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
cur = build_mm(model.mm_model_proj, cur);
cb(cur, "projected", -1);
// build the graph

View File

@@ -70,17 +70,17 @@ ggml_cgraph * clip_graph_llava::build() {
// self-attention
{
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
if (layer.q_b) {
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
}
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
if (layer.k_b) {
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
}
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
if (layer.v_b) {
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
}
@@ -164,17 +164,17 @@ ggml_cgraph * clip_graph_llava::build() {
// llava projector
if (proj_type == PROJECTOR_TYPE_MLP) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = build_mm(model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
embeddings = ggml_gelu(ctx0, embeddings);
if (model.mm_2_w) {
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = build_mm(model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
}
}
else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = build_mm(model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
// First LayerNorm
@@ -186,7 +186,7 @@ ggml_cgraph * clip_graph_llava::build() {
embeddings = ggml_gelu(ctx0, embeddings);
// Second linear layer
embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
embeddings = build_mm(model.mm_3_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
// Second LayerNorm
@@ -197,10 +197,10 @@ ggml_cgraph * clip_graph_llava::build() {
else if (proj_type == PROJECTOR_TYPE_LDP) {
// MobileVLM projector
int n_patch = 24;
ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
ggml_tensor * mlp_1 = build_mm(model.mm_model_mlp_1_w, embeddings);
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
mlp_1 = ggml_gelu(ctx0, mlp_1);
ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
ggml_tensor * mlp_3 = build_mm(model.mm_model_mlp_3_w, mlp_1);
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
@@ -229,10 +229,10 @@ ggml_cgraph * clip_graph_llava::build() {
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
// pointwise conv
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
block_1 = build_mm(model.mm_model_block_1_block_1_fc1_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
block_1 = ggml_relu(ctx0, block_1);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
block_1 = build_mm(model.mm_model_block_1_block_1_fc2_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
block_1 = ggml_hardsigmoid(ctx0, block_1);
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
@@ -244,7 +244,7 @@ ggml_cgraph * clip_graph_llava::build() {
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
block_1 = build_mm(model.mm_model_block_1_block_2_0_w, block_1);
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
@@ -277,10 +277,10 @@ ggml_cgraph * clip_graph_llava::build() {
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
// pointwise conv
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
block_1 = build_mm(model.mm_model_block_2_block_1_fc1_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
block_1 = ggml_relu(ctx0, block_1);
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
block_1 = build_mm(model.mm_model_block_2_block_1_fc2_w, block_1);
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
block_1 = ggml_hardsigmoid(ctx0, block_1);
@@ -292,7 +292,7 @@ ggml_cgraph * clip_graph_llava::build() {
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
block_1 = build_mm(model.mm_model_block_2_block_2_0_w, block_1);
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
@@ -307,10 +307,10 @@ ggml_cgraph * clip_graph_llava::build() {
else if (proj_type == PROJECTOR_TYPE_LDPV2)
{
int n_patch = 24;
ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
ggml_tensor * mlp_0 = build_mm(model.mm_model_mlp_0_w, embeddings);
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
mlp_0 = ggml_gelu(ctx0, mlp_0);
ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
ggml_tensor * mlp_2 = build_mm(model.mm_model_mlp_2_w, mlp_0);
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
// mlp_2 ne = [2048, 576, 1, 1]
// // AVG Pool Layer 2*2, strides = 2
@@ -344,15 +344,15 @@ ggml_cgraph * clip_graph_llava::build() {
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
// GLU
{
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
embeddings = build_mm(model.mm_model_mlp_0_w, embeddings);
embeddings = ggml_norm(ctx0, embeddings, eps);
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
embeddings = ggml_gelu_inplace(ctx0, embeddings);
ggml_tensor * x = embeddings;
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
embeddings = build_mm(model.mm_model_mlp_2_w, embeddings);
x = build_mm(model.mm_model_mlp_1_w,x);
embeddings = ggml_swiglu_split(ctx0, embeddings, x);
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
embeddings = build_mm(model.mm_model_mlp_3_w, embeddings);
}
// arrangement of BOI/EOI token embeddings
// note: these embeddings are not present in text model, hence we cannot process them as text tokens

View File

@@ -38,7 +38,7 @@ ggml_cgraph * clip_graph_minicpmv::build() {
// resampler projector (it is just another transformer)
ggml_tensor * q = model.mm_model_query;
ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
ggml_tensor * v = build_mm(model.mm_model_kv_proj, embeddings);
// norm
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
@@ -77,13 +77,13 @@ ggml_cgraph * clip_graph_minicpmv::build() {
// Use actual config value if available, otherwise fall back to hardcoded values
int num_query = hparams.minicpmv_query_num;
ggml_tensor * Q = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
build_mm(model.mm_model_attn_q_w, q),
model.mm_model_attn_q_b);
ggml_tensor * K = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
build_mm(model.mm_model_attn_k_w, k),
model.mm_model_attn_k_b);
ggml_tensor * V = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
build_mm(model.mm_model_attn_v_w, v),
model.mm_model_attn_v_b);
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
@@ -105,7 +105,7 @@ ggml_cgraph * clip_graph_minicpmv::build() {
embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
// projection
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
embeddings = build_mm(model.mm_model_proj, embeddings);
// build the graph
ggml_build_forward_expand(gf, embeddings);

View File

@@ -429,7 +429,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
// PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
// Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
if (model.mm_input_proj_w) {
cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
cur = build_mm(model.mm_input_proj_w, cur);
}
// 5. POST PROJECTION NORM

View File

@@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_pixtral::build() {
// project to n_embd
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
cur = build_mm(model.mm_patch_merger_w, cur);
}
// LlavaMultiModalProjector (always using GELU activation)

View File

@@ -90,11 +90,11 @@ ggml_cgraph * clip_graph_qwen2vl::build() {
// self-attention
{
ggml_tensor * Qcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
build_mm(layer.q_w, cur), layer.q_b);
ggml_tensor * Kcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
build_mm(layer.k_w, cur), layer.k_b);
ggml_tensor * Vcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
build_mm(layer.v_w, cur), layer.v_b);
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);

View File

@@ -85,7 +85,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
// self-attention
{
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
cur = build_mm(layer.qkv_w, cur);
cur = ggml_add(ctx0, cur, layer.qkv_b);
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,

View File

@@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_siglip::build() {
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
const int scale_factor = model.hparams.n_merge;
cur = build_patch_merge_permute(cur, scale_factor);
cur = ggml_mul_mat(ctx0, model.projection, cur);
cur = build_mm(model.projection, cur);
} else if (proj_type == PROJECTOR_TYPE_LFM2) {
// pixel unshuffle block

View File

@@ -59,7 +59,7 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
// ffn in
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
cur = build_mm(model.mm_1_w, cur);
// swiglu
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
@@ -70,11 +70,11 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
// ffn out
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
cur = build_mm(model.mm_2_w, cur);
} else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
// projector
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
cur = build_mm(model.mm_fc_w, cur);
cur = ggml_add(ctx0, cur, model.mm_fc_b);
} else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {

View File

@@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_youtuvl::build() {
ctx0, inp,
3*patch_size* patch_size, Hm * Wm * m * m, 1);
}
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
inp = build_mm(model.patch_embeddings_0, inp);
if (model.patch_bias) {
inp = ggml_add(ctx0, inp, model.patch_bias);
@@ -97,11 +97,11 @@ ggml_cgraph * clip_graph_youtuvl::build() {
// self-attention
{
ggml_tensor * Qcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
build_mm(layer.q_w, cur), layer.q_b);
ggml_tensor * Kcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
build_mm(layer.k_w, cur), layer.k_b);
ggml_tensor * Vcur = ggml_add(ctx0,
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
build_mm(layer.v_w, cur), layer.v_b);
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);

Binary file not shown.

View File

@@ -5,9 +5,12 @@
import Label from '$lib/components/ui/label/label.svelte';
import * as Select from '$lib/components/ui/select';
import { Textarea } from '$lib/components/ui/textarea';
import { SETTING_CONFIG_DEFAULT, SETTING_CONFIG_INFO, SETTINGS_KEYS } from '$lib/constants';
import { SETTING_CONFIG_INFO, SETTINGS_KEYS } from '$lib/constants';
import { SettingsFieldType } from '$lib/enums/settings';
import { settingsStore } from '$lib/stores/settings.svelte';
import { serverStore } from '$lib/stores/server.svelte';
import { modelsStore, selectedModelName } from '$lib/stores/models.svelte';
import { normalizeFloatingPoint } from '$lib/utils/precision';
import { ChatSettingsParameterSourceIndicator } from '$lib/components/app';
import type { Component } from 'svelte';
@@ -20,35 +23,36 @@
let { fields, localConfig, onConfigChange, onThemeChange }: Props = $props();
// Helper function to get parameter source info for syncable parameters
function getParameterSourceInfo(key: string) {
if (!settingsStore.canSyncParameter(key)) {
return null;
// server sampling defaults for placeholders
let sp = $derived.by(() => {
if (serverStore.isRouterMode) {
const m = selectedModelName();
if (m) {
const p = modelsStore.getModelProps(m);
return (p?.default_generation_settings?.params ?? {}) as Record<string, unknown>;
}
}
return settingsStore.getParameterInfo(key);
}
return (serverStore.defaultParams ?? {}) as Record<string, unknown>;
});
</script>
{#each fields as field (field.key)}
<div class="space-y-2">
{#if field.type === SettingsFieldType.INPUT}
{@const paramInfo = getParameterSourceInfo(field.key)}
{@const currentValue = String(localConfig[field.key] ?? '')}
{@const propsDefault = paramInfo?.serverDefault}
{@const serverDefault = sp[field.key]}
{@const isCustomRealTime = (() => {
if (!paramInfo || propsDefault === undefined) return false;
if (serverDefault == null) return false;
if (currentValue === '') return false;
// Apply same rounding logic for real-time comparison
const inputValue = currentValue;
const numericInput = parseFloat(inputValue);
const numericInput = parseFloat(currentValue);
const normalizedInput = !isNaN(numericInput)
? Math.round(numericInput * 1000000) / 1000000
: inputValue;
: currentValue;
const normalizedDefault =
typeof propsDefault === 'number'
? Math.round(propsDefault * 1000000) / 1000000
: propsDefault;
typeof serverDefault === 'number'
? Math.round(serverDefault * 1000000) / 1000000
: serverDefault;
return normalizedInput !== normalizedDefault;
})()}
@@ -74,7 +78,9 @@
// Update local config immediately for real-time badge feedback
onConfigChange(field.key, e.currentTarget.value);
}}
placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] ?? 'none'}`}
placeholder={sp[field.key] != null
? `Default: ${normalizeFloatingPoint(sp[field.key])}`
: ''}
class="w-full {isCustomRealTime ? 'pr-8' : ''}"
/>
{#if isCustomRealTime}
@@ -82,9 +88,7 @@
type="button"
onclick={() => {
settingsStore.resetParameterToServerDefault(field.key);
// Trigger UI update by calling onConfigChange with the default value
const defaultValue = propsDefault ?? SETTING_CONFIG_DEFAULT[field.key];
onConfigChange(field.key, String(defaultValue));
onConfigChange(field.key, '');
}}
class="absolute top-1/2 right-2 inline-flex h-5 w-5 -translate-y-1/2 items-center justify-center rounded transition-colors hover:bg-muted"
aria-label="Reset to default"
@@ -112,7 +116,7 @@
id={field.key}
value={String(localConfig[field.key] ?? '')}
onchange={(e) => onConfigChange(field.key, e.currentTarget.value)}
placeholder={`Default: ${SETTING_CONFIG_DEFAULT[field.key] ?? 'none'}`}
placeholder=""
class="min-h-[10rem] w-full md:max-w-2xl"
/>
@@ -140,14 +144,12 @@
(opt: { value: string; label: string; icon?: Component }) =>
opt.value === localConfig[field.key]
)}
{@const paramInfo = getParameterSourceInfo(field.key)}
{@const currentValue = localConfig[field.key]}
{@const propsDefault = paramInfo?.serverDefault}
{@const serverDefault = sp[field.key]}
{@const isCustomRealTime = (() => {
if (!paramInfo || propsDefault === undefined) return false;
// For select fields, do direct comparison (no rounding needed)
return currentValue !== propsDefault;
if (serverDefault == null) return false;
if (currentValue === '' || currentValue === undefined) return false;
return currentValue !== serverDefault;
})()}
<div class="flex items-center gap-2">
@@ -190,9 +192,7 @@
type="button"
onclick={() => {
settingsStore.resetParameterToServerDefault(field.key);
// Trigger UI update by calling onConfigChange with the default value
const defaultValue = propsDefault ?? SETTING_CONFIG_DEFAULT[field.key];
onConfigChange(field.key, String(defaultValue));
onConfigChange(field.key, '');
}}
class="absolute top-1/2 right-8 inline-flex h-5 w-5 -translate-y-1/2 items-center justify-center rounded transition-colors hover:bg-muted"
aria-label="Reset to default"

View File

@@ -28,6 +28,11 @@
let parsed = $derived(ModelsService.parseModelId(modelId));
let resolvedShowRaw = $derived(showRaw ?? (config().showRawModelNames as boolean) ?? false);
let displayName = $derived(
aliases && aliases.length > 0 ? aliases[0] : (parsed.modelName ?? modelId)
);
let remainingAliases = $derived(aliases && aliases.length > 1 ? aliases.slice(1) : []);
let allTags = $derived([...(parsed.tags ?? []), ...(tags ?? [])]);
</script>
{#if resolvedShowRaw}
@@ -35,7 +40,7 @@
{:else}
<span class="flex min-w-0 flex-wrap items-center gap-1 {className}">
<span class="min-w-0 truncate font-medium">
{#if showOrgName && parsed.orgName}{parsed.orgName}/{/if}{parsed.modelName ?? modelId}
{#if showOrgName && parsed.orgName && !(aliases && aliases.length > 0)}{parsed.orgName}/{/if}{displayName}
</span>
{#if parsed.params}
@@ -50,14 +55,14 @@
</span>
{/if}
{#if aliases && aliases.length > 0}
{#each aliases as alias (alias)}
{#if remainingAliases.length > 0}
{#each remainingAliases as alias (alias)}
<span class={badgeClass}>{alias}</span>
{/each}
{/if}
{#if tags && tags.length > 0}
{#each tags as tag (tag)}
{#if allTags.length > 0}
{#each allTags as tag (tag)}
<span class={tagBadgeClass}>{tag}</span>
{/each}
{/if}

View File

@@ -1,3 +1,4 @@
export const CONFIG_LOCALSTORAGE_KEY = 'LlamaCppWebui.config';
export const USER_OVERRIDES_LOCALSTORAGE_KEY = 'LlamaCppWebui.userOverrides';
export const FAVOURITE_MODELS_LOCALSTORAGE_KEY = 'LlamaCppWebui.favouriteModels';
export const MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY = 'LlamaCppWebui.mcpDefaultEnabled';

View File

@@ -11,10 +11,16 @@ export const MODEL_ID_SEGMENT_SEPARATOR = '-';
export const MODEL_ID_QUANTIZATION_SEPARATOR = ':';
/**
* Matches a trailing ALL-CAPS format segment, e.g. `GGUF`, `BF16`, `Q4_K_M`.
* Must be at least 2 uppercase letters, optionally followed by uppercase letters or digits.
* Matches a quantization/precision segment, e.g. `Q4_K_M`, `IQ4_XS`, `F16`, `BF16`, `MXFP4`.
* Case-insensitive to handle both uppercase and lowercase inputs.
*/
export const MODEL_FORMAT_SEGMENT_RE = /^[A-Z]{2,}[A-Z0-9]*$/;
export const MODEL_QUANTIZATION_SEGMENT_RE =
/^(I?Q\d+(_[A-Z0-9]+)*|F\d+|BF\d+|MXFP\d+(_[A-Z0-9]+)*)$/i;
/**
* Matches prefix for custom quantization types, e.g. `UD-Q8_K_XL`.
*/
export const MODEL_CUSTOM_QUANTIZATION_PREFIX_RE = /^UD$/i;
/**
* Matches a parameter-count segment, e.g. `7B`, `1.5b`, `120M`.
@@ -22,7 +28,12 @@ export const MODEL_FORMAT_SEGMENT_RE = /^[A-Z]{2,}[A-Z0-9]*$/;
export const MODEL_PARAMS_RE = /^\d+(\.\d+)?[BbMmKkTt]$/;
/**
* Matches an activated-parameter-count segment, e.g. `A10B`, `A2.4b`.
* The leading `A` distinguishes it from a regular params segment.
* Matches an activated-parameter-count segment, e.g. `A10B`, `a2.4b`.
* The leading `A`/`a` distinguishes it from a regular params segment.
*/
export const MODEL_ACTIVATED_PARAMS_RE = /^A\d+(\.\d+)?[BbMmKkTt]$/;
export const MODEL_ACTIVATED_PARAMS_RE = /^[Aa]\d+(\.\d+)?[BbMmKkTt]$/;
/**
* Container format segments to exclude from tags (every model uses these).
*/
export const MODEL_IGNORED_SEGMENTS = new Set(['GGUF', 'GGML']);

View File

@@ -1,8 +1,8 @@
import { ColorMode } from '$lib/enums/ui';
import { Monitor, Moon, Sun } from '@lucide/svelte';
export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> = {
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value.
export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean | undefined> = {
// Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value.
// Do not use nested objects, keep it single level. Prefix the key if you need to group them.
apiKey: '',
systemMessage: '',
@@ -30,27 +30,30 @@ export const SETTING_CONFIG_DEFAULT: Record<string, string | number | boolean> =
agenticMaxToolPreviewLines: 25,
showToolCallInProgress: false,
alwaysShowAgenticTurns: false,
// make sure these default values are in sync with `common.h`
samplers: 'top_k;typ_p;top_p;min_p;temperature',
// sampling params: empty means "use server default"
// the server / preset is the source of truth
// empty values are shown as placeholders from /props in the UI
// and are NOT sent in API requests, letting the server decide
samplers: '',
backend_sampling: false,
temperature: 0.8,
dynatemp_range: 0.0,
dynatemp_exponent: 1.0,
top_k: 40,
top_p: 0.95,
min_p: 0.05,
xtc_probability: 0.0,
xtc_threshold: 0.1,
typ_p: 1.0,
repeat_last_n: 64,
repeat_penalty: 1.0,
presence_penalty: 0.0,
frequency_penalty: 0.0,
dry_multiplier: 0.0,
dry_base: 1.75,
dry_allowed_length: 2,
dry_penalty_last_n: -1,
max_tokens: -1,
temperature: undefined,
dynatemp_range: undefined,
dynatemp_exponent: undefined,
top_k: undefined,
top_p: undefined,
min_p: undefined,
xtc_probability: undefined,
xtc_threshold: undefined,
typ_p: undefined,
repeat_last_n: undefined,
repeat_penalty: undefined,
presence_penalty: undefined,
frequency_penalty: undefined,
dry_multiplier: undefined,
dry_base: undefined,
dry_allowed_length: undefined,
dry_penalty_last_n: undefined,
max_tokens: undefined,
custom: '', // custom json-stringified object
// experimental features
pyInterpreterEnabled: false,

View File

@@ -2,9 +2,11 @@ import { ServerModelStatus } from '$lib/enums';
import { apiFetch, apiPost } from '$lib/utils';
import type { ParsedModelId } from '$lib/types/models';
import {
MODEL_FORMAT_SEGMENT_RE,
MODEL_QUANTIZATION_SEGMENT_RE,
MODEL_CUSTOM_QUANTIZATION_PREFIX_RE,
MODEL_PARAMS_RE,
MODEL_ACTIVATED_PARAMS_RE,
MODEL_IGNORED_SEGMENTS,
MODEL_ID_NOT_FOUND,
MODEL_ID_ORG_SEPARATOR,
MODEL_ID_SEGMENT_SEPARATOR,
@@ -119,8 +121,9 @@ export class ModelsService {
/**
* Parse a model ID string into its structured components.
*
* Handles the convention:
* `<org>/<ModelName>-<Parameters>(-<ActivatedParameters>)-<Format>:<QuantizationType>`
* Handles conventions like:
* `<org>/<ModelName>-<Parameters>(-<ActivatedParameters>)(-<Tags>)(-<Quantization>):<Quantization>`
* `<ModelName>.<Quantization>` (dot-separated quantization, e.g. `model.Q4_K_M`)
*
* @param modelId - Raw model identifier string
* @returns Structured {@link ParsedModelId} with all detected fields
@@ -132,11 +135,11 @@ export class ModelsService {
modelName: null,
params: null,
activatedParams: null,
format: null,
quantization: null,
tags: []
};
// 1. Extract colon-separated quantization (e.g. `model:Q4_K_M`)
const colonIdx = modelId.indexOf(MODEL_ID_QUANTIZATION_SEPARATOR);
let modelPath: string;
@@ -147,6 +150,7 @@ export class ModelsService {
modelPath = modelId;
}
// 2. Extract org name (e.g. `org/model` -> org = "org")
const slashIdx = modelPath.indexOf(MODEL_ID_ORG_SEPARATOR);
let modelStr: string;
@@ -157,37 +161,66 @@ export class ModelsService {
modelStr = modelPath;
}
const segments = modelStr.split(MODEL_ID_SEGMENT_SEPARATOR);
// 3. Handle dot-separated quantization (e.g. `model-name.Q4_K_M`)
const dotIdx = modelStr.lastIndexOf('.');
if (segments.length > 0 && MODEL_FORMAT_SEGMENT_RE.test(segments[segments.length - 1])) {
result.format = segments.pop()!;
if (dotIdx !== MODEL_ID_NOT_FOUND && !result.quantization) {
const afterDot = modelStr.slice(dotIdx + 1);
if (MODEL_QUANTIZATION_SEGMENT_RE.test(afterDot)) {
result.quantization = afterDot;
modelStr = modelStr.slice(0, dotIdx);
}
}
const paramsRe = MODEL_PARAMS_RE;
const activatedParamsRe = MODEL_ACTIVATED_PARAMS_RE;
const segments = modelStr.split(MODEL_ID_SEGMENT_SEPARATOR);
// 4. Detect trailing quantization from dash-separated segments
// Handle UD-prefixed quantization (e.g. `UD-Q8_K_XL`) and
// standalone quantization (e.g. `Q4_K_M`, `BF16`, `F16`, `MXFP4`)
if (!result.quantization && segments.length > 1) {
const last = segments[segments.length - 1];
const secondLast = segments.length > 2 ? segments[segments.length - 2] : null;
if (MODEL_QUANTIZATION_SEGMENT_RE.test(last)) {
if (secondLast && MODEL_CUSTOM_QUANTIZATION_PREFIX_RE.test(secondLast)) {
result.quantization = `${secondLast}-${last}`;
segments.splice(segments.length - 2, 2);
} else {
result.quantization = last;
segments.pop();
}
}
}
// 5. Find params and activated params
let paramsIdx = MODEL_ID_NOT_FOUND;
let activatedParamsIdx = MODEL_ID_NOT_FOUND;
for (let i = 0; i < segments.length; i++) {
const seg = segments[i];
if (paramsIdx === -1 && paramsRe.test(seg)) {
if (paramsIdx === MODEL_ID_NOT_FOUND && MODEL_PARAMS_RE.test(seg)) {
paramsIdx = i;
result.params = seg.toUpperCase();
} else if (activatedParamsRe.test(seg)) {
} else if (paramsIdx !== MODEL_ID_NOT_FOUND && MODEL_ACTIVATED_PARAMS_RE.test(seg)) {
activatedParamsIdx = i;
result.activatedParams = seg.toUpperCase();
}
}
// 6. Model name = segments before params; tags = remaining segments after params
const pivotIdx = paramsIdx !== MODEL_ID_NOT_FOUND ? paramsIdx : segments.length;
result.modelName = segments.slice(0, pivotIdx).join(MODEL_ID_SEGMENT_SEPARATOR) || null;
if (paramsIdx !== MODEL_ID_NOT_FOUND) {
result.tags = segments
.slice(paramsIdx + 1)
.filter((_, relIdx) => paramsIdx + 1 + relIdx !== activatedParamsIdx);
result.tags = segments.slice(paramsIdx + 1).filter((_, relIdx) => {
const absIdx = paramsIdx + 1 + relIdx;
if (absIdx === activatedParamsIdx) return false;
return !MODEL_IGNORED_SEGMENTS.has(segments[absIdx].toUpperCase());
});
}
return result;

View File

@@ -36,7 +36,8 @@ import {
ISO_TIME_SEPARATOR,
ISO_TIME_SEPARATOR_REPLACEMENT,
NON_ALPHANUMERIC_REGEX,
MULTIPLE_UNDERSCORE_REGEX
MULTIPLE_UNDERSCORE_REGEX,
MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY
} from '$lib/constants';
class ConversationsStore {
@@ -61,7 +62,37 @@ class ConversationsStore {
isInitialized = $state(false);
/** Pending MCP server overrides for new conversations (before first message) */
pendingMcpServerOverrides = $state<McpServerOverride[]>([]);
pendingMcpServerOverrides = $state<McpServerOverride[]>(ConversationsStore.loadMcpDefaults());
/** Load MCP default overrides from localStorage */
private static loadMcpDefaults(): McpServerOverride[] {
if (typeof globalThis.localStorage === 'undefined') return [];
try {
const raw = localStorage.getItem(MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY);
if (!raw) return [];
const parsed = JSON.parse(raw);
if (!Array.isArray(parsed)) return [];
return parsed.filter(
(o: unknown) => typeof o === 'object' && o !== null && 'serverId' in o && 'enabled' in o
) as McpServerOverride[];
} catch {
return [];
}
}
/** Persist MCP default overrides to localStorage */
private saveMcpDefaults(): void {
if (typeof globalThis.localStorage === 'undefined') return;
const plain = this.pendingMcpServerOverrides.map((o) => ({
serverId: o.serverId,
enabled: o.enabled
}));
if (plain.length > 0) {
localStorage.setItem(MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY, JSON.stringify(plain));
} else {
localStorage.removeItem(MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY);
}
}
/** Callback for title update confirmation dialog */
titleUpdateConfirmationCallback?: (currentTitle: string, newTitle: string) => Promise<boolean>;
@@ -261,6 +292,8 @@ class ConversationsStore {
clearActiveConversation(): void {
this.activeConversation = null;
this.activeMessages = [];
// reload MCP defaults so new chats inherit persisted state
this.pendingMcpServerOverrides = ConversationsStore.loadMcpDefaults();
}
/**
@@ -597,6 +630,7 @@ class ConversationsStore {
this.pendingMcpServerOverrides = [...this.pendingMcpServerOverrides, { serverId, enabled }];
}
}
this.saveMcpDefaults();
}
/**
@@ -621,6 +655,7 @@ class ConversationsStore {
*/
clearPendingMcpServerOverrides(): void {
this.pendingMcpServerOverrides = [];
this.saveMcpDefaults();
}
/**

View File

@@ -208,23 +208,16 @@ class MCPStore {
}
/**
* Checks if a server is enabled, considering per-chat overrides.
* Checks if a server is enabled for a given chat.
* Only per-chat overrides (persisted in localStorage for new chats,
* or in IndexedDB for existing conversations) control enabled state.
*/
#checkServerEnabled(
server: MCPServerSettingsEntry,
perChatOverrides?: McpServerOverride[]
): boolean {
if (!server.enabled) {
return false;
}
if (perChatOverrides) {
const override = perChatOverrides.find((o) => o.serverId === server.id);
return override?.enabled ?? false;
}
return false;
const override = perChatOverrides?.find((o) => o.serverId === server.id);
return override?.enabled ?? false;
}
/**
@@ -570,18 +563,8 @@ class MCPStore {
getEnabledServersForConversation(
perChatOverrides?: McpServerOverride[]
): MCPServerSettingsEntry[] {
if (!perChatOverrides?.length) {
return [];
}
return this.getServers().filter((server) => {
if (!server.enabled) {
return false;
}
const override = perChatOverrides.find((o) => o.serverId === server.id);
return override?.enabled ?? false;
return this.#checkServerEnabled(server, perChatOverrides);
});
}

View File

@@ -289,16 +289,10 @@ class SettingsStore {
const serverDefaults = this.getServerDefaults();
if (serverDefaults[key] !== undefined) {
const value = normalizeFloatingPoint(serverDefaults[key]);
this.config[key as keyof SettingsConfigType] =
value as SettingsConfigType[keyof SettingsConfigType];
} else {
if (key in SETTING_CONFIG_DEFAULT) {
const defaultValue = getConfigValue(SETTING_CONFIG_DEFAULT, key);
setConfigValue(this.config, key, defaultValue);
}
// sampling param known by server: clear it, let server decide
setConfigValue(this.config, key, '');
} else if (key in SETTING_CONFIG_DEFAULT) {
setConfigValue(this.config, key, getConfigValue(SETTING_CONFIG_DEFAULT, key));
}
this.userOverrides.delete(key);
@@ -319,12 +313,7 @@ class SettingsStore {
*/
syncWithServerDefaults(): void {
const propsDefaults = this.getServerDefaults();
if (Object.keys(propsDefaults).length === 0) {
console.warn('No server defaults available for initialization');
return;
}
if (Object.keys(propsDefaults).length === 0) return;
for (const [key, propsValue] of Object.entries(propsDefaults)) {
const currentValue = getConfigValue(this.config, key);
@@ -332,17 +321,14 @@ class SettingsStore {
const normalizedCurrent = normalizeFloatingPoint(currentValue);
const normalizedDefault = normalizeFloatingPoint(propsValue);
// if user value matches server, it's not a real override
if (normalizedCurrent === normalizedDefault) {
this.userOverrides.delete(key);
setConfigValue(this.config, key, propsValue);
} else if (!this.userOverrides.has(key)) {
setConfigValue(this.config, key, propsValue);
}
}
this.saveConfig();
console.log('Settings initialized with props defaults:', propsDefaults);
console.log('Current user overrides after sync:', Array.from(this.userOverrides));
console.log('User overrides after sync:', Array.from(this.userOverrides));
}
/**
@@ -352,19 +338,11 @@ class SettingsStore {
*/
forceSyncWithServerDefaults(): void {
const propsDefaults = this.getServerDefaults();
const syncableKeys = ParameterSyncService.getSyncableParameterKeys();
for (const key of syncableKeys) {
for (const key of ParameterSyncService.getSyncableParameterKeys()) {
if (propsDefaults[key] !== undefined) {
const normalizedValue = normalizeFloatingPoint(propsDefaults[key]);
setConfigValue(this.config, key, normalizedValue);
} else {
if (key in SETTING_CONFIG_DEFAULT) {
const defaultValue = getConfigValue(SETTING_CONFIG_DEFAULT, key);
setConfigValue(this.config, key, defaultValue);
}
setConfigValue(this.config, key, '');
} else if (key in SETTING_CONFIG_DEFAULT) {
setConfigValue(this.config, key, getConfigValue(SETTING_CONFIG_DEFAULT, key));
}
this.userOverrides.delete(key);

View File

@@ -25,7 +25,6 @@ export interface ParsedModelId {
modelName: string | null;
params: string | null;
activatedParams: string | null;
format: string | null;
quantization: string | null;
tags: string[];
}

View File

@@ -5,7 +5,7 @@ import type { DatabaseMessageExtra } from './database';
import type { ParameterSource, SyncableParameterType, SettingsFieldType } from '$lib/enums';
import type { Icon } from '@lucide/svelte';
export type SettingsConfigValue = string | number | boolean;
export type SettingsConfigValue = string | number | boolean | undefined;
export interface SettingsFieldConfig {
key: string;

View File

@@ -0,0 +1,270 @@
import { describe, expect, it } from 'vitest';
import { ModelsService } from '$lib/services/models.service';
const { parseModelId } = ModelsService;
describe('parseModelId', () => {
it('handles unknown patterns correctly', () => {
expect(parseModelId('model-name-1')).toStrictEqual({
activatedParams: null,
modelName: 'model-name-1',
orgName: null,
params: null,
quantization: null,
raw: 'model-name-1',
tags: []
});
expect(parseModelId('org/model-name-2')).toStrictEqual({
activatedParams: null,
modelName: 'model-name-2',
orgName: 'org',
params: null,
quantization: null,
raw: 'org/model-name-2',
tags: []
});
});
it('extracts model parameters correctly', () => {
expect(parseModelId('model-100B-BF16')).toMatchObject({ params: '100B' });
expect(parseModelId('model-100B:Q4_K_M')).toMatchObject({ params: '100B' });
});
it('extracts model parameters correctly in lowercase', () => {
expect(parseModelId('model-100b-bf16')).toMatchObject({ params: '100B' });
expect(parseModelId('model-100b:q4_k_m')).toMatchObject({ params: '100B' });
});
it('extracts activated parameters correctly', () => {
expect(parseModelId('model-100B-A10B-BF16')).toMatchObject({ activatedParams: 'A10B' });
expect(parseModelId('model-100B-A10B:Q4_K_M')).toMatchObject({ activatedParams: 'A10B' });
});
it('extracts activated parameters correctly in lowercase', () => {
expect(parseModelId('model-100b-a10b-bf16')).toMatchObject({ activatedParams: 'A10B' });
expect(parseModelId('model-100b-a10b:q4_k_m')).toMatchObject({ activatedParams: 'A10B' });
});
it('extracts quantization correctly', () => {
// Dash-separated quantization
expect(parseModelId('model-100B-UD-IQ1_S')).toMatchObject({ quantization: 'UD-IQ1_S' });
expect(parseModelId('model-100B-IQ4_XS')).toMatchObject({ quantization: 'IQ4_XS' });
expect(parseModelId('model-100B-Q4_K_M')).toMatchObject({ quantization: 'Q4_K_M' });
expect(parseModelId('model-100B-Q8_0')).toMatchObject({ quantization: 'Q8_0' });
expect(parseModelId('model-100B-UD-Q8_K_XL')).toMatchObject({ quantization: 'UD-Q8_K_XL' });
expect(parseModelId('model-100B-F16')).toMatchObject({ quantization: 'F16' });
expect(parseModelId('model-100B-BF16')).toMatchObject({ quantization: 'BF16' });
expect(parseModelId('model-100B-MXFP4')).toMatchObject({ quantization: 'MXFP4' });
// Colon-separated quantization
expect(parseModelId('model-100B:UD-IQ1_S')).toMatchObject({ quantization: 'UD-IQ1_S' });
expect(parseModelId('model-100B:IQ4_XS')).toMatchObject({ quantization: 'IQ4_XS' });
expect(parseModelId('model-100B:Q4_K_M')).toMatchObject({ quantization: 'Q4_K_M' });
expect(parseModelId('model-100B:Q8_0')).toMatchObject({ quantization: 'Q8_0' });
expect(parseModelId('model-100B:UD-Q8_K_XL')).toMatchObject({ quantization: 'UD-Q8_K_XL' });
expect(parseModelId('model-100B:F16')).toMatchObject({ quantization: 'F16' });
expect(parseModelId('model-100B:BF16')).toMatchObject({ quantization: 'BF16' });
expect(parseModelId('model-100B:MXFP4')).toMatchObject({ quantization: 'MXFP4' });
// Dot-separated quantization
expect(parseModelId('nomic-embed-text-v2-moe.Q4_K_M')).toMatchObject({
quantization: 'Q4_K_M'
});
});
it('extracts additional tags correctly', () => {
expect(parseModelId('model-100B-foobar-Q4_K_M')).toMatchObject({ tags: ['foobar'] });
expect(parseModelId('model-100B-A10B-foobar-1M-BF16')).toMatchObject({
tags: ['foobar', '1M']
});
expect(parseModelId('model-100B-1M-foobar:UD-Q8_K_XL')).toMatchObject({
tags: ['1M', 'foobar']
});
});
it('filters out container format segments from tags', () => {
expect(parseModelId('model-100B-GGUF-Instruct-BF16')).toMatchObject({
tags: ['Instruct']
});
expect(parseModelId('model-100B-GGML-Instruct:Q4_K_M')).toMatchObject({
tags: ['Instruct']
});
});
it('handles real-world examples correctly', () => {
expect(parseModelId('meta-llama/Llama-3.1-8B')).toStrictEqual({
activatedParams: null,
modelName: 'Llama-3.1',
orgName: 'meta-llama',
params: '8B',
quantization: null,
raw: 'meta-llama/Llama-3.1-8B',
tags: []
});
expect(parseModelId('openai/gpt-oss-120b-MXFP4')).toStrictEqual({
activatedParams: null,
modelName: 'gpt-oss',
orgName: 'openai',
params: '120B',
quantization: 'MXFP4',
raw: 'openai/gpt-oss-120b-MXFP4',
tags: []
});
expect(parseModelId('openai/gpt-oss-20b:Q4_K_M')).toStrictEqual({
activatedParams: null,
modelName: 'gpt-oss',
orgName: 'openai',
params: '20B',
quantization: 'Q4_K_M',
raw: 'openai/gpt-oss-20b:Q4_K_M',
tags: []
});
expect(parseModelId('Qwen/Qwen3-Coder-30B-A3B-Instruct-1M-BF16')).toStrictEqual({
activatedParams: 'A3B',
modelName: 'Qwen3-Coder',
orgName: 'Qwen',
params: '30B',
quantization: 'BF16',
raw: 'Qwen/Qwen3-Coder-30B-A3B-Instruct-1M-BF16',
tags: ['Instruct', '1M']
});
});
it('handles real-world examples with quantization in segments', () => {
expect(parseModelId('meta-llama/Llama-4-Scout-17B-16E-Instruct-Q4_K_M')).toStrictEqual({
activatedParams: null,
modelName: 'Llama-4-Scout',
orgName: 'meta-llama',
params: '17B',
quantization: 'Q4_K_M',
raw: 'meta-llama/Llama-4-Scout-17B-16E-Instruct-Q4_K_M',
tags: ['16E', 'Instruct']
});
expect(parseModelId('MiniMaxAI/MiniMax-M2-IQ4_XS')).toStrictEqual({
activatedParams: null,
modelName: 'MiniMax-M2',
orgName: 'MiniMaxAI',
params: null,
quantization: 'IQ4_XS',
raw: 'MiniMaxAI/MiniMax-M2-IQ4_XS',
tags: []
});
expect(parseModelId('MiniMaxAI/MiniMax-M2-UD-Q3_K_XL')).toStrictEqual({
activatedParams: null,
modelName: 'MiniMax-M2',
orgName: 'MiniMaxAI',
params: null,
quantization: 'UD-Q3_K_XL',
raw: 'MiniMaxAI/MiniMax-M2-UD-Q3_K_XL',
tags: []
});
expect(parseModelId('mistralai/Devstral-2-123B-Instruct-2512-Q4_K_M')).toStrictEqual({
activatedParams: null,
modelName: 'Devstral-2',
orgName: 'mistralai',
params: '123B',
quantization: 'Q4_K_M',
raw: 'mistralai/Devstral-2-123B-Instruct-2512-Q4_K_M',
tags: ['Instruct', '2512']
});
expect(parseModelId('mistralai/Devstral-Small-2-24B-Instruct-2512-Q8_0')).toStrictEqual({
activatedParams: null,
modelName: 'Devstral-Small-2',
orgName: 'mistralai',
params: '24B',
quantization: 'Q8_0',
raw: 'mistralai/Devstral-Small-2-24B-Instruct-2512-Q8_0',
tags: ['Instruct', '2512']
});
expect(parseModelId('noctrex/GLM-4.7-Flash-MXFP4_MOE')).toStrictEqual({
activatedParams: null,
modelName: 'GLM-4.7-Flash',
orgName: 'noctrex',
params: null,
quantization: 'MXFP4_MOE',
raw: 'noctrex/GLM-4.7-Flash-MXFP4_MOE',
tags: []
});
expect(parseModelId('Qwen/Qwen3-Coder-Next-Q4_K_M')).toStrictEqual({
activatedParams: null,
modelName: 'Qwen3-Coder-Next',
orgName: 'Qwen',
params: null,
quantization: 'Q4_K_M',
raw: 'Qwen/Qwen3-Coder-Next-Q4_K_M',
tags: []
});
expect(parseModelId('openai/gpt-oss-120b-Q4_K_M')).toStrictEqual({
activatedParams: null,
modelName: 'gpt-oss',
orgName: 'openai',
params: '120B',
quantization: 'Q4_K_M',
raw: 'openai/gpt-oss-120b-Q4_K_M',
tags: []
});
expect(parseModelId('openai/gpt-oss-20b-F16')).toStrictEqual({
activatedParams: null,
modelName: 'gpt-oss',
orgName: 'openai',
params: '20B',
quantization: 'F16',
raw: 'openai/gpt-oss-20b-F16',
tags: []
});
expect(parseModelId('nomic-embed-text-v2-moe.Q4_K_M')).toStrictEqual({
activatedParams: null,
modelName: 'nomic-embed-text-v2-moe',
orgName: null,
params: null,
quantization: 'Q4_K_M',
raw: 'nomic-embed-text-v2-moe.Q4_K_M',
tags: []
});
});
it('handles ambiguous model names', () => {
// Qwen3.5 Instruct vs Thinking — tags should distinguish them
expect(parseModelId('Qwen/Qwen3.5-30B-A3B-Instruct')).toMatchObject({
modelName: 'Qwen3.5',
params: '30B',
activatedParams: 'A3B',
tags: ['Instruct']
});
expect(parseModelId('Qwen/Qwen3.5-30B-A3B-Thinking')).toMatchObject({
modelName: 'Qwen3.5',
params: '30B',
activatedParams: 'A3B',
tags: ['Thinking']
});
// Dot-separated quantization with variant suffixes
expect(parseModelId('gemma-3-27b-it-heretic-v2.Q8_0')).toMatchObject({
modelName: 'gemma-3',
params: '27B',
quantization: 'Q8_0',
tags: ['it', 'heretic', 'v2']
});
expect(parseModelId('gemma-3-27b-it.Q8_0')).toMatchObject({
modelName: 'gemma-3',
params: '27B',
quantization: 'Q8_0',
tags: ['it']
});
});
});