mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2026-04-30 16:47:31 +03:00
Compare commits
22 Commits
gg/gguf-fi
...
b8429
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1e64534570 | ||
|
|
cd708db0cc | ||
|
|
512bba6ee0 | ||
|
|
b486c17b3e | ||
|
|
1b9bbaa357 | ||
|
|
07feeaa92e | ||
|
|
3fee84e156 | ||
|
|
811397745e | ||
|
|
c014c3f83a | ||
|
|
7f2cbd9a4d | ||
|
|
509a31d00f | ||
|
|
ea01d196d7 | ||
|
|
07ba6d275b | ||
|
|
6729d4920c | ||
|
|
d13d60af1d | ||
|
|
5744d7ec43 | ||
|
|
8ced5f41f9 | ||
|
|
78d550b541 | ||
|
|
4efd326e71 | ||
|
|
b08f7322ee | ||
|
|
79187f2fb8 | ||
|
|
48e61238e1 |
@@ -936,7 +936,9 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp
|
||||
for (auto msg : inputs.messages) {
|
||||
if (msg.contains("reasoning_content") && msg.at("reasoning_content").is_string()) {
|
||||
msg["thinking"] = msg.at("reasoning_content");
|
||||
msg.erase("content");
|
||||
if (msg.contains("tool_calls") && msg.at("tool_calls").is_array() && !msg.at("tool_calls").empty()) {
|
||||
msg.erase("content");
|
||||
}
|
||||
}
|
||||
adjusted_messages.push_back(msg);
|
||||
}
|
||||
|
||||
@@ -1062,6 +1062,10 @@ class TextModel(ModelBase):
|
||||
self.gguf_writer.add_head_count_kv(n_head_kv)
|
||||
logger.info(f"gguf: key-value head count = {n_head_kv}")
|
||||
|
||||
if self.hparams.get("is_causal") is False:
|
||||
self.gguf_writer.add_causal_attention(False)
|
||||
logger.info("gguf: causal attention = False")
|
||||
|
||||
# TODO: Handle "sliding_attention" similarly when models start implementing it
|
||||
rope_params = self.rope_parameters.get("full_attention", self.rope_parameters)
|
||||
if (rope_type := rope_params.get("rope_type")) is not None:
|
||||
|
||||
@@ -28,6 +28,9 @@ Additionally, there the following images, similar to the above:
|
||||
- `ghcr.io/ggml-org/llama.cpp:full-vulkan`: Same as `full` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:light-vulkan`: Same as `light` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:server-vulkan`: Same as `server` but compiled with Vulkan support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:full-openvino`: Same as `full` but compiled with OpenVino support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:light-openvino`: Same as `light` but compiled with OpenVino support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggml-org/llama.cpp:server-openvino`: Same as `server` but compiled with OpenVino support. (platforms: `linux/amd64`)
|
||||
|
||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ Legend:
|
||||
| CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
|
||||
| CUMSUM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| DIAG | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
| DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
@@ -62,7 +62,7 @@ Legend:
|
||||
| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
|
||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ |
|
||||
| LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
|
||||
| MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
@@ -115,7 +115,7 @@ Legend:
|
||||
| TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||
| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
|
||||
| UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ |
|
||||
| XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
|
||||
|
||||
@@ -5744,49 +5744,61 @@
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000000,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000000,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.000000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000000,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000000,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000001","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000001,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000001","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000001,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000001,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000001","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000001,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000001","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000001,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.000001","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000001,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000001,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000100","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000100,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000100","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000100,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000100","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000100,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.000100,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000100","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.000100,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000100","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.000100,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.000100","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000100,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.000100,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.100000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.100000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.100000,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=0.100000,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=0.100000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=0.100000,inplace=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.100000","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.100000,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.100000,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=10.000000,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[64,5,4,3],eps=10.000000,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=10.000000,v=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","L2_NORM","type=f32,ne=[1025,5,4,3],eps=10.000000,v=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","WebGPU"
|
||||
@@ -10036,17 +10048,17 @@
|
||||
"WebGPU: WebGPU","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","CUMSUM","type=f32,ne=[20481,4,1,1]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","XIELU","type=f32,ne=[10,5,4,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","TRI","type=f32,ne=[10,10,4,3],tri_type=0","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","FILL","type=f32,ne=[10,10,4,3],c=0.000000","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","DIAG","type=f32,ne=[10,1,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","DIAG","type=f32,ne=[79,1,19,13]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","DIAG","type=f32,ne=[256,1,8,16]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","DIAG","type=f32,ne=[10,1,4,3]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","DIAG","type=f32,ne=[79,1,19,13]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","DIAG","type=f32,ne=[256,1,8,16]","support","1","yes","WebGPU"
|
||||
"WebGPU: WebGPU","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","WebGPU"
|
||||
"WebGPU: WebGPU","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","WebGPU"
|
||||
|
||||
|
Can't render this file because it is too large.
|
@@ -4,7 +4,7 @@ project("ggml" C CXX ASM)
|
||||
### GGML Version
|
||||
set(GGML_VERSION_MAJOR 0)
|
||||
set(GGML_VERSION_MINOR 9)
|
||||
set(GGML_VERSION_PATCH 7)
|
||||
set(GGML_VERSION_PATCH 8)
|
||||
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")
|
||||
|
||||
find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
|
||||
|
||||
@@ -733,6 +733,10 @@ extern "C" {
|
||||
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||
|
||||
GGML_DEPRECATED(
|
||||
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
||||
"use ggml_row_size() instead");
|
||||
|
||||
GGML_API const char * ggml_type_name(enum ggml_type type);
|
||||
GGML_API const char * ggml_op_name (enum ggml_op op);
|
||||
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
||||
|
||||
@@ -1544,8 +1544,8 @@ static void aclnn_get_slope(ggml_backend_cann_context & ctx,
|
||||
end = 2 * ((n_head - 1) - n_head_log2) + 1;
|
||||
step = 2;
|
||||
count = n_head - n_head_log2;
|
||||
aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * sizeof(float), m1, count, start, end + 1, step,
|
||||
dtype);
|
||||
aclnn_get_slope_inner(ctx, (char *) slope_buffer + n_head_log2 * ggml_type_size(dtype), m1, count, start, end + 1,
|
||||
step, dtype);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2943,6 +2943,27 @@ void ggml_cann_rope(ggml_backend_cann_context & ctx, ggml_tensor * dst) {
|
||||
// Rotate full tensor (no tail), using trans tensors
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_trans_tensor.get(), acl_cos_reshape_tensor.get(),
|
||||
acl_sin_reshape_tensor.get(), acl_mode, acl_dst_trans_tensor.get());
|
||||
} else if (src0->data == dst->data && !ggml_is_contiguous(src0)) {
|
||||
// In-place on non-contiguous tensor: RotaryPositionEmbedding cannot safely
|
||||
// read and write the same non-contiguous buffer. Use contiguous temporaries.
|
||||
size_t contiguous_nb[GGML_MAX_DIMS];
|
||||
contiguous_nb[0] = sizeof(float);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
contiguous_nb[i] = contiguous_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
int64_t total_elements = ggml_nelements(src0);
|
||||
ggml_cann_pool_alloc inplace_src_alloc(ctx.pool(), total_elements * sizeof(float));
|
||||
ggml_cann_pool_alloc inplace_dst_alloc(ctx.pool(), total_elements * sizeof(float));
|
||||
|
||||
acl_tensor_ptr acl_src_contig = ggml_cann_create_tensor(inplace_src_alloc.get(), ACL_FLOAT, sizeof(float),
|
||||
src0->ne, contiguous_nb, GGML_MAX_DIMS);
|
||||
acl_tensor_ptr acl_dst_contig = ggml_cann_create_tensor(inplace_dst_alloc.get(), ACL_FLOAT, sizeof(float),
|
||||
dst->ne, contiguous_nb, GGML_MAX_DIMS);
|
||||
|
||||
cann_copy(ctx, acl_src.get(), acl_src_contig.get());
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src_contig.get(), acl_cos_reshape_tensor.get(),
|
||||
acl_sin_reshape_tensor.get(), acl_mode, acl_dst_contig.get());
|
||||
cann_copy(ctx, acl_dst_contig.get(), acl_dst.get());
|
||||
} else {
|
||||
// Rotate full tensor (no tail), using original tensors
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, RotaryPositionEmbedding, acl_src.get(), acl_cos_reshape_tensor.get(),
|
||||
@@ -3599,6 +3620,44 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
|
||||
acl_k_tensor = ggml_cann_create_tensor(src1, src1_bsnd_ne, src1_bsnd_nb, GGML_MAX_DIMS);
|
||||
acl_v_tensor = ggml_cann_create_tensor(src2, src2_bsnd_ne, src2_bsnd_nb, GGML_MAX_DIMS);
|
||||
|
||||
// Step 2.5: Pad Q, K, V along head dimension if D is not a multiple of 16
|
||||
// (required by FusedInferAttentionScoreV2)
|
||||
const int64_t D = src0->ne[0];
|
||||
const int64_t D_padded = GGML_PAD(D, 16);
|
||||
const bool needs_padding = (D != D_padded);
|
||||
|
||||
ggml_cann_pool_alloc q_pad_allocator(ctx.pool());
|
||||
ggml_cann_pool_alloc k_pad_allocator(ctx.pool());
|
||||
ggml_cann_pool_alloc v_pad_allocator(ctx.pool());
|
||||
|
||||
if (needs_padding) {
|
||||
int64_t paddings[] = { 0, D_padded - D, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
auto pad_fa_tensor = [&](acl_tensor_ptr & tensor, const int64_t * bsnd_ne,
|
||||
ggml_cann_pool_alloc & allocator) {
|
||||
int64_t pad_ne[GGML_MAX_DIMS] = { D_padded, bsnd_ne[1], bsnd_ne[2], bsnd_ne[3] };
|
||||
size_t pad_nb[GGML_MAX_DIMS];
|
||||
pad_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
pad_nb[i] = pad_nb[i - 1] * pad_ne[i - 1];
|
||||
}
|
||||
int64_t nelements = pad_ne[0] * pad_ne[1] * pad_ne[2] * pad_ne[3];
|
||||
void * buffer = allocator.alloc(nelements * faElemSize);
|
||||
acl_tensor_ptr padded =
|
||||
ggml_cann_create_tensor(buffer, faDataType, faElemSize, pad_ne, pad_nb, GGML_MAX_DIMS);
|
||||
aclnn_pad(ctx, tensor.get(), padded.get(), paddings);
|
||||
tensor = std::move(padded);
|
||||
};
|
||||
|
||||
pad_fa_tensor(acl_q_tensor, src0_bsnd_ne, q_pad_allocator);
|
||||
pad_fa_tensor(acl_k_tensor, src1_bsnd_ne, k_pad_allocator);
|
||||
pad_fa_tensor(acl_v_tensor, src2_bsnd_ne, v_pad_allocator);
|
||||
|
||||
src0_bsnd_ne[0] = D_padded;
|
||||
src1_bsnd_ne[0] = D_padded;
|
||||
src2_bsnd_ne[0] = D_padded;
|
||||
}
|
||||
|
||||
// Step 3: create the PSEShift tensor if needed
|
||||
// this tensor is considered as mask (f16) in the llama.cpp
|
||||
acl_tensor_ptr bcast_pse_tensor;
|
||||
@@ -3688,17 +3747,16 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
|
||||
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||
acl_tensor_ptr fa_dst_tensor;
|
||||
acl_tensor_ptr acl_dst_tensor;
|
||||
ggml_cann_pool_alloc out_f16_allocator(ctx.pool());
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
void * out_f16_buffer = out_f16_allocator.alloc(ggml_nelements(dst) * faElemSize);
|
||||
|
||||
if (dst->type == GGML_TYPE_F32 || needs_padding) {
|
||||
int64_t * out_f16_ne = src0_bsnd_ne;
|
||||
size_t out_f16_nb[GGML_MAX_DIMS];
|
||||
out_f16_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
out_f16_nb[i] = out_f16_nb[i - 1] * out_f16_ne[i - 1];
|
||||
}
|
||||
int64_t out_nelements = out_f16_ne[0] * out_f16_ne[1] * out_f16_ne[2] * out_f16_ne[3];
|
||||
void * out_f16_buffer = out_f16_allocator.alloc(out_nelements * faElemSize);
|
||||
|
||||
fa_dst_tensor =
|
||||
ggml_cann_create_tensor(out_f16_buffer, faDataType, faElemSize, out_f16_ne, out_f16_nb, GGML_MAX_DIMS);
|
||||
@@ -3730,8 +3788,33 @@ void ggml_cann_flash_attn_ext(ggml_backend_cann_context & ctx, ggml_tensor * dst
|
||||
nullptr // softmaxLse
|
||||
);
|
||||
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
// Step 6: post-processing, permute and cast to f32
|
||||
// Step 6: post-processing — slice padded output and/or cast to f32
|
||||
if (needs_padding) {
|
||||
ggml_cann_pool_alloc sliced_f16_allocator(ctx.pool());
|
||||
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
int64_t sliced_ne[GGML_MAX_DIMS] = { D, src0_bsnd_ne[1], src0_bsnd_ne[2], src0_bsnd_ne[3] };
|
||||
size_t sliced_nb[GGML_MAX_DIMS];
|
||||
sliced_nb[0] = faElemSize;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
|
||||
sliced_nb[i] = sliced_nb[i - 1] * sliced_ne[i - 1];
|
||||
}
|
||||
int64_t sliced_nelements = sliced_ne[0] * sliced_ne[1] * sliced_ne[2] * sliced_ne[3];
|
||||
void * sliced_buffer = sliced_f16_allocator.alloc(sliced_nelements * faElemSize);
|
||||
acl_tensor_ptr sliced_f16_tensor = ggml_cann_create_tensor(sliced_buffer, faDataType, faElemSize,
|
||||
sliced_ne, sliced_nb, GGML_MAX_DIMS);
|
||||
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Slice, fa_dst_tensor.get(),
|
||||
(int64_t) -1, (int64_t) 0, D, (int64_t) 1, sliced_f16_tensor.get());
|
||||
|
||||
acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
aclnn_cast(ctx, sliced_f16_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
|
||||
} else {
|
||||
acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
GGML_CANN_CALL_ACLNN_OP(ctx, Slice, fa_dst_tensor.get(),
|
||||
(int64_t) -1, (int64_t) 0, D, (int64_t) 1, acl_dst_tensor.get());
|
||||
}
|
||||
} else if (dst->type == GGML_TYPE_F32) {
|
||||
acl_tensor_ptr acl_dst_tensor = ggml_cann_create_tensor(dst);
|
||||
aclnn_cast(ctx, fa_dst_tensor.get(), acl_dst_tensor.get(), ggml_cann_type_mapping(dst->type));
|
||||
}
|
||||
|
||||
@@ -2503,10 +2503,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
|
||||
// different head sizes of K and V are not supported yet
|
||||
return false;
|
||||
}
|
||||
if (op->src[0]->ne[0] % 16 != 0) {
|
||||
// TODO: padding to support
|
||||
return false;
|
||||
}
|
||||
float logitSoftcap = 0.0f;
|
||||
memcpy(&logitSoftcap, (const float *) (op->op_params) + 2, sizeof(float));
|
||||
if (logitSoftcap != 0.0f) {
|
||||
|
||||
@@ -570,24 +570,34 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "54049037570ab0ee0a0d126b2ba5ece1")
|
||||
|
||||
if (POLICY CMP0135)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
endif()
|
||||
|
||||
# TODO: Use FetchContent_MakeAvailable with EXCLUDE_FROM_ALL after bumping minimum CMake version to 3.28+
|
||||
# Using FetchContent_Populate instead to avoid EXCLUDE_FROM_ALL which requires CMake 3.28
|
||||
FetchContent_Declare(KleidiAI_Download
|
||||
set(KLEIDIAI_FETCH_ARGS
|
||||
URL ${KLEIDIAI_DOWNLOAD_URL}
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP NEW
|
||||
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5})
|
||||
URL_HASH MD5=${KLEIDIAI_ARCHIVE_MD5}
|
||||
)
|
||||
|
||||
FetchContent_GetProperties(KleidiAI_Download
|
||||
SOURCE_DIR KLEIDIAI_SRC
|
||||
POPULATED KLEIDIAI_POPULATED)
|
||||
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.28")
|
||||
FetchContent_Declare(KleidiAI_Download
|
||||
${KLEIDIAI_FETCH_ARGS}
|
||||
EXCLUDE_FROM_ALL
|
||||
)
|
||||
|
||||
if (NOT KLEIDIAI_POPULATED)
|
||||
FetchContent_Populate(KleidiAI_Download)
|
||||
FetchContent_MakeAvailable(KleidiAI_Download)
|
||||
FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
|
||||
else()
|
||||
FetchContent_Declare(KleidiAI_Download
|
||||
${KLEIDIAI_FETCH_ARGS}
|
||||
)
|
||||
|
||||
FetchContent_GetProperties(KleidiAI_Download
|
||||
SOURCE_DIR KLEIDIAI_SRC
|
||||
POPULATED KLEIDIAI_POPULATED
|
||||
)
|
||||
|
||||
if (NOT KLEIDIAI_POPULATED)
|
||||
FetchContent_Populate(KleidiAI_Download)
|
||||
FetchContent_GetProperties(KleidiAI_Download SOURCE_DIR KLEIDIAI_SRC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
add_compile_definitions(GGML_USE_CPU_KLEIDIAI)
|
||||
|
||||
@@ -531,7 +531,6 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
||||
|
||||
UNUSED(bs);
|
||||
|
||||
__m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
|
||||
__m256i finalpermutemask = _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0);
|
||||
|
||||
// Permute mask used for easier vector processing at later stages
|
||||
@@ -580,6 +579,7 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
|
||||
if constexpr (
|
||||
std::is_same_v<block_tx8, block_q4_0x8> ||
|
||||
std::is_same_v<block_tx8, block_iq4_nlx8>) {
|
||||
const __m128i changemask = _mm_set_epi8(15, 14, 7, 6, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
|
||||
col_scale_f32 = GGML_F32Cx8_REARRANGE_LOAD(b_ptr[b].d, changemask);
|
||||
} else if constexpr (std::is_same_v<block_tx8, block_mxfp4x8>) {
|
||||
// Load 8 E8M0 exponents and convert to float via LUT
|
||||
|
||||
@@ -444,19 +444,20 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2;
|
||||
|
||||
const uint ib = idx / 128; // 2 values per idx
|
||||
const uint ib32 = (idx % 128) / 16; // 0..7
|
||||
const uint iq = 16 * ib32 + 2 * (idx % 8);
|
||||
const uint ib = idx / 64; // 4 values per idx
|
||||
const uint ib32 = (idx % 64) / 8; // 0..7
|
||||
const uint iq = 4 * ib32 + (idx % 4);
|
||||
|
||||
const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF;
|
||||
const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3;
|
||||
const uint qshift = (idx & 8) >> 1;
|
||||
u8vec2 qs = unpack8((uint(data_a_packed16[ib].qs[iq/2]) >> qshift) & 0x0F0F).xy;
|
||||
const uint qshift = idx & 4;
|
||||
u8vec4 qs = unpack8((uint(data_a_packed32[ib].qs[iq]) >> qshift) & 0x0F0F0F0F);
|
||||
|
||||
const float d = float(data_a[ib].d);
|
||||
const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]);
|
||||
const vec4 v = d * float(int(sl | (sh << 4)) - 32) * vec4(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y], kvalues_iq4nl[qs.z], kvalues_iq4nl[qs.w]);
|
||||
|
||||
buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v.xy);
|
||||
buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(v.zw);
|
||||
#elif defined(DATA_A_IQ4_NL)
|
||||
const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row;
|
||||
const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4;
|
||||
|
||||
@@ -554,7 +554,7 @@ void matmul_shaders(bool fp16, MatMulIdType matmul_id_type, bool coopmat, bool c
|
||||
std::string load_vec_quant = "2";
|
||||
if ((tname == "q4_0") || (tname == "q4_1") || (tname == "q5_1") || (tname == "iq1_s") || (tname == "iq1_m") || (tname == "iq2_xxs") || (tname == "iq2_xs") || (tname == "iq2_s"))
|
||||
load_vec_quant = "8";
|
||||
else if ((tname == "q5_0") || (tname == "q8_0") || (tname == "q2_k") || (tname == "q4_k") || (tname == "q5_k") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_nl") || (tname == "mxfp4"))
|
||||
else if ((tname == "q5_0") || (tname == "q8_0") || (tname == "q2_k") || (tname == "q4_k") || (tname == "q5_k") || (tname == "iq3_xxs") || (tname == "iq3_s") || (tname == "iq4_xs") || (tname == "iq4_nl") || (tname == "mxfp4"))
|
||||
load_vec_quant = "4";
|
||||
|
||||
if (tname == "bf16") {
|
||||
|
||||
@@ -151,6 +151,26 @@ struct ggml_webgpu_get_rows_pipeline_key_hash {
|
||||
}
|
||||
};
|
||||
|
||||
/** Row Norm **/
|
||||
|
||||
struct ggml_webgpu_row_norm_pipeline_key {
|
||||
ggml_op op;
|
||||
bool inplace;
|
||||
|
||||
bool operator==(const ggml_webgpu_row_norm_pipeline_key & other) const {
|
||||
return op == other.op && inplace == other.inplace;
|
||||
}
|
||||
};
|
||||
|
||||
struct ggml_webgpu_row_norm_pipeline_key_hash {
|
||||
size_t operator()(const ggml_webgpu_row_norm_pipeline_key & key) const {
|
||||
size_t seed = 0;
|
||||
ggml_webgpu_hash_combine(seed, key.op);
|
||||
ggml_webgpu_hash_combine(seed, key.inplace);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
|
||||
/** Pad **/
|
||||
struct ggml_webgpu_pad_pipeline_key {
|
||||
bool circular;
|
||||
@@ -244,13 +264,15 @@ struct ggml_webgpu_binary_pipeline_key_hash {
|
||||
/** Unary **/
|
||||
|
||||
struct ggml_webgpu_unary_pipeline_key {
|
||||
int type;
|
||||
int op;
|
||||
bool is_unary; // many unary operators fall under the GGML_OP_UNARY umbrella
|
||||
bool inplace;
|
||||
int type;
|
||||
int op;
|
||||
bool is_unary; // many unary operators fall under the GGML_OP_UNARY umbrella
|
||||
bool inplace;
|
||||
ggml_tri_type ttype; // only used for GGML_OP_TRI
|
||||
|
||||
bool operator==(const ggml_webgpu_unary_pipeline_key & other) const {
|
||||
return type == other.type && op == other.op && is_unary == other.is_unary && inplace == other.inplace;
|
||||
return type == other.type && op == other.op && is_unary == other.is_unary && inplace == other.inplace &&
|
||||
ttype == other.ttype;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -261,6 +283,7 @@ struct ggml_webgpu_unary_pipeline_key_hash {
|
||||
ggml_webgpu_hash_combine(seed, key.op);
|
||||
ggml_webgpu_hash_combine(seed, key.is_unary);
|
||||
ggml_webgpu_hash_combine(seed, key.inplace);
|
||||
ggml_webgpu_hash_combine(seed, key.ttype);
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
@@ -435,6 +458,8 @@ class ggml_webgpu_shader_lib {
|
||||
std::unordered_map<int, webgpu_pipeline> argsort_pipelines; // key is order
|
||||
std::unordered_map<int, webgpu_pipeline> argsort_merge_pipelines; // key is order
|
||||
std::unordered_map<int, webgpu_pipeline> cumsum_pipelines; // key is fixed, no variants yet
|
||||
std::unordered_map<ggml_webgpu_row_norm_pipeline_key, webgpu_pipeline, ggml_webgpu_row_norm_pipeline_key_hash>
|
||||
row_norm_pipelines; // op/inplace
|
||||
std::unordered_map<ggml_webgpu_get_rows_pipeline_key, webgpu_pipeline, ggml_webgpu_get_rows_pipeline_key_hash>
|
||||
get_rows_pipelines; // src_type, vectorized
|
||||
std::unordered_map<ggml_webgpu_unary_pipeline_key, webgpu_pipeline, ggml_webgpu_unary_pipeline_key_hash>
|
||||
@@ -479,6 +504,44 @@ class ggml_webgpu_shader_lib {
|
||||
return sum_rows_pipelines[1];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_row_norm_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
ggml_webgpu_row_norm_pipeline_key key = {
|
||||
.op = context.dst->op,
|
||||
.inplace = context.inplace,
|
||||
};
|
||||
|
||||
auto it = row_norm_pipelines.find(key);
|
||||
if (it != row_norm_pipelines.end()) {
|
||||
return it->second;
|
||||
}
|
||||
std::vector<std::string> defines;
|
||||
std::string variant;
|
||||
|
||||
switch (key.op) {
|
||||
case GGML_OP_RMS_NORM:
|
||||
defines.push_back("OP_RMS_NORM");
|
||||
variant = "rms_norm";
|
||||
break;
|
||||
case GGML_OP_L2_NORM:
|
||||
defines.push_back("OP_L2_NORM");
|
||||
variant = "l2_norm";
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported op for row_norm shader");
|
||||
}
|
||||
|
||||
if (key.inplace) {
|
||||
defines.push_back("INPLACE");
|
||||
variant += "_inplace";
|
||||
}
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
|
||||
|
||||
auto processed = preprocessor.preprocess(wgsl_row_norm, defines);
|
||||
row_norm_pipelines[key] = ggml_webgpu_create_pipeline(device, processed, variant);
|
||||
return row_norm_pipelines[key];
|
||||
}
|
||||
|
||||
webgpu_pipeline get_argmax_pipeline(const ggml_webgpu_shader_lib_context & context) {
|
||||
bool vec4 = context.src0->ne[0] % 4 == 0;
|
||||
|
||||
@@ -1058,6 +1121,7 @@ class ggml_webgpu_shader_lib {
|
||||
.op = op,
|
||||
.is_unary = is_unary,
|
||||
.inplace = context.inplace,
|
||||
.ttype = (ggml_tri_type) ggml_get_op_params_i32(context.dst, 0),
|
||||
};
|
||||
|
||||
auto it = unary_pipelines.find(key);
|
||||
@@ -1088,6 +1152,29 @@ class ggml_webgpu_shader_lib {
|
||||
variant += "_inplace";
|
||||
}
|
||||
|
||||
if (op == GGML_OP_TRI) {
|
||||
switch (key.ttype) {
|
||||
case GGML_TRI_TYPE_LOWER:
|
||||
defines.push_back("TRI_TYPE_LOWER");
|
||||
variant += "_tri_type_lower";
|
||||
break;
|
||||
case GGML_TRI_TYPE_LOWER_DIAG:
|
||||
defines.push_back("TRI_TYPE_LOWER_DIAG");
|
||||
variant += "_tri_type_lower_diag";
|
||||
break;
|
||||
case GGML_TRI_TYPE_UPPER:
|
||||
defines.push_back("TRI_TYPE_UPPER");
|
||||
variant += "_tri_type_upper";
|
||||
break;
|
||||
case GGML_TRI_TYPE_UPPER_DIAG:
|
||||
defines.push_back("TRI_TYPE_UPPER_DIAG");
|
||||
variant += "_tri_upper_diag";
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("Unsupported ggml_tri_type for unary shader");
|
||||
}
|
||||
}
|
||||
|
||||
defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size));
|
||||
|
||||
auto processed = preprocessor.preprocess(wgsl_unary, defines);
|
||||
|
||||
@@ -366,7 +366,6 @@ struct webgpu_context_struct {
|
||||
|
||||
std::map<int, std::map<int, webgpu_pipeline>> cpy_pipelines; // src_type, dst_type
|
||||
|
||||
std::map<int, webgpu_pipeline> rms_norm_pipelines; // inplace
|
||||
std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> rope_pipelines; // type, ff, inplace
|
||||
std::map<int, std::map<int, std::map<int, webgpu_pipeline>>> glu_pipelines; // glu_op, type, split
|
||||
|
||||
@@ -509,50 +508,39 @@ static void ggml_backend_webgpu_wait_profile_futures(webgpu_global_context &
|
||||
static void ggml_backend_webgpu_wait(webgpu_global_context & ctx,
|
||||
std::vector<webgpu_submission> & subs,
|
||||
bool block = true) {
|
||||
// If we have too many in-flight submissions, wait on the oldest one first.
|
||||
if (subs.empty()) {
|
||||
return;
|
||||
}
|
||||
while (subs.size() >= WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD) {
|
||||
auto waitStatus = ctx->instance.WaitAny(1, &subs[0].submit_done, UINT64_MAX);
|
||||
if (ggml_backend_webgpu_handle_wait_status(waitStatus)) {
|
||||
|
||||
bool blocking_wait = block || subs.size() >= WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD;
|
||||
while (blocking_wait) {
|
||||
auto waitStatus = ctx->instance.WaitAny(1, &subs[0].submit_done, 0);
|
||||
if (ggml_backend_webgpu_handle_wait_status(waitStatus, true)) {
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
ggml_backend_webgpu_wait_profile_futures(ctx, subs[0].profile_futures, true);
|
||||
#endif
|
||||
subs.erase(subs.begin());
|
||||
}
|
||||
blocking_wait = (block && !subs.empty()) || subs.size() >= WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD;
|
||||
}
|
||||
|
||||
if (subs.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (block) {
|
||||
for (auto & sub : subs) {
|
||||
while (!sub.submit_done.completed) {
|
||||
auto waitStatus = ctx->instance.WaitAny(1, &sub.submit_done, UINT64_MAX);
|
||||
ggml_backend_webgpu_handle_wait_status(waitStatus);
|
||||
}
|
||||
// Poll each submit future once and remove completed submissions.
|
||||
for (auto sub = subs.begin(); sub != subs.end();) {
|
||||
auto waitStatus = ctx->instance.WaitAny(1, &sub->submit_done, 0);
|
||||
bool success = ggml_backend_webgpu_handle_wait_status(waitStatus, true);
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
ggml_backend_webgpu_wait_profile_futures(ctx, sub.profile_futures, true);
|
||||
#endif
|
||||
}
|
||||
subs.clear();
|
||||
} else {
|
||||
// Poll each submit future once and remove completed submissions.
|
||||
for (auto sub = subs.begin(); sub != subs.end();) {
|
||||
auto waitStatus = ctx->instance.WaitAny(1, &sub->submit_done, 0);
|
||||
ggml_backend_webgpu_handle_wait_status(waitStatus, true);
|
||||
#ifdef GGML_WEBGPU_GPU_PROFILE
|
||||
ggml_backend_webgpu_wait_profile_futures(ctx, sub->profile_futures, false);
|
||||
if (sub->submit_done.completed && sub->profile_futures.empty()) {
|
||||
ggml_backend_webgpu_wait_profile_futures(ctx, sub->profile_futures, false);
|
||||
if (success && sub->profile_futures.empty()) {
|
||||
#else
|
||||
if (sub->submit_done.completed) {
|
||||
if (success) {
|
||||
#endif
|
||||
sub = subs.erase(sub);
|
||||
} else {
|
||||
++sub;
|
||||
}
|
||||
sub = subs.erase(sub);
|
||||
} else {
|
||||
++sub;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1609,8 +1597,8 @@ static webgpu_command ggml_webgpu_repeat(webgpu_context & ctx, ggml_tensor * src
|
||||
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, wg_x);
|
||||
}
|
||||
|
||||
static webgpu_command ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
|
||||
int inplace = ggml_webgpu_tensor_equal(src, dst);
|
||||
static webgpu_command ggml_webgpu_row_norm(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) {
|
||||
bool inplace = ggml_webgpu_tensor_equal(src, dst);
|
||||
|
||||
std::vector<uint32_t> params = {
|
||||
(uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)),
|
||||
@@ -1641,8 +1629,15 @@ static webgpu_command ggml_webgpu_rms_norm(webgpu_context & ctx, ggml_tensor * s
|
||||
.size = ggml_webgpu_tensor_binding_size(ctx, dst) });
|
||||
}
|
||||
|
||||
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, ctx->rms_norm_pipelines[inplace], params,
|
||||
entries, ggml_nrows(src));
|
||||
ggml_webgpu_shader_lib_context shader_lib_ctx = {
|
||||
.src0 = src,
|
||||
.dst = dst,
|
||||
.max_wg_size = WEBGPU_ROW_SPLIT_WG_SIZE,
|
||||
.inplace = inplace,
|
||||
};
|
||||
|
||||
webgpu_pipeline pipeline = ctx->shader_lib->get_row_norm_pipeline(shader_lib_ctx);
|
||||
return ggml_backend_webgpu_build(ctx->global_ctx, ctx->param_buf_pool, pipeline, params, entries, ggml_nrows(src));
|
||||
}
|
||||
|
||||
static webgpu_command ggml_webgpu_rope(webgpu_context & ctx,
|
||||
@@ -2203,7 +2198,8 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
|
||||
case GGML_OP_REPEAT:
|
||||
return ggml_webgpu_repeat(ctx, src0, node);
|
||||
case GGML_OP_RMS_NORM:
|
||||
return ggml_webgpu_rms_norm(ctx, src0, node);
|
||||
case GGML_OP_L2_NORM:
|
||||
return ggml_webgpu_row_norm(ctx, src0, node);
|
||||
case GGML_OP_ROPE:
|
||||
return ggml_webgpu_rope(ctx, src0, src1, src2, node);
|
||||
case GGML_OP_GLU:
|
||||
@@ -2220,6 +2216,8 @@ static std::optional<webgpu_command> ggml_webgpu_encode_node(webgpu_context ctx,
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_SIN:
|
||||
case GGML_OP_COS:
|
||||
case GGML_OP_DIAG:
|
||||
case GGML_OP_TRI:
|
||||
return ggml_webgpu_unary_op(ctx, src0, node);
|
||||
case GGML_OP_PAD:
|
||||
return ggml_webgpu_pad(ctx, src0, node);
|
||||
@@ -2625,15 +2623,6 @@ static void ggml_webgpu_init_cpy_pipeline(webgpu_context & webgpu_ctx) {
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->global_ctx->device, wgsl_cpy_f16_f16, "cpy_f16_f16", constants);
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_rms_norm_pipeline(webgpu_context & webgpu_ctx) {
|
||||
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(WEBGPU_ROW_SPLIT_WG_SIZE);
|
||||
|
||||
webgpu_ctx->rms_norm_pipelines[0] =
|
||||
ggml_webgpu_create_pipeline(webgpu_ctx->global_ctx->device, wgsl_rms_norm, "rms_norm", constants);
|
||||
webgpu_ctx->rms_norm_pipelines[1] = ggml_webgpu_create_pipeline(
|
||||
webgpu_ctx->global_ctx->device, wgsl_rms_norm_inplace, "rms_norm_inplace", constants);
|
||||
}
|
||||
|
||||
static void ggml_webgpu_init_rope_pipeline(webgpu_context & webgpu_ctx) {
|
||||
std::vector<wgpu::ConstantEntry> constants = ggml_webgpu_wg_size_entry(WEBGPU_MAX_WG_SIZE);
|
||||
|
||||
@@ -2918,7 +2907,6 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) {
|
||||
wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, "set_rows_host_error_buf");
|
||||
|
||||
ggml_webgpu_init_cpy_pipeline(webgpu_ctx);
|
||||
ggml_webgpu_init_rms_norm_pipeline(webgpu_ctx);
|
||||
ggml_webgpu_init_rope_pipeline(webgpu_ctx);
|
||||
ggml_webgpu_init_glu_pipeline(webgpu_ctx);
|
||||
ggml_webgpu_init_soft_max_pipeline(webgpu_ctx);
|
||||
@@ -2961,17 +2949,16 @@ static ggml_backend_buffer_type_t ggml_backend_webgpu_device_get_buffer_type(ggm
|
||||
|
||||
static struct ggml_backend_buffer_type ggml_backend_webgpu_buffer_type = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_webgpu_buffer_type_get_name,
|
||||
/* .alloc_buffer = */
|
||||
ggml_backend_webgpu_buffer_type_alloc_buffer, /* .get_alignment = */
|
||||
ggml_backend_webgpu_buffer_type_get_alignment, /* .get_max_size = */
|
||||
ggml_backend_webgpu_buffer_type_get_max_size, /* .get_alloc_size = */
|
||||
ggml_backend_webgpu_buffer_type_get_alloc_size, /* .is_host = */ NULL, // defaults to false
|
||||
/* .get_name = */ ggml_backend_webgpu_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_webgpu_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_webgpu_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ ggml_backend_webgpu_buffer_type_get_max_size,
|
||||
/* .get_alloc_size = */ ggml_backend_webgpu_buffer_type_get_alloc_size,
|
||||
/* .is_host = */ NULL, // defaults to false
|
||||
},
|
||||
/* .device = */
|
||||
dev,
|
||||
/* .context = */
|
||||
NULL
|
||||
dev,
|
||||
/* .context = */ NULL
|
||||
};
|
||||
|
||||
return &ggml_backend_webgpu_buffer_type;
|
||||
@@ -3130,6 +3117,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
||||
break;
|
||||
}
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_L2_NORM:
|
||||
supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
|
||||
break;
|
||||
case GGML_OP_ROPE:
|
||||
@@ -3213,6 +3201,12 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const
|
||||
case GGML_OP_COS:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
|
||||
break;
|
||||
case GGML_OP_DIAG:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
|
||||
break;
|
||||
case GGML_OP_TRI:
|
||||
supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type);
|
||||
break;
|
||||
case GGML_OP_PAD:
|
||||
supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32;
|
||||
break;
|
||||
|
||||
@@ -1,21 +1,11 @@
|
||||
#define(VARIANTS)
|
||||
|
||||
[
|
||||
{
|
||||
"DECLS": ["NOT_INPLACE"]
|
||||
},
|
||||
{
|
||||
"SHADER_SUFFIX": "inplace",
|
||||
"DECLS": ["INPLACE"]
|
||||
},
|
||||
]
|
||||
|
||||
#end(VARIANTS)
|
||||
|
||||
#define(DECLS)
|
||||
|
||||
#decl(NOT_INPLACE)
|
||||
#ifdef INPLACE
|
||||
fn update(src_offset: u32, dst_offset: u32, scale: f32) {
|
||||
src[dst_offset] = scale * src[src_offset];
|
||||
}
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<uniform> params: Params;
|
||||
#else
|
||||
fn update(src_offset: u32, dst_offset: u32, scale: f32) {
|
||||
dst[dst_offset] = scale * src[src_offset];
|
||||
}
|
||||
@@ -25,23 +15,7 @@ var<storage, read_write> dst: array<f32>;
|
||||
|
||||
@group(0) @binding(2)
|
||||
var<uniform> params: Params;
|
||||
|
||||
#enddecl(NOT_INPLACE)
|
||||
|
||||
#decl(INPLACE)
|
||||
|
||||
fn update(src_offset: u32, dst_offset: u32, scale: f32) {
|
||||
src[dst_offset] = scale * src[src_offset];
|
||||
}
|
||||
|
||||
@group(0) @binding(1)
|
||||
var<uniform> params: Params;
|
||||
|
||||
#enddecl(INPLACE)
|
||||
|
||||
#end(DECLS)
|
||||
|
||||
#define(SHADER)
|
||||
#endif
|
||||
|
||||
struct Params {
|
||||
offset_src: u32, // in elements
|
||||
@@ -68,12 +42,9 @@ struct Params {
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src: array<f32>;
|
||||
|
||||
DECLS
|
||||
var<workgroup> scratch: array<f32, WG_SIZE>;
|
||||
|
||||
override wg_size: u32;
|
||||
var<workgroup> scratch: array<f32, wg_size>;
|
||||
|
||||
@compute @workgroup_size(wg_size)
|
||||
@compute @workgroup_size(WG_SIZE)
|
||||
fn main(@builtin(workgroup_id) wid: vec3<u32>,
|
||||
@builtin(local_invocation_id) lid: vec3<u32>) {
|
||||
|
||||
@@ -86,7 +57,7 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
|
||||
let i_src_row = params.offset_src + i3 * params.stride_src3 + i2 * params.stride_src2 + i1 * params.stride_src1;
|
||||
let i_dst_row = params.offset_dst + i3 * params.stride_dst3 + i2 * params.stride_dst2 + i1 * params.stride_dst1;
|
||||
|
||||
let elems = (params.ne0 + wg_size - 1) / wg_size;
|
||||
let elems = (params.ne0 + WG_SIZE - 1) / WG_SIZE;
|
||||
|
||||
var sum = 0.0f;
|
||||
var col = lid.x;
|
||||
@@ -95,12 +66,12 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
|
||||
break;
|
||||
}
|
||||
sum += pow(src[i_src_row + col], 2.0);
|
||||
col += wg_size;
|
||||
col += WG_SIZE;
|
||||
}
|
||||
|
||||
scratch[lid.x] = sum;
|
||||
workgroupBarrier();
|
||||
var offset = wg_size / 2;
|
||||
var offset: u32 = WG_SIZE / 2;
|
||||
while (offset > 0) {
|
||||
if (lid.x < offset) {
|
||||
scratch[lid.x] += scratch[lid.x + offset];
|
||||
@@ -110,14 +81,17 @@ fn main(@builtin(workgroup_id) wid: vec3<u32>,
|
||||
}
|
||||
sum = scratch[0];
|
||||
|
||||
#ifdef OP_RMS_NORM
|
||||
let scale = 1.0/sqrt(sum/f32(params.ne0) + params.eps);
|
||||
#elif OP_L2_NORM
|
||||
let scale = 1.0/max(sqrt(sum), params.eps);
|
||||
#endif
|
||||
col = lid.x;
|
||||
for (var j: u32 = 0; j < elems; j++) {
|
||||
if (col >= params.ne0) {
|
||||
break;
|
||||
}
|
||||
update(i_src_row + col, i_dst_row + col, scale);
|
||||
col += wg_size;
|
||||
col += WG_SIZE;
|
||||
}
|
||||
}
|
||||
#end(SHADER)
|
||||
@@ -5,7 +5,6 @@ enable f16;
|
||||
#define TYPE f32
|
||||
#endif
|
||||
|
||||
|
||||
@group(0) @binding(0)
|
||||
var<storage, read_write> src: array<TYPE>;
|
||||
|
||||
@@ -57,12 +56,20 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
return;
|
||||
}
|
||||
var i = gid.x;
|
||||
let i3 = i / (params.ne2 * params.ne1 * params.ne0);
|
||||
i = i % (params.ne2 * params.ne1 * params.ne0);
|
||||
let i2 = i / (params.ne1 * params.ne0);
|
||||
i = i % (params.ne1 * params.ne0);
|
||||
let i1 = i / params.ne0;
|
||||
let i0 = i % params.ne0;
|
||||
let ne2 = params.ne2;
|
||||
#ifdef DIAG
|
||||
let ne1 = params.ne0;
|
||||
#else
|
||||
let ne1 = params.ne1;
|
||||
#endif
|
||||
let ne0 = params.ne0;
|
||||
|
||||
let i3 = i / (ne2 * ne1 * ne0);
|
||||
i = i % (ne2 * ne1 * ne0);
|
||||
let i2 = i / (ne1 * ne0);
|
||||
i = i % (ne1 * ne0);
|
||||
let i1 = i / ne0;
|
||||
let i0 = i % ne0;
|
||||
|
||||
let src_idx = i0 * params.stride_src0 + i1 * params.stride_src1 +
|
||||
i2 * params.stride_src2 + i3 * params.stride_src3;
|
||||
@@ -184,6 +191,20 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
|
||||
let res_f32 = cos(f32(src[params.offset_src + src_idx]));
|
||||
let res = TYPE(res_f32);
|
||||
#endif
|
||||
#ifdef DIAG
|
||||
let res = select(0.0, src[params.offset_src + i0 + i2 * params.stride_src2 + i3 * params.stride_src3], i0 == i1);
|
||||
#endif
|
||||
#ifdef TRI
|
||||
#ifdef TRI_TYPE_LOWER
|
||||
let res = select(0.0, src[params.offset_src + src_idx], i0 < i1);
|
||||
#elif TRI_TYPE_LOWER_DIAG
|
||||
let res = select(0.0, src[params.offset_src + src_idx], i0 <= i1);
|
||||
#elif TRI_TYPE_UPPER
|
||||
let res = select(0.0, src[params.offset_src + src_idx], i0 > i1);
|
||||
#elif TRI_TYPE_UPPER_DIAG
|
||||
let res = select(0.0, src[params.offset_src + src_idx], i0 >= i1);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef INPLACE
|
||||
src[params.offset_src + src_idx] = res;
|
||||
|
||||
@@ -1294,6 +1294,12 @@ size_t ggml_row_size(enum ggml_type type, int64_t ne) {
|
||||
return ggml_type_size(type)*ne/ggml_blck_size(type);
|
||||
}
|
||||
|
||||
double ggml_type_sizef(enum ggml_type type) {
|
||||
assert(type >= 0);
|
||||
assert(type < GGML_TYPE_COUNT);
|
||||
return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
|
||||
}
|
||||
|
||||
const char * ggml_type_name(enum ggml_type type) {
|
||||
assert(type >= 0);
|
||||
assert(type < GGML_TYPE_COUNT);
|
||||
|
||||
@@ -425,8 +425,7 @@ class GGUFWriter:
|
||||
fout = self.fout[file_id]
|
||||
|
||||
# pop the first tensor info
|
||||
# TODO: cleaner way to get the first key
|
||||
first_tensor_name = [name for name, _ in zip(self.tensors[file_id].keys(), range(1))][0]
|
||||
first_tensor_name = next(iter(self.tensors[file_id]))
|
||||
ti = self.tensors[file_id].pop(first_tensor_name)
|
||||
assert ti.nbytes == tensor.nbytes
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
553552e1d88be2b214b85e5159eedd39a63e2c34
|
||||
c044a8eeae2591faa0950c8b5e514cbc4bbfc4ca
|
||||
|
||||
@@ -2129,19 +2129,28 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
||||
}
|
||||
|
||||
const uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
|
||||
|
||||
const float * scores = nullptr;
|
||||
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
||||
if (score_idx != -1) {
|
||||
const uint32_t n_scores = gguf_get_arr_n(ctx, score_idx);
|
||||
if (n_scores < n_tokens) {
|
||||
throw std::runtime_error("Index out of array bounds for scores (" + std::to_string(n_scores) + " < " + std::to_string(n_tokens) + ")\n");
|
||||
}
|
||||
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||||
}
|
||||
|
||||
const int * toktypes = nullptr;
|
||||
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
||||
if (toktype_idx != -1) {
|
||||
const uint32_t n_toktypes = gguf_get_arr_n(ctx, toktype_idx);
|
||||
if (n_toktypes < n_tokens) {
|
||||
throw std::runtime_error("Index out of array bounds for toktypes (" + std::to_string(n_toktypes) + " < " + std::to_string(n_tokens) + ")\n");
|
||||
}
|
||||
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||
}
|
||||
|
||||
uint32_t n_tokens = gguf_get_arr_n(ctx, token_idx);
|
||||
id_to_token.resize(n_tokens);
|
||||
|
||||
for (uint32_t i = 0; i < n_tokens; i++) {
|
||||
|
||||
@@ -121,6 +121,9 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_pa
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
@@ -111,8 +111,13 @@ llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_
|
||||
|
||||
}
|
||||
|
||||
inpL = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(inpL, "l_out", il);
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = build_norm(inpL,
|
||||
|
||||
@@ -86,6 +86,10 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_pa
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
|
||||
@@ -82,6 +82,7 @@ llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_grap
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
|
||||
@@ -66,8 +66,14 @@ llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
inpL = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(inpL, "l_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
cur = build_norm(inpL,
|
||||
model.output_norm,
|
||||
|
||||
@@ -362,6 +362,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
cur = inpL;
|
||||
|
||||
@@ -177,6 +177,9 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
|
||||
cb(ffn_norm_out, "model.layers.{}.ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_out);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
}
|
||||
|
||||
cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
|
||||
|
||||
@@ -71,6 +71,7 @@ llm_build_plamo2::llm_build_plamo2(const llama_model & model, const llm_graph_pa
|
||||
cur = ggml_add(ctx0, cur, residual);
|
||||
cb(cur, "ffn_residual", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
|
||||
@@ -109,6 +109,8 @@ llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_gr
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
|
||||
@@ -64,6 +64,9 @@ llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_pa
|
||||
cur = ggml_add(ctx0, cur, ffn_residual);
|
||||
cb(cur, "post_ffn", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// Input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
@@ -64,6 +64,9 @@ llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_gr
|
||||
cur = ggml_add(ctx0, cur, ffn_residual);
|
||||
cb(cur, "post_moe", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// Input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
@@ -56,6 +56,9 @@ llm_build_qwen3next::llm_build_qwen3next(const llama_model & model, const llm_gr
|
||||
cur = ggml_add(ctx0, cur, ffn_residual);
|
||||
cb(cur, "post_moe", il);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// Input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
@@ -101,6 +101,7 @@ llm_build_smallthinker<iswa>::llm_build_smallthinker(const llama_model & model,
|
||||
cur = ffn_out;
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
|
||||
@@ -145,9 +145,11 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
|
||||
cur = build_cvec(cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
|
||||
@@ -41,6 +41,11 @@ struct clip_graph {
|
||||
virtual ~clip_graph() = default;
|
||||
virtual ggml_cgraph * build() = 0;
|
||||
|
||||
// wrapper around ggml_mul_mat, allow hooking (e.g. LoRA, clamping) depending on the model
|
||||
// tensor w should be the weight matrix, and tensor x should be the input
|
||||
virtual ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const;
|
||||
// TODO: build_mm(w, b, x) to support bias
|
||||
|
||||
//
|
||||
// utility functions
|
||||
//
|
||||
|
||||
@@ -255,6 +255,10 @@ clip_graph::clip_graph(clip_ctx * ctx, const clip_image_f32 & img) :
|
||||
gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
|
||||
}
|
||||
|
||||
ggml_tensor * clip_graph::build_mm(ggml_tensor * w, ggml_tensor * x) const {
|
||||
return ggml_mul_mat(ctx0, w, x);
|
||||
}
|
||||
|
||||
void clip_graph::cb(ggml_tensor * cur, const char * name, int il) const {
|
||||
if (il >= 0) {
|
||||
ggml_format_name(cur, "%s-%d", name, il);
|
||||
@@ -326,7 +330,7 @@ ggml_tensor * clip_graph::build_vit(
|
||||
ggml_tensor * Vcur = nullptr;
|
||||
if (layer.qkv_w != nullptr) {
|
||||
// fused qkv
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
cur = build_mm(layer.qkv_w, cur);
|
||||
if (layer.qkv_b != nullptr) {
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
}
|
||||
@@ -360,17 +364,17 @@ ggml_tensor * clip_graph::build_vit(
|
||||
|
||||
} else {
|
||||
// separate q, k, v
|
||||
Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
Qcur = build_mm(layer.q_w, cur);
|
||||
if (layer.q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
}
|
||||
|
||||
Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
Kcur = build_mm(layer.k_w, cur);
|
||||
if (layer.k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
}
|
||||
|
||||
Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
Vcur = build_mm(layer.v_w, cur);
|
||||
if (layer.v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
@@ -517,7 +521,7 @@ ggml_tensor * clip_graph::build_ffn(
|
||||
ffn_op_type type_op,
|
||||
int il) const {
|
||||
|
||||
ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
|
||||
ggml_tensor * tmp = up ? build_mm(up, cur) : cur;
|
||||
cb(tmp, "ffn_up", il);
|
||||
|
||||
if (up_b) {
|
||||
@@ -526,7 +530,7 @@ ggml_tensor * clip_graph::build_ffn(
|
||||
}
|
||||
|
||||
if (gate) {
|
||||
cur = ggml_mul_mat(ctx0, gate, cur);
|
||||
cur = build_mm(gate, cur);
|
||||
cb(cur, "ffn_gate", il);
|
||||
|
||||
if (gate_b) {
|
||||
@@ -580,7 +584,7 @@ ggml_tensor * clip_graph::build_ffn(
|
||||
}
|
||||
|
||||
if (down) {
|
||||
cur = ggml_mul_mat(ctx0, down, cur);
|
||||
cur = build_mm(down, cur);
|
||||
}
|
||||
|
||||
if (down_b) {
|
||||
@@ -646,7 +650,7 @@ ggml_tensor * clip_graph::build_attn(
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
cur = ggml_mul_mat(ctx0, wo, cur);
|
||||
cur = build_mm(wo, cur);
|
||||
}
|
||||
|
||||
if (wo_b) {
|
||||
|
||||
@@ -19,7 +19,7 @@ ggml_cgraph * clip_graph_cogvlm::build() {
|
||||
auto & layer = model.layers[il];
|
||||
ggml_tensor * cur = inpL;
|
||||
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
cur = build_mm(layer.qkv_w, cur);
|
||||
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
|
||||
@@ -67,7 +67,7 @@ ggml_cgraph * clip_graph_cogvlm::build() {
|
||||
ggml_row_size(inpL->type, n_embd), 0);
|
||||
|
||||
// Multiply with mm_model_proj
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
|
||||
cur = build_mm(model.mm_model_proj, cur);
|
||||
|
||||
// Apply layernorm, weight, bias
|
||||
cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
@@ -76,16 +76,16 @@ ggml_cgraph * clip_graph_cogvlm::build() {
|
||||
cur = ggml_gelu_inplace(ctx0, cur);
|
||||
|
||||
// Branch 1: multiply with mm_h_to_4h_w
|
||||
ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur);
|
||||
ggml_tensor * h_to_4h = build_mm(model.mm_h_to_4h_w, cur);
|
||||
|
||||
// Branch 2: multiply with mm_gate_w
|
||||
ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur);
|
||||
ggml_tensor * gate = build_mm(model.mm_gate_w, cur);
|
||||
|
||||
// Apply silu
|
||||
gate = ggml_swiglu_split(ctx0, gate, h_to_4h);
|
||||
|
||||
// Apply mm_4h_to_h_w
|
||||
cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate);
|
||||
cur = build_mm(model.mm_4h_to_h_w, gate);
|
||||
|
||||
// Concatenate with boi and eoi
|
||||
cur = ggml_concat(ctx0, model.mm_boi, cur, 1);
|
||||
|
||||
@@ -56,7 +56,7 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
|
||||
|
||||
// calculate out
|
||||
cur = ggml_mul_mat(ctx0, model.pre_encode_out_w, cur);
|
||||
cur = build_mm(model.pre_encode_out_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.pre_encode_out_b);
|
||||
cb(cur, "conformer.pre_encode.out", -1);
|
||||
}
|
||||
@@ -87,7 +87,7 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
cur = build_norm(residual, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, 1e-5, il);
|
||||
cb(cur, "conformer.layers.{}.norm_self_att", il);
|
||||
|
||||
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, Qcur->ne[1]);
|
||||
ggml_tensor * Q_bias_u = ggml_add(ctx0, Qcur, layer.pos_bias_u);
|
||||
@@ -96,12 +96,12 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
Q_bias_v = ggml_permute(ctx0, Q_bias_v, 0, 2, 1, 3);
|
||||
|
||||
// TODO @ngxson : some cont can/should be removed when ggml_mul_mat support these cases
|
||||
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, Kcur->ne[1]);
|
||||
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 0, 2, 1, 3));
|
||||
|
||||
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, Vcur->ne[1]);
|
||||
Vcur = ggml_cont(ctx0, ggml_permute(ctx0, Vcur, 1, 2, 0, 3));
|
||||
@@ -111,7 +111,7 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
matrix_ac = ggml_cont(ctx0, ggml_permute(ctx0, matrix_ac, 1, 0, 2, 3));
|
||||
cb(matrix_ac, "conformer.layers.{}.self_attn.id3", il);
|
||||
|
||||
auto * p = ggml_mul_mat(ctx0, layer.linear_pos_w, pos_emb);
|
||||
auto * p = build_mm(layer.linear_pos_w, pos_emb);
|
||||
cb(p, "conformer.layers.{}.self_attn.linear_pos", il);
|
||||
p = ggml_reshape_3d(ctx0, p, d_head, n_head, p->ne[1]);
|
||||
p = ggml_permute(ctx0, p, 0, 2, 1, 3);
|
||||
@@ -143,7 +143,7 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
x = ggml_permute(ctx0, x, 2, 0, 1, 3);
|
||||
x = ggml_cont_2d(ctx0, x, x->ne[0] * x->ne[1], x->ne[2]);
|
||||
|
||||
ggml_tensor * out = ggml_mul_mat(ctx0, layer.o_w, x);
|
||||
ggml_tensor * out = build_mm(layer.o_w, x);
|
||||
out = ggml_add(ctx0, out, layer.o_b);
|
||||
cb(out, "conformer.layers.{}.self_attn.linear_out", il);
|
||||
|
||||
@@ -157,7 +157,7 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
// conv
|
||||
{
|
||||
auto * x = cur;
|
||||
x = ggml_mul_mat(ctx0, layer.conv_pw1_w, x);
|
||||
x = build_mm(layer.conv_pw1_w, x);
|
||||
x = ggml_add(ctx0, x, layer.conv_pw1_b);
|
||||
cb(x, "conformer.layers.{}.conv.pointwise_conv1", il);
|
||||
|
||||
@@ -181,7 +181,7 @@ ggml_cgraph * clip_graph_conformer::build() {
|
||||
x = ggml_silu(ctx0, x);
|
||||
|
||||
// pointwise_conv2
|
||||
x = ggml_mul_mat(ctx0, layer.conv_pw2_w, x);
|
||||
x = build_mm(layer.conv_pw2_w, x);
|
||||
x = ggml_add(ctx0, x, layer.conv_pw2_b);
|
||||
|
||||
cur = x;
|
||||
|
||||
@@ -97,7 +97,7 @@ ggml_cgraph * clip_graph_glm4v::build() {
|
||||
|
||||
// FC projector
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
||||
cur = build_mm(model.projection, cur);
|
||||
// default LayerNorm (post_projection_norm)
|
||||
cur = build_norm(cur, model.mm_post_norm_w, model.mm_post_norm_b, NORM_TYPE_NORMAL, 1e-5, -1);
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
|
||||
@@ -22,7 +22,7 @@ ggml_cgraph * clip_graph_llama4::build() {
|
||||
ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
|
||||
patch_size, patch_size, 3, n_embd);
|
||||
inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
|
||||
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
|
||||
inp = build_mm(model.patch_embeddings_0, inp);
|
||||
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
|
||||
cb(inp, "patch_conv", -1);
|
||||
}
|
||||
@@ -78,15 +78,15 @@ ggml_cgraph * clip_graph_llama4::build() {
|
||||
|
||||
// based on Llama4VisionMLP2 (always uses GELU activation, no bias)
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
|
||||
cur = build_mm(model.mm_model_mlp_1_w, cur);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
|
||||
cur = build_mm(model.mm_model_mlp_2_w, cur);
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cb(cur, "adapter_mlp", -1);
|
||||
}
|
||||
|
||||
// Llama4MultiModalProjector
|
||||
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
|
||||
cur = build_mm(model.mm_model_proj, cur);
|
||||
cb(cur, "projected", -1);
|
||||
|
||||
// build the graph
|
||||
|
||||
@@ -70,17 +70,17 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
|
||||
ggml_tensor * Qcur = build_mm(layer.q_w, cur);
|
||||
if (layer.q_b) {
|
||||
Qcur = ggml_add(ctx0, Qcur, layer.q_b);
|
||||
}
|
||||
|
||||
ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
|
||||
ggml_tensor * Kcur = build_mm(layer.k_w, cur);
|
||||
if (layer.k_b) {
|
||||
Kcur = ggml_add(ctx0, Kcur, layer.k_b);
|
||||
}
|
||||
|
||||
ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
|
||||
ggml_tensor * Vcur = build_mm(layer.v_w, cur);
|
||||
if (layer.v_b) {
|
||||
Vcur = ggml_add(ctx0, Vcur, layer.v_b);
|
||||
}
|
||||
@@ -164,17 +164,17 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
|
||||
// llava projector
|
||||
if (proj_type == PROJECTOR_TYPE_MLP) {
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = build_mm(model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
if (model.mm_2_w) {
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
||||
embeddings = build_mm(model.mm_2_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||
}
|
||||
}
|
||||
else if (proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = build_mm(model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
||||
// First LayerNorm
|
||||
@@ -186,7 +186,7 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
// Second linear layer
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
|
||||
embeddings = build_mm(model.mm_3_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
|
||||
|
||||
// Second LayerNorm
|
||||
@@ -197,10 +197,10 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
else if (proj_type == PROJECTOR_TYPE_LDP) {
|
||||
// MobileVLM projector
|
||||
int n_patch = 24;
|
||||
ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
|
||||
ggml_tensor * mlp_1 = build_mm(model.mm_model_mlp_1_w, embeddings);
|
||||
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
|
||||
mlp_1 = ggml_gelu(ctx0, mlp_1);
|
||||
ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
|
||||
ggml_tensor * mlp_3 = build_mm(model.mm_model_mlp_3_w, mlp_1);
|
||||
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
|
||||
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
|
||||
|
||||
@@ -229,10 +229,10 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
// pointwise conv
|
||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_1_block_1_fc1_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
|
||||
block_1 = ggml_relu(ctx0, block_1);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_1_block_1_fc2_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
|
||||
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
|
||||
@@ -244,7 +244,7 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||
|
||||
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_1_block_2_0_w, block_1);
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||
|
||||
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||
@@ -277,10 +277,10 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
// pointwise conv
|
||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_2_block_1_fc1_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
|
||||
block_1 = ggml_relu(ctx0, block_1);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_2_block_1_fc2_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
|
||||
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||
|
||||
@@ -292,7 +292,7 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
|
||||
block_1 = build_mm(model.mm_model_block_2_block_2_0_w, block_1);
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||
|
||||
|
||||
@@ -307,10 +307,10 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
else if (proj_type == PROJECTOR_TYPE_LDPV2)
|
||||
{
|
||||
int n_patch = 24;
|
||||
ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
||||
ggml_tensor * mlp_0 = build_mm(model.mm_model_mlp_0_w, embeddings);
|
||||
mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b);
|
||||
mlp_0 = ggml_gelu(ctx0, mlp_0);
|
||||
ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0);
|
||||
ggml_tensor * mlp_2 = build_mm(model.mm_model_mlp_2_w, mlp_0);
|
||||
mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b);
|
||||
// mlp_2 ne = [2048, 576, 1, 1]
|
||||
// // AVG Pool Layer 2*2, strides = 2
|
||||
@@ -344,15 +344,15 @@ ggml_cgraph * clip_graph_llava::build() {
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
|
||||
// GLU
|
||||
{
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
|
||||
embeddings = build_mm(model.mm_model_mlp_0_w, embeddings);
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
|
||||
embeddings = ggml_gelu_inplace(ctx0, embeddings);
|
||||
ggml_tensor * x = embeddings;
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
|
||||
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
|
||||
embeddings = build_mm(model.mm_model_mlp_2_w, embeddings);
|
||||
x = build_mm(model.mm_model_mlp_1_w,x);
|
||||
embeddings = ggml_swiglu_split(ctx0, embeddings, x);
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
|
||||
embeddings = build_mm(model.mm_model_mlp_3_w, embeddings);
|
||||
}
|
||||
// arrangement of BOI/EOI token embeddings
|
||||
// note: these embeddings are not present in text model, hence we cannot process them as text tokens
|
||||
|
||||
@@ -38,7 +38,7 @@ ggml_cgraph * clip_graph_minicpmv::build() {
|
||||
// resampler projector (it is just another transformer)
|
||||
|
||||
ggml_tensor * q = model.mm_model_query;
|
||||
ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
|
||||
ggml_tensor * v = build_mm(model.mm_model_kv_proj, embeddings);
|
||||
|
||||
// norm
|
||||
q = build_norm(q, model.mm_model_ln_q_w, model.mm_model_ln_q_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
@@ -77,13 +77,13 @@ ggml_cgraph * clip_graph_minicpmv::build() {
|
||||
// Use actual config value if available, otherwise fall back to hardcoded values
|
||||
int num_query = hparams.minicpmv_query_num;
|
||||
ggml_tensor * Q = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
|
||||
build_mm(model.mm_model_attn_q_w, q),
|
||||
model.mm_model_attn_q_b);
|
||||
ggml_tensor * K = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k),
|
||||
build_mm(model.mm_model_attn_k_w, k),
|
||||
model.mm_model_attn_k_b);
|
||||
ggml_tensor * V = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v),
|
||||
build_mm(model.mm_model_attn_v_w, v),
|
||||
model.mm_model_attn_v_b);
|
||||
|
||||
Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_query);
|
||||
@@ -105,7 +105,7 @@ ggml_cgraph * clip_graph_minicpmv::build() {
|
||||
embeddings = build_norm(embeddings, model.mm_model_ln_post_w, model.mm_model_ln_post_b, NORM_TYPE_NORMAL, eps, -1);
|
||||
|
||||
// projection
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
|
||||
embeddings = build_mm(model.mm_model_proj, embeddings);
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, embeddings);
|
||||
|
||||
@@ -429,7 +429,7 @@ ggml_cgraph * clip_graph_mobilenetv5::build() {
|
||||
// PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
|
||||
// Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
|
||||
if (model.mm_input_proj_w) {
|
||||
cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
|
||||
cur = build_mm(model.mm_input_proj_w, cur);
|
||||
}
|
||||
|
||||
// 5. POST PROJECTION NORM
|
||||
|
||||
@@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_pixtral::build() {
|
||||
|
||||
// project to n_embd
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1] * cur->ne[2]);
|
||||
cur = ggml_mul_mat(ctx0, model.mm_patch_merger_w, cur);
|
||||
cur = build_mm(model.mm_patch_merger_w, cur);
|
||||
}
|
||||
|
||||
// LlavaMultiModalProjector (always using GELU activation)
|
||||
|
||||
@@ -90,11 +90,11 @@ ggml_cgraph * clip_graph_qwen2vl::build() {
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
|
||||
build_mm(layer.q_w, cur), layer.q_b);
|
||||
ggml_tensor * Kcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
|
||||
build_mm(layer.k_w, cur), layer.k_b);
|
||||
ggml_tensor * Vcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
|
||||
build_mm(layer.v_w, cur), layer.v_b);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
|
||||
|
||||
@@ -85,7 +85,7 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, layer.qkv_w, cur);
|
||||
cur = build_mm(layer.qkv_w, cur);
|
||||
cur = ggml_add(ctx0, cur, layer.qkv_b);
|
||||
|
||||
ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos,
|
||||
|
||||
@@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_siglip::build() {
|
||||
// https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
|
||||
const int scale_factor = model.hparams.n_merge;
|
||||
cur = build_patch_merge_permute(cur, scale_factor);
|
||||
cur = ggml_mul_mat(ctx0, model.projection, cur);
|
||||
cur = build_mm(model.projection, cur);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_LFM2) {
|
||||
// pixel unshuffle block
|
||||
|
||||
@@ -59,7 +59,7 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
||||
|
||||
// ffn in
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
cur = build_mm(model.mm_1_w, cur);
|
||||
|
||||
// swiglu
|
||||
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
|
||||
@@ -70,11 +70,11 @@ ggml_cgraph * clip_graph_whisper_enc::build() {
|
||||
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
|
||||
|
||||
// ffn out
|
||||
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
||||
cur = build_mm(model.mm_2_w, cur);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
|
||||
// projector
|
||||
cur = ggml_mul_mat(ctx0, model.mm_fc_w, cur);
|
||||
cur = build_mm(model.mm_fc_w, cur);
|
||||
cur = ggml_add(ctx0, cur, model.mm_fc_b);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_VOXTRAL) {
|
||||
|
||||
@@ -43,7 +43,7 @@ ggml_cgraph * clip_graph_youtuvl::build() {
|
||||
ctx0, inp,
|
||||
3*patch_size* patch_size, Hm * Wm * m * m, 1);
|
||||
}
|
||||
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
|
||||
inp = build_mm(model.patch_embeddings_0, inp);
|
||||
|
||||
if (model.patch_bias) {
|
||||
inp = ggml_add(ctx0, inp, model.patch_bias);
|
||||
@@ -97,11 +97,11 @@ ggml_cgraph * clip_graph_youtuvl::build() {
|
||||
// self-attention
|
||||
{
|
||||
ggml_tensor * Qcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.q_w, cur), layer.q_b);
|
||||
build_mm(layer.q_w, cur), layer.q_b);
|
||||
ggml_tensor * Kcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.k_w, cur), layer.k_b);
|
||||
build_mm(layer.k_w, cur), layer.k_b);
|
||||
ggml_tensor * Vcur = ggml_add(ctx0,
|
||||
ggml_mul_mat(ctx0, layer.v_w, cur), layer.v_b);
|
||||
build_mm(layer.v_w, cur), layer.v_b);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_patches);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_patches);
|
||||
|
||||
Binary file not shown.
@@ -148,7 +148,7 @@
|
||||
</Tooltip.Trigger>
|
||||
|
||||
<Tooltip.Content side="right">
|
||||
<p>Images require vision models to be processed</p>
|
||||
<p>Image processing requires a vision model</p>
|
||||
</Tooltip.Content>
|
||||
</Tooltip.Root>
|
||||
{/if}
|
||||
@@ -173,7 +173,7 @@
|
||||
</Tooltip.Trigger>
|
||||
|
||||
<Tooltip.Content side="right">
|
||||
<p>Audio files require audio models to be processed</p>
|
||||
<p>Audio files processing requires an audio model</p>
|
||||
</Tooltip.Content>
|
||||
</Tooltip.Root>
|
||||
{/if}
|
||||
|
||||
@@ -28,6 +28,11 @@
|
||||
|
||||
let parsed = $derived(ModelsService.parseModelId(modelId));
|
||||
let resolvedShowRaw = $derived(showRaw ?? (config().showRawModelNames as boolean) ?? false);
|
||||
let displayName = $derived(
|
||||
aliases && aliases.length > 0 ? aliases[0] : (parsed.modelName ?? modelId)
|
||||
);
|
||||
let remainingAliases = $derived(aliases && aliases.length > 1 ? aliases.slice(1) : []);
|
||||
let allTags = $derived([...(parsed.tags ?? []), ...(tags ?? [])]);
|
||||
</script>
|
||||
|
||||
{#if resolvedShowRaw}
|
||||
@@ -35,7 +40,7 @@
|
||||
{:else}
|
||||
<span class="flex min-w-0 flex-wrap items-center gap-1 {className}">
|
||||
<span class="min-w-0 truncate font-medium">
|
||||
{#if showOrgName && parsed.orgName}{parsed.orgName}/{/if}{parsed.modelName ?? modelId}
|
||||
{#if showOrgName && parsed.orgName && !(aliases && aliases.length > 0)}{parsed.orgName}/{/if}{displayName}
|
||||
</span>
|
||||
|
||||
{#if parsed.params}
|
||||
@@ -50,14 +55,14 @@
|
||||
</span>
|
||||
{/if}
|
||||
|
||||
{#if aliases && aliases.length > 0}
|
||||
{#each aliases as alias (alias)}
|
||||
{#if remainingAliases.length > 0}
|
||||
{#each remainingAliases as alias (alias)}
|
||||
<span class={badgeClass}>{alias}</span>
|
||||
{/each}
|
||||
{/if}
|
||||
|
||||
{#if tags && tags.length > 0}
|
||||
{#each tags as tag (tag)}
|
||||
{#if allTags.length > 0}
|
||||
{#each allTags as tag (tag)}
|
||||
<span class={tagBadgeClass}>{tag}</span>
|
||||
{/each}
|
||||
{/if}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
export const CONFIG_LOCALSTORAGE_KEY = 'LlamaCppWebui.config';
|
||||
export const USER_OVERRIDES_LOCALSTORAGE_KEY = 'LlamaCppWebui.userOverrides';
|
||||
export const FAVOURITE_MODELS_LOCALSTORAGE_KEY = 'LlamaCppWebui.favouriteModels';
|
||||
export const MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY = 'LlamaCppWebui.mcpDefaultEnabled';
|
||||
|
||||
@@ -11,10 +11,16 @@ export const MODEL_ID_SEGMENT_SEPARATOR = '-';
|
||||
export const MODEL_ID_QUANTIZATION_SEPARATOR = ':';
|
||||
|
||||
/**
|
||||
* Matches a trailing ALL-CAPS format segment, e.g. `GGUF`, `BF16`, `Q4_K_M`.
|
||||
* Must be at least 2 uppercase letters, optionally followed by uppercase letters or digits.
|
||||
* Matches a quantization/precision segment, e.g. `Q4_K_M`, `IQ4_XS`, `F16`, `BF16`, `MXFP4`.
|
||||
* Case-insensitive to handle both uppercase and lowercase inputs.
|
||||
*/
|
||||
export const MODEL_FORMAT_SEGMENT_RE = /^[A-Z]{2,}[A-Z0-9]*$/;
|
||||
export const MODEL_QUANTIZATION_SEGMENT_RE =
|
||||
/^(I?Q\d+(_[A-Z0-9]+)*|F\d+|BF\d+|MXFP\d+(_[A-Z0-9]+)*)$/i;
|
||||
|
||||
/**
|
||||
* Matches prefix for custom quantization types, e.g. `UD-Q8_K_XL`.
|
||||
*/
|
||||
export const MODEL_CUSTOM_QUANTIZATION_PREFIX_RE = /^UD$/i;
|
||||
|
||||
/**
|
||||
* Matches a parameter-count segment, e.g. `7B`, `1.5b`, `120M`.
|
||||
@@ -22,7 +28,12 @@ export const MODEL_FORMAT_SEGMENT_RE = /^[A-Z]{2,}[A-Z0-9]*$/;
|
||||
export const MODEL_PARAMS_RE = /^\d+(\.\d+)?[BbMmKkTt]$/;
|
||||
|
||||
/**
|
||||
* Matches an activated-parameter-count segment, e.g. `A10B`, `A2.4b`.
|
||||
* The leading `A` distinguishes it from a regular params segment.
|
||||
* Matches an activated-parameter-count segment, e.g. `A10B`, `a2.4b`.
|
||||
* The leading `A`/`a` distinguishes it from a regular params segment.
|
||||
*/
|
||||
export const MODEL_ACTIVATED_PARAMS_RE = /^A\d+(\.\d+)?[BbMmKkTt]$/;
|
||||
export const MODEL_ACTIVATED_PARAMS_RE = /^[Aa]\d+(\.\d+)?[BbMmKkTt]$/;
|
||||
|
||||
/**
|
||||
* Container format segments to exclude from tags (every model uses these).
|
||||
*/
|
||||
export const MODEL_IGNORED_SEGMENTS = new Set(['GGUF', 'GGML']);
|
||||
|
||||
@@ -2,9 +2,11 @@ import { ServerModelStatus } from '$lib/enums';
|
||||
import { apiFetch, apiPost } from '$lib/utils';
|
||||
import type { ParsedModelId } from '$lib/types/models';
|
||||
import {
|
||||
MODEL_FORMAT_SEGMENT_RE,
|
||||
MODEL_QUANTIZATION_SEGMENT_RE,
|
||||
MODEL_CUSTOM_QUANTIZATION_PREFIX_RE,
|
||||
MODEL_PARAMS_RE,
|
||||
MODEL_ACTIVATED_PARAMS_RE,
|
||||
MODEL_IGNORED_SEGMENTS,
|
||||
MODEL_ID_NOT_FOUND,
|
||||
MODEL_ID_ORG_SEPARATOR,
|
||||
MODEL_ID_SEGMENT_SEPARATOR,
|
||||
@@ -119,8 +121,9 @@ export class ModelsService {
|
||||
/**
|
||||
* Parse a model ID string into its structured components.
|
||||
*
|
||||
* Handles the convention:
|
||||
* `<org>/<ModelName>-<Parameters>(-<ActivatedParameters>)-<Format>:<QuantizationType>`
|
||||
* Handles conventions like:
|
||||
* `<org>/<ModelName>-<Parameters>(-<ActivatedParameters>)(-<Tags>)(-<Quantization>):<Quantization>`
|
||||
* `<ModelName>.<Quantization>` (dot-separated quantization, e.g. `model.Q4_K_M`)
|
||||
*
|
||||
* @param modelId - Raw model identifier string
|
||||
* @returns Structured {@link ParsedModelId} with all detected fields
|
||||
@@ -132,11 +135,11 @@ export class ModelsService {
|
||||
modelName: null,
|
||||
params: null,
|
||||
activatedParams: null,
|
||||
format: null,
|
||||
quantization: null,
|
||||
tags: []
|
||||
};
|
||||
|
||||
// 1. Extract colon-separated quantization (e.g. `model:Q4_K_M`)
|
||||
const colonIdx = modelId.indexOf(MODEL_ID_QUANTIZATION_SEPARATOR);
|
||||
let modelPath: string;
|
||||
|
||||
@@ -147,6 +150,7 @@ export class ModelsService {
|
||||
modelPath = modelId;
|
||||
}
|
||||
|
||||
// 2. Extract org name (e.g. `org/model` -> org = "org")
|
||||
const slashIdx = modelPath.indexOf(MODEL_ID_ORG_SEPARATOR);
|
||||
let modelStr: string;
|
||||
|
||||
@@ -157,37 +161,66 @@ export class ModelsService {
|
||||
modelStr = modelPath;
|
||||
}
|
||||
|
||||
const segments = modelStr.split(MODEL_ID_SEGMENT_SEPARATOR);
|
||||
// 3. Handle dot-separated quantization (e.g. `model-name.Q4_K_M`)
|
||||
const dotIdx = modelStr.lastIndexOf('.');
|
||||
|
||||
if (segments.length > 0 && MODEL_FORMAT_SEGMENT_RE.test(segments[segments.length - 1])) {
|
||||
result.format = segments.pop()!;
|
||||
if (dotIdx !== MODEL_ID_NOT_FOUND && !result.quantization) {
|
||||
const afterDot = modelStr.slice(dotIdx + 1);
|
||||
|
||||
if (MODEL_QUANTIZATION_SEGMENT_RE.test(afterDot)) {
|
||||
result.quantization = afterDot;
|
||||
modelStr = modelStr.slice(0, dotIdx);
|
||||
}
|
||||
}
|
||||
|
||||
const paramsRe = MODEL_PARAMS_RE;
|
||||
const activatedParamsRe = MODEL_ACTIVATED_PARAMS_RE;
|
||||
const segments = modelStr.split(MODEL_ID_SEGMENT_SEPARATOR);
|
||||
|
||||
// 4. Detect trailing quantization from dash-separated segments
|
||||
// Handle UD-prefixed quantization (e.g. `UD-Q8_K_XL`) and
|
||||
// standalone quantization (e.g. `Q4_K_M`, `BF16`, `F16`, `MXFP4`)
|
||||
if (!result.quantization && segments.length > 1) {
|
||||
const last = segments[segments.length - 1];
|
||||
const secondLast = segments.length > 2 ? segments[segments.length - 2] : null;
|
||||
|
||||
if (MODEL_QUANTIZATION_SEGMENT_RE.test(last)) {
|
||||
if (secondLast && MODEL_CUSTOM_QUANTIZATION_PREFIX_RE.test(secondLast)) {
|
||||
result.quantization = `${secondLast}-${last}`;
|
||||
segments.splice(segments.length - 2, 2);
|
||||
} else {
|
||||
result.quantization = last;
|
||||
segments.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Find params and activated params
|
||||
let paramsIdx = MODEL_ID_NOT_FOUND;
|
||||
let activatedParamsIdx = MODEL_ID_NOT_FOUND;
|
||||
|
||||
for (let i = 0; i < segments.length; i++) {
|
||||
const seg = segments[i];
|
||||
if (paramsIdx === -1 && paramsRe.test(seg)) {
|
||||
|
||||
if (paramsIdx === MODEL_ID_NOT_FOUND && MODEL_PARAMS_RE.test(seg)) {
|
||||
paramsIdx = i;
|
||||
result.params = seg.toUpperCase();
|
||||
} else if (activatedParamsRe.test(seg)) {
|
||||
} else if (paramsIdx !== MODEL_ID_NOT_FOUND && MODEL_ACTIVATED_PARAMS_RE.test(seg)) {
|
||||
activatedParamsIdx = i;
|
||||
result.activatedParams = seg.toUpperCase();
|
||||
}
|
||||
}
|
||||
|
||||
// 6. Model name = segments before params; tags = remaining segments after params
|
||||
const pivotIdx = paramsIdx !== MODEL_ID_NOT_FOUND ? paramsIdx : segments.length;
|
||||
|
||||
result.modelName = segments.slice(0, pivotIdx).join(MODEL_ID_SEGMENT_SEPARATOR) || null;
|
||||
|
||||
if (paramsIdx !== MODEL_ID_NOT_FOUND) {
|
||||
result.tags = segments
|
||||
.slice(paramsIdx + 1)
|
||||
.filter((_, relIdx) => paramsIdx + 1 + relIdx !== activatedParamsIdx);
|
||||
result.tags = segments.slice(paramsIdx + 1).filter((_, relIdx) => {
|
||||
const absIdx = paramsIdx + 1 + relIdx;
|
||||
if (absIdx === activatedParamsIdx) return false;
|
||||
|
||||
return !MODEL_IGNORED_SEGMENTS.has(segments[absIdx].toUpperCase());
|
||||
});
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
@@ -36,7 +36,8 @@ import {
|
||||
ISO_TIME_SEPARATOR,
|
||||
ISO_TIME_SEPARATOR_REPLACEMENT,
|
||||
NON_ALPHANUMERIC_REGEX,
|
||||
MULTIPLE_UNDERSCORE_REGEX
|
||||
MULTIPLE_UNDERSCORE_REGEX,
|
||||
MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY
|
||||
} from '$lib/constants';
|
||||
|
||||
class ConversationsStore {
|
||||
@@ -61,7 +62,37 @@ class ConversationsStore {
|
||||
isInitialized = $state(false);
|
||||
|
||||
/** Pending MCP server overrides for new conversations (before first message) */
|
||||
pendingMcpServerOverrides = $state<McpServerOverride[]>([]);
|
||||
pendingMcpServerOverrides = $state<McpServerOverride[]>(ConversationsStore.loadMcpDefaults());
|
||||
|
||||
/** Load MCP default overrides from localStorage */
|
||||
private static loadMcpDefaults(): McpServerOverride[] {
|
||||
if (typeof globalThis.localStorage === 'undefined') return [];
|
||||
try {
|
||||
const raw = localStorage.getItem(MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY);
|
||||
if (!raw) return [];
|
||||
const parsed = JSON.parse(raw);
|
||||
if (!Array.isArray(parsed)) return [];
|
||||
return parsed.filter(
|
||||
(o: unknown) => typeof o === 'object' && o !== null && 'serverId' in o && 'enabled' in o
|
||||
) as McpServerOverride[];
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/** Persist MCP default overrides to localStorage */
|
||||
private saveMcpDefaults(): void {
|
||||
if (typeof globalThis.localStorage === 'undefined') return;
|
||||
const plain = this.pendingMcpServerOverrides.map((o) => ({
|
||||
serverId: o.serverId,
|
||||
enabled: o.enabled
|
||||
}));
|
||||
if (plain.length > 0) {
|
||||
localStorage.setItem(MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY, JSON.stringify(plain));
|
||||
} else {
|
||||
localStorage.removeItem(MCP_DEFAULT_ENABLED_LOCALSTORAGE_KEY);
|
||||
}
|
||||
}
|
||||
|
||||
/** Callback for title update confirmation dialog */
|
||||
titleUpdateConfirmationCallback?: (currentTitle: string, newTitle: string) => Promise<boolean>;
|
||||
@@ -261,6 +292,8 @@ class ConversationsStore {
|
||||
clearActiveConversation(): void {
|
||||
this.activeConversation = null;
|
||||
this.activeMessages = [];
|
||||
// reload MCP defaults so new chats inherit persisted state
|
||||
this.pendingMcpServerOverrides = ConversationsStore.loadMcpDefaults();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -597,6 +630,7 @@ class ConversationsStore {
|
||||
this.pendingMcpServerOverrides = [...this.pendingMcpServerOverrides, { serverId, enabled }];
|
||||
}
|
||||
}
|
||||
this.saveMcpDefaults();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -621,6 +655,7 @@ class ConversationsStore {
|
||||
*/
|
||||
clearPendingMcpServerOverrides(): void {
|
||||
this.pendingMcpServerOverrides = [];
|
||||
this.saveMcpDefaults();
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -208,23 +208,16 @@ class MCPStore {
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if a server is enabled, considering per-chat overrides.
|
||||
* Checks if a server is enabled for a given chat.
|
||||
* Only per-chat overrides (persisted in localStorage for new chats,
|
||||
* or in IndexedDB for existing conversations) control enabled state.
|
||||
*/
|
||||
#checkServerEnabled(
|
||||
server: MCPServerSettingsEntry,
|
||||
perChatOverrides?: McpServerOverride[]
|
||||
): boolean {
|
||||
if (!server.enabled) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (perChatOverrides) {
|
||||
const override = perChatOverrides.find((o) => o.serverId === server.id);
|
||||
|
||||
return override?.enabled ?? false;
|
||||
}
|
||||
|
||||
return false;
|
||||
const override = perChatOverrides?.find((o) => o.serverId === server.id);
|
||||
return override?.enabled ?? false;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -570,18 +563,8 @@ class MCPStore {
|
||||
getEnabledServersForConversation(
|
||||
perChatOverrides?: McpServerOverride[]
|
||||
): MCPServerSettingsEntry[] {
|
||||
if (!perChatOverrides?.length) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return this.getServers().filter((server) => {
|
||||
if (!server.enabled) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const override = perChatOverrides.find((o) => o.serverId === server.id);
|
||||
|
||||
return override?.enabled ?? false;
|
||||
return this.#checkServerEnabled(server, perChatOverrides);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
1
tools/server/webui/src/lib/types/models.d.ts
vendored
1
tools/server/webui/src/lib/types/models.d.ts
vendored
@@ -25,7 +25,6 @@ export interface ParsedModelId {
|
||||
modelName: string | null;
|
||||
params: string | null;
|
||||
activatedParams: string | null;
|
||||
format: string | null;
|
||||
quantization: string | null;
|
||||
tags: string[];
|
||||
}
|
||||
|
||||
270
tools/server/webui/tests/unit/model-id-parser.test.ts
Normal file
270
tools/server/webui/tests/unit/model-id-parser.test.ts
Normal file
@@ -0,0 +1,270 @@
|
||||
import { describe, expect, it } from 'vitest';
|
||||
import { ModelsService } from '$lib/services/models.service';
|
||||
|
||||
const { parseModelId } = ModelsService;
|
||||
|
||||
describe('parseModelId', () => {
|
||||
it('handles unknown patterns correctly', () => {
|
||||
expect(parseModelId('model-name-1')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'model-name-1',
|
||||
orgName: null,
|
||||
params: null,
|
||||
quantization: null,
|
||||
raw: 'model-name-1',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('org/model-name-2')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'model-name-2',
|
||||
orgName: 'org',
|
||||
params: null,
|
||||
quantization: null,
|
||||
raw: 'org/model-name-2',
|
||||
tags: []
|
||||
});
|
||||
});
|
||||
|
||||
it('extracts model parameters correctly', () => {
|
||||
expect(parseModelId('model-100B-BF16')).toMatchObject({ params: '100B' });
|
||||
expect(parseModelId('model-100B:Q4_K_M')).toMatchObject({ params: '100B' });
|
||||
});
|
||||
|
||||
it('extracts model parameters correctly in lowercase', () => {
|
||||
expect(parseModelId('model-100b-bf16')).toMatchObject({ params: '100B' });
|
||||
expect(parseModelId('model-100b:q4_k_m')).toMatchObject({ params: '100B' });
|
||||
});
|
||||
|
||||
it('extracts activated parameters correctly', () => {
|
||||
expect(parseModelId('model-100B-A10B-BF16')).toMatchObject({ activatedParams: 'A10B' });
|
||||
expect(parseModelId('model-100B-A10B:Q4_K_M')).toMatchObject({ activatedParams: 'A10B' });
|
||||
});
|
||||
|
||||
it('extracts activated parameters correctly in lowercase', () => {
|
||||
expect(parseModelId('model-100b-a10b-bf16')).toMatchObject({ activatedParams: 'A10B' });
|
||||
expect(parseModelId('model-100b-a10b:q4_k_m')).toMatchObject({ activatedParams: 'A10B' });
|
||||
});
|
||||
|
||||
it('extracts quantization correctly', () => {
|
||||
// Dash-separated quantization
|
||||
expect(parseModelId('model-100B-UD-IQ1_S')).toMatchObject({ quantization: 'UD-IQ1_S' });
|
||||
expect(parseModelId('model-100B-IQ4_XS')).toMatchObject({ quantization: 'IQ4_XS' });
|
||||
expect(parseModelId('model-100B-Q4_K_M')).toMatchObject({ quantization: 'Q4_K_M' });
|
||||
expect(parseModelId('model-100B-Q8_0')).toMatchObject({ quantization: 'Q8_0' });
|
||||
expect(parseModelId('model-100B-UD-Q8_K_XL')).toMatchObject({ quantization: 'UD-Q8_K_XL' });
|
||||
expect(parseModelId('model-100B-F16')).toMatchObject({ quantization: 'F16' });
|
||||
expect(parseModelId('model-100B-BF16')).toMatchObject({ quantization: 'BF16' });
|
||||
expect(parseModelId('model-100B-MXFP4')).toMatchObject({ quantization: 'MXFP4' });
|
||||
|
||||
// Colon-separated quantization
|
||||
expect(parseModelId('model-100B:UD-IQ1_S')).toMatchObject({ quantization: 'UD-IQ1_S' });
|
||||
expect(parseModelId('model-100B:IQ4_XS')).toMatchObject({ quantization: 'IQ4_XS' });
|
||||
expect(parseModelId('model-100B:Q4_K_M')).toMatchObject({ quantization: 'Q4_K_M' });
|
||||
expect(parseModelId('model-100B:Q8_0')).toMatchObject({ quantization: 'Q8_0' });
|
||||
expect(parseModelId('model-100B:UD-Q8_K_XL')).toMatchObject({ quantization: 'UD-Q8_K_XL' });
|
||||
expect(parseModelId('model-100B:F16')).toMatchObject({ quantization: 'F16' });
|
||||
expect(parseModelId('model-100B:BF16')).toMatchObject({ quantization: 'BF16' });
|
||||
expect(parseModelId('model-100B:MXFP4')).toMatchObject({ quantization: 'MXFP4' });
|
||||
|
||||
// Dot-separated quantization
|
||||
expect(parseModelId('nomic-embed-text-v2-moe.Q4_K_M')).toMatchObject({
|
||||
quantization: 'Q4_K_M'
|
||||
});
|
||||
});
|
||||
|
||||
it('extracts additional tags correctly', () => {
|
||||
expect(parseModelId('model-100B-foobar-Q4_K_M')).toMatchObject({ tags: ['foobar'] });
|
||||
expect(parseModelId('model-100B-A10B-foobar-1M-BF16')).toMatchObject({
|
||||
tags: ['foobar', '1M']
|
||||
});
|
||||
expect(parseModelId('model-100B-1M-foobar:UD-Q8_K_XL')).toMatchObject({
|
||||
tags: ['1M', 'foobar']
|
||||
});
|
||||
});
|
||||
|
||||
it('filters out container format segments from tags', () => {
|
||||
expect(parseModelId('model-100B-GGUF-Instruct-BF16')).toMatchObject({
|
||||
tags: ['Instruct']
|
||||
});
|
||||
expect(parseModelId('model-100B-GGML-Instruct:Q4_K_M')).toMatchObject({
|
||||
tags: ['Instruct']
|
||||
});
|
||||
});
|
||||
|
||||
it('handles real-world examples correctly', () => {
|
||||
expect(parseModelId('meta-llama/Llama-3.1-8B')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'Llama-3.1',
|
||||
orgName: 'meta-llama',
|
||||
params: '8B',
|
||||
quantization: null,
|
||||
raw: 'meta-llama/Llama-3.1-8B',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('openai/gpt-oss-120b-MXFP4')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'gpt-oss',
|
||||
orgName: 'openai',
|
||||
params: '120B',
|
||||
quantization: 'MXFP4',
|
||||
raw: 'openai/gpt-oss-120b-MXFP4',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('openai/gpt-oss-20b:Q4_K_M')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'gpt-oss',
|
||||
orgName: 'openai',
|
||||
params: '20B',
|
||||
quantization: 'Q4_K_M',
|
||||
raw: 'openai/gpt-oss-20b:Q4_K_M',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('Qwen/Qwen3-Coder-30B-A3B-Instruct-1M-BF16')).toStrictEqual({
|
||||
activatedParams: 'A3B',
|
||||
modelName: 'Qwen3-Coder',
|
||||
orgName: 'Qwen',
|
||||
params: '30B',
|
||||
quantization: 'BF16',
|
||||
raw: 'Qwen/Qwen3-Coder-30B-A3B-Instruct-1M-BF16',
|
||||
tags: ['Instruct', '1M']
|
||||
});
|
||||
});
|
||||
|
||||
it('handles real-world examples with quantization in segments', () => {
|
||||
expect(parseModelId('meta-llama/Llama-4-Scout-17B-16E-Instruct-Q4_K_M')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'Llama-4-Scout',
|
||||
orgName: 'meta-llama',
|
||||
params: '17B',
|
||||
quantization: 'Q4_K_M',
|
||||
raw: 'meta-llama/Llama-4-Scout-17B-16E-Instruct-Q4_K_M',
|
||||
tags: ['16E', 'Instruct']
|
||||
});
|
||||
|
||||
expect(parseModelId('MiniMaxAI/MiniMax-M2-IQ4_XS')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'MiniMax-M2',
|
||||
orgName: 'MiniMaxAI',
|
||||
params: null,
|
||||
quantization: 'IQ4_XS',
|
||||
raw: 'MiniMaxAI/MiniMax-M2-IQ4_XS',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('MiniMaxAI/MiniMax-M2-UD-Q3_K_XL')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'MiniMax-M2',
|
||||
orgName: 'MiniMaxAI',
|
||||
params: null,
|
||||
quantization: 'UD-Q3_K_XL',
|
||||
raw: 'MiniMaxAI/MiniMax-M2-UD-Q3_K_XL',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('mistralai/Devstral-2-123B-Instruct-2512-Q4_K_M')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'Devstral-2',
|
||||
orgName: 'mistralai',
|
||||
params: '123B',
|
||||
quantization: 'Q4_K_M',
|
||||
raw: 'mistralai/Devstral-2-123B-Instruct-2512-Q4_K_M',
|
||||
tags: ['Instruct', '2512']
|
||||
});
|
||||
|
||||
expect(parseModelId('mistralai/Devstral-Small-2-24B-Instruct-2512-Q8_0')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'Devstral-Small-2',
|
||||
orgName: 'mistralai',
|
||||
params: '24B',
|
||||
quantization: 'Q8_0',
|
||||
raw: 'mistralai/Devstral-Small-2-24B-Instruct-2512-Q8_0',
|
||||
tags: ['Instruct', '2512']
|
||||
});
|
||||
|
||||
expect(parseModelId('noctrex/GLM-4.7-Flash-MXFP4_MOE')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'GLM-4.7-Flash',
|
||||
orgName: 'noctrex',
|
||||
params: null,
|
||||
quantization: 'MXFP4_MOE',
|
||||
raw: 'noctrex/GLM-4.7-Flash-MXFP4_MOE',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('Qwen/Qwen3-Coder-Next-Q4_K_M')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'Qwen3-Coder-Next',
|
||||
orgName: 'Qwen',
|
||||
params: null,
|
||||
quantization: 'Q4_K_M',
|
||||
raw: 'Qwen/Qwen3-Coder-Next-Q4_K_M',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('openai/gpt-oss-120b-Q4_K_M')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'gpt-oss',
|
||||
orgName: 'openai',
|
||||
params: '120B',
|
||||
quantization: 'Q4_K_M',
|
||||
raw: 'openai/gpt-oss-120b-Q4_K_M',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('openai/gpt-oss-20b-F16')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'gpt-oss',
|
||||
orgName: 'openai',
|
||||
params: '20B',
|
||||
quantization: 'F16',
|
||||
raw: 'openai/gpt-oss-20b-F16',
|
||||
tags: []
|
||||
});
|
||||
|
||||
expect(parseModelId('nomic-embed-text-v2-moe.Q4_K_M')).toStrictEqual({
|
||||
activatedParams: null,
|
||||
modelName: 'nomic-embed-text-v2-moe',
|
||||
orgName: null,
|
||||
params: null,
|
||||
quantization: 'Q4_K_M',
|
||||
raw: 'nomic-embed-text-v2-moe.Q4_K_M',
|
||||
tags: []
|
||||
});
|
||||
});
|
||||
|
||||
it('handles ambiguous model names', () => {
|
||||
// Qwen3.5 Instruct vs Thinking — tags should distinguish them
|
||||
expect(parseModelId('Qwen/Qwen3.5-30B-A3B-Instruct')).toMatchObject({
|
||||
modelName: 'Qwen3.5',
|
||||
params: '30B',
|
||||
activatedParams: 'A3B',
|
||||
tags: ['Instruct']
|
||||
});
|
||||
|
||||
expect(parseModelId('Qwen/Qwen3.5-30B-A3B-Thinking')).toMatchObject({
|
||||
modelName: 'Qwen3.5',
|
||||
params: '30B',
|
||||
activatedParams: 'A3B',
|
||||
tags: ['Thinking']
|
||||
});
|
||||
|
||||
// Dot-separated quantization with variant suffixes
|
||||
expect(parseModelId('gemma-3-27b-it-heretic-v2.Q8_0')).toMatchObject({
|
||||
modelName: 'gemma-3',
|
||||
params: '27B',
|
||||
quantization: 'Q8_0',
|
||||
tags: ['it', 'heretic', 'v2']
|
||||
});
|
||||
|
||||
expect(parseModelId('gemma-3-27b-it.Q8_0')).toMatchObject({
|
||||
modelName: 'gemma-3',
|
||||
params: '27B',
|
||||
quantization: 'Q8_0',
|
||||
tags: ['it']
|
||||
});
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user