llama : remove quantization sanity check (#17788 )

* llama : remove quantization sanity check This commit removes the quantization sanity check for attention layers. The motivation for this is that there are model that are hybrid models that have recurrent layers, experts layers, and attention layers. For these models the current check fails as the experts layers are not taking into account. After consideration, it was decided that this check is not strictly necessary, and can be removed to allow for more flexible model architectures. * llama : remove unused pruned_attention_w and is_clip_model vars
vulkan: Use one row per workgroup for f32 mmv (#17711 )
2026-02-05 13:53:23 +02:00 · 2025-12-06 12:26:20 +01:00 · 2025-12-06 11:12:26 +01:00 · 2025-12-06 10:49:33 +01:00 · 2025-12-06 08:56:45 +01:00 · 2025-12-06 09:34:18 +02:00
72 changed files with 54367 additions and 12303 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1602,33 +1602,33 @@ jobs:
        run: |
          bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-  ggml-ci-x64-amd-vulkan:
-    runs-on: [self-hosted, Linux, X64, AMD]
+  # ggml-ci-x64-amd-vulkan:
+  #   runs-on: [self-hosted, Linux, X64, AMD]

-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
+  #   steps:
+  #     - name: Clone
+  #       id: checkout
+  #       uses: actions/checkout@v4

-      - name: Test
-        id: ggml-ci
-        run: |
-          vulkaninfo --summary
-          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #     - name: Test
+  #       id: ggml-ci
+  #       run: |
+  #         vulkaninfo --summary
+  #         GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

-  ggml-ci-x64-amd-rocm:
-    runs-on: [self-hosted, Linux, X64, AMD]
+  # ggml-ci-x64-amd-rocm:
+  #   runs-on: [self-hosted, Linux, X64, AMD]

-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
+  #   steps:
+  #     - name: Clone
+  #       id: checkout
+  #       uses: actions/checkout@v4

-      - name: Test
-        id: ggml-ci
-        run: |
-          amd-smi static
-          GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp
+  #     - name: Test
+  #       id: ggml-ci
+  #       run: |
+  #         amd-smi static
+  #         GG_BUILD_ROCM=1 GG_BUILD_AMDGPU_TARGETS="gfx1101" bash ./ci/run.sh ~/results/llama.cpp /mnt/llama.cpp

  ggml-ci-mac-metal:
    runs-on: [self-hosted, macOS, ARM64]
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -67,7 +67,7 @@ jobs:
        run: |
          cp LICENSE ./build/bin/
          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts (zip)
        uses: actions/upload-artifact@v4
@@ -128,7 +128,7 @@ jobs:
        run: |
          cp LICENSE ./build/bin/
          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-macos-x64.tar.gz -s ",./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts (zip)
        uses: actions/upload-artifact@v4
@@ -197,7 +197,7 @@ jobs:
        run: |
          cp LICENSE ./build/bin/
          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts (zip)
        uses: actions/upload-artifact@v4
@@ -257,7 +257,7 @@ jobs:
        run: |
          cp LICENSE ./build/bin/
          zip -y -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.zip ./build/bin/*
-          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz -C ./build/bin .
+          tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-x64.tar.gz --transform "s,./,llama-${{ steps.tag.outputs.name }}/," -C ./build/bin .

      - name: Upload artifacts (zip)
        uses: actions/upload-artifact@v4
--- a/.github/workflows/winget.yml
+++ b/.github/workflows/winget.yml
@@ -9,7 +9,7 @@ jobs:
  update:
    name: Update Winget Package
    runs-on: ubuntu-latest
-    if: ${{ github.repository.owner.login == 'ggml-org' }}
+    if: github.repository_owner == 'ggml-org'

    steps:
      - name: Install cargo binstall
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,12 @@ if (MSVC)
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/bigobj>")
 endif()

+if (LLAMA_STANDALONE)
+    # enable parallel builds for msbuild
+    list(APPEND CMAKE_VS_GLOBALS UseMultiToolTask=true)
+    list(APPEND CMAKE_VS_GLOBALS EnforceProcessCountAcrossBuilds=true)
+endif()
+
 if (CMAKE_SYSTEM_NAME STREQUAL "iOS")
    set(LLAMA_TOOLS_INSTALL_DEFAULT OFF)
 else()
@@ -193,11 +199,6 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()

-if (MINGW)
-    # Target Windows 8 for PrefetchVirtualMemory
-    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
-endif()
-
 #
 # build the library
 #
--- a/2
+++ b/2
@@ -10,6 +10,7 @@
 /common/arg.*                           @ggerganov
 /common/base64.hpp.*                    @ggerganov
 /common/build-info.*                    @ggerganov
+/common/chat.*                          @pwilkin
 /common/chat-peg-parser.*               @aldehir
 /common/common.*                        @ggerganov
 /common/console.*                       @ggerganov
@@ -84,6 +85,7 @@
 /src/llama-vocab.*                      @CISC
 /src/models/                            @CISC
 /tests/                                 @ggerganov
+/tests/test-chat-.*                     @pwilkin
 /tools/batched-bench/                   @ggerganov
 /tools/main/                            @ggerganov
 /tools/mtmd/                            @ngxson
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -16,7 +16,7 @@ The project differentiates between 3 levels of contributors:
    - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
 - Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
+- If your PR becomes stale, rebase it on top of latest `master` to get maintainers attention
 - Maintainers will rely on your insights and approval when making a final decision to approve and merge a PR
 - Consider adding yourself to [CODEOWNERS](CODEOWNERS) to indicate your availability for reviewing related PRs
 - Using AI to generate PRs is permitted. However, you must (1) explicitly disclose how AI was used and (2) conduct a thorough manual review before publishing the PR. Note that trivial tab autocompletions do not require disclosure.
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@@ -39,26 +39,10 @@ if(Git_FOUND)
    endif()
 endif()

-if(MSVC)
-    set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
-    if (CMAKE_VS_PLATFORM_NAME)
-        set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
-    else()
-        set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
-    endif()
-else()
-    execute_process(
-        COMMAND ${CMAKE_C_COMPILER} --version
-        OUTPUT_VARIABLE OUT
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
-    string(REGEX REPLACE " *\n.*" "" OUT "${OUT}")
-    set(BUILD_COMPILER ${OUT})
+set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")

-    execute_process(
-        COMMAND ${CMAKE_C_COMPILER} -dumpmachine
-        OUTPUT_VARIABLE OUT
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
-    set(BUILD_TARGET ${OUT})
+if(CMAKE_VS_PLATFORM_NAME)
+    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
+else()
+    set(BUILD_TARGET "${CMAKE_SYSTEM_NAME} ${CMAKE_SYSTEM_PROCESSOR}")
 endif()
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -427,7 +427,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context

    // model is required (except for server)
    // TODO @ngxson : maybe show a list of available models in CLI in this case
-    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER) {
+    if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !params.usage) {
        throw std::invalid_argument("error: --model is required\n");
    }

--- a/common/common.cpp
+++ b/common/common.cpp
@@ -786,11 +786,29 @@ bool fs_validate_filename(const std::string & filename, bool allow_subdirs) {
 #include <iostream>


+#ifdef _WIN32
+static std::wstring utf8_to_wstring(const std::string & str) {
+    if (str.empty()) {
+        return std::wstring();
+    }
+
+    int size = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), NULL, 0);
+
+    if (size <= 0) {
+        return std::wstring();
+    }
+
+    std::wstring wstr(size, 0);
+    MultiByteToWideChar(CP_UTF8, 0, str.c_str(), (int)str.size(), &wstr[0], size);
+
+    return wstr;
+}
+#endif
+
 // returns true if successful, false otherwise
 bool fs_create_directory_with_parents(const std::string & path) {
 #ifdef _WIN32
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    std::wstring wpath = converter.from_bytes(path);
+    std::wstring wpath = utf8_to_wstring(path);

    // if the path already exists, check whether it's a directory
    const DWORD attributes = GetFileAttributesW(wpath.c_str());
--- a/common/common.h
+++ b/common/common.h
@@ -12,6 +12,10 @@
 #include <vector>
 #include <map>

+#if defined(_WIN32) && !defined(_WIN32_WINNT)
+#define _WIN32_WINNT 0x0A00
+#endif
+
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
 #else
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1524,6 +1524,79 @@ class TextModel(ModelBase):
        special_vocab._set_special_token("bos", 151643)
        special_vocab.add_to_gguf(self.gguf_writer)

+    def _set_vocab_mistral(self):
+        if not _mistral_common_installed:
+            raise ImportError(_mistral_import_error_msg)
+
+        vocab = MistralVocab(self.dir_model)
+        logger.info(
+            f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
+        )
+
+        self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
+
+        tokens = []
+        scores = []
+        toktypes = []
+
+        for text, score, toktype in vocab.all_tokens():
+            tokens.append(text)
+            scores.append(score)
+            toktypes.append(toktype)
+
+        assert len(tokens) == vocab.vocab_size, (
+            f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
+        )
+
+        if vocab.tokenizer_type == MistralTokenizerType.tekken:
+            self.gguf_writer.add_tokenizer_pre("tekken")
+            self.gguf_writer.add_token_merges(
+                vocab.extract_vocab_merges_from_model()
+            )
+
+        logger.info(
+            f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
+        )
+
+        self.gguf_writer.add_bos_token_id(vocab.bos_id)
+        self.gguf_writer.add_eos_token_id(vocab.eos_id)
+        self.gguf_writer.add_unk_token_id(vocab.unk_id)
+        self.gguf_writer.add_pad_token_id(vocab.pad_id)
+
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+        self.gguf_writer.add_vocab_size(vocab.vocab_size)
+
+        self.gguf_writer.add_add_bos_token(True)
+        self.gguf_writer.add_add_eos_token(False)
+
+        local_template_file_path = self.dir_model / "chat_template.jinja"
+
+        if self.is_mistral_format and local_template_file_path.is_file():
+            # Ministral-3 and other new Mistral models come with chat templates.
+            # ref: https://huggingface.co/mistralai/Ministral-3-14B-Instruct-2512/tree/main
+            logger.info("Using an existing Mistral local chat template.")
+
+            with open(local_template_file_path, "r", encoding="utf-8") as f:
+                template = f.read()
+        elif not self.is_mistral_format or not self.disable_mistral_community_chat_template:
+            template_dir = Path(__file__).parent / "models/templates/"
+
+            # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
+            if self.is_mistral_format:
+                logger.info(
+                    "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
+                    "Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
+                )
+            template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
+        else:
+            logger.info("Not using a Mistral local or community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
+            template = None
+
+        if template is not None:
+            self.gguf_writer.add_chat_template(template)
+

 class MmprojModel(ModelBase):
    model_type = ModelType.MMPROJ
@@ -2294,67 +2367,6 @@ class LlamaModel(TextModel):
        if self.hf_arch == "VLlama3ForCausalLM":
            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)

-    def _set_vocab_mistral(self):
-        if not _mistral_common_installed:
-            raise ImportError(_mistral_import_error_msg)
-
-        vocab = MistralVocab(self.dir_model)
-        logger.info(
-            f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}."
-        )
-
-        self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model)
-
-        tokens = []
-        scores = []
-        toktypes = []
-
-        for text, score, toktype in vocab.all_tokens():
-            tokens.append(text)
-            scores.append(score)
-            toktypes.append(toktype)
-
-        assert len(tokens) == vocab.vocab_size, (
-            f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})"
-        )
-
-        if vocab.tokenizer_type == MistralTokenizerType.tekken:
-            self.gguf_writer.add_tokenizer_pre("tekken")
-            self.gguf_writer.add_token_merges(
-                vocab.extract_vocab_merges_from_model()
-            )
-
-        logger.info(
-            f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}."
-        )
-
-        self.gguf_writer.add_bos_token_id(vocab.bos_id)
-        self.gguf_writer.add_eos_token_id(vocab.eos_id)
-        self.gguf_writer.add_unk_token_id(vocab.unk_id)
-        self.gguf_writer.add_pad_token_id(vocab.pad_id)
-
-        self.gguf_writer.add_token_list(tokens)
-        self.gguf_writer.add_token_scores(scores)
-        self.gguf_writer.add_token_types(toktypes)
-        self.gguf_writer.add_vocab_size(vocab.vocab_size)
-
-        self.gguf_writer.add_add_bos_token(True)
-        self.gguf_writer.add_add_eos_token(False)
-
-        template_dir = Path(__file__).parent / "models/templates/"
-
-        if not self.is_mistral_format or not self.disable_mistral_community_chat_template:
-            # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`.
-            if self.is_mistral_format:
-                logger.info(
-                    "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. "
-                    "Mistral recommends to use `mistral-common` to perform tokenization and detokenization."
-                )
-            template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format)
-            self.gguf_writer.add_chat_template(template)
-        else:
-            logger.info("Not using a Mistral community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`.")
-
    def set_vocab(self):
        if self.is_mistral_format:
            return self._set_vocab_mistral()
@@ -9912,17 +9924,109 @@ class MistralModel(LlamaModel):

    def set_gguf_parameters(self):
        super().set_gguf_parameters()
-        if "yarn" in self.hparams:
-            yarn_params = self.hparams["yarn"]
-            self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
-            self.gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
-            self.gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
-            self.gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
-            self.gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
+        MistralModel.set_mistral_config(self.gguf_writer, self.hparams)

-        if "llama_4_scaling" in self.hparams:
-            self.gguf_writer.add_attn_temperature_scale(self.hparams["llama_4_scaling"]["beta"])
+    @staticmethod
+    def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict):
+        if "yarn" in hparams:
+            yarn_params = hparams["yarn"]
+            gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+            gguf_writer.add_rope_scaling_factor(yarn_params["factor"])
+            gguf_writer.add_rope_scaling_yarn_beta_fast(yarn_params["beta"])
+            gguf_writer.add_rope_scaling_yarn_beta_slow(yarn_params["alpha"])
+            gguf_writer.add_rope_scaling_yarn_log_mul(1.0) # mscale_all_dim
+            gguf_writer.add_rope_scaling_orig_ctx_len(yarn_params["original_max_position_embeddings"])
+
+        if "llama_4_scaling" in hparams:
+            gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"])
+
+
+class MistralMoeModel(DeepseekV2Model):
+    model_arch = gguf.MODEL_ARCH.DEEPSEEK2
+    model_name = "Mistral"
+    hf_arch = ""
+    is_mistral_format = True
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        logger.info("Using MistralMoeModel")
+        # remap hparams from Mistral MoE format to DeepseekV2 format
+        # we do this way to be able to reuse DeepseekV2Model set_gguf_parameters logic
+        # ref: https://github.com/vllm-project/vllm/blob/b294e28db2c5dee61bc25157664edcada8b90b31/vllm/transformers_utils/configs/mistral.py
+        config = self.hparams
+        # Mistral key -> HF key
+        config_mapping = {
+            "dim": "hidden_size",
+            "norm_eps": "rms_norm_eps",
+            "n_kv_heads": "num_key_value_heads",
+            "n_layers": "num_hidden_layers",
+            "n_heads": "num_attention_heads",
+            "hidden_dim": "intermediate_size",
+        }
+        # HF key -> (Mistral key, default value)
+        top_level_mapping_with_default = {
+            "model_type": ("model_type", "transformer"),
+            "hidden_act": ("activation", "silu"),
+            "tie_word_embeddings": ("tied_embeddings", False),
+            "max_seq_len": ("max_seq_len", config.get("max_position_embeddings", 128_000)),
+            "max_position_embeddings": ("max_position_embeddings", 128_000),
+        }
+        # mapping top-level keys
+        for key, new_key in config_mapping.items():
+            if key in config:
+                config[new_key] = config[key]
+        for new_key, (key, default_value) in top_level_mapping_with_default.items():
+            config[new_key] = config.get(key, default_value)
+        # mapping MoE-specific keys
+        moe_config_map = {
+            "route_every_n": "moe_layer_freq",
+            "first_k_dense_replace": "first_k_dense_replace",
+            "num_experts_per_tok": "num_experts_per_tok",
+            "num_experts": "n_routed_experts",
+            "expert_hidden_dim": "moe_intermediate_size",
+            "routed_scale": "routed_scaling_factor",
+            "num_shared_experts": "n_shared_experts",
+            "num_expert_groups": "n_group",
+            "num_expert_groups_per_tok": "topk_group",
+        }
+        moe = config["moe"]
+        for key, new_key in moe_config_map.items():
+            if key in moe:
+                config[new_key] = moe[key]
+        # provide missing values
+        config["topk_method"] = None
+        config["norm_topk_prob"] = True
+        config["scoring_func"] = "softmax"
+
+    def set_vocab(self):
+        self._set_vocab_mistral()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        MistralModel.set_mistral_config(self.gguf_writer, self.hparams)
+        yarn_params = self.hparams["yarn"]
+        self.gguf_writer.add_attn_temperature_length(yarn_params["original_max_position_embeddings"])
+        self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1) # mscale_all_dim * 0.1
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+        if name.startswith("vision_") or name.startswith("patch_merger.") or "mm_projector" in name:
+            return []
+
+        # rename certain tensors so that we can reuse DeepseekV2Model modify_tensors logic
+        if name.endswith(".qscale_act"):
+            name = name.replace(".qscale_act", ".input_scale")
+        if name.endswith(".qscale_weight"):
+            name = name.replace(".qscale_weight", ".weight_scale")
+        if ".wkv_b." in name:
+            name = name.replace(".wkv_b.", ".kv_b_proj.")
+        if ".experts." in name:
+            name = name.replace(".experts.", ".mlp.experts.")
+            name = name.replace(".w1.", ".gate_proj.")
+            name = name.replace(".w2.", ".down_proj.")
+            name = name.replace(".w3.", ".up_proj.")
+            name = "model." + name
+
+        return super().modify_tensors(data_torch, name, bid)


 class PixtralModel(LlavaVisionModel):
@@ -10478,6 +10582,8 @@ def main() -> None:
        elif args.mmproj:
            assert hparams.get("vision_encoder") is not None, "This model does not support multimodal"
            model_class = PixtralModel
+        elif "moe" in hparams:
+            model_class = MistralMoeModel
        else:
            model_class = MistralModel

--- a/docs/ops.md
+++ b/docs/ops.md
@@ -12,111 +12,111 @@ Legend:
 - 🟡 Partially supported by this backend
 - ❌ Not supported by this backend

-| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | zDNN |
-|-----------|------|------|------|------|------|------|------|------|------|
-|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
-|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
-|                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ |
-|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
-|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                           CUMSUM | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
-|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                             FILL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
-|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
-|                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
-|                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                           IM2COL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ❌ |
-|                              LOG | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ |
-|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
-|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ |
-|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                             NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
-|                              PAD | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
-|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ |
-|                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
-|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ |
-|                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
-|                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
-|                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                              SET | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ❌ | ❌ |
-|                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
-|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ❌ |
-|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
-|                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
-|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | 🟡 | ❌ |
-|                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ |
-|                        SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ |
-|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
-|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ |
-|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
-|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ |
-|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ❌ |
-|                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ |
-|                              SUM | ❌ | ✅ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
-|                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
-|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | 🟡 | ❌ |
-|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ |
-|               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                            TOP_K | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ |
-|                              TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ |
-|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
-|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+| Operation | BLAS | CANN | CPU | CUDA | Metal | OpenCL | SYCL | Vulkan | WebGPU | zDNN |
+|-----------|------|------|------|------|------|------|------|------|------|------|
+|                              ABS | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ |
+|                              ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                              ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ |
+|                             ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                           ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ |
+|                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ |
+|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
+|                          CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ |
+|                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                          CONV_3D | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                              COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ |
+|                      COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                              CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|               CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|          CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                           CUMSUM | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                    DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                              DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ |
+|                              DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ |
+|                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ |
+|                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ |
+|                            EXPM1 | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                             FILL | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ |
+|                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
+|                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
+|                      GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
+|                             GELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ |
+|                         GELU_ERF | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ |
+|                       GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ |
+|                         GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|                    GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                       GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|               GROUP_NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                      HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ |
+|                        HARDSWISH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ |
+|                           IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                        IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                          L2_NORM | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                       LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ |
+|                              LOG | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | ✅ | ❌ | ❌ |
+|                             MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                              MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ |
+|                          MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 |
+|                       MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ |
+|                              NEG | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ |
+|                             NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ❌ | ❌ |
+|                     NORM_MUL_ADD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                   OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                     OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                         OUT_PROD | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                              PAD | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                   PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ |
+|                          POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                            REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
+|                             RELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ |
+|                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ | ❌ |
+|                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                 RMS_NORM_MUL_ADD | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ |
+|                             ROLL | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ |
+|                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|                              SET | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ |
+|                         SET_ROWS | ❌ | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ |
+|                              SGN | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ |
+|                          SIGMOID | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ |
+|                             SILU | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | ✅ | ❌ |
+|                        SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                              SIN | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ |
+|                          SOFTCAP | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
+|                         SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ |
+|                         SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
+|                    SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ |
+|                        SOLVE_TRI | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | 🟡 | ❌ | ❌ |
+|                              SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ |
+|                             SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ |
+|                         SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ |
+|                         SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ |
+|                             STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ |
+|                              SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ✅ | ❌ |
+|                              SUM | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | ❌ | ❌ |
+|                         SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ |
+|                           SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
+|                       SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | 🟡 | ✅ | ❌ |
+|                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ |
+|               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
+|                            TOP_K | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | 🟡 | ❌ | ❌ |
+|                              TRI | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ |
+|                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ |
+|                            XIELU | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
--- a/docs/ops/BLAS.csv
+++ b/docs/ops/BLAS.csv
--- a/docs/ops/Metal.csv
+++ b/docs/ops/Metal.csv
--- a/docs/ops/WebGPU.csv
+++ b/docs/ops/WebGPU.csv
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -20,6 +20,7 @@ else()

    add_subdirectory(gguf-hash)
    add_subdirectory(gguf)
+    add_subdirectory(idle)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
    add_subdirectory(parallel)
--- a/examples/idle/CMakeLists.txt
+++ b/examples/idle/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-idle)
+add_executable(${TARGET} idle.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/idle/README.md
+++ b/examples/idle/README.md
@@ -0,0 +1,3 @@
+# llama.cpp/example/idle
+
+https://github.com/ggml-org/llama.cpp/pull/17766
--- a/examples/idle/idle.cpp
+++ b/examples/idle/idle.cpp
@@ -0,0 +1,110 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <thread>
+#include <vector>
+
+static void print_usage(int /*argc*/, char ** argv) {
+    printf("\nexample usage:\n");
+    printf("\n    %s -m model.gguf [-ngl n_gpu_layers]\n", argv[0]);
+    printf("\n");
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
+        return 1;
+    }
+
+    common_init();
+
+    // init LLM
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // initialize the model
+
+    llama_model_params model_params = common_model_params_to_llama(params);
+
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
+
+    if (model == NULL) {
+        LOG_ERR("%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    // we need just a dummy token to evaluate
+    std::vector<llama_token> prompt_tokens(1, llama_vocab_bos(vocab));
+
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.n_ctx   = 512;
+    ctx_params.n_batch = 512;
+    ctx_params.no_perf = false;
+
+    llama_context * ctx = llama_init_from_model(model, ctx_params);
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+
+    const int n_iters = 3;
+
+    // warm-up
+    llama_decode(ctx, batch);
+    llama_memory_clear(llama_get_memory(ctx), true);
+    llama_synchronize(ctx);
+
+    for (int64_t t_pause_ms = 0; t_pause_ms <= 4000; t_pause_ms += 800) {
+        double t_sum_us  = 0.0;
+        double t_sum2_us = 0.0;
+
+        for (int i = 0; i < n_iters; i++) {
+            // this pause is important - it simulates "idle GPU"
+            std::this_thread::sleep_for(std::chrono::milliseconds(t_pause_ms));
+
+            const int64_t t_start_us = llama_time_us();
+
+            // this should take constant time
+            llama_decode(ctx, batch);
+            llama_synchronize(ctx);
+
+            const int64_t t_end_us = llama_time_us();
+
+            const double t_cur_us = t_end_us - t_start_us;
+
+#if 1
+            // print individual decode times
+            printf("  - decode time: %8.2f ms\n", t_cur_us / 1000);
+#endif
+
+            t_sum_us  += t_cur_us;
+            t_sum2_us += t_cur_us * t_cur_us;
+
+            llama_memory_clear(llama_get_memory(ctx), true);
+            llama_synchronize(ctx); // just in case
+        }
+
+        const double t_avg_us = t_sum_us / n_iters;
+        const double t_dev_us = sqrt((t_sum2_us / (n_iters - 1)) - (t_avg_us * t_avg_us * n_iters) / (n_iters - 1));
+
+        printf("iters: %4d, pause: %5d ms, avg decode time: %8.2f +/- %4.2f ms\n", n_iters, (int) t_pause_ms, t_avg_us / 1000, t_dev_us / 1000);
+        fflush(stdout);
+    }
+
+    llama_free(ctx);
+    llama_model_free(model);
+
+    return 0;
+}
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -241,6 +241,12 @@ int main(int argc, char ** argv) {

    llama_batch_free(batch);

+    // this one is managed by common_init_result
+    //llama_free(ctx);
+
+    llama_free(ctx2);
+    llama_free(ctx3);
+
    if (result0 != result2) {
        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
        return 1;
--- a/examples/simple-cmake-pkg/README.md
+++ b/examples/simple-cmake-pkg/README.md
@@ -18,6 +18,7 @@ cd llama.cpp
 cmake -S . -B build
 cmake --build build
 cmake --install build --prefix inst
+```

 ### Build simple-cmake-pkg

--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -175,11 +175,6 @@ option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requi
 set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
 set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")

-
-if (MINGW)
-    set(GGML_WIN_VER "0xA00" CACHE STRING   "ggml: Windows version")
-endif()
-
 # ggml core
 set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
 option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -1,6 +1,5 @@
 #pragma once

-#include "ggml.h"
 #include "ggml-backend.h"

 #ifdef  __cplusplus
@@ -8,7 +7,7 @@ extern "C" {
 #endif

 #define RPC_PROTO_MAJOR_VERSION    3
-#define RPC_PROTO_MINOR_VERSION    5
+#define RPC_PROTO_MINOR_VERSION    6
 #define RPC_PROTO_PATCH_VERSION    0
 #define GGML_RPC_MAX_SERVERS       16

--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -204,6 +204,10 @@
 #    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif

+#if defined(_WIN32) && !defined(_WIN32_WINNT)
+#    define _WIN32_WINNT 0x0A00
+#endif
+
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -127,10 +127,6 @@ if (NOT MSVC)
    endif()
 endif()

-if (MINGW)
-    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
-endif()
-
 #
 # POSIX conformance
 #
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@@ -505,7 +505,6 @@ void ggml_gemv_q4_K_8x4_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
    constexpr int blocklen          = 8;

    assert(n % qk == 0);
-    assert(nr % 4 == 0);
    assert(nc % ncols_interleaved == 0);

    UNUSED(nb);
@@ -645,7 +644,6 @@ void ggml_gemv_q4_K_8x8_q8_K(int                        n,
    constexpr int blocklen          = 8;

    assert(n % qk == 0);
-    assert(nr % 4 == 0);
    assert(nc % ncols_interleaved == 0);

    UNUSED(nb);
--- a/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h
+++ b/ggml/src/ggml-cpu/llamafile/sgemm-ppc.h
@@ -0,0 +1,333 @@
+#pragma once
+
+typedef vector unsigned char vec_t;
+typedef __vector_quad acc_t;
+
+template <typename TA>
+class tinyBLAS_Q0_PPC {
+  public:
+    tinyBLAS_Q0_PPC(int64_t k,
+                    const TA *A, int64_t lda,
+                    const block_q8_0 *B, int64_t ldb,
+                    float *C, int64_t ldc,
+                    int ith, int nth);
+
+    void matmul(int64_t m, int64_t n);
+    void matmul_tiled_q0(int64_t m, int64_t n, int64_t mc, int64_t nc, int64_t kc) {
+        vec_t A_pack[mc*kc*2];
+        vec_t B_pack[nc*kc*2];
+        int comparray[mc*kc];
+        constexpr bool is_Ablock_q4 = std::is_same_v<TA, block_q4_0>;
+        int64_t ytiles = m / mc;
+        int64_t xtiles = n / nc;
+        int64_t tiles  = xtiles * ytiles;
+        int64_t duty = (tiles + nth - 1) / nth;
+        int64_t start = duty * ith;
+        int64_t end = start + duty;
+        if (end > tiles) {
+            end = tiles;
+        }
+        for (int64_t job = start; job < end; ++job) {
+            int64_t ii = (job / xtiles) * mc;
+            int64_t jj = (job % xtiles) * nc;
+            for (int64_t kk = 0; kk < k; kk += kc) {
+                if constexpr(is_Ablock_q4) {
+                    packNormalInt4_large(A + ii*lda + kk, lda, mc, 4, (int8_t*)A_pack, comparray);
+                } else {
+                    packNormal_large<int8_t, vector signed char>(A + ii*lda + kk, lda, mc, 8, (int8_t*)A_pack, false, comparray);
+                }
+                packNormal_large<uint8_t, vector unsigned char>(B + jj*ldb + kk, ldb, nc, 8, (uint8_t*)B_pack, true);
+                KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack, comparray);
+            }
+        }
+    }
+
+  private:
+    inline void save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
+        for (int I = 0; I < RM; I++) {
+            for (int J = 0; J < RN; J++) {
+                *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
+            }
+        }
+    }
+
+    inline void add_save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
+        for (int I = 0; I < RM; I++) {
+            for (int J = 0; J < RN; J++) {
+                float * c_ptr = (float *)(C+ii+((jj+J)*ldc)+I);
+                *c_ptr += *((float*)&fin_res[idx+I]+J);
+            }
+        }
+    }
+
+    template<typename ArrayType>
+    inline void compute(acc_t* ACC, int c_idx, int s_idx, ArrayType& comparray, vector float* vs, vector float* fin_res) {
+        vector signed int vec_C[4];
+        vector float CA[4] = {0};
+        vector float res[4] = {0};
+        __builtin_mma_disassemble_acc(vec_C, ACC);
+        for (int i = 0; i < 4; i++) {
+            CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
+            res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
+            fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
+        }
+    }
+
+    inline void process_q4_elements(vector signed char (&c)[2], int* ca) {
+        const vector signed char lowMask = vec_splats((signed char)0xF);
+        const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+        const vector signed char v8 = vec_splats((signed char)0x8);
+        vector signed int vsum = {0};
+        vector signed int vsum2 = {0};
+        c[0] = vec_and(c[1], lowMask);
+        c[1] = vec_sr(c[1], v4);
+        c[0] = vec_sub(c[0], v8);
+        c[1] = vec_sub(c[1], v8);
+        vsum = vec_sum4s(c[0], vsum);
+        vsum2 = vec_sum4s(c[1], vsum2);
+        vsum = vec_add(vsum, vsum2);
+        *(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+    }
+
+    template <typename V1, typename V2>
+    inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) {
+        vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+        vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+        vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
+        vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
+        V2 t1, t2, t3, t4, t5, t6, t7, t8;
+        vector unsigned char xor_vector;
+        uint8_t flip_vec = 0x80;
+        xor_vector = vec_splats(flip_vec);
+        t1 = vec_perm(s1, s2, swiz1);
+        t2 = vec_perm(s1, s2, swiz2);
+        t3 = vec_perm(s3, s4, swiz1);
+        t4 = vec_perm(s3, s4, swiz2);
+        t5 = vec_perm(t1, t3, swiz3);
+        t6 = vec_perm(t1, t3, swiz4);
+        t7 = vec_perm(t2, t4, swiz3);
+        t8 = vec_perm(t2, t4, swiz4);
+        if (flip == true) {
+            t5 = vec_xor(t5, xor_vector);
+            t6 = vec_xor(t6, xor_vector);
+            t7 = vec_xor(t7, xor_vector);
+            t8 = vec_xor(t8, xor_vector);
+        }
+        vec_xst(t5, 0, vecOffset);
+        vec_xst(t6, 0, vecOffset+16);
+        vec_xst(t7, 0, vecOffset+32);
+        vec_xst(t8, 0, vecOffset+48);
+    }
+
+    template<int RM, int RN>
+    inline void kernel(int64_t ii, int64_t jj) {
+        if constexpr(RM == 4 && RN == 8) {
+            KERNEL_4x8(ii,jj);
+        } else if constexpr(RM == 8 && RN == 4) {
+            KERNEL_8x4(ii,jj);
+        } else if constexpr(RM == 8 && RN == 8) {
+            KERNEL_8x8(ii,jj);
+        } else {
+            assert(false && "RN/RM values not supported");
+        }
+    }
+    template<int size>
+    void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray);
+    template<typename VA, typename VB>
+    void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip);
+    void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n);
+    void KERNEL_4x8(int64_t ii, int64_t jj);
+    void KERNEL_8x4(int64_t ii, int64_t jj);
+    void KERNEL_8x8(int64_t ii, int64_t jj);
+    void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN);
+    template <int RM, int RN>
+    void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n);
+
+    void compute_scale(int64_t ii, int64_t jj, int blk, vector float* vs){
+        for (int I = 0; I<8; I++) {
+            float a_scale = unhalf((A+((ii+I)*lda)+blk)->d);
+            for (int J = 0; J<4; J++) {
+                *((float*)&vs[I]+J) = (a_scale * unhalf((B+((jj+J)*ldb)+blk)->d));
+                *((float*)&vs[I+8]+J) = (a_scale * unhalf((B+((jj+J+4)*ldb)+blk)->d));
+             }
+         }
+    }
+
+    inline void process_q8_elements(const int8_t *qs, int *ca) {
+        vector signed char c1 = vec_xl(0, qs);
+        vector signed char c2 = vec_xl(16, qs);
+        vector signed int vsum1 = {0};
+        vector signed int vsum2 = {0};
+        vsum1 = vec_sum4s(c1, vsum1);
+        vsum2 = vec_sum4s(c2, vsum2);
+        vector signed int vsum = vec_add(vsum1, vsum2);
+        *ca = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+    }
+
+    template<typename VA, typename VB>
+    void packNormal_large(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip, int* comparray=nullptr) {
+        int64_t i, j;
+        block_q8_0 *aoffset = NULL;
+        VA *vecOffset = NULL;
+        block_q8_0* aoffsets[8];
+        __vector_pair arr[8];
+        VB c[8][2] = {0};
+        VB c1[8] = {0}; VB c2[8] = {0};
+        aoffset = const_cast<block_q8_0*>(a);
+        vecOffset = vec;
+        j = (rows >> 3);
+        int index = 0;
+        if (j > 0) {
+            do {
+                for (int it = 0; it < 8; it++)
+                    aoffsets[it] = aoffset + it*lda;
+                aoffset += 8 * lda;
+                for (int blk = 0; blk < kc; blk++) {
+                    for (int it = 0; it < 8; it++) {
+                        arr[it] = __builtin_vsx_lxvp(0, (__vector_pair*)(aoffsets[it]+blk)->qs);
+                        __builtin_vsx_disassemble_pair(c[it], &arr[it]);
+                        c1[it] = c[it][0];
+                        c2[it] = c[it][1];
+                        if (comparray){
+                            process_q8_elements((aoffsets[it]+ blk)->qs, &comparray[index + 8*blk + it]);
+                        }
+                    }
+                    vector_permute_store<VA, VB>(c1[0], c1[1], c1[2], c1[3], vecOffset, flip);
+                    vector_permute_store<VA, VB>(c2[0], c2[1], c2[2], c2[3], vecOffset+64, flip);
+                    vector_permute_store<VA, VB>(c1[4], c1[5], c1[6], c1[7], vecOffset+128, flip);
+                    vector_permute_store<VA, VB>(c2[4], c2[5], c2[6], c2[7], vecOffset+192, flip);
+                    vecOffset += 256;
+                }
+                j--;
+                index += 8*kc;
+            } while(j > 0);
+        }
+
+    }
+
+    void packNormalInt4_large(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, int*comparray) {
+        int64_t i, j;
+        TA *aoffset = NULL;
+        int8_t *vecOffset = NULL;
+        TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
+        TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
+        vector signed char c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
+        vector signed char c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
+        aoffset = const_cast<TA*>(a);
+        vecOffset = vec;
+        int index = 0;
+        j = (rows >> 3);
+        if (j > 0) {
+            do {
+                aoffset1 = aoffset;
+                aoffset2 = aoffset1 + lda;
+                aoffset3 = aoffset2 + lda;
+                aoffset4 = aoffset3 + lda;
+                aoffset5 = aoffset4 + lda;
+                aoffset6 = aoffset5 + lda;
+                aoffset7 = aoffset6 + lda;
+                aoffset8 = aoffset7 + lda;
+                aoffset += 8 * lda;
+                for (int blk = 0; blk < kc; blk++) {
+                    c1[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset1+blk)->qs));
+                    c2[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset2+blk)->qs));
+                    c3[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset3+blk)->qs));
+                    c4[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset4+blk)->qs));
+                    c5[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset5+blk)->qs));
+                    c6[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset6+blk)->qs));
+                    c7[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset7+blk)->qs));
+                    c8[1] = reinterpret_cast<vector signed char>(vec_xl(0, (aoffset8+blk)->qs));
+
+                    process_q4_elements(c1, &comparray[index + 8*blk+0]);
+                    process_q4_elements(c2, &comparray[index + 8*blk+1]);
+                    process_q4_elements(c3, &comparray[index + 8*blk+2]);
+                    process_q4_elements(c4, &comparray[index + 8*blk+3]);
+                    process_q4_elements(c5, &comparray[index + 8*blk+4]);
+                    process_q4_elements(c6, &comparray[index + 8*blk+5]);
+                    process_q4_elements(c7, &comparray[index + 8*blk+6]);
+                    process_q4_elements(c8, &comparray[index + 8*blk+7]);
+                    vector_permute_store<int8_t, vector signed char>(c1[0], c2[0], c3[0], c4[0], vecOffset, false);
+                    vector_permute_store<int8_t, vector signed char>(c1[1], c2[1], c3[1], c4[1], vecOffset+64, false);
+                    vector_permute_store<int8_t, vector signed char>(c5[0], c6[0], c7[0], c8[0], vecOffset+128, false);
+                    vector_permute_store<int8_t, vector signed char>(c5[1], c6[1], c7[1], c8[1], vecOffset+192, false);
+                    vecOffset += 256;
+                }
+                j--;
+                index += 8*kc;
+            } while (j > 0);
+        }
+    }
+
+    void KERNEL_Q0(int64_t ii, int64_t jj, int64_t mc, int64_t nc, int64_t kc, int64_t l, vec_t *vec_A, vec_t *vec_B, int *comparray) {
+        acc_t acc[8];
+        for (int i = 0; i < mc ; i += 8) {
+            for (int j = 0; j < nc; j += 8) {
+                vector float fin_res[16] = {0};
+                vector float vs[16] = {0};
+                for (int64_t kk = 0; kk < kc; kk+=2) {
+                    for (int x = 0; x < 8; x++) {
+                        __builtin_mma_xxsetaccz(&acc[x]);
+                    }
+                    int A_block_idx = (i/8)*(16*kc) + kk*16;
+                    int B_block_idx = (j/8)*(16*kc)+ kk*16;
+                    vec_t *A_block = &vec_A[A_block_idx];
+                    vec_t *B_block = &vec_B[B_block_idx];
+                    for (int x = 0; x < 8; x++) {
+                        __builtin_mma_xvi8ger4pp(&acc[0], A_block[x],     B_block[x]);
+                        __builtin_mma_xvi8ger4pp(&acc[1], A_block[x + 8], B_block[x]);
+                        __builtin_mma_xvi8ger4pp(&acc[2], A_block[x],     B_block[x+8]);
+                        __builtin_mma_xvi8ger4pp(&acc[3], A_block[x+8],   B_block[x+8]);
+                    }
+                    compute_scale(ii+i, jj+j, l+kk, vs);
+                    int c_index = (i/8)*(8*kc)+ kk*8;
+                    int* c_block = &comparray[c_index];
+                    compute(&acc[0], 0,  0,  c_block, vs, fin_res);
+                    compute(&acc[1], 4,  4,  c_block, vs, fin_res);
+                    compute(&acc[2], 0,  8,  c_block, vs, fin_res);
+                    compute(&acc[3], 4, 12,  c_block, vs, fin_res);
+
+                    A_block_idx = (i/8)*(16*kc) + (kk+1)*16;
+                    B_block_idx = (j/8)*(16*kc)+ (kk+1)*16;
+                    A_block = &vec_A[A_block_idx];
+                    B_block = &vec_B[B_block_idx];
+                    for (int x = 0; x < 8; x++) {
+                        __builtin_mma_xvi8ger4pp(&acc[4], A_block[x],     B_block[x]);
+                        __builtin_mma_xvi8ger4pp(&acc[5], A_block[x + 8], B_block[x]);
+                        __builtin_mma_xvi8ger4pp(&acc[6], A_block[x],     B_block[x+8]);
+                        __builtin_mma_xvi8ger4pp(&acc[7], A_block[x+8],   B_block[x+8]);
+                    }
+                    compute_scale(ii+i, jj+j, l+kk+1, vs);
+                    c_index = (i/8)*(8*kc)+ (kk+1)*8;
+                    c_block = &comparray[c_index];
+                    compute(&acc[4], 0,  0,  c_block, vs, fin_res);
+                    compute(&acc[5], 4,  4,  c_block, vs, fin_res);
+                    compute(&acc[6], 0,  8,  c_block, vs, fin_res);
+                    compute(&acc[7], 4, 12,  c_block, vs, fin_res);
+
+                }
+                if (l == 0) {
+                    save_res(ii+i,   jj+j,    0,  fin_res);
+                    save_res(ii+i+4, jj+j,    4,  fin_res);
+                    save_res(ii+i,   jj+j+4,  8,  fin_res);
+                    save_res(ii+i+4, jj+j+4, 12,  fin_res);
+                } else {
+                    add_save_res(ii+i,   jj+j,    0,  fin_res);
+                    add_save_res(ii+i+4, jj+j,    4,  fin_res);
+                    add_save_res(ii+i,   jj+j+4,  8,  fin_res);
+                    add_save_res(ii+i+4, jj+j+4, 12,  fin_res);
+                }
+            }
+        }
+    }
+
+    const TA *const A;
+    const block_q8_0 *const B;
+    float *C;
+    const int64_t k;
+    int64_t kc;
+    const int64_t lda;
+    const int64_t ldb;
+    const int64_t ldc;
+    const int ith;
+    const int nth;
+};
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -117,8 +117,7 @@ inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vec_mul(x, y); }
 #endif

 #if defined(__MMA__)
-typedef vector unsigned char vec_t;
-typedef __vector_quad acc_t;
+#include "sgemm-ppc.h"
 #endif
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // VECTORIZED FUSED MULTIPLY ADD
@@ -1573,95 +1572,35 @@ class tinyBLAS_BF16_PPC {
    const int nth;
 };

-template <typename TA>
-class tinyBLAS_Q0_PPC {
-  public:
-    tinyBLAS_Q0_PPC(int64_t k,
-                const TA *A, int64_t lda,
-                const block_q8_0 *B, int64_t ldb,
-                float *C, int64_t ldc,
-                int ith, int nth)
+    template <typename TA>
+    tinyBLAS_Q0_PPC<TA>::tinyBLAS_Q0_PPC(int64_t k,
+        const TA *A, int64_t lda,
+        const block_q8_0 *B, int64_t ldb,
+        float *C, int64_t ldc,
+        int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
+                kc = 64;
    }

-    void matmul(int64_t m, int64_t n) {
-        mnpack(0, m, 0, n);
-    }
-
-  private:
-
-    inline void save_res(int ii, int jj, int idx, vector float* fin_res, int RM=4, int RN=4) {
-       for (int I = 0; I < RM; I++) {
-          for (int J = 0; J < RN; J++) {
-             *((float*)(C+ii+((jj+J)*ldc)+I)) = *((float*)&fin_res[idx+I]+J);
-          }
-       }
-    }
-
-    template<int size>
-    inline void compute(acc_t* ACC, int c_idx, int s_idx, std::array<int, size>& comparray, vector float* vs, vector float* fin_res) {
-       vector signed int vec_C[4];
-       vector float CA[4] = {0};
-       vector float res[4] = {0};
-       __builtin_mma_disassemble_acc(vec_C, ACC);
-       for (int i = 0; i < 4; i++) {
-          CA[i] = vec_splats((float)(((double)comparray[c_idx+i]) * -128.0));
-          res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
-          fin_res[s_idx+i] = vec_madd(res[i], vs[s_idx+i], fin_res[s_idx+i]);
-       }
-    }
-    /* This function processes quantized data from block_q4_0 elements.
-     * First the we try to extract the two int4 values stored in single int8_t into two signed int8.
-     * And then we subtract each of the resultant element with 8, to convert signed int8 to unsigned int8.
-     * Also compute the rowsum which is required to compensate the above conversion. */
-    inline void process_q4_elements(vector signed char (&c)[2], int* ca) {
-        const vector signed char lowMask = vec_splats((signed char)0xF);
-        const vector unsigned char v4 = vec_splats((unsigned char)0x4);
-        const vector signed char v8 = vec_splats((signed char)0x8);
-        vector signed int vsum = {0};
-        vector signed int vsum2 = {0};
-        c[0] = vec_and(c[1], lowMask);
-        c[1] = vec_sr(c[1], v4);
-        c[0] = vec_sub(c[0], v8);
-        c[1] = vec_sub(c[1], v8);
-        vsum = vec_sum4s(c[0], vsum);
-        vsum2 = vec_sum4s(c[1], vsum2);
-        vsum = vec_add(vsum, vsum2);
-        *(ca) = vsum[0] + vsum[1] + vsum[2] + vsum[3];
-    }
-
-    template <typename V1, typename V2>
-    inline void vector_permute_store(V2 &s1, V2 &s2, V2 &s3, V2 &s4, V1 *vecOffset, bool flip) {
-        vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
-        vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
-        vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
-        vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
-        V2 t1, t2, t3, t4, t5, t6, t7, t8;
-        vector unsigned char xor_vector;
-        uint8_t flip_vec = 0x80;
-        xor_vector = vec_splats(flip_vec);
-        t1 = vec_perm(s1, s2, swiz1);
-        t2 = vec_perm(s1, s2, swiz2);
-        t3 = vec_perm(s3, s4, swiz1);
-        t4 = vec_perm(s3, s4, swiz2);
-        t5 = vec_perm(t1, t3, swiz3);
-        t6 = vec_perm(t1, t3, swiz4);
-        t7 = vec_perm(t2, t4, swiz3);
-        t8 = vec_perm(t2, t4, swiz4);
-        if (flip == true) {
-            t5 = vec_xor(t5, xor_vector);
-            t6 = vec_xor(t6, xor_vector);
-            t7 = vec_xor(t7, xor_vector);
-            t8 = vec_xor(t8, xor_vector);
+    template<typename TA>
+    void tinyBLAS_Q0_PPC<TA>::matmul(int64_t m, int64_t n) {
+        int mc = 64; int nc = 64;
+        if (n % 8 == 0 && n < nc) {
+                nc = n;
+                mc = 32 ;
+                kc = 32;
+        }
+        const bool is_aligned = ((m & (mc - 1)) == 0) & ((n & (nc - 1)) == 0) & ((k & (kc - 1)) == 0);
+        if (is_aligned) {
+            this->matmul_tiled_q0(m, n, mc, nc, kc);
+        } else {
+            mnpack(0, m, 0, n);
        }
-        vec_xst(t5, 0, vecOffset);
-        vec_xst(t6, 0, vecOffset+16);
-        vec_xst(t7, 0, vecOffset+32);
-        vec_xst(t8, 0, vecOffset+48);
    }

-    template<int size>
-    void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray) {
+   template<typename TA>
+   template<int size>
+   void tinyBLAS_Q0_PPC<TA>::packNormalInt4(const TA* a, int64_t lda, int rows, int cols, int8_t* vec, std::array<int, size>& comparray) {
        int64_t i, j;
        TA *aoffset = NULL;
        int8_t *vecOffset = NULL;
@@ -1781,8 +1720,10 @@ class tinyBLAS_Q0_PPC {
            }
        }
    }
+
+    template<typename TA>
    template<typename VA, typename VB>
-    void packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
+    void tinyBLAS_Q0_PPC<TA>::packNormal(const block_q8_0* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
        int64_t i, j;
        block_q8_0 *aoffset = NULL;
        VA *vecOffset = NULL;
@@ -1822,7 +1763,6 @@ class tinyBLAS_Q0_PPC {
            j--;
        } while(j > 0);
    }
-
    if (rows & 4) {
            aoffsets[0]  = aoffset;
            for (int it = 1; it < 4; it++ )
@@ -1878,7 +1818,8 @@ class tinyBLAS_Q0_PPC {
        }
    }

-    void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+    template<typename TA>
+    void tinyBLAS_Q0_PPC<TA>::mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
        int m_rem = MIN(m - m0, 16);
        int n_rem = MIN(n - n0, 16);

@@ -1915,7 +1856,8 @@ class tinyBLAS_Q0_PPC {
    }


-    void KERNEL_4x8(int64_t ii, int64_t jj) {
+    template<typename TA>
+    void tinyBLAS_Q0_PPC<TA>::KERNEL_4x8(int64_t ii, int64_t jj) {
        vec_t vec_A[8], vec_B[16] = {0};
        acc_t acc_0, acc_1;
        std::array<int, 4> comparray {};
@@ -1953,14 +1895,15 @@ class tinyBLAS_Q0_PPC {
                    aoffset += lda;
                }
            }
-            compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
-            compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
+            compute(&acc_0, 0, 0, comparray, vs, fin_res);
+            compute(&acc_1, 0, 4, comparray, vs, fin_res);
        }
        save_res(ii, jj, 0, fin_res);
        save_res(ii, jj+4, 4, fin_res);
    }

-    void KERNEL_8x4(int64_t ii, int64_t jj) {
+    template<typename TA>
+    void tinyBLAS_Q0_PPC<TA>::KERNEL_8x4(int64_t ii, int64_t jj) {
        vec_t vec_A[16], vec_B[8] = {0};
        acc_t acc_0, acc_1;
        std::array<int, 8> comparray {};
@@ -1997,16 +1940,18 @@ class tinyBLAS_Q0_PPC {
                    aoffset += lda;
                }
            }
-            compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
-            compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
+            compute(&acc_0, 0, 0, comparray, vs, fin_res);
+            compute(&acc_1, 4, 4, comparray, vs, fin_res);
        }
        save_res(ii, jj, 0, fin_res);
        save_res(ii+4, jj, 4, fin_res);
    }

-    void KERNEL_8x8(int64_t ii, int64_t jj) {
+    template<typename TA>
+    void tinyBLAS_Q0_PPC<TA>::KERNEL_8x8(int64_t ii, int64_t jj) {
        vec_t vec_A[16], vec_B[16] = {0};
        acc_t acc_0, acc_1, acc_2, acc_3;
+        acc_t acc_4, acc_5, acc_6, acc_7;
        std::array<int, 8> comparray {};
        vector float fin_res[16] = {0};
        vector float vs[16] = {0};
@@ -2046,10 +1991,10 @@ class tinyBLAS_Q0_PPC {
                    aoffset += lda;
                }
            }
-            compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
-            compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
-            compute<8>(&acc_2, 0, 8, comparray, vs, fin_res);
-            compute<8>(&acc_3, 4, 12, comparray, vs, fin_res);
+            compute(&acc_0, 0, 0, comparray, vs, fin_res);
+            compute(&acc_1, 4, 4, comparray, vs, fin_res);
+            compute(&acc_2, 0, 8, comparray, vs, fin_res);
+            compute(&acc_3, 4, 12, comparray, vs, fin_res);
        }
        save_res(ii, jj, 0, fin_res);
        save_res(ii+4, jj, 4, fin_res);
@@ -2057,7 +2002,8 @@ class tinyBLAS_Q0_PPC {
        save_res(ii+4, jj+4, 12, fin_res);
    }

-    void gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
+    template<typename TA>
+    void tinyBLAS_Q0_PPC<TA>::gemm_small(int64_t m0, int64_t m, int64_t n0, int64_t n, int RM, int RN) {
        int64_t ytiles = (m - m0) / RM;
        int64_t xtiles = (n - n0) / RN;
        int64_t tiles = xtiles * ytiles;
@@ -2125,21 +2071,9 @@ class tinyBLAS_Q0_PPC {
        }
    }

-    template<int RM, int RN>
-    inline void kernel(int64_t ii, int64_t jj) {
-       if constexpr(RM == 4 && RN == 8) {
-          KERNEL_4x8(ii,jj);
-       } else if constexpr(RM == 8 && RN == 4) {
-          KERNEL_8x4(ii,jj);
-       } else if constexpr(RM == 8 && RN == 8) {
-          KERNEL_8x8(ii,jj);
-       } else {
-          assert(false && "RN/RM values not supported");
-       }
-    }
-
+    template<typename TA>
    template <int RM, int RN>
-    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
+    NOINLINE void tinyBLAS_Q0_PPC<TA>::gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
        int64_t ytiles = (m - m0) / RM;
        int64_t xtiles = (n - n0) / RN;
        int64_t tiles = xtiles * ytiles;
@@ -2151,20 +2085,12 @@ class tinyBLAS_Q0_PPC {
        for (int64_t job = start; job < end; ++job) {
            int64_t ii = m0 + job / xtiles * RM;
            int64_t jj = n0 + job % xtiles * RN;
-            kernel<RM, RN>(ii, jj);
+            this->kernel<RM, RN>(ii, jj);
        }
    }

-    const TA *const A;
-    const block_q8_0 *const B;
-    float *C;
-    const int64_t k;
-    const int64_t lda;
-    const int64_t ldb;
-    const int64_t ldc;
-    const int ith;
-    const int nth;
-};
+template class tinyBLAS_Q0_PPC<block_q4_0>;
+template class tinyBLAS_Q0_PPC<block_q8_0>;

 class tinyBLAS_PPC {
  public:
--- a/ggml/src/ggml-cpu/llamafile/sgemm.h
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.h
@@ -6,6 +6,12 @@
 #include <vecintrin.h>
 #endif

+#ifdef _MSC_VER
+#define NOINLINE __declspec(noinline)
+#else
+#define NOINLINE __attribute__((__noinline__))
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -226,7 +226,7 @@ static const char * cu_get_error_str(CUresult err) {
 #define AMD_MFMA_AVAILABLE
 #endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)

-#if defined(GGML_USE_HIP) && defined(RDNA4)
+#if defined(GGML_USE_HIP) && (defined(RDNA4) || defined(RDNA3))
 #define AMD_WMMA_AVAILABLE
 #endif // defined(GGML_USE_HIP) && defined(RDNA4)

@@ -294,7 +294,7 @@ static bool amd_mfma_available(const int cc) {
 }

 static bool amd_wmma_available(const int cc) {
-    return GGML_CUDA_CC_IS_RDNA4(cc);
+    return (GGML_CUDA_CC_IS_RDNA4(cc) || GGML_CUDA_CC_IS_RDNA3(cc));
 }

 static bool volta_mma_available(const int cc) {
@@ -463,6 +463,53 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
    return x;
 }

+template<typename T, int width = WARP_SIZE>
+static __device__ __forceinline__ T warp_prefix_inclusive_sum(T x) {
+    const int lane_id = threadIdx.x % width;
+#pragma unroll
+    for (int offset = 1; offset < width; offset <<= 1) {
+        const T t = __shfl_up_sync(0xffffffff, x, offset, width);
+        if (lane_id >= offset) {
+            x += t;
+        }
+    }
+    return x;
+}
+
+template<int width = WARP_SIZE>
+static __device__ __forceinline__ float2 warp_prefix_inclusive_sum(float2 a) {
+    const int lane_id = threadIdx.x % width;
+#pragma unroll
+    for (int offset = 1; offset < width; offset <<= 1) {
+        const float t_x = __shfl_up_sync(0xffffffff, a.x, offset, width);
+        const float t_y = __shfl_up_sync(0xffffffff, a.y, offset, width);
+        if (lane_id >= offset) {
+            a.x += t_x;
+            a.y += t_y;
+        }
+    }
+    return a;
+}
+
+template<int width = WARP_SIZE>
+static __device__ __forceinline__ half2 warp_prefix_inclusive_sum(half2 a) {
+#ifdef FP16_AVAILABLE
+    const int lane_id = threadIdx.x % width;
+#pragma unroll
+    for (int offset = 1; offset < width; offset <<= 1) {
+        const half2 t = __shfl_up_sync(0xffffffff, a, offset, width);
+        if (lane_id >= offset) {
+            a = __hadd2(a, t);
+        }
+    }
+    return a;
+
+#else
+    NO_DEVICE_CODE;
+    return a;
+#endif // FP16_AVAILABLE
+}
+
 static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
 #ifdef FP16_AVAILABLE

--- a/ggml/src/ggml-cuda/cumsum.cu
+++ b/ggml/src/ggml-cuda/cumsum.cu
@@ -0,0 +1,237 @@
+#include <algorithm>
+#include "cumsum.cuh"
+#include "convert.cuh"
+#include "ggml-cuda/common.cuh"
+#include "ggml.h"
+
+#ifdef GGML_CUDA_USE_CUB
+#   include <cub/device/device_scan.cuh>
+#endif // GGML_CUDA_USE_CUB
+
+template<typename T, int BLOCK_SIZE>
+static __global__ void cumsum_cub_kernel(
+        const T * __restrict__ src,
+        T * __restrict__ dst,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t  s01, const int64_t  s02, const int64_t  s03,
+        const int64_t   s1,  const int64_t   s2,  const int64_t   s3) {
+#ifdef GGML_CUDA_USE_CUB
+    using BlockScan = cub::BlockScan<T, BLOCK_SIZE>;
+
+    __shared__ typename BlockScan::TempStorage temp_storage;
+    __shared__ T block_carry;      // carry from previous tile
+
+    const int tid = threadIdx.x;
+
+    const int64_t i1 = blockIdx.x;
+    const int64_t i2 = blockIdx.y;
+    const int64_t i3 = blockIdx.z;
+
+    if (i1 >= ne01 || i2 >= ne02 || i3 >= ne03) {
+        return;
+    }
+
+    const T * src_row = src + i1 * s01 + i2 * s02 + i3 * s03;
+    T *       dst_row = dst + i1 * s1  + i2 * s2  + i3 * s3;
+
+    if (tid == 0) {
+        block_carry = 0;
+    }
+    __syncthreads();
+
+    for (int64_t start = 0; start < ne00; start += BLOCK_SIZE) {
+        int64_t idx = start + tid;
+        T x = (idx < ne00) ? src_row[idx] : T(0);
+
+        T inclusive;
+        T block_total;
+        BlockScan(temp_storage).InclusiveSum(x, inclusive, block_total);
+
+        __syncthreads();
+
+        T final_val = inclusive + block_carry;
+
+        // store result
+        if (idx < ne00) {
+            dst_row[idx] = final_val;
+        }
+
+        __syncthreads();
+
+        if (tid == 0) {
+            block_carry += block_total;
+        }
+
+        __syncthreads();
+    }
+#else
+    NO_DEVICE_CODE;
+#endif // GGML_CUDA_USE_CUB
+}
+
+// Fallback kernel implementation (original)
+template<typename T>
+static __global__ void cumsum_kernel(
+        const T * src, T * dst,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t  s00, const int64_t  s01, const int64_t  s02, const int64_t  s03,
+        const int64_t   s0, const int64_t   s1, const int64_t   s2, const int64_t   s3) {
+
+    GGML_UNUSED_VARS(s00, s0);
+
+    const int tid = threadIdx.x;
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
+    const int lane = tid % warp_size;
+    const int warp = tid / warp_size;
+    const int warps_per_block = blockDim.x / warp_size;
+
+    extern __shared__ float smem[];
+    float * s_vals = smem;
+    float * s_warp_sums = smem + blockDim.x;
+    float * s_carry = smem + blockDim.x + warps_per_block;
+    float * s_chunk_total = s_carry + 1;
+
+    // Initialize carry
+    if (tid == 0) {
+        *s_carry = 0.0f;
+    }
+    __syncthreads();
+
+    const int64_t i3 = blockIdx.z;
+    const int64_t i2 = blockIdx.y;
+    const int64_t i1 = blockIdx.x;
+    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
+        return;
+    }
+
+    const T * src_row = src + i1 * s01 + i2 * s02 + i3 * s03;
+    T       * dst_row = dst + i1 * s1  + i2 * s2  + i3 * s3;
+
+    for (int64_t start = 0; start < ne00; start += blockDim.x) {
+        int64_t idx = start + tid;
+        float val = (idx < ne00) ? ggml_cuda_cast<float, T>(src_row[idx]) : 0.0f;
+
+        // 1. Warp inclusive scan
+        val = warp_prefix_inclusive_sum<T, warp_size>(val);
+        s_vals[tid] = val;
+
+        // Store warp total
+        if (lane == warp_size - 1) {
+            s_warp_sums[warp] = val;
+        }
+        __syncthreads();
+
+        // 2. Exclusive scan of warp sums (warp 0 only)
+        if (warp == 0) {
+            float w = (tid < warps_per_block) ? s_warp_sums[tid] : 0.0f;
+            float inc = warp_prefix_inclusive_sum<T, warp_size>(w);
+            if (tid < warps_per_block) {
+                s_warp_sums[tid] = inc - w;   // exclusive sum
+            }
+            if (tid == warps_per_block - 1) {
+                *s_chunk_total = inc;          // total sum of this chunk
+            }
+        }
+        __syncthreads();
+
+        float carry = *s_carry;
+        float final_val = s_vals[tid] + s_warp_sums[warp] + carry;
+        if (idx < ne00) {
+            dst_row[idx] = ggml_cuda_cast<T, float>(final_val);
+        }
+        __syncthreads();
+
+        // Update carry for next chunk
+        if (tid == 0) {
+            *s_carry += *s_chunk_total;
+        }
+        __syncthreads();
+    }
+}
+
+template<typename T>
+static void cumsum_cuda(
+        const T * src, T * dst,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
+        const int64_t  nb0,  const int64_t nb1, const int64_t  nb2, const int64_t  nb3,
+        cudaStream_t stream) {
+
+    const size_t type_size = sizeof(T);
+    bool use_cub = false;
+#ifdef GGML_CUDA_USE_CUB
+    // Check if we can use CUB (data must be contiguous along innermost dimension)
+    const bool is_contiguous = (nb00 == type_size) && (nb0 == type_size);
+
+    if (is_contiguous) {
+        use_cub = true;
+    }
+#endif // GGML_CUDA_USE_CUB
+    dim3 grid_dims(ne01, ne02, ne03);
+    const auto &info = ggml_cuda_info().devices[ggml_cuda_get_device()];
+    const int warp_size = info.warp_size;
+    const int num_warps = (ne00 + warp_size - 1) / warp_size;
+    int block_size = num_warps * warp_size;
+    block_size = std::min(block_size, CUDA_CUMSUM_BLOCK_SIZE);
+    dim3 block_dims(block_size, 1, 1);
+    const int warps_per_block = block_size / warp_size;
+    const size_t shmem_size = (block_size + warps_per_block + 2) * sizeof(float);
+
+    if (use_cub) {
+        cumsum_cub_kernel<T, CUDA_CUMSUM_BLOCK_SIZE><<<grid_dims, CUDA_CUMSUM_BLOCK_SIZE, 0, stream>>>(
+            src, dst,
+            ne00, ne01, ne02, ne03,
+            nb01 / type_size, nb02 / type_size, nb03 / type_size,
+            nb1 / type_size,  nb2 / type_size,  nb3 / type_size
+        );
+    } else {
+        cumsum_kernel<<<grid_dims, block_dims, shmem_size, stream>>>(
+            src, dst,
+            ne00, ne01, ne02, ne03,
+            nb00 / type_size, nb01 / type_size, nb02 / type_size, nb03 / type_size,
+            nb0 / type_size, nb1 / type_size, nb2 / type_size, nb3 / type_size
+        );
+    }
+}
+
+void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    cudaStream_t stream = ctx.stream();
+
+    GGML_ASSERT(src0->type == dst->type);
+    switch(src0->type) {
+        case GGML_TYPE_F32:
+            {
+                cumsum_cuda(
+                    (const float *)src0->data, (float *)dst->data,
+                    src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+                    src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+                    dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
+                    stream
+                );
+            } break;
+        // We do not support those on CPU for now anyway, so comment them out because they cause errors on some CI platforms
+        /*case GGML_TYPE_F16:
+            {
+                cumsum_cuda(
+                    (const half *)src0->data, (half *)dst->data,
+                    src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+                    src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+                    dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
+                    stream
+                );
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                cumsum_cuda(
+                    (const nv_bfloat16 *)src0->data, (nv_bfloat16 *)dst->data,
+                    src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+                    src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+                    dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
+                    stream
+                );
+            } break;*/
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
--- a/ggml/src/ggml-cuda/cumsum.cuh
+++ b/ggml/src/ggml-cuda/cumsum.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_CUMSUM_BLOCK_SIZE 256
+
+void ggml_cuda_op_cumsum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -10,6 +10,12 @@
 #define HALF_MAX_HALF         __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
 #define SOFTMAX_FTZ_THRESHOLD -20.0f                   // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.

+// log(2) = 0.6931, by adding this to the KQ maximum used for the softmax the numerical range representable
+//     by the VKQ accumulators is effectively being shifted up by a factor of 8.
+// This reduces issues with numerical overflow but also causes larger values to be flushed to zero.
+// However, as the output from FlashAttention will usually be used as an input for a matrix multiplication this should be negligible.
+#define FATTN_KQ_MAX_OFFSET 0.6931f
+
 typedef void (* fattn_kernel_t)(
        const char * __restrict__ Q,
        const char * __restrict__ K,
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -532,7 +532,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 #pragma unroll
            for (int l = 0; l < T_C_KQ::ne; ++l) {
                if (!oob_check || k0 + T_C_KQ::get_i(l) < k_VKQ_sup) {
-                    KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l]);
+                    KQ_max_new[l % 2] = fmaxf(KQ_max_new[l % 2], KQ_C[k0/(np*T_C_KQ::I)].x[l] + FATTN_KQ_MAX_OFFSET);
                }
            }
        }
@@ -585,7 +585,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
            for (int l = 0; l < T_C_KQ::ne; ++l) {
                if (!oob_check || k0 + T_C_KQ::get_j(l) < k_VKQ_sup) {
                    // Turing + Volta:
-                    KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l]);
+                    KQ_max_new[(l/2) % 2] = fmaxf(KQ_max_new[(l/2) % 2], KQ_C[(k0/(np*T_C_KQ::J))].x[l] + FATTN_KQ_MAX_OFFSET);
                }
            }
        }
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -572,7 +572,7 @@ static __device__ __forceinline__ void flash_attn_tile_iter(
                KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0] += (ncols2 > 1 || mask) ?
                    slope*__half2float(mask[j*stride_mask + k_VKQ_0 + i_KQ]) : 0.0f;

-                KQ_max_new[jc0] = fmaxf(KQ_max_new[jc0], KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0]);
+                KQ_max_new[jc0] = fmaxf(KQ_max_new[jc0], KQ_acc[(i_KQ_0/(np*warp_size))*cpw + jc0] + FATTN_KQ_MAX_OFFSET);
            }
        }

--- a/ggml/src/ggml-cuda/fattn-vec.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec.cuh
@@ -270,7 +270,7 @@ static __global__ void flash_attn_ext_vec(
                    sum += slope*__half2float(maskh[j*ne11 + i_KQ]);
                }

-                KQ_max_new[j] = fmaxf(KQ_max_new[j], sum);
+                KQ_max_new[j] = fmaxf(KQ_max_new[j], sum + FATTN_KQ_MAX_OFFSET);

                if ((nthreads_KQ == WARP_SIZE ? threadIdx.x : threadIdx.x % nthreads_KQ) == uint32_t(i_KQ_0)) {
                    KQ_reg[j] = sum;
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -220,7 +220,7 @@ static __global__ void flash_attn_ext_f16(

                    KQ_f_tmp[k0/warp_size] += mask && ic0 + j < int(ne01.z) ?
                        __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
-                    KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/warp_size]);
+                    KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/warp_size] + FATTN_KQ_MAX_OFFSET);
                }
                KQ_max_new = warp_reduce_max<warp_size>(KQ_max_new);

--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -54,6 +54,8 @@
 #include "ggml-cuda/set-rows.cuh"
 #include "ggml-cuda/pad_reflect_1d.cuh"
 #include "ggml-cuda/solve_tri.cuh"
+#include "ggml-cuda/tri.cuh"
+#include "ggml-cuda/cumsum.cuh"
 #include "ggml.h"

 #include <algorithm>
@@ -2701,6 +2703,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_CROSS_ENTROPY_LOSS:
            ggml_cuda_cross_entropy_loss(ctx, dst);
            break;
+        case GGML_OP_CUMSUM:
+            ggml_cuda_op_cumsum(ctx, dst);
+            break;
+        case GGML_OP_TRI:
+            ggml_cuda_op_tri(ctx, dst);
+            break;
        case GGML_OP_RWKV_WKV6:
            ggml_cuda_op_rwkv_wkv6(ctx, dst);
            break;
@@ -4609,6 +4617,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
        case GGML_OP_OPT_STEP_ADAMW:
        case GGML_OP_OPT_STEP_SGD:
+        case GGML_OP_CUMSUM:
+        case GGML_OP_TRI:
            return true;
        case GGML_OP_SOLVE_TRI:
            return op->src[0]->ne[0] <= 64 && op->src[1]->ne[0] <= 32;
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -173,6 +173,9 @@ namespace ggml_cuda_mma {
 #elif defined(AMD_WMMA_AVAILABLE)
 #if defined(RDNA4)
        static constexpr int ne = I * J / 32;
+#elif defined(RDNA3)
+        static constexpr int ne = (I == 16 && J == 16) ? I * J / 32 : I * J / 16;
+#endif // defined(RDNA4)
        T x[ne] = {0};

        static constexpr __device__ bool supported() {
@@ -182,7 +185,11 @@ namespace ggml_cuda_mma {

        static __device__ __forceinline__ int get_i(const int l) {
            if constexpr (I == 16 && J == 16) {
+#if defined(RDNA4)
                return 8 * (threadIdx.x / 16) + l;
+#elif defined(RDNA3)
+                return 2 * l + (threadIdx.x / 16);
+#endif // defined(RDNA4)
            } else {
                NO_DEVICE_CODE;
                return -1;
@@ -197,7 +204,6 @@ namespace ggml_cuda_mma {
                return -1;
            }
        }
-#endif
 #else
        static constexpr int ne = I * J / 32;
        T x[ne] = {0};
@@ -284,6 +290,7 @@ namespace ggml_cuda_mma {
            }
        }
 #elif defined(AMD_WMMA_AVAILABLE)
+
        static constexpr int ne = I * J / 32;
        half2 x[ne] = {{0.0f, 0.0f}};

@@ -544,18 +551,34 @@ namespace ggml_cuda_mma {
        } else if constexpr (std::is_same_v<T, int>) {
            if constexpr (I == 16 && J == 4) {
                int64_t * xi = (int64_t *) t.x;
+#if defined(RDNA4)
                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
                xi[0] = xs[0];
-
-            }else if constexpr (I == 16 && J == 8) {
+#elif defined(RDNA3)
+                static_assert(tile<I,J,T>::ne >= 4, "fragment too small");
+                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
+                xi[0] = xs[0];
+                xi[1] = xs[1];
+#endif // defined(RDNA4)
+            } else if constexpr (I == 16 && J == 8) {
                int64_t * xi = (int64_t *) t.x;
+#if defined(RDNA4)
                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I));
                xi[0] = xs[0];

                const int64_t * xs1 = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 4 * (threadIdx.x / t.I) + 2);
                xi[1] = xs1[0];
-
-            }else{
+#elif defined(RDNA3)
+                static_assert(tile<I,J,T>::ne >= 8, "fragment too small");
+                const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride);
+                // contiguous four 64-bit chunks per lane for the wider RDNA3 fragment
+                xi[0] = xs[0];
+                xi[1] = xs[1];
+                const int64_t * xs1 = xs + 2;
+                xi[2] = xs1[0];
+                xi[3] = xs1[1];
+#endif // defined(RDNA4)
+            } else {
                NO_DEVICE_CODE;
            }
        } else {
@@ -858,12 +881,14 @@ namespace ggml_cuda_mma {
            : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
 #endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
 #elif defined(AMD_WMMA_AVAILABLE)
+#if defined(RDNA4)
        using halfx8_t = __attribute__((ext_vector_type(8))) _Float16;
        using floatx8_t = __attribute__((ext_vector_type(8))) float;
        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
        const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
        const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
+#endif // RDNA4
 #else
        GGML_UNUSED_VARS(D, A, B);
        NO_DEVICE_CODE;
@@ -873,12 +898,14 @@ namespace ggml_cuda_mma {
    static __device__ __forceinline__ void mma(
            tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) {
 #if defined(AMD_WMMA_AVAILABLE)
+#if defined(RDNA4)
        using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;
        using floatx8_t = __attribute__((ext_vector_type(8))) float;
        floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
        const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
        const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
        acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
+#endif // RDNA4
 #else
        GGML_UNUSED_VARS(D, A, B);
        NO_DEVICE_CODE;
@@ -907,14 +934,14 @@ namespace ggml_cuda_mma {
 #endif // defined(CDNA3)

 #elif defined(AMD_WMMA_AVAILABLE)
-        using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
-        int32x2_t * a_vec = (int32x2_t *) A.x;
-        int32x2_t * b_vec = (int32x2_t *) B.x;

        using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
        int32x8_t * acc = (int32x8_t *) D.x;

 #if defined(RDNA4)
+        using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
+        int32x2_t * a_vec = (int32x2_t *) A.x;
+        int32x2_t * b_vec = (int32x2_t *) B.x;

        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
            true,
@@ -933,7 +960,30 @@ namespace ggml_cuda_mma {
            acc[0],
            true
        );
-#endif // defined(RDNA4)
+
+#elif defined(RDNA3)
+        using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+        int32x4_t * a_vec = (int32x4_t *) A.x;
+        int32x4_t * b_vec = (int32x4_t *) B.x;
+
+        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
+            true,
+            a_vec[0],
+            true,
+            b_vec[0],
+            acc[0],
+            true
+        );
+
+        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
+            true,
+            a_vec[1],
+            true,
+            b_vec[1],
+            acc[0],
+            true
+        );
+#endif // RDNA4

 #else
        GGML_UNUSED_VARS(D, A, B);
@@ -1020,27 +1070,40 @@ namespace ggml_cuda_mma {
 static __device__ __forceinline__ void mma(
            tile<16, 16, int> & D, const tile<16, 4, int> & A, const tile<16, 4, int> & B) {
 #if defined(AMD_WMMA_AVAILABLE)
-    using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
-    int32x2_t * a_vec = (int32x2_t *) A.x;
-    int32x2_t * b_vec = (int32x2_t *) B.x;
+        using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
+        int32x8_t * acc = (int32x8_t *) D.x;
+#if defined(RDNA4)
+        using int32x2_t = __attribute__((__vector_size__(2 * sizeof(int)))) int;
+        int32x2_t * a_vec = (int32x2_t *) A.x;
+        int32x2_t * b_vec = (int32x2_t *) B.x;

-    using int32x8_t = __attribute__((__vector_size__(8 * sizeof(int)))) int;
-    int32x8_t * acc = (int32x8_t *) D.x;
+        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
+            true,
+            a_vec[0],
+            true,
+            b_vec[0],
+            acc[0],
+            false
+        );
+#elif defined(RDNA3)
+        using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+        int32x4_t * a_vec = (int32x4_t *) A.x;
+        int32x4_t * b_vec = (int32x4_t *) B.x;

-    acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(
-        true,
-        a_vec[0],
-        true,
-        b_vec[0],
-        acc[0],
-        false
-    );
+        acc[0] = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(
+            true,
+            a_vec[0],
+            true,
+            b_vec[0],
+            acc[0],
+            false
+        );
+#endif // RDNA4
 #else
        GGML_UNUSED(D);
        GGML_UNUSED(A);
        GGML_UNUSED(B);
        NO_DEVICE_CODE;
-#endif
+#endif // AMD_WMMA_AVAILABLE
    }
 }
-
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -307,10 +307,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
    }

    if (amd_wmma_available(cc)) {
-        if (GGML_CUDA_CC_IS_RDNA4(cc)) {
-            return true;
-        }
+        return true;
    }

-    return (!GGML_CUDA_CC_IS_RDNA3(cc) && !GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
+    return (!GGML_CUDA_CC_IS_CDNA(cc)) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
+
 }
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -1542,8 +1542,10 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
            tile_C Cm;
            if (k01 >= MMQ_TILE_NE_K * 3/4) {
                tile_A A1;
-                A1.x[0] = 0x01010101;
-                A1.x[1] = 0x01010101;
+#pragma unroll
+                for (int l = 0; l < tile_A::ne; ++l) {
+                    A1.x[l] = 0x01010101;
+                }
                mma(Cm, A1, B);
            }

--- a/ggml/src/ggml-cuda/tri.cu
+++ b/ggml/src/ggml-cuda/tri.cu
@@ -0,0 +1,136 @@
+#include "common.cuh"
+#include "convert.cuh"
+#include "tri.cuh"
+#include "ggml.h"
+
+template<typename T, bool prefix_keep, int add_to_split>
+static __global__ void tri_kernel(
+        const T * src, T * dst,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
+        const int64_t nb0,  const int64_t nb1,  const int64_t nb2,  const int64_t nb3) {
+    const int64_t i3 = blockIdx.z;
+    const int64_t i2 = blockIdx.y;
+    const int64_t i1 = blockIdx.x;
+    const int64_t split_point = i1 + add_to_split;
+
+    GGML_UNUSED_VARS(nb00, nb0);
+
+    if (i3 >= ne03 || i2 >= ne02 || i1 >= ne01) {
+        return;
+    }
+
+    const T * src_row = src + i1*nb01 + i2*nb02 + i3*nb03;
+    T       * dst_row = dst + i1*nb1  + i2*nb2  + i3*nb3;
+
+    if constexpr (prefix_keep) {
+        for (int64_t i0 = threadIdx.x; i0 < split_point; i0 += blockDim.x) {
+            dst_row[i0] = src_row[i0];
+        }
+        for (int64_t i0 = threadIdx.x + split_point; i0 < ne00; i0 += blockDim.x) {
+            dst_row[i0] = ggml_cuda_cast<T, float>(0.0f);
+        }
+    } else {
+        for (int64_t i0 = threadIdx.x; i0 < split_point; i0 += blockDim.x) {
+            dst_row[i0] = ggml_cuda_cast<T, float>(0.0f);
+        }
+        for (int64_t i0 = threadIdx.x + split_point; i0 < ne00; i0 += blockDim.x) {
+            dst_row[i0] = src_row[i0];
+        }
+    }
+}
+
+template<typename T>
+static void tri_cuda(
+        const T * src, T * dst,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
+        const int64_t nb0,  const int64_t nb1,  const int64_t nb2,  const int64_t nb3,
+        const ggml_tri_type ttype,
+        cudaStream_t stream) {
+
+    dim3 block_dims(CUDA_TRI_BLOCK_SIZE, 1, 1);
+    dim3 grid_dims(ne01, ne02, ne03);
+    const size_t type_size = sizeof(T);
+
+    const int add_to_split = (ttype == GGML_TRI_TYPE_LOWER_DIAG || ttype == GGML_TRI_TYPE_UPPER) ? 1 : 0;
+    const bool prefix_keep = (ttype == GGML_TRI_TYPE_LOWER || ttype == GGML_TRI_TYPE_LOWER_DIAG);
+
+    if (prefix_keep) {
+        if (add_to_split == 0) {
+            tri_kernel<T, true, 0><<<grid_dims, block_dims, 0, stream>>>(
+                src, dst,
+                ne00, ne01, ne02, ne03,
+                nb00 / type_size, nb01 / type_size, nb02 / type_size, nb03 / type_size,
+                nb0 / type_size, nb1 / type_size, nb2 / type_size, nb3 / type_size
+            );
+        } else { // only 0 and 1 supported
+            tri_kernel<T, true, 1><<<grid_dims, block_dims, 0, stream>>>(
+                src, dst,
+                ne00, ne01, ne02, ne03,
+                nb00 / type_size, nb01 / type_size, nb02 / type_size, nb03 / type_size,
+                nb0 / type_size, nb1 / type_size, nb2 / type_size, nb3 / type_size
+            );
+        }
+    } else {
+        if (add_to_split == 0) {
+            tri_kernel<T, false, 0><<<grid_dims, block_dims, 0, stream>>>(
+                src, dst,
+                ne00, ne01, ne02, ne03,
+                nb00 / type_size, nb01 / type_size, nb02 / type_size, nb03 / type_size,
+                nb0 / type_size, nb1 / type_size, nb2 / type_size, nb3 / type_size
+            );
+        } else {
+            tri_kernel<T, false, 1><<<grid_dims, block_dims, 0, stream>>>(
+                src, dst,
+                ne00, ne01, ne02, ne03,
+                nb00 / type_size, nb01 / type_size, nb02 / type_size, nb03 / type_size,
+                nb0 / type_size, nb1 / type_size, nb2 / type_size, nb3 / type_size
+            );
+        }
+    }
+}
+
+void ggml_cuda_op_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    cudaStream_t stream = ctx.stream();
+
+    const ggml_tri_type ttype = static_cast<ggml_tri_type>(ggml_get_op_params_i32(dst, 0));
+
+    GGML_ASSERT(src0->type == dst->type);
+
+    switch(src0->type) {
+        case GGML_TYPE_F32:
+            {
+                tri_cuda(
+                    (const float *)src0->data, (float *)dst->data,
+                    src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+                    src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+                    dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
+                    ttype, stream
+                );
+            } break;
+        case GGML_TYPE_F16:
+            {
+                tri_cuda(
+                    (const half *)src0->data, (half *)dst->data,
+                    src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+                    src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+                    dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
+                    ttype, stream
+                );
+            } break;
+        case GGML_TYPE_BF16:
+            {
+                tri_cuda(
+                    (const nv_bfloat16 *)src0->data, (nv_bfloat16 *)dst->data,
+                    src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+                    src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
+                    dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3],
+                    ttype, stream
+                );
+            } break;
+        default:
+            GGML_ABORT("fatal error");
+    }
+}
--- a/ggml/src/ggml-cuda/tri.cuh
+++ b/ggml/src/ggml-cuda/tri.cuh
@@ -0,0 +1,5 @@
+#include "common.cuh"
+
+#define CUDA_TRI_BLOCK_SIZE 256
+
+void ggml_cuda_op_tri(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-metal/ggml-metal-context.m
+++ b/ggml/src/ggml-metal/ggml-metal-context.m
@@ -24,9 +24,6 @@ struct ggml_metal_command_buffer {
 };

 struct ggml_metal {
-    id<MTLDevice>       device;
-    id<MTLCommandQueue> queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND]
-
    ggml_metal_device_t  dev;
    ggml_metal_library_t lib;

@@ -91,15 +88,15 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
    // init context
    ggml_metal_t res = calloc(1, sizeof(struct ggml_metal));

-    res->device = ggml_metal_device_get_obj(dev);
+    id<MTLDevice> device = ggml_metal_device_get_obj(dev);

-    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[res->device name] UTF8String]);
+    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);

    // TODO: would it be better to have one queue for the backend and one queue for the device?
    //       the graph encoders and async ops would use the backend queue while the sync ops would use the device queue?
    //res->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND]
-    res->queue = ggml_metal_device_get_queue(dev);
-    if (res->queue == nil) {
+    id<MTLCommandQueue> queue = ggml_metal_device_get_queue(dev);
+    if (queue == nil) {
        GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
        return NULL;
    }
@@ -274,7 +271,8 @@ static struct ggml_metal_buffer_id ggml_metal_get_buffer_id(const struct ggml_te
 void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
    @autoreleasepool {
        // wrap the source data into a Metal buffer
-        id<MTLBuffer> buf_src = [ctx->device newBufferWithBytes:data
+        id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
+        id<MTLBuffer> buf_src = [device newBufferWithBytes:data
                                                         length:size
                                                        options:MTLResourceStorageModeShared];

@@ -289,7 +287,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,

        // queue the copy operation into the queue of the Metal context
        // this will be queued at the end, after any currently ongoing GPU operations
-        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
+        id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
+        id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];

        [encoder copyFromBuffer:buf_src
@@ -315,7 +314,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,

 void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
    @autoreleasepool {
-        id<MTLBuffer> buf_dst = [ctx->device newBufferWithBytesNoCopy:data
+        id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
+        id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
                                                               length:size
                                                              options:MTLResourceStorageModeShared
                                                          deallocator:nil];
@@ -331,7 +331,8 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te

        // queue the copy operation into the queue of the Metal context
        // this will be queued at the end, after any currently ongoing GPU operations
-        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBuffer];
+        id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
+        id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
        id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];

        [encoder copyFromBuffer:bid_src.metal
@@ -362,6 +363,9 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
    // number of threads in addition to the main thread
    const int n_cb = ctx->n_cb;

+    // keep the memory wired
+    ggml_metal_device_rsets_keep_alive(ctx->dev);
+
    // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
    // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
    // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
@@ -389,7 +393,8 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *

            if (!ctx->capture_started) {
                // create capture scope
-                ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
+                id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
+                ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device];

                MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
                descriptor.captureObject = ctx->capture_scope;
@@ -406,10 +411,13 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
            }
        }

+        // short-hand
+        id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
+
        // the main thread commits the first few commands immediately
        // cmd_buf[n_cb]
        {
-            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+            id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
            [cmd_buf retain];

            if (ctx->cmd_bufs[n_cb].obj) {
@@ -428,7 +436,7 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
        // prepare the rest of the command buffers asynchronously (optional)
        // cmd_buf[0.. n_cb)
        for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+            id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
            [cmd_buf retain];

            if (ctx->cmd_bufs[cb_idx].obj) {
@@ -589,9 +597,11 @@ void ggml_metal_set_abort_callback(ggml_metal_t ctx, ggml_abort_callback abort_c
 }

 bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
-    GGML_ASSERT(ctx->device != nil);
+    GGML_ASSERT(ctx->dev != nil);

-    return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
+    id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
+
+    return [device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
 }

 void ggml_metal_capture_next_compute(ggml_metal_t ctx) {
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -35,20 +35,6 @@ typedef struct ggml_metal_pipeline * ggml_metal_pipeline_t;
 ggml_metal_pipeline_t ggml_metal_pipeline_init(void);
 void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline);

-void ggml_metal_pipeline_set_nsg(ggml_metal_pipeline_t pipeline, int nsg);
-int  ggml_metal_pipeline_get_nsg(ggml_metal_pipeline_t pipeline);
-
-void ggml_metal_pipeline_set_nr0(ggml_metal_pipeline_t pipeline, int nr0);
-int  ggml_metal_pipeline_get_nr0(ggml_metal_pipeline_t pipeline);
-
-void ggml_metal_pipeline_set_nr1(ggml_metal_pipeline_t pipeline, int nr1);
-int  ggml_metal_pipeline_get_nr1(ggml_metal_pipeline_t pipeline);
-
-void   ggml_metal_pipeline_set_smem(ggml_metal_pipeline_t pipeline, size_t smem);
-size_t ggml_metal_pipeline_get_smem(ggml_metal_pipeline_t pipeline);
-
-int ggml_metal_pipeline_max_theads_per_threadgroup(ggml_metal_pipeline_t pipeline);
-
 // a collection of pipelines
 typedef struct ggml_metal_pipelines * ggml_metal_pipelines_t;

@@ -58,6 +44,19 @@ void ggml_metal_pipelines_free(ggml_metal_pipelines_t ppls);
 void                  ggml_metal_pipelines_add(ggml_metal_pipelines_t ppls, const char * name, ggml_metal_pipeline_t pipeline);
 ggml_metal_pipeline_t ggml_metal_pipelines_get(ggml_metal_pipelines_t ppls, const char * name);

+struct ggml_metal_pipeline_with_params {
+    ggml_metal_pipeline_t pipeline;
+
+    int nsg;
+
+    int nr0;
+    int nr1;
+
+    size_t smem;
+};
+
+int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline);
+
 //
 // MTLCommandBuffer wrapper
 //
@@ -76,7 +75,7 @@ void ggml_metal_encoder_free(ggml_metal_encoder_t encoder);
 void ggml_metal_encoder_debug_group_push(ggml_metal_encoder_t encoder, const char * name);
 void ggml_metal_encoder_debug_group_pop (ggml_metal_encoder_t encoder);

-void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, ggml_metal_pipeline_t pipeline);
+void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, struct ggml_metal_pipeline_with_params pipeline);

 void ggml_metal_encoder_set_bytes (ggml_metal_encoder_t encoder, void * data, size_t size, int idx);
 void ggml_metal_encoder_set_buffer(ggml_metal_encoder_t encoder, struct ggml_metal_buffer_id buffer, int idx);
@@ -100,66 +99,67 @@ ggml_metal_library_t ggml_metal_library_init_from_source(ggml_metal_device_t dev

 void ggml_metal_library_free(ggml_metal_library_t lib);

-ggml_metal_pipeline_t ggml_metal_library_get_pipeline    (ggml_metal_library_t lib, const char * name);
-ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline    (ggml_metal_library_t lib, const char * name);
+struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv);

-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_base              (ggml_metal_library_t lib, enum ggml_op op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cpy               (ggml_metal_library_t lib, enum ggml_type tsrc, enum ggml_type tdst);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pool_2d           (ggml_metal_library_t lib, const struct ggml_tensor * op, enum ggml_op_pool op_pool);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_get_rows          (ggml_metal_library_t lib, enum ggml_type tsrc);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_set_rows          (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_repeat            (ggml_metal_library_t lib, enum ggml_type tsrc);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_unary             (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_glu               (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_sum               (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_sum_rows          (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cumsum_blk        (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_cumsum_add        (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_soft_max          (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_ssm_conv          (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_ssm_scan          (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rwkv              (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv_ext        (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm            (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv            (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id_map0    (ggml_metal_library_t lib, int ne02, int ne20);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id         (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mv_id         (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argmax            (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argsort           (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_argsort_merge     (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_top_k             (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_top_k_merge       (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_bin               (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_l2_norm           (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_group_norm        (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_norm              (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope              (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_im2col            (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_transpose_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_conv_2d           (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_upscale           (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad               (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_pad_reflect_1d    (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_arange            (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_opt_step_adamw    (ggml_metal_library_t lib, const struct ggml_tensor * op);
-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_opt_step_sgd      (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_base              (ggml_metal_library_t lib, enum ggml_op op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cpy               (ggml_metal_library_t lib, enum ggml_type tsrc, enum ggml_type tdst);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pool_2d           (ggml_metal_library_t lib, const struct ggml_tensor * op, enum ggml_op_pool op_pool);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows          (ggml_metal_library_t lib, enum ggml_type tsrc);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows          (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat            (ggml_metal_library_t lib, enum ggml_type tsrc);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary             (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_glu               (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_sum               (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_sum_rows          (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_blk        (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_add        (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tri               (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max          (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv          (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan          (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rwkv              (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_ext        (ggml_metal_library_t lib, enum ggml_type tsrc0, enum ggml_type tsrc1, int nsg, int nxpsg, int r1ptg);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm            (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv            (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id_map0    (ggml_metal_library_t lib, int ne02, int ne20);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mm_id         (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_mul_mv_id         (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argmax            (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort           (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_argsort_merge     (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k             (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_top_k_merge       (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin               (ggml_metal_library_t lib, enum ggml_op op, int32_t n_fuse, bool row);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_l2_norm           (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_group_norm        (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_norm              (ggml_metal_library_t lib, const struct ggml_tensor * op, int32_t n_fuse);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope              (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_im2col            (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_transpose_1d (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_transpose_2d (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_conv_2d           (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_upscale           (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad               (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_pad_reflect_1d    (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_arange            (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_adamw    (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_sgd      (ggml_metal_library_t lib, const struct ggml_tensor * op);

-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_pad(
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_pad(
        ggml_metal_library_t lib,
        const struct ggml_tensor * op,
        bool    has_mask,
        int32_t ncpsg);

-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_blk(
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_blk(
        ggml_metal_library_t lib,
        const struct ggml_tensor * op,
        int32_t nqptg,
        int32_t ncpsg);

-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext(
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext(
        ggml_metal_library_t lib,
        const struct ggml_tensor * op,
        bool    has_mask,
@@ -169,7 +169,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext(
        bool    has_kvpad,
        int32_t nsg);

-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec(
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_vec(
        ggml_metal_library_t lib,
        const struct ggml_tensor * op,
        bool    has_mask,
@@ -180,12 +180,22 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec(
        int32_t nsg,
        int32_t nwg);

-ggml_metal_pipeline_t ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(
        ggml_metal_library_t lib,
        const struct ggml_tensor * op,
        int32_t dv,
        int32_t nwg);

+// MTLResidencySet wrapper
+
+typedef void * ggml_metal_rset_t;
+
+// a collection of residency sets (non-owning)
+typedef struct ggml_metal_rsets * ggml_metal_rsets_t;
+
+ggml_metal_rsets_t ggml_metal_rsets_init(void);
+void ggml_metal_rsets_free(ggml_metal_rsets_t rsets);
+
 //
 // device
 //
@@ -219,6 +229,11 @@ void * ggml_metal_device_get_queue(ggml_metal_device_t dev); // id<MTLCommandQue

 ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev);

+void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset);
+void ggml_metal_device_rsets_rm (ggml_metal_device_t dev, ggml_metal_rset_t rset);
+
+void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev);
+
 void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total);
 bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_tensor * op);

--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1,7 +1,6 @@
 #import "ggml-metal-device.h"

 #import "ggml-impl.h"
-#import "ggml-threading.h"

 #include <Foundation/Foundation.h>

@@ -75,14 +74,6 @@ void ggml_metal_cv_set_bool(ggml_metal_cv_t cv, bool value, int32_t idx) {

 struct ggml_metal_pipeline {
    id<MTLComputePipelineState> obj;
-
-    // suggested dispatch sizes
-    int nsg;
-
-    int nr0;
-    int nr1;
-
-    size_t smem;
 };

 ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {
@@ -90,10 +81,6 @@ ggml_metal_pipeline_t ggml_metal_pipeline_init(void) {

    *res = (struct ggml_metal_pipeline) {
        /*.obj  =*/ nil,
-        /*.nsg  =*/ 0,
-        /*.nr0  =*/ 0,
-        /*.nr1  =*/ 0,
-        /*.smem =*/ 0,
    };

    return res;
@@ -105,40 +92,8 @@ void ggml_metal_pipeline_free(ggml_metal_pipeline_t pipeline) {
    free(pipeline);
 }

-void ggml_metal_pipeline_set_nsg(ggml_metal_pipeline_t pipeline, int nsg) {
-    pipeline->nsg = nsg;
-}
-
-int ggml_metal_pipeline_get_nsg(ggml_metal_pipeline_t pipeline) {
-    return pipeline->nsg;
-}
-
-void ggml_metal_pipeline_set_nr0(ggml_metal_pipeline_t pipeline, int nr0) {
-    pipeline->nr0 = nr0;
-}
-
-int ggml_metal_pipeline_get_nr0(ggml_metal_pipeline_t pipeline) {
-    return pipeline->nr0;
-}
-
-void ggml_metal_pipeline_set_nr1(ggml_metal_pipeline_t pipeline, int nr1) {
-    pipeline->nr1 = nr1;
-}
-
-int ggml_metal_pipeline_get_nr1(ggml_metal_pipeline_t pipeline) {
-    return pipeline->nr1;
-}
-
-void   ggml_metal_pipeline_set_smem(ggml_metal_pipeline_t pipeline, size_t smem) {
-    pipeline->smem = smem;
-}
-
-size_t ggml_metal_pipeline_get_smem(ggml_metal_pipeline_t pipeline) {
-    return pipeline->smem;
-}
-
-int ggml_metal_pipeline_max_theads_per_threadgroup(ggml_metal_pipeline_t pipeline) {
-    return pipeline->obj.maxTotalThreadsPerThreadgroup;
+int ggml_metal_pipeline_max_theads_per_threadgroup(struct ggml_metal_pipeline_with_params pipeline) {
+    return pipeline.pipeline->obj.maxTotalThreadsPerThreadgroup;
 }

 struct ggml_metal_library {
@@ -389,28 +344,42 @@ void ggml_metal_library_free(ggml_metal_library_t lib) {
    free(lib);
 }

-ggml_metal_pipeline_t ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline(ggml_metal_library_t lib, const char * name) {
    [lib->lock lock];

-    ggml_metal_pipeline_t res = ggml_metal_pipelines_get(lib->pipelines, name);
+    struct ggml_metal_pipeline_with_params res = {
+        /*.pipeline =*/ nil,
+        /*.nr0      =*/ 0,
+        /*.nr1      =*/ 0,
+        /*.nsg      =*/ 0,
+        /*.smem     =*/ 0,
+    };
+
+    res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);

    [lib->lock unlock];

    return res;
 }

-ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
+struct ggml_metal_pipeline_with_params ggml_metal_library_compile_pipeline(ggml_metal_library_t lib, const char * base, const char * name, ggml_metal_cv_t cv) {
+    struct ggml_metal_pipeline_with_params res = {
+        /*.pipeline =*/ nil,
+        /*.nr0      =*/ 0,
+        /*.nr1      =*/ 0,
+        /*.nsg      =*/ 0,
+        /*.smem     =*/ 0,
+    };
+
    [lib->lock lock];

-    ggml_metal_pipeline_t res = ggml_metal_pipelines_get(lib->pipelines, name);
-    if (res) {
+    res.pipeline = ggml_metal_pipelines_get(lib->pipelines, name);
+    if (res.pipeline) {
        [lib->lock unlock];

        return res;
    }

-    res = ggml_metal_pipeline_init();
-
    @autoreleasepool {
        NSError * error = nil;

@@ -432,26 +401,43 @@ ggml_metal_pipeline_t ggml_metal_library_compile_pipeline(ggml_metal_library_t l
                GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
            }

-            return nil;
+            return res;
        }

-        res->obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];
+        id<MTLComputePipelineState> obj = [lib->device newComputePipelineStateWithFunction:mtl_function error:&error];

        [mtl_function release];

-        GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name, (void *) res->obj,
-                (int) res->obj.maxTotalThreadsPerThreadgroup,
-                (int) res->obj.threadExecutionWidth);
+        if (!obj) {
+            [lib->lock unlock];
+
+            GGML_LOG_ERROR("%s: failed to create pipeline state: base = '%s', name = '%s'\n", __func__, base, name);
+            if (error) {
+                GGML_LOG_ERROR("%s: %s\n", __func__, [[error description] UTF8String]);
+            }
+
+            return res;
+        }
+
+        GGML_LOG_DEBUG("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, name,
+                (void *) obj,
+                (int)    obj.maxTotalThreadsPerThreadgroup,
+                (int)    obj.threadExecutionWidth);
+
+        if (obj.maxTotalThreadsPerThreadgroup == 0 || obj.threadExecutionWidth == 0) {
+            [obj release];

-        if (res->obj.maxTotalThreadsPerThreadgroup == 0 || res->obj.threadExecutionWidth == 0) {
            [lib->lock unlock];

            GGML_LOG_ERROR("%s: incompatible pipeline %s\n", __func__, name);

-            return nil;
+            return res;
        }

-        ggml_metal_pipelines_add(lib->pipelines, name, res);
+        res.pipeline = ggml_metal_pipeline_init();
+        res.pipeline->obj = obj;
+
+        ggml_metal_pipelines_add(lib->pipelines, name, res.pipeline);
    }

    [lib->lock unlock];
@@ -496,8 +482,8 @@ void ggml_metal_encoder_debug_group_pop (ggml_metal_encoder_t encoder) {
    [encoder->obj popDebugGroup];
 }

-void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, ggml_metal_pipeline_t pipeline) {
-    [encoder->obj setComputePipelineState:pipeline->obj];
+void ggml_metal_encoder_set_pipeline(ggml_metal_encoder_t encoder, struct ggml_metal_pipeline_with_params pipeline) {
+    [encoder->obj setComputePipelineState:pipeline.pipeline->obj];
 }

 void ggml_metal_encoder_set_bytes(ggml_metal_encoder_t encoder, void * data, size_t size, int idx) {
@@ -532,11 +518,106 @@ struct ggml_metal_device {
    // ref: https://github.com/ggml-org/llama.cpp/pull/15906
    id<MTLCommandQueue> mtl_queue;

+    ggml_metal_rsets_t rsets;
+
    ggml_metal_library_t library;

    struct ggml_metal_device_props props;
 };

+//
+// MTLResidenceSet wrapper
+//
+
+struct ggml_metal_rsets {
+    NSLock * lock;
+
+    NSMutableArray * data;
+
+    // number of seconds since the last graph computation
+    // keep the residency sets wired for that amount of time to avoid being collected by the OS
+    int keep_alive_s;
+
+    // background heartbeat thread to keep the residency sets alive
+    atomic_bool d_stop;
+    atomic_int  d_loop;
+
+    dispatch_group_t d_group;
+};
+
+ggml_metal_rsets_t ggml_metal_rsets_init(void) {
+    ggml_metal_rsets_t res = calloc(1, sizeof(struct ggml_metal_rsets));
+
+    res->lock = [[NSLock alloc] init];
+    res->data = [[NSMutableArray alloc] init];
+
+    // by default keep the memory wired for 3 minutes
+    res->keep_alive_s = 3*60;
+
+    const char * GGML_METAL_RESIDENCY_KEEP_ALIVE_S = getenv("GGML_METAL_RESIDENCY_KEEP_ALIVE_S");
+    if (GGML_METAL_RESIDENCY_KEEP_ALIVE_S) {
+        res->keep_alive_s = atoi(GGML_METAL_RESIDENCY_KEEP_ALIVE_S);
+    }
+
+    if (res->keep_alive_s <= 0) {
+        res->keep_alive_s = 3*60;
+    }
+
+    GGML_LOG_INFO("%s: creating a residency set collection (keep_alive = %d s)\n", __func__, res->keep_alive_s);
+
+    atomic_store_explicit(&res->d_stop, false, memory_order_relaxed);
+    atomic_store_explicit(&res->d_loop, 2*res->keep_alive_s, memory_order_relaxed);
+
+    res->d_group = dispatch_group_create();
+
+    // start a background thread that periodically requests residency for all the currently active sets in the collection
+    // the requests stop after a certain amount of time (keep_alive_s) of inactivity
+    dispatch_queue_t d_queue = dispatch_get_global_queue(QOS_CLASS_DEFAULT, 0);
+    dispatch_group_async(res->d_group, d_queue, ^{
+#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
+        if (@available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, *)) {
+              while (!atomic_load_explicit(&res->d_stop, memory_order_relaxed)) {
+                  if (atomic_load_explicit(&res->d_loop, memory_order_relaxed) > 0) {
+                      [res->lock lock];
+
+                      for (int i = 0; i < (int) res->data.count; ++i) {
+                          [res->data[i] requestResidency];
+                      }
+
+                      atomic_fetch_sub_explicit(&res->d_loop, 1, memory_order_relaxed);
+
+                      [res->lock unlock];
+                  }
+
+                  // half a second
+                  usleep(500 * 1000);
+              }
+        }
+#endif
+    });
+
+    return res;
+}
+
+void ggml_metal_rsets_free(ggml_metal_rsets_t rsets) {
+    if (rsets == NULL) {
+        return;
+    }
+
+    // note: if you hit this assert, most likely you haven't deallocated all Metal resources before exiting
+    GGML_ASSERT([rsets->data count] == 0);
+
+    atomic_store_explicit(&rsets->d_stop, true, memory_order_relaxed);
+
+    dispatch_group_wait(rsets->d_group, DISPATCH_TIME_FOREVER);
+    dispatch_release(rsets->d_group);
+
+    [rsets->data release];
+    [rsets->lock release];
+
+    free(rsets);
+}
+
 ggml_metal_device_t ggml_metal_device_init(void) {
    ggml_metal_device_t dev = calloc(1, sizeof(struct ggml_metal_device));

@@ -622,8 +703,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
                    GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
                    dev->props.has_tensor = false;
                } else {
-                    ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
-                    if (!ppl) {
+                    struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
+                    if (!ppl.pipeline) {
                        GGML_LOG_WARN("%s: - the tensor API is not supported in this environment - disabling\n", __func__);
                        dev->props.has_tensor = false;
                    }
@@ -672,8 +753,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
                    GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
                    dev->props.has_bfloat = false;
                } else {
-                    ggml_metal_pipeline_t ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
-                    if (!ppl) {
+                    struct ggml_metal_pipeline_with_params ppl = ggml_metal_library_compile_pipeline(lib, "dummy_kernel", "dummy_kernel", nil);
+                    if (!ppl.pipeline) {
                        GGML_LOG_WARN("%s: - the tensor API does not support bfloat - disabling bfloat support\n", __func__);
                        dev->props.has_bfloat = false;
                    }
@@ -705,7 +786,11 @@ ggml_metal_device_t ggml_metal_device_init(void) {
                GGML_LOG_ERROR("%s: error: failed to create library\n", __func__);
            }

-            // --------------------------------------------------
+            if (dev->props.use_residency_sets) {
+                dev->rsets = ggml_metal_rsets_init();
+            } else {
+                dev->rsets = nil;
+            }

            // print MTL GPU family:
            GGML_LOG_INFO("%s: GPU name:   %s\n", __func__, dev->props.name);
@@ -758,6 +843,8 @@ ggml_metal_device_t ggml_metal_device_init(void) {
 void ggml_metal_device_free(ggml_metal_device_t dev) {
    assert(dev != NULL);

+    ggml_metal_rsets_free(dev->rsets);
+
    ggml_metal_library_free(dev->library);
    dev->library = NULL;

@@ -786,6 +873,42 @@ ggml_metal_library_t ggml_metal_device_get_library(ggml_metal_device_t dev) {
    return dev->library;
 }

+void ggml_metal_device_rsets_add(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
+    if (rset == nil) {
+        return;
+    }
+
+    GGML_ASSERT(dev->rsets);
+
+    [dev->rsets->lock lock];
+
+    [dev->rsets->data addObject:rset];
+
+    [dev->rsets->lock unlock];
+}
+
+void ggml_metal_device_rsets_rm(ggml_metal_device_t dev, ggml_metal_rset_t rset) {
+    if (rset == nil) {
+        return;
+    }
+
+    GGML_ASSERT(dev->rsets);
+
+    [dev->rsets->lock lock];
+
+    [dev->rsets->data removeObject:rset];
+
+    [dev->rsets->lock unlock];
+}
+
+void ggml_metal_device_rsets_keep_alive(ggml_metal_device_t dev) {
+    if (dev->rsets == NULL) {
+        return;
+    }
+
+    atomic_store_explicit(&dev->rsets->d_loop, 2*dev->rsets->keep_alive_s, memory_order_relaxed);
+}
+
 void ggml_metal_device_get_memory(ggml_metal_device_t dev, size_t * free, size_t * total) {
    if (@available(macOS 10.12, iOS 16.0, *)) {
        *total = dev->mtl_device.recommendedMaxWorkingSetSize;
@@ -831,6 +954,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
                case GGML_UNARY_OP_HARDSWISH:
                case GGML_UNARY_OP_HARDSIGMOID:
                case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_SOFTPLUS:
+                case GGML_UNARY_OP_EXPM1:
                    return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
                default:
                    return false;
@@ -863,6 +988,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
        case GGML_OP_ACC:
        case GGML_OP_REPEAT:
        case GGML_OP_SCALE:
+        case GGML_OP_FILL:
        case GGML_OP_CONV_TRANSPOSE_1D:
            return true;
        case GGML_OP_CONV_TRANSPOSE_2D:
@@ -880,6 +1006,8 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
            return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
        case GGML_OP_SUM:
            return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
+        case GGML_OP_TRI:
+            return ggml_is_contiguous_rows(op->src[0]);
        case GGML_OP_SUM_ROWS:
        case GGML_OP_CUMSUM:
        case GGML_OP_MEAN:
@@ -1074,9 +1202,8 @@ struct ggml_metal_buffer {
    // note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
    id rset;

-    // pointers to global device objects
-    id<MTLDevice> device;
-    id<MTLCommandQueue> queue;
+    // pointers to global device
+    ggml_metal_device_t dev;
 };

 static void ggml_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
@@ -1119,7 +1246,7 @@ static bool ggml_metal_buffer_rset_init(ggml_metal_buffer_t buf) {
        desc.initialCapacity = buf->n_buffers;

        NSError * error;
-        buf->rset = [buf->device newResidencySetWithDescriptor:desc error:&error];
+        buf->rset = [buf->dev->mtl_device newResidencySetWithDescriptor:desc error:&error];
        if (error) {
            GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
            [desc release];
@@ -1180,6 +1307,8 @@ static void * ggml_metal_host_malloc(size_t n) {
 ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size, bool shared) {
    ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));

+    res->dev = dev;
+
    const size_t size_page = sysconf(_SC_PAGESIZE);

    size_t size_aligned = size;
@@ -1204,9 +1333,6 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,

    res->owned = true;

-    res->device = ggml_metal_device_get_obj(dev);
-    res->queue  = ggml_metal_device_get_queue(dev);
-
    res->n_buffers = 1;

    if (res->all_data != NULL) {
@@ -1215,12 +1341,12 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,

        if (size_aligned > 0) {
            if (props_dev->use_shared_buffers && shared) {
-                res->buffers[0].metal = [res->device newBufferWithBytesNoCopy:res->all_data
+                res->buffers[0].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:res->all_data
                                                                  length:size_aligned
                                                                 options:MTLResourceStorageModeShared
                                                             deallocator:nil];
            } else {
-                res->buffers[0].metal = [res->device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
+                res->buffers[0].metal = [res->dev->mtl_device newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate];
            }
        }

@@ -1241,6 +1367,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
        return NULL;
    }

+    ggml_metal_device_rsets_add(dev, res->rset);
+
    //ggml_metal_log_allocated_size(device, size_aligned);

    return res;
@@ -1249,6 +1377,8 @@ ggml_metal_buffer_t ggml_metal_buffer_init(ggml_metal_device_t dev, size_t size,
 ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, size_t size, size_t max_tensor_size) {
    ggml_metal_buffer_t res = calloc(1, sizeof(struct ggml_metal_buffer));

+    res->dev = dev;
+
    res->all_data = ptr;
    res->all_size = size;

@@ -1271,9 +1401,6 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
        size_aligned += (size_page - (size_aligned % size_page));
    }

-    res->device = ggml_metal_device_get_obj(dev);
-    res->queue  = ggml_metal_device_get_queue(dev);
-
    const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);

    // the buffer fits into the max buffer size allowed by the device
@@ -1283,7 +1410,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
        res->buffers[res->n_buffers].metal = nil;

        if (size_aligned > 0) {
-            res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+            res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:ptr length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];

            if (res->buffers[res->n_buffers].metal == nil) {
                GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
@@ -1292,7 +1419,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
            }
        }

-        ggml_metal_log_allocated_size(res->device, size_aligned);
+        ggml_metal_log_allocated_size(res->dev->mtl_device, size_aligned);

        ++res->n_buffers;
    } else {
@@ -1310,7 +1437,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
            res->buffers[res->n_buffers].metal = nil;

            if (size_step_aligned > 0) {
-                res->buffers[res->n_buffers].metal = [res->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+                res->buffers[res->n_buffers].metal = [res->dev->mtl_device newBufferWithBytesNoCopy:(void *) ((uint8_t *) ptr + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];

                if (res->buffers[res->n_buffers].metal == nil) {
                    GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
@@ -1319,7 +1446,7 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
                }
            }

-            ggml_metal_log_allocated_size(res->device, size_step_aligned);
+            ggml_metal_log_allocated_size(res->dev->mtl_device, size_step_aligned);

            if (i + size_step < size) {
                GGML_LOG_INFO("\n");
@@ -1337,10 +1464,14 @@ ggml_metal_buffer_t ggml_metal_buffer_map(ggml_metal_device_t dev, void * ptr, s
        return NULL;
    }

+    ggml_metal_device_rsets_add(dev, res->rset);
+
    return res;
 }

 void ggml_metal_buffer_free(ggml_metal_buffer_t buf) {
+    ggml_metal_device_rsets_rm(buf->dev, buf->rset);
+
    for (int i = 0; i < buf->n_buffers; i++) {
        [buf->buffers[i].metal release];
    }
@@ -1377,8 +1508,7 @@ void ggml_metal_buffer_memset_tensor(ggml_metal_buffer_t buf, struct ggml_tensor
        struct ggml_metal_buffer_id bid_dst = ggml_metal_buffer_get_id(buf, tensor);
        bid_dst.offs += offset;

-        id<MTLCommandQueue>  queue   = buf->queue;
-        id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
+        id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];

        {
            id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
@@ -1404,7 +1534,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
    @autoreleasepool {
        // src
        void * data_ptr = (void *)(uintptr_t) data; // "const cast" the src data
-        id<MTLBuffer> buf_src = [buf->device newBufferWithBytesNoCopy:data_ptr
+        id<MTLBuffer> buf_src = [buf->dev->mtl_device newBufferWithBytesNoCopy:data_ptr
                                                               length:size
                                                              options:MTLResourceStorageModeShared
                                                          deallocator:nil];
@@ -1419,8 +1549,7 @@ void ggml_metal_buffer_set_tensor(ggml_metal_buffer_t buf, struct ggml_tensor *
        //       this is alternative to waitUntilCompleted, which should be faster, but don't seem to make much difference
        dispatch_semaphore_t completion_semaphore = dispatch_semaphore_create(0);

-        id<MTLCommandQueue>  queue   = buf->queue;
-        id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
+        id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];

        {
            id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
@@ -1462,15 +1591,14 @@ void ggml_metal_buffer_get_tensor(ggml_metal_buffer_t buf, const struct ggml_ten
        bid_src.offs += offset;

        // dst
-        id<MTLBuffer> buf_dst = [buf->device newBufferWithBytesNoCopy:data
+        id<MTLBuffer> buf_dst = [buf->dev->mtl_device newBufferWithBytesNoCopy:data
                                                               length:size
                                                              options:MTLResourceStorageModeShared
                                                          deallocator:nil];

        GGML_ASSERT(buf_dst);

-        id<MTLCommandQueue>  queue   = buf->queue;
-        id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
+        id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];

        {
            id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
@@ -1496,8 +1624,7 @@ void ggml_metal_buffer_clear(ggml_metal_buffer_t buf, uint8_t value) {
    }

    @autoreleasepool {
-        id<MTLCommandQueue>  queue   = buf->queue;
-        id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
+        id<MTLCommandBuffer> cmd_buf = [buf->dev->mtl_queue commandBufferWithUnretainedReferences];

        {
            id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -182,6 +182,10 @@ typedef struct {
    float bias;
 } ggml_metal_kargs_scale;

+typedef struct {
+    float val;
+} ggml_metal_kargs_fill;
+
 typedef struct {
    float min;
    float max;
@@ -831,6 +835,25 @@ typedef struct {
    float    slope;
 } ggml_metal_kargs_leaky_relu;

+typedef struct {
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    int32_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne0;
+    int32_t  ne1;
+    int32_t  ne2;
+    int32_t  ne3;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+} ggml_metal_kargs_tri;
+
 typedef struct {
    int32_t  ne00;
    int32_t  ne01;
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -286,6 +286,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
            {
                n_fuse = ggml_metal_op_scale(ctx, idx);
            } break;
+        case GGML_OP_FILL:
+            {
+                n_fuse = ggml_metal_op_fill(ctx, idx);
+            } break;
        case GGML_OP_CLAMP:
            {
                n_fuse = ggml_metal_op_clamp(ctx, idx);
@@ -414,6 +418,10 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
            {
                n_fuse = ggml_metal_op_leaky_relu(ctx, idx);
            } break;
+        case GGML_OP_TRI:
+            {
+                n_fuse = ggml_metal_op_tri(ctx, idx);
+            } break;
        case GGML_OP_FLASH_ATTN_EXT:
            {
                n_fuse = ggml_metal_op_flash_attn_ext(ctx, idx);
@@ -524,7 +532,7 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
        /*.dim  =*/ dim,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_CONCAT);
+    auto pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_CONCAT);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -550,7 +558,7 @@ int ggml_metal_op_repeat(ggml_metal_op_t ctx, int idx) {
    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_repeat(lib, op->type);
+    auto pipeline = ggml_metal_library_get_pipeline_repeat(lib, op->type);

    ggml_metal_kargs_repeat args = {
        /*.ne00 =*/ ne00,
@@ -616,7 +624,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
        // TODO: make a simpler cpy_bytes kernel

        //const id<MTLComputePipelineState> pipeline = ctx->pipelines[GGML_METAL_PIPELINE_TYPE_CPY_F32_F32].obj;
-        ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
+        auto pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);

        ggml_metal_kargs_cpy args = {
            /*.nk0  =*/ ne00,
@@ -679,7 +687,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
        /*.o1   =*/ { 0 },
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_bin(lib, GGML_OP_ADD, 1, false);
+    auto pipeline = ggml_metal_library_get_pipeline_bin(lib, GGML_OP_ADD, 1, false);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -721,7 +729,42 @@ int ggml_metal_op_scale(ggml_metal_op_t ctx, int idx) {
        n /= 4;
    }

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, n, 1, 1, 1, 1, 1);
+
+    return 1;
+}
+
+int ggml_metal_op_fill(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+
+    const float val = ggml_get_op_params_f32(op, 0);
+
+    ggml_metal_kargs_fill args = {
+        /*.val =*/ val
+    };
+
+    int64_t n = ggml_nelements(op);
+
+    if (n % 4 == 0) {
+        n /= 4;
+    }
+
+    auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -760,7 +803,7 @@ int ggml_metal_op_clamp(ggml_metal_op_t ctx, int idx) {
        n /= 4;
    }

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -789,7 +832,7 @@ int ggml_metal_op_unary(ggml_metal_op_t ctx, int idx) {
        n /= 4;
    }

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 0);
@@ -817,7 +860,7 @@ int ggml_metal_op_glu(ggml_metal_op_t ctx, int idx) {
        GGML_ASSERT(ggml_are_same_shape(op->src[0], op->src[1]));
    }

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_glu(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_glu(lib, op);

    const int32_t swp = ggml_get_op_params_i32(op, 1);
    const float alpha = ggml_get_op_params_f32(op, 2);
@@ -870,7 +913,7 @@ int ggml_metal_op_sum(ggml_metal_op_t ctx, int idx) {
        /*.np =*/ n,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_sum(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_sum(lib, op);

    int nth = 32; // SIMD width

@@ -925,7 +968,7 @@ int ggml_metal_op_sum_rows(ggml_metal_op_t ctx, int idx) {
        /*.nb3  =*/ nb3,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_sum_rows(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_sum_rows(lib, op);

    int nth = 32; // SIMD width

@@ -936,7 +979,7 @@ int ggml_metal_op_sum_rows(ggml_metal_op_t ctx, int idx) {
    nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
    nth = std::min(nth, ne00);

-    const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -963,7 +1006,7 @@ int ggml_metal_op_cumsum(ggml_metal_op_t ctx, int idx) {
    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);

-    ggml_metal_pipeline_t pipeline_blk = ggml_metal_library_get_pipeline_cumsum_blk(lib, op);
+    auto pipeline_blk = ggml_metal_library_get_pipeline_cumsum_blk(lib, op);

    int nth = 1;
    while (nth < ne00 && 2*nth <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline_blk)) {
@@ -1060,7 +1103,7 @@ int ggml_metal_op_cumsum(ggml_metal_op_t ctx, int idx) {
        ggml_metal_op_concurrency_reset(ctx);

        {
-            ggml_metal_pipeline_t pipeline_add = ggml_metal_library_get_pipeline_cumsum_add(lib, op);
+            auto pipeline_add = ggml_metal_library_get_pipeline_cumsum_add(lib, op);

            ggml_metal_kargs_cumsum_add args = {
                /*.ne00 =*/ ne00,
@@ -1106,7 +1149,7 @@ int ggml_metal_op_get_rows(ggml_metal_op_t ctx, int idx) {
    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_get_rows(lib, op->src[0]->type);
+    auto pipeline = ggml_metal_library_get_pipeline_get_rows(lib, op->src[0]->type);

    ggml_metal_kargs_get_rows args = {
        /*.ne00t =*/ ggml_is_quantized(op->src[0]->type) ? ne00/16 : ne00,
@@ -1151,7 +1194,7 @@ int ggml_metal_op_set_rows(ggml_metal_op_t ctx, int idx) {
    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_set_rows(lib, op->src[1]->type, op->type);
+    auto pipeline = ggml_metal_library_get_pipeline_set_rows(lib, op->src[1]->type, op->type);

    const int32_t nk0 = ne0/ggml_blck_size(op->type);

@@ -1252,7 +1295,7 @@ int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) {
        /*.n_head_log2 =*/ n_head_log2,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_soft_max(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_soft_max(lib, op);

    int nth = 32; // SIMD width

@@ -1266,7 +1309,7 @@ int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) {
        }
    }

-    const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
@@ -1322,7 +1365,7 @@ int ggml_metal_op_ssm_conv(ggml_metal_op_t ctx, int idx) {
        /*.nb2  =*/ nb2,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_ssm_conv(lib, op);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0);
@@ -1409,11 +1452,11 @@ int ggml_metal_op_ssm_scan(ggml_metal_op_t ctx, int idx) {
        /*.nb0          =*/ nb0,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_ssm_scan(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_ssm_scan(lib, op);

    GGML_ASSERT(d_state <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));

-    const size_t sms = ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -1426,7 +1469,7 @@ int ggml_metal_op_ssm_scan(ggml_metal_op_t ctx, int idx) {
    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[6]), 7);
    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         8);

-    ggml_metal_encoder_set_threadgroup_memory_size(enc, sms, 0);
+    ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);

    ggml_metal_encoder_dispatch_threadgroups(enc, d_inner, n_head, n_seqs, d_state, 1, 1);

@@ -1449,7 +1492,7 @@ int ggml_metal_op_rwkv(ggml_metal_op_t ctx, int idx) {
    const int64_t C = op->ne[0];
    const int64_t H = op->src[0]->ne[1];

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rwkv(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_rwkv(lib, op);

    int ida = 0;

@@ -1485,7 +1528,7 @@ int ggml_metal_op_cpy(ggml_metal_op_t ctx, int idx) {
    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);
+    auto pipeline = ggml_metal_library_get_pipeline_cpy(lib, op->src[0]->type, op->type);

    GGML_ASSERT(ne00 % ggml_blck_size(op->src[0]->type) == 0);

@@ -1592,7 +1635,7 @@ int ggml_metal_op_pool_2d(ggml_metal_op_t ctx, int idx) {
        /* .np = */ np
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pool_2d(lib, op, op_pool);
+    auto pipeline = ggml_metal_library_get_pipeline_pool_2d(lib, op, op_pool);

    const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), (int) np);
    const int ntg = (np + nth - 1) / nth;
@@ -1701,7 +1744,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
                GGML_ABORT("unsupported ne11");
        };

-        ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);
+        auto pipeline = ggml_metal_library_get_pipeline_mul_mv_ext(lib, op->src[0]->type, op->src[1]->type, nsg, nxpsg, r1ptg);

        ggml_metal_kargs_mul_mv_ext args = {
            /*.ne00  =*/ ne00,
@@ -1748,7 +1791,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
        //    default: break;
        //}

-        ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op);
+        auto pipeline = ggml_metal_library_get_pipeline_mul_mm(lib, op);

        ggml_metal_kargs_mul_mm args = {
            /*.ne00 =*/ ne00,
@@ -1773,18 +1816,18 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
        ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         3);

-        const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
+        const size_t smem = pipeline.smem;

        ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);
        ggml_metal_encoder_dispatch_threadgroups(enc, ((ne11 + 31)/32), ((ne01 + 63)/64), ne12*ne13, 128, 1, 1);
    } else {
-        ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);
+        auto pipeline = ggml_metal_library_get_pipeline_mul_mv(lib, op);

-        const int nr0 = ggml_metal_pipeline_get_nr0(pipeline);
-        const int nr1 = ggml_metal_pipeline_get_nr1(pipeline);
-        const int nsg = ggml_metal_pipeline_get_nsg(pipeline);
+        const int nr0 = pipeline.nr0;
+        const int nr1 = pipeline.nr1;
+        const int nsg = pipeline.nsg;

-        const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
+        const size_t smem = pipeline.smem;

        ggml_metal_kargs_mul_mv args = {
            /*.ne00 =*/ ne00,
@@ -1915,9 +1958,9 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
                nb21,
            };

-            ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm_id_map0(lib, ne02, ne20);
+            auto pipeline = ggml_metal_library_get_pipeline_mul_mm_id_map0(lib, ne02, ne20);

-            const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
+            const size_t smem = pipeline.smem;

            GGML_ASSERT(ne02 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));

@@ -1938,7 +1981,7 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
        ggml_metal_op_concurrency_reset(ctx);

        {
-            ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mm_id(lib, op);
+            auto pipeline = ggml_metal_library_get_pipeline_mul_mm_id(lib, op);

            ggml_metal_kargs_mul_mm_id args = {
                /*.ne00  =*/ ne00,
@@ -1967,20 +2010,20 @@ int ggml_metal_op_mul_mat_id(ggml_metal_op_t ctx, int idx) {
            ggml_metal_encoder_set_buffer  (enc, bid_ids,  4);
            ggml_metal_encoder_set_buffer  (enc, bid_dst,  5);

-            const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
+            const size_t smem = pipeline.smem;

            ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0);

            ggml_metal_encoder_dispatch_threadgroups(enc, (ne21 + 31)/32, (ne01 + 63)/64, ne02, 128, 1, 1);
        }
    } else {
-        ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_mul_mv_id(lib, op);
+        auto pipeline = ggml_metal_library_get_pipeline_mul_mv_id(lib, op);

-        const int nr0 = ggml_metal_pipeline_get_nr0(pipeline);
-        const int nr1 = ggml_metal_pipeline_get_nr1(pipeline);
-        const int nsg = ggml_metal_pipeline_get_nsg(pipeline);
+        const int nr0 = pipeline.nr0;
+        const int nr1 = pipeline.nr1;
+        const int nsg = pipeline.nsg;

-        const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
+        const size_t smem = pipeline.smem;

        ggml_metal_kargs_mul_mv_id args = {
            /*.nei0 =*/ ne20,
@@ -2064,7 +2107,7 @@ int ggml_metal_op_add_id(ggml_metal_op_t ctx, int idx) {
        /*.nb21 =*/ nb21,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_ADD_ID);
+    auto pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_ADD_ID);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -2308,7 +2351,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
                /*.nb33    =*/nb33,
            };

-            ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
+            auto pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);

            ggml_metal_encoder_set_pipeline(enc, pipeline0);
            ggml_metal_encoder_set_bytes   (enc, &args0, sizeof(args0), 0);
@@ -2339,7 +2382,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
                /*.nb33 =*/ nb33,
            };

-            ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_blk(lib, op, nqptg, ncpsg);
+            auto pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_blk(lib, op, nqptg, ncpsg);

            ggml_metal_encoder_set_pipeline(enc, pipeline0);
            ggml_metal_encoder_set_bytes   (enc, &args0, sizeof(args0), 0);
@@ -2424,7 +2467,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
            /*.logit_softcap =*/ logit_softcap,
        };

-        ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg);
+        auto pipeline = ggml_metal_library_get_pipeline_flash_attn_ext(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg);

        ggml_metal_encoder_set_pipeline(enc, pipeline);
        ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -2476,7 +2519,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
                /*.nb33    =*/nb33,
            };

-            ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);
+            auto pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_pad(lib, op, has_mask, ncpsg);

            ggml_metal_encoder_set_pipeline(enc, pipeline0);
            ggml_metal_encoder_set_bytes   (enc, &args0, sizeof(args0), 0);
@@ -2578,7 +2621,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
            /*.logit_softcap =*/ logit_softcap,
        };

-        ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg, nwg);
+        auto pipeline = ggml_metal_library_get_pipeline_flash_attn_ext_vec(lib, op, has_mask, has_sinks, has_bias, has_scap, has_kvpad, nsg, nwg);

        GGML_ASSERT(nsg*32 <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));

@@ -2630,7 +2673,7 @@ int ggml_metal_op_flash_attn_ext(ggml_metal_op_t ctx, int idx) {
                    nrows,
                };

-                ggml_metal_pipeline_t pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(lib, op, ne20, nwg);
+                auto pipeline0 = ggml_metal_library_get_pipeline_flash_attn_ext_vec_reduce(lib, op, ne20, nwg);

                ggml_metal_encoder_set_pipeline(enc, pipeline0);
                ggml_metal_encoder_set_bytes   (enc, &args0, sizeof(args0), 0);
@@ -2762,7 +2805,7 @@ int ggml_metal_op_bin(ggml_metal_op_t ctx, int idx) {
    // the offsets of src1 and all fused buffers are relative to the start of the src1 buffer
    bid_src1.offs = 0;

-    ggml_metal_pipeline_t pipeline = nullptr;
+    struct ggml_metal_pipeline_with_params pipeline;

    if (ggml_nelements(op->src[1]) == ne10 && ggml_is_contiguous(op->src[1]) && ne00 % 4 == 0 && ne10 % 4 == 0) {
        GGML_ASSERT(ggml_is_contiguous(op->src[0]));
@@ -2835,7 +2878,7 @@ int ggml_metal_op_l2_norm(ggml_metal_op_t ctx, int idx) {
        /*.eps    =*/ eps,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_l2_norm(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_l2_norm(lib, op);

    while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
        nth *= 2;
@@ -2844,7 +2887,7 @@ int ggml_metal_op_l2_norm(ggml_metal_op_t ctx, int idx) {
    nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
    nth = std::min(nth, ne00/4);

-    const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;

    const int64_t nrows = ggml_nrows(op->src[0]);

@@ -2887,7 +2930,7 @@ int ggml_metal_op_group_norm(ggml_metal_op_t ctx, int idx) {
        /*.eps  =*/ eps,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_group_norm(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_group_norm(lib, op);

    int nth = 32; // SIMD width
    //while (nth < ne00/4 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
@@ -2897,7 +2940,7 @@ int ggml_metal_op_group_norm(ggml_metal_op_t ctx, int idx) {
    //nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
    //nth = std::min(nth, ne00/4);

-    const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3022,7 +3065,7 @@ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
        }
    }

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);
+    auto pipeline = ggml_metal_library_get_pipeline_norm(lib, op, n_fuse);

    int nth = 32; // SIMD width

@@ -3033,7 +3076,7 @@ int ggml_metal_op_norm(ggml_metal_op_t ctx, int idx) {
    nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
    nth = std::min(nth, args.ne00_t);

-    const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3127,7 +3170,7 @@ int ggml_metal_op_rope(ggml_metal_op_t ctx, int idx) {
        /* src2        =*/ op->src[2] != nullptr,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_rope(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_rope(lib, op);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3199,7 +3242,7 @@ int ggml_metal_op_im2col(ggml_metal_op_t ctx, int idx) {
        /*.KHW  =*/ KH * KW,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_im2col(lib, op);

    GGML_ASSERT(KH*KW <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));

@@ -3270,7 +3313,7 @@ int ggml_metal_op_conv_2d(ggml_metal_op_t ctx, int idx) {
        /*.d1   =*/ d1,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_2d(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_conv_2d(lib, op);

    int nth = ggml_metal_pipeline_max_theads_per_threadgroup(pipeline);
    nth = std::min(nth, 256);
@@ -3325,7 +3368,7 @@ int ggml_metal_op_conv_transpose_1d(ggml_metal_op_t ctx, int idx) {
        /*.nb1 =*/ nb1,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_transpose_1d(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_conv_transpose_1d(lib, op);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3377,7 +3420,7 @@ int ggml_metal_op_conv_transpose_2d(ggml_metal_op_t ctx, int idx) {
        /*.nb2 =*/ nb2,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_conv_transpose_2d(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_conv_transpose_2d(lib, op);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3433,7 +3476,7 @@ int ggml_metal_op_upscale(ggml_metal_op_t ctx, int idx) {
        /*.sf3 =*/ sf3
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_upscale(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_upscale(lib, op);

    const int nth = std::min(ggml_metal_pipeline_max_theads_per_threadgroup(pipeline), ne0);

@@ -3477,7 +3520,7 @@ int ggml_metal_op_pad(ggml_metal_op_t ctx, int idx) {
        /*.nb3  =*/ nb3
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pad(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_pad(lib, op);

    const int nth = std::min(1024, ne0);

@@ -3523,7 +3566,7 @@ int ggml_metal_op_pad_reflect_1d(ggml_metal_op_t ctx, int idx) {
        /*.p1 =*/ ((const int32_t *)(op->op_params))[1]
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_pad_reflect_1d(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_pad_reflect_1d(lib, op);

    const int nth = std::min(1024, ne0);

@@ -3560,7 +3603,7 @@ int ggml_metal_op_arange(ggml_metal_op_t ctx, int idx) {

    const int nth = std::min(1024, ne0);

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_arange(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_arange(lib, op);

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3591,7 +3634,7 @@ int ggml_metal_op_timestep_embedding(ggml_metal_op_t ctx, int idx) {
        /*.max_period =*/ max_period,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_timestep_embedding(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_timestep_embedding(lib, op);

    const int nth = std::max(1, std::min(1024, dim/2));

@@ -3621,7 +3664,7 @@ int ggml_metal_op_argmax(ggml_metal_op_t ctx, int idx) {
        /*.nb01 = */ nb01,
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_argmax(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_argmax(lib, op);

    const int64_t nrows = ggml_nrows(op->src[0]);

@@ -3630,7 +3673,7 @@ int ggml_metal_op_argmax(ggml_metal_op_t ctx, int idx) {
        nth *= 2;
    }

-    const size_t smem = ggml_metal_pipeline_get_smem(pipeline);
+    const size_t smem = pipeline.smem;

    ggml_metal_encoder_set_pipeline(enc, pipeline);
    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
@@ -3657,7 +3700,7 @@ int ggml_metal_op_argsort(ggml_metal_op_t ctx, int idx) {
    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_argsort(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_argsort(lib, op);

    // bitonic sort requires the number of elements to be power of 2
    int nth = 1;
@@ -3706,7 +3749,7 @@ int ggml_metal_op_argsort(ggml_metal_op_t ctx, int idx) {

    ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);

-    ggml_metal_pipeline_t pipeline_merge = ggml_metal_library_get_pipeline_argsort_merge(lib, op);
+    auto pipeline_merge = ggml_metal_library_get_pipeline_argsort_merge(lib, op);

    int len = nth;

@@ -3764,7 +3807,7 @@ int ggml_metal_op_top_k(ggml_metal_op_t ctx, int idx) {
    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_top_k(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_top_k(lib, op);

    // bitonic sort requires the number of elements to be power of 2
    int nth = 1;
@@ -3818,7 +3861,7 @@ int ggml_metal_op_top_k(ggml_metal_op_t ctx, int idx) {

    ggml_metal_encoder_dispatch_threadgroups(enc, npr*ne01, ne02, ne03, nth, 1, 1);

-    ggml_metal_pipeline_t pipeline_merge = ggml_metal_library_get_pipeline_top_k_merge(lib, op);
+    auto pipeline_merge = ggml_metal_library_get_pipeline_top_k_merge(lib, op);

    int len = args.top_k;

@@ -3881,7 +3924,7 @@ int ggml_metal_op_leaky_relu(ggml_metal_op_t ctx, int idx) {
        /*.slope =*/ slope
    };

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_unary(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_unary(lib, op);

    int64_t n = ggml_nelements(op);

@@ -3899,6 +3942,57 @@ int ggml_metal_op_leaky_relu(ggml_metal_op_t ctx, int idx) {
    return 1;
 }

+int ggml_metal_op_tri(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_TENSOR_LOCALS( int32_t, ne0, op->src[0], ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+
+    ggml_metal_kargs_tri args = {
+        /*.ne00  =*/ ne00,
+        /*.ne01  =*/ ne01,
+        /*.ne02  =*/ ne02,
+        /*.ne03  =*/ ne03,
+        /*.nb00  =*/ nb00,
+        /*.nb01  =*/ nb01,
+        /*.nb02  =*/ nb02,
+        /*.nb03  =*/ nb03,
+        /*.ne0   =*/ ne0,
+        /*.ne1   =*/ ne1,
+        /*.ne2   =*/ ne2,
+        /*.ne3   =*/ ne3,
+        /*.nb0   =*/ nb0,
+        /*.nb1   =*/ nb1,
+        /*.nb2   =*/ nb2,
+        /*.nb3   =*/ nb3,
+    };
+
+    auto pipeline = ggml_metal_library_get_pipeline_tri(lib, op);
+
+    int nth = 32; // SIMD width
+
+    while (nth < ne00 && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+        nth *= 2;
+    }
+
+    nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    nth = std::min(nth, ne00);
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
+
+    return 1;
+}
+
 int ggml_metal_op_opt_step_adamw(ggml_metal_op_t ctx, int idx) {
    ggml_tensor * op = ctx->node(idx);

@@ -3910,7 +4004,7 @@ int ggml_metal_op_opt_step_adamw(ggml_metal_op_t ctx, int idx) {
    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_opt_step_adamw(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_opt_step_adamw(lib, op);

    const int64_t np = ggml_nelements(op->src[0]);
    ggml_metal_kargs_opt_step_adamw args = {
@@ -3946,7 +4040,7 @@ int ggml_metal_op_opt_step_sgd(ggml_metal_op_t ctx, int idx) {
    GGML_TENSOR_LOCALS( int32_t, ne,  op,         ne);
    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);

-    ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_opt_step_sgd(lib, op);
+    auto pipeline = ggml_metal_library_get_pipeline_opt_step_sgd(lib, op);

    const int64_t np = ggml_nelements(op->src[0]);
    ggml_metal_kargs_opt_step_sgd args = {
--- a/ggml/src/ggml-metal/ggml-metal-ops.h
+++ b/ggml/src/ggml-metal/ggml-metal-ops.h
@@ -47,6 +47,7 @@ int ggml_metal_op_concat            (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_repeat            (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_acc               (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_scale             (ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_fill              (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_clamp             (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_unary             (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_glu               (ggml_metal_op_t ctx, int idx);
@@ -83,6 +84,7 @@ int ggml_metal_op_argmax            (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_argsort           (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_top_k             (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_leaky_relu        (ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_tri               (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_opt_step_adamw    (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_opt_step_sgd      (ggml_metal_op_t ctx, int idx);

--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -1249,6 +1249,22 @@ kernel void kernel_scale_f32_4(
    dst[tpig] = src0[tpig] * args.scale + args.bias;
 }

+kernel void kernel_fill_f32(
+        constant ggml_metal_kargs_fill & args,
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = args.val;
+}
+
+kernel void kernel_fill_f32_4(
+        constant ggml_metal_kargs_fill & args,
+        device const float4 * src0,
+        device       float4 * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = args.val;
+}
+
 kernel void kernel_clamp_f32(
        constant ggml_metal_kargs_clamp & args,
        device const float * src0,
@@ -1595,6 +1611,36 @@ kernel void kernel_exp_f32_4(
    dst[tpig] = exp(src0[tpig]);
 }

+kernel void kernel_softplus_f32(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    device const float & x = src0[tpig];
+    dst[tpig] = select(log(1.0f + exp(x)), x, x > 20.0f);
+}
+
+kernel void kernel_softplus_f32_4(
+        device const float4 * src0,
+        device       float4 * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    device const float4 & x = src0[tpig];
+    dst[tpig] = select(log(1.0f + exp(x)), x, x > 20.0f);
+}
+
+kernel void kernel_expm1_f32(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = exp(src0[tpig]) - 1.0f;
+}
+
+kernel void kernel_expm1_f32_4(
+        device const float4 * src0,
+        device       float4 * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = exp(src0[tpig]) - 1.0f;
+}
+
 kernel void kernel_reglu_f32(
        constant ggml_metal_kargs_glu & args,
        device const char * src0,
@@ -1943,6 +1989,75 @@ typedef decltype(kernel_cumsum_add<float>) kernel_cumsum_add_t;

 template [[host_name("kernel_cumsum_add_f32")]] kernel kernel_cumsum_add_t kernel_cumsum_add<float>;

+
+template<uint32_t ttype>
+bool _ggml_vec_tri_cmp(const int i, const int r);
+
+template<>
+bool _ggml_vec_tri_cmp</* GGML_TRI_TYPE_LOWER */ 3>(const int i, const int r) {
+    return i < r;
+}
+
+template<>
+bool _ggml_vec_tri_cmp</* GGML_TRI_TYPE_LOWER_DIAG */ 2>(const int i, const int r) {
+    return i <= r;
+}
+
+template<>
+bool _ggml_vec_tri_cmp</* GGML_TRI_TYPE_UPPER */ 1>(const int i, const int r) {
+    return i > r;
+}
+
+template<>
+bool _ggml_vec_tri_cmp</* GGML_TRI_TYPE_UPPER_DIAG */ 0>(const int i, const int r) {
+    return i >= r;
+}
+
+template<typename T, int ttype>
+kernel void kernel_tri(
+        constant ggml_metal_kargs_tri & args,
+        device const char * src0,
+        device const char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i3 = tgpig.z;
+    const int i2 = tgpig.y;
+    const int i1 = tgpig.x;
+
+    if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) {
+        return;
+    }
+
+    device const T * src_row = (device const T *) ((device const char *) src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03);
+    device       T * dst_row = (device       T *) ((device       char *) dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3);
+
+    // Each thread is a single element of the row if ne00 < max threads per
+    // threadgroup, so this will loop once for each index that this thread is
+    // responsible for
+    for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) {
+        // Use the comparison as a mask for branchless
+        dst_row[i0] = static_cast<T>(_ggml_vec_tri_cmp<ttype>(i0, i1)) * src_row[i0];
+    }
+}
+
+typedef decltype(kernel_tri<float, 0>) kernel_tri_t;
+
+template [[host_name("kernel_tri_f32_0")]] kernel kernel_tri_t kernel_tri<float, 0>;
+template [[host_name("kernel_tri_f32_1")]] kernel kernel_tri_t kernel_tri<float, 1>;
+template [[host_name("kernel_tri_f32_2")]] kernel kernel_tri_t kernel_tri<float, 2>;
+template [[host_name("kernel_tri_f32_3")]] kernel kernel_tri_t kernel_tri<float, 3>;
+template [[host_name("kernel_tri_f16_0")]] kernel kernel_tri_t kernel_tri<half, 0>;
+template [[host_name("kernel_tri_f16_1")]] kernel kernel_tri_t kernel_tri<half, 1>;
+template [[host_name("kernel_tri_f16_2")]] kernel kernel_tri_t kernel_tri<half, 2>;
+template [[host_name("kernel_tri_f16_3")]] kernel kernel_tri_t kernel_tri<half, 3>;
+#if defined(GGML_METAL_HAS_BF16)
+template [[host_name("kernel_tri_bf16_0")]] kernel kernel_tri_t kernel_tri<bfloat, 0>;
+template [[host_name("kernel_tri_bf16_1")]] kernel kernel_tri_t kernel_tri<bfloat, 1>;
+template [[host_name("kernel_tri_bf16_2")]] kernel kernel_tri_t kernel_tri<bfloat, 2>;
+template [[host_name("kernel_tri_bf16_3")]] kernel kernel_tri_t kernel_tri<bfloat, 3>;
+#endif
+
 template<typename T>
 kernel void kernel_soft_max(
        constant ggml_metal_kargs_soft_max & args,
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -128,6 +128,7 @@ struct rpc_msg_device_count_rsp {
 struct rpc_msg_get_alloc_size_req {
    uint32_t   device;
    rpc_tensor tensor;
+    rpc_tensor srcs[GGML_MAX_SRC];
 };

 struct rpc_msg_get_alloc_size_rsp {
@@ -572,6 +573,11 @@ static void * ggml_backend_rpc_buffer_get_base(ggml_backend_buffer_t buffer) {

 static rpc_tensor serialize_tensor(const ggml_tensor * tensor) {
    rpc_tensor result;
+    if (!tensor) {
+        memset(&result, 0, sizeof(result));
+        return result;
+    }
+
    result.id = reinterpret_cast<uint64_t>(tensor);
    result.type = tensor->type;
    if (tensor->buffer) {
@@ -753,23 +759,41 @@ static size_t ggml_backend_rpc_get_max_size(ggml_backend_buffer_type_t buft) {
 }

 static size_t ggml_backend_rpc_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+    // should we query the remote server for the actual size
+    bool rpc_get = false;
+
    // See comments in init_tensor.
-    if (ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr)) {
+    rpc_get |= ggml_is_quantized(tensor->type) && (tensor->ne[0] % 512 != 0) && (tensor->view_src == nullptr);
+
+    // ops that require additional memory for fleeting data on certain backends
+    // ref: https://github.com/ggml-org/llama.cpp/pull/15966
+    rpc_get |= tensor->op == GGML_OP_FLASH_ATTN_EXT;
+    rpc_get |= tensor->op == GGML_OP_MUL_MAT_ID;
+
+    if (rpc_get) {
        ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
        auto sock = get_socket(buft_ctx->endpoint);

-        rpc_msg_get_alloc_size_req request;
-        request.device = buft_ctx->device;
-        request.tensor = serialize_tensor(tensor);
+        rpc_msg_get_alloc_size_req request = {
+            /*.device =*/ buft_ctx->device,
+            /*.tensor =*/ serialize_tensor(tensor),
+            /*.srcs   =*/ {},
+        };

+        // .get_alloc_size could be a function of the tensor's srcs, so we must serialize them as well
+        for (int i = 0; i < GGML_MAX_SRC; i++) {
+            request.srcs[i] = serialize_tensor(tensor->src[i]);
+        }
+
+        // TODO: cache the alloc responses to avoid extra RPC calls?
        rpc_msg_get_alloc_size_rsp response;
        bool status = send_rpc_cmd(sock, RPC_CMD_GET_ALLOC_SIZE, &request, sizeof(request), &response, sizeof(response));
        RPC_STATUS_ASSERT(status);

        return response.alloc_size;
-    } else {
-        return ggml_nbytes(tensor);
    }
+
+    return ggml_nbytes(tensor);
 }

 static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
@@ -1017,7 +1041,7 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
    }
    ggml_backend_buffer_type_t buft;
    struct ggml_init_params params {
-        /*.mem_size   =*/ ggml_tensor_overhead(),
+        /*.mem_size   =*/ ggml_tensor_overhead()*(1 + GGML_MAX_SRC),
        /*.mem_buffer =*/ NULL,
        /*.no_alloc   =*/ true,
    };
@@ -1025,12 +1049,18 @@ bool rpc_server::get_alloc_size(const rpc_msg_get_alloc_size_req & request, rpc_
    ggml_context_ptr ctx_ptr { ggml_init(params) };
    GGML_ASSERT(ctx_ptr != nullptr);
    ggml_context * ctx = ctx_ptr.get();
-    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);

+    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
    if (tensor == nullptr) {
        GGML_LOG_ERROR("Null tensor pointer passed to server get_alloc_size function.\n");
        return false;
    }
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (request.srcs[i].id != 0) {
+            tensor->src[i] = deserialize_tensor(ctx, &request.srcs[i]);
+        }
+    }
+
    LOG_DBG("[%s] device: %d, buffer: %p, data: %p\n", __func__, dev_id, (void*)tensor->buffer, tensor->data);
    if (tensor->buffer == nullptr) {
        //No buffer allocated.
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -353,10 +353,17 @@ enum vk_conv_shapes {
    CONV_SHAPE_COUNT,
 };

-uint32_t conv_shapes_wg_denoms[][3] = {
-    { 128, 128, 1 },
-    {  64,  32, 1 },
-    {  32, 256, 1 },
+struct vk_conv_block_size {
+    uint32_t K;
+    uint32_t NPQ;
+    uint32_t CRS;
+};
+
+vk_conv_block_size vk_conv_block_sizes[CONV_SHAPE_COUNT] = {
+    // K   NPQ  CRS
+    { 128, 128, 16 }, // CONV_SHAPE_128x128
+    {  64,  32, 32 }, // CONV_SHAPE_64x32
+    {  32, 256, 16 }, // CONV_SHAPE_32x256
 };

 enum dmmv_wg_sizes {
@@ -519,6 +526,7 @@ struct vk_device_struct {
    bool fp16;
    bool bf16;
    bool pipeline_robustness;
+    bool memory_priority;
    vk::Device device;
    uint32_t vendor_id;
    vk::DriverId driver_id;
@@ -1343,20 +1351,11 @@ struct vk_op_conv2d_push_constants {
    uint32_t Cin;
    uint32_t N;

-    uint32_t KW;
-    uint32_t KH;
    uint32_t W;
    uint32_t H;
    uint32_t OW;
    uint32_t OH;

-    uint32_t s0;
-    uint32_t s1;
-    uint32_t p0;
-    uint32_t p1;
-    uint32_t d0;
-    uint32_t d1;
-
    uint32_t nb01;
    uint32_t nb02;
    uint32_t nb03;
@@ -1380,48 +1379,6 @@ template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) {
    init_fastdiv_values(p.OW*p.OH,  p.OWOHmp,  p.OWOHL);
 }

-struct vk_op_conv_transpose_2d_push_constants {
-    uint32_t Cout;
-    uint32_t Cin;
-    uint32_t N;
-
-    uint32_t KW;
-    uint32_t KH;
-    uint32_t W;
-    uint32_t H;
-    uint32_t OW;
-    uint32_t OH;
-
-    uint32_t s0;
-    uint32_t s1;
-    uint32_t p0;
-    uint32_t p1;
-    uint32_t d0;
-    uint32_t d1;
-
-    uint32_t nb01;
-    uint32_t nb02;
-    uint32_t nb03;
-
-    uint32_t nb11;
-    uint32_t nb12;
-    uint32_t nb13;
-
-    uint32_t nb1;
-    uint32_t nb2;
-    uint32_t nb3;
-
-    // init_fastdiv_values constants for dividing by OW, OW*OH
-    uint32_t OWmp;   uint32_t OWL;
-    uint32_t OWOHmp; uint32_t OWOHL;
-};
-
-template <> void init_pushconst_fastdiv(vk_op_conv_transpose_2d_push_constants &p) {
-    // Compute magic values to divide by OW, OW*OH
-    init_fastdiv_values(p.OW,       p.OWmp,    p.OWL);
-    init_fastdiv_values(p.OW*p.OH,  p.OWOHmp,  p.OWOHL);
-}
-
 struct vk_op_conv2d_dw_push_constants {
    uint32_t ne;
    uint32_t batches;
@@ -2369,7 +2326,13 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std

    vk::PhysicalDeviceMemoryProperties mem_props = device->physical_device.getMemoryProperties();

-    const vk::MemoryAllocateFlagsInfo mem_flags_info { mem_flags };
+    const vk::MemoryPriorityAllocateInfoEXT mem_priority_info { 1.0f };
+
+    vk::MemoryAllocateFlagsInfo mem_flags_info { mem_flags };
+
+    if (device->memory_priority) {
+        mem_flags_info.setPNext(&mem_priority_info);
+    }

    for (auto it = req_flags_list.begin(); it != req_flags_list.end(); it++) {
        const auto & req_flags = *it;
@@ -3574,7 +3537,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
                                              SHADER_REDUCTION_MODE_SHMEM;

        for (uint32_t i = 0; i < mul_mat_vec_max_cols; ++i) {
-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32",  arr_dmmv_f32_f32_f32_len[reduc],  arr_dmmv_f32_f32_f32_data[reduc],  "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f32_f32",  arr_dmmv_f32_f32_f32_len[reduc],  arr_dmmv_f32_f32_f32_data[reduc],  "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {wg_size_subgroup, 1, i+1}, 1, false, use_subgroups, force_subgroup_size);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f32_f32",  arr_dmmv_f16_f32_f32_len[reduc],  arr_dmmv_f16_f32_f32_data[reduc],  "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f32_f32", arr_dmmv_bf16_f32_f32_len[reduc], arr_dmmv_bf16_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f32_f32", arr_dmmv_q4_0_f32_f32_len[reduc], arr_dmmv_q4_0_f32_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
@@ -3598,7 +3561,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_IQ4_NL][i],  "mul_mat_vec_iq4_nl_f32_f32",  arr_dmmv_iq4_nl_f32_f32_len[reduc16],  arr_dmmv_iq4_nl_f32_f32_data[reduc16],  "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[w][GGML_TYPE_MXFP4][i],   "mul_mat_vec_mxfp4_f32_f32",   arr_dmmv_mxfp4_f32_f32_len[reduc16],   arr_dmmv_mxfp4_f32_f32_data[reduc16],   "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {rm_iq, 1, 1}, {wg_size_subgroup16, rm_iq, i+1}, 1, true, use_subgroups16, force_subgroup_size16);

-            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32",  arr_dmmv_f32_f16_f32_len[reduc],  arr_dmmv_f32_f16_f32_data[reduc],  "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
+            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F32 ][i], "mul_mat_vec_f32_f16_f32",  arr_dmmv_f32_f16_f32_len[reduc],  arr_dmmv_f32_f16_f32_data[reduc],  "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {wg_size_subgroup, 1, i+1}, 1, false, use_subgroups, force_subgroup_size);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_F16 ][i], "mul_mat_vec_f16_f16_f32",  arr_dmmv_f16_f16_f32_len[reduc],  arr_dmmv_f16_f16_f32_data[reduc],  "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_BF16][i], "mul_mat_vec_bf16_f16_f32", arr_dmmv_bf16_f16_f32_len[reduc], arr_dmmv_bf16_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {wg_size_subgroup, 2, i+1}, 1, false, use_subgroups, force_subgroup_size);
            ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[w][GGML_TYPE_Q4_0][i], "mul_mat_vec_q4_0_f16_f32", arr_dmmv_q4_0_f16_f32_len[reduc], arr_dmmv_q4_0_f16_f32_data[reduc], "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq, i+1}, 1, true, use_subgroups, force_subgroup_size);
@@ -3644,7 +3607,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
 #endif // GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT
        }

-        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",        arr_dmmv_id_f32_f32_f32_len[reduc],     arr_dmmv_id_f32_f32_f32_data[reduc],     "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size);
+        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32",        arr_dmmv_id_f32_f32_f32_len[reduc],     arr_dmmv_id_f32_f32_f32_data[reduc],     "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {wg_size_subgroup, 1}, 1, false, use_subgroups, force_subgroup_size);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32",        arr_dmmv_id_f16_f32_f32_len[reduc],     arr_dmmv_id_f16_f32_f32_data[reduc],     "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_BF16], "mul_mat_vec_id_bf16_f32",       arr_dmmv_id_bf16_f32_f32_len[reduc],    arr_dmmv_id_bf16_f32_f32_data[reduc],    "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {wg_size_subgroup, 2}, 1, false, use_subgroups, force_subgroup_size);
        ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[w][GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32",       arr_dmmv_id_q4_0_f32_f32_len[reduc],    arr_dmmv_id_q4_0_f32_f32_data[reduc],    "main", mul_mat_vec_id_num_bindings, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {wg_size_subgroup, 2*rm_stdq}, 1, true, use_subgroups, force_subgroup_size);
@@ -4050,7 +4013,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
            uint32_t nary_shmem = 2 * sizeof(int) * BLOCK_SIZE +
                                  sizeof(int) * device->subgroup_size +
                                  2 * sizeof(int) +
-                                  (BLOCK_SIZE / device->subgroup_size) * sizeof(int);
+                                  2 * (BLOCK_SIZE / device->subgroup_size) * sizeof(int);
            if (device->subgroup_arithmetic && device->subgroup_require_full_support && device->subgroup_shuffle && device->subgroup_ballot &&
                nary_shmem <= device->properties.limits.maxComputeSharedMemorySize) {
                ggml_vk_create_pipeline2(device, device->pipeline_topk_f32[i], "topk_f32_"+std::to_string(i), topk_nary_search_f32_len, topk_nary_search_f32_data, "main", 2, sizeof(vk_op_topk_push_constants), {BLOCK_SIZE, 1, 1}, {BLOCK_SIZE, device->subgroup_size, device->subgroup_size_log2}, 1, true, true, device->subgroup_size);
@@ -4070,10 +4033,16 @@ static void ggml_vk_load_shaders(vk_device& device) {

    for (auto &s : device->pipeline_solve_tri_f32) {
        const vk_solve_tri_pipeline_state &state = s.first;
+
+        // Max number of rows to load at a time, limited by shared memory
+        const uint32_t batch_N = device->properties.limits.maxComputeSharedMemorySize / ((state.N + state.K) * sizeof(float));
+        // Need at least K invocations, and prefer a minimum of 128 to spread out loading shared memory
+        const uint32_t block_size = std::max(128u, 1u << (uint32_t)ceilf(log2f(float(state.K))));
+
        ggml_vk_create_pipeline(
            device, s.second, "solve_tri_f32",
            solve_tri_f32_len, solve_tri_f32_data, "main", 3,
-            sizeof(vk_op_binary_push_constants), {1, 1, 1}, { 0, state.N, state.K }, 1, true);
+            sizeof(vk_op_binary_push_constants), {1, 1, 1}, { 0, state.N, state.K, batch_N, block_size }, 1, true);
    }

 #define IM2COL(bda) \
@@ -4119,12 +4088,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
    // conv2d, conv_transpose_2d
    for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) {
        uint32_t conv2d_WG_SIZE  = 256;
-        uint32_t conv2d_BS_K     = 128;
-        uint32_t conv2d_BS_CRS   = 16;
        uint32_t use_collectives = 0;  // Enables subgroup ops for preventing the re-calculation of indices.
-        uint32_t conv2d_BS_NPQ = 128;
-        uint32_t conv2d_TS_K   = 8;
+        uint32_t conv2d_TS_K     = (s == CONV_SHAPE_64x32) ? 4 : 8;
        uint32_t conv2d_SHMEM_PAD = 4;
+        vk_conv_block_size conv2d_BS = vk_conv_block_sizes[s];
        bool conv2d_UNROLL = true;

 #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
@@ -4138,29 +4105,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
            conv2d_UNROLL = false;
        } else if (device->vendor_id == VK_VENDOR_ID_AMD) {
            conv2d_SHMEM_PAD = device->architecture == vk_device_architecture::AMD_GCN ? 1 : 4;
-        }
-
-        switch (s) {
-        default:
-        case CONV_SHAPE_128x128:
-            conv2d_BS_K = conv_shapes_wg_denoms[CONV_SHAPE_128x128][0];
-            conv2d_BS_NPQ = conv_shapes_wg_denoms[CONV_SHAPE_128x128][1];
-            conv2d_BS_CRS = 16;
-            if (device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != vk_device_architecture::AMD_GCN) {
+            if (s == CONV_SHAPE_128x128 && device->architecture != vk_device_architecture::AMD_GCN) {
                conv2d_UNROLL = false;
            }
-            break;
-        case CONV_SHAPE_64x32:
-            conv2d_BS_K = conv_shapes_wg_denoms[CONV_SHAPE_64x32][0];
-            conv2d_BS_NPQ = conv_shapes_wg_denoms[CONV_SHAPE_64x32][1];
-            conv2d_BS_CRS = 32;
-            conv2d_TS_K   = 4;
-            break;
-        case CONV_SHAPE_32x256:
-            conv2d_BS_K = conv_shapes_wg_denoms[CONV_SHAPE_32x256][0];
-            conv2d_BS_NPQ = conv_shapes_wg_denoms[CONV_SHAPE_32x256][1];
-            conv2d_BS_CRS = 16;
-            break;
        }

        // Use collectives on pre-Turing NVIDIA GPUs and GCN AMD cards, which had slower integer math.
@@ -4174,22 +4121,22 @@ static void ggml_vk_load_shaders(vk_device& device) {
            allow_collectives_nv &&
            allow_collectives_amd) {
            use_collectives = 1;
-            conv2d_BS_CRS   = std::min(
+            conv2d_BS.CRS   = std::min(
                device->subgroup_size,
-                conv2d_BS_CRS);  // CRS block size should be capped at subgroup size for correctness when shuffle is used.
+                conv2d_BS.CRS);  // CRS block size should be capped at subgroup size for correctness when shuffle is used.
        }

        uint32_t conv2d_shmem_req =
-            (conv2d_BS_K * (conv2d_BS_CRS + conv2d_SHMEM_PAD) + conv2d_BS_CRS * (conv2d_BS_NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
+            (conv2d_BS.K * (conv2d_BS.CRS + conv2d_SHMEM_PAD) + conv2d_BS.CRS * (conv2d_BS.NPQ + conv2d_SHMEM_PAD)) * sizeof(float);
        if (device->properties.limits.maxComputeSharedMemorySize < conv2d_shmem_req) {
-            conv2d_BS_CRS = 8;
+            conv2d_BS.CRS = 8;
            if (use_collectives) {
-                conv2d_BS_CRS = std::min(device->subgroup_size, conv2d_BS_CRS);
+                conv2d_BS.CRS = std::min(device->subgroup_size, conv2d_BS.CRS);
            }
        }

-        std::array<uint32_t, 3> wg_denoms = { conv2d_BS_K, conv2d_BS_NPQ, 1 };
-        std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS_K, conv2d_BS_CRS, conv2d_BS_NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };
+        std::array<uint32_t, 3> wg_denoms = { conv2d_BS.K, 1, 1 };
+        std::vector<uint32_t> spec_constants = { conv2d_WG_SIZE, conv2d_BS.K, conv2d_BS.CRS, conv2d_BS.NPQ, conv2d_TS_K, use_collectives, conv2d_SHMEM_PAD };

 #define CREATE_CONV(name, type_suffix, spv_suffix) \
        for (auto &c : device->pipeline_##name##type_suffix[s]) { \
@@ -4206,15 +4153,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
            ggml_vk_create_pipeline( \
                device, c.second, #name #type_suffix, \
                name##type_suffix##spv_suffix##_len, name##type_suffix##spv_suffix##_data, "main", 3, \
-                sizeof(vk_op_##name##_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives);    \
+                sizeof(vk_op_conv2d_push_constants), wg_denoms, spec_constants_cpy, 1, true, use_collectives);    \
        }
 #define CREATE_CONVS(spv_suffix) \
        CREATE_CONV(conv2d, _f32, spv_suffix) \
        CREATE_CONV(conv2d, _f16_f32, spv_suffix) \
-        if (device->properties.limits.maxPushConstantsSize >= sizeof(vk_op_conv_transpose_2d_push_constants)) { \
-            CREATE_CONV(conv_transpose_2d, _f32, spv_suffix) \
-            CREATE_CONV(conv_transpose_2d, _f16_f32, spv_suffix) \
-        }
+        CREATE_CONV(conv_transpose_2d, _f32, spv_suffix) \
+        CREATE_CONV(conv_transpose_2d, _f16_f32, spv_suffix)
 #if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
        if (device->coopmat2) {
            CREATE_CONVS(_cm2)
@@ -4235,9 +4180,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_cwhn_f16_f32, "conv2d_dw_cwhn_f16_f32", conv2d_dw_cwhn_f16_f32_len, conv2d_dw_cwhn_f16_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1);

    for (uint32_t i = 0; i < num_topk_moe_pipelines; ++i) {
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX],      "topk_moe_f32_early_softmax_"+std::to_string(i),       topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true);
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true);
-        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX],       "topk_moe_f32_late_softmax"+std::to_string(i),         topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true);
+        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX],      "topk_moe_f32_early_softmax_"+std::to_string(i),       topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 0}, 1, true, true, device->subgroup_size);
+        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_EARLY_SOFTMAX_NORM], "topk_moe_f32_early_softmax_norm"+std::to_string(i),   topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 1, 0}, 1, true, true, device->subgroup_size);
+        ggml_vk_create_pipeline2(device, device->pipeline_topk_moe[i][TOPK_MOE_LATE_SOFTMAX],       "topk_moe_f32_late_softmax"+std::to_string(i),         topk_moe_f32_len, topk_moe_f32_data, "main", 3, sizeof(vk_op_topk_moe_push_constants), {1, 1, 1}, {device->subgroup_size, 1u<<i, 0, 1}, 1, true, true, device->subgroup_size);
    }

    for (auto &c : compiles) {
@@ -4340,6 +4285,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
 #endif
            } else if (strcmp("VK_KHR_pipeline_executable_properties", properties.extensionName) == 0) {
                pipeline_executable_properties_support = true;
+            } else if (strcmp("VK_EXT_memory_priority", properties.extensionName) == 0 &&
+                       getenv("GGML_VK_ENABLE_MEMORY_PRIORITY")) {
+                device->memory_priority = true;
            }
        }

@@ -4531,6 +4479,16 @@ static vk_device ggml_vk_get_device(size_t idx) {
            device_extensions.push_back("VK_EXT_pipeline_robustness");
        }

+        VkPhysicalDeviceMemoryPriorityFeaturesEXT memory_priority_features;
+        memory_priority_features.pNext = nullptr;
+        memory_priority_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MEMORY_PRIORITY_FEATURES_EXT;
+        memory_priority_features.memoryPriority = VK_FALSE;
+        if (device->memory_priority) {
+            last_struct->pNext = (VkBaseOutStructure *)&memory_priority_features;
+            last_struct = (VkBaseOutStructure *)&memory_priority_features;
+            device_extensions.push_back("VK_EXT_memory_priority");
+        }
+
        VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features;
        subgroup_size_control_features.pNext = nullptr;
        subgroup_size_control_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT;
@@ -5110,7 +5068,7 @@ static void ggml_vk_print_gpu_info(size_t idx) {
    }
 }

-static bool ggml_vk_instance_validation_ext_available();
+static bool ggml_vk_instance_layer_settings_available();
 static bool ggml_vk_instance_portability_enumeration_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions);
 static bool ggml_vk_instance_debug_utils_ext_available(const std::vector<vk::ExtensionProperties> & instance_extensions);
 static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev);
@@ -5139,19 +5097,19 @@ static void ggml_vk_instance_init() {
    vk::ApplicationInfo app_info{ "ggml-vulkan", 1, nullptr, 0, api_version };

    const std::vector<vk::ExtensionProperties> instance_extensions = vk::enumerateInstanceExtensionProperties();
-    const bool validation_ext = ggml_vk_instance_validation_ext_available();
+    const bool layer_settings = ggml_vk_instance_layer_settings_available();
 #ifdef __APPLE__
    const bool portability_enumeration_ext = ggml_vk_instance_portability_enumeration_ext_available(instance_extensions);
 #endif
    const bool debug_utils_ext = ggml_vk_instance_debug_utils_ext_available(instance_extensions) && getenv("GGML_VK_DEBUG_MARKERS") != nullptr;
    std::vector<const char*> layers;

-    if (validation_ext) {
+    if (layer_settings) {
        layers.push_back("VK_LAYER_KHRONOS_validation");
    }
    std::vector<const char*> extensions;
-    if (validation_ext) {
-        extensions.push_back("VK_EXT_validation_features");
+    if (layer_settings) {
+        extensions.push_back("VK_EXT_layer_settings");
    }
 #ifdef __APPLE__
    if (portability_enumeration_ext) {
@@ -5161,26 +5119,24 @@ static void ggml_vk_instance_init() {
    if (debug_utils_ext) {
        extensions.push_back("VK_EXT_debug_utils");
    }
-    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions);
+    VkBool32 enable_best_practice = layer_settings;
+    std::vector<vk::LayerSettingEXT> settings = {
+        {
+            "VK_LAYER_KHRONOS_validation",
+            "validate_best_practices",
+            vk::LayerSettingTypeEXT::eBool32,
+            1,
+            &enable_best_practice
+        },
+    };
+    vk::LayerSettingsCreateInfoEXT layer_setting_info(settings);
+    vk::InstanceCreateInfo instance_create_info(vk::InstanceCreateFlags{}, &app_info, layers, extensions, &layer_setting_info);
 #ifdef __APPLE__
    if (portability_enumeration_ext) {
        instance_create_info.flags |= vk::InstanceCreateFlagBits::eEnumeratePortabilityKHR;
    }
 #endif

-    std::vector<vk::ValidationFeatureEnableEXT> features_enable;
-    vk::ValidationFeaturesEXT validation_features;
-
-    if (validation_ext) {
-        features_enable = { vk::ValidationFeatureEnableEXT::eBestPractices };
-        validation_features = {
-            features_enable,
-            {},
-        };
-        validation_features.setPNext(nullptr);
-        instance_create_info.setPNext(&validation_features);
-        GGML_LOG_DEBUG("ggml_vulkan: Validation layers enabled\n");
-    }
    vk_instance.instance = vk::createInstance(instance_create_info);
    vk_instance_initialized = true;

@@ -6928,6 +6884,10 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_
    // Quantization overhead is not worth it for small k
    switch (device->vendor_id) {
    case VK_VENDOR_ID_NVIDIA:
+        if (src0_type == GGML_TYPE_Q2_K) {
+            return true;
+        }
+
        if (k <= 4096) {
            return false;
        }
@@ -8260,59 +8220,23 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    }
 }

-static std::array<uint32_t, 3> ggml_vk_get_conv_elements(const ggml_tensor *dst) {
-    const ggml_tensor *src0 = dst->src[0];
-    const ggml_tensor *src1 = dst->src[1];
-
-    // src0 - kernel:   [KW, KH, Cin, Cout]
-    // src1 - input:    [W, H, Cin, N]
-    // dst - result:    [OW, OH, Cout, N]
-
-    // Copied from ggml.c: int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d)
-    auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
-        return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
+static vk_conv_shapes ggml_vk_conv_select_shape(ggml_backend_vk_context * ctx, uint32_t K, uint32_t NPQ) {
+    auto n_tiles = [&](vk_conv_shapes s) {
+        return CEIL_DIV(K, vk_conv_block_sizes[s].K)
+            * CEIL_DIV(NPQ, vk_conv_block_sizes[s].NPQ);
    };
-    // parallelize in {OW/BS_K, OH/BS_NPQ, 1}
-    int64_t W    = src1->ne[0];
-    int64_t H    = src1->ne[1];
-    int64_t KW   = src0->ne[0];
-    int64_t KH   = src0->ne[1];
-    int64_t Cout = src0->ne[3];
-    int64_t N    = src1->ne[3];
-    int64_t OH   = calc_conv_output_size(H, KH, dst->op_params[1], dst->op_params[3], dst->op_params[5]);
-    int64_t OW   = calc_conv_output_size(W, KW, dst->op_params[0], dst->op_params[2], dst->op_params[4]);
-    int64_t NPQ  = N * OW * OH;

-    // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
-    std::array<uint32_t, 3> elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
-    return elements;
-}
+    // We can't query number of shader cores on Intel, use 32 as a placeholder
+    // so small convolutions will still choose a smaller tile.
+    const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;

-static std::array<uint32_t, 3> ggml_vk_get_conv_transpose_2d_elements(const ggml_tensor *dst) {
-    const ggml_tensor *src0 = dst->src[0];
-    const ggml_tensor *src1 = dst->src[1];
-
-    // src0 - kernel:   [KW, KH, Cout, Cin]
-    // src1 - input:    [W, H, Cin, N]
-    // dst - result:    [OW, OH, Cout, N]
-
-    auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t {
-        return (ins - 1) * s - 2 * p + (ks - 1) * d + 1;
-    };
-    // parallelize in {OW/BS_K, OH/BS_NPQ, 1}
-    int64_t W    = src1->ne[0];
-    int64_t H    = src1->ne[1];
-    int64_t KW   = src0->ne[0];
-    int64_t KH   = src0->ne[1];
-    int64_t Cout = src0->ne[2];
-    int64_t N    = src1->ne[3];
-    int64_t OH   = calc_conv_output_size(H, KH, dst->op_params[0], 0, 1);
-    int64_t OW   = calc_conv_output_size(W, KW, dst->op_params[0], 0, 1);
-    int64_t NPQ  = N * OW * OH;
-
-    // Tile output matrix to (K/NB_K, NPQ/NB_NPQ, 1) workgroups
-    std::array<uint32_t, 3> elements = { static_cast<uint32_t>(Cout), static_cast<uint32_t>(NPQ), 1 };
-    return elements;
+    if (K > 64 && n_tiles(CONV_SHAPE_128x128) >= shader_core_count * 2) {
+        return CONV_SHAPE_128x128;
+    } else if (K <= 32 && n_tiles(CONV_SHAPE_32x256) >= shader_core_count * 2) {
+        return CONV_SHAPE_32x256;
+    } else {
+        return CONV_SHAPE_64x32;
+    }
 }

 static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * dst, ggml_op op) {
@@ -8775,39 +8699,20 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
        return nullptr;
    case GGML_OP_CONV_2D:
    case GGML_OP_CONV_TRANSPOSE_2D:
-        if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 &&
-            ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) {
-            std::array<uint32_t, 3> elements{};
-            if (op == GGML_OP_CONV_2D) elements = ggml_vk_get_conv_elements(dst);
-            else if (op == GGML_OP_CONV_TRANSPOSE_2D) elements = ggml_vk_get_conv_transpose_2d_elements(dst);
-            vk_conv_shapes shape;
-
-            uint32_t tiles[CONV_SHAPE_COUNT];
-            for (uint32_t i = 0; i < CONV_SHAPE_COUNT; ++i) {
-                tiles[i] = CEIL_DIV(elements[0], conv_shapes_wg_denoms[i][0]) * CEIL_DIV(elements[1], conv_shapes_wg_denoms[i][1]);
-            }
-
-            // We can't query number of shader cores on Intel, use 32 as a placeholder
-            // so small convolutions will still choose a smaller tile.
-            const uint32_t shader_core_count = ctx->device->shader_core_count > 0 ? ctx->device->shader_core_count : 32;
-
-            if (elements[0] > 64 && tiles[CONV_SHAPE_128x128] >= shader_core_count * 2) {
-                shape = CONV_SHAPE_128x128;
-            } else if (elements[0] <= 32 && tiles[CONV_SHAPE_32x256] >= shader_core_count * 2) {
-                shape = CONV_SHAPE_32x256;
-            } else {
-                shape = CONV_SHAPE_64x32;
-            }
-
-            uint32_t KW = static_cast<uint32_t>(src0->ne[0]);
-            uint32_t KH = static_cast<uint32_t>(src0->ne[1]);
-            uint32_t s0 = static_cast<uint32_t>(dst->op_params[0]);
-            uint32_t s1 = op == GGML_OP_CONV_2D ? static_cast<uint32_t>(dst->op_params[1]) : static_cast<uint32_t>(dst->op_params[0]);
-            uint32_t p0 = op == GGML_OP_CONV_2D ? static_cast<uint32_t>(dst->op_params[2]) : 0;
-            uint32_t p1 = op == GGML_OP_CONV_2D ? static_cast<uint32_t>(dst->op_params[3]) : 0;
-            uint32_t d0 = op == GGML_OP_CONV_2D ? static_cast<uint32_t>(dst->op_params[4]) : 1;
-            uint32_t d1 = op == GGML_OP_CONV_2D ? static_cast<uint32_t>(dst->op_params[5]) : 1;
+        if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            uint32_t K = dst->ne[2]; // Cout
+            uint32_t NPQ = dst->ne[3] * dst->ne[1] * dst->ne[0]; // N * OH * OW
+            vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, K, NPQ);

+            bool transpose = dst->op == GGML_OP_CONV_TRANSPOSE_2D;
+            uint32_t KW = (uint32_t)src0->ne[0];
+            uint32_t KH = (uint32_t)src0->ne[1];
+            uint32_t s0 = (uint32_t)(ggml_get_op_params_i32(dst, 0));
+            uint32_t s1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 1) : s0;
+            uint32_t p0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 2) : 0;
+            uint32_t p1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 3) : 0;
+            uint32_t d0 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 4) : 1;
+            uint32_t d1 = !transpose ? (uint32_t)ggml_get_op_params_i32(dst, 5) : 1;
            vk_conv2d_pipeline_state conv2d_pipeline_state(s0, s1, p0, p1, d0, d1, KW, KH);

            std::map<vk_conv2d_pipeline_state, vk_pipeline> *pipelines = nullptr;
@@ -9126,13 +9031,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
            elements = { N * OC * OH * OW, 1, 1};
        } break;
    case GGML_OP_CONV_2D:
-        {
-            elements = ggml_vk_get_conv_elements(dst);
-        } break;
    case GGML_OP_CONV_TRANSPOSE_2D:
-        {
-            elements = ggml_vk_get_conv_transpose_2d_elements(dst);
-        } break;
+        if constexpr (std::is_same_v<PC, vk_op_conv2d_push_constants>) {
+            const uint32_t NPQ = pc.N * pc.OH * pc.OW;
+            const vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, pc.Cout, NPQ);
+            const uint32_t NPQ_blocks = CEIL_DIV(NPQ, vk_conv_block_sizes[shape].NPQ);
+
+            elements = { pc.Cout, NPQ_blocks, 1 };
+            if (elements[1] > 512) {
+                elements[2] = CEIL_DIV(elements[1], 512);
+                elements[1] = 512;
+            }
+        } else {
+            GGML_ABORT("invalid push constant type for CONV_2D");
+        }
+        break;
    case GGML_OP_ADD:
    case GGML_OP_SUB:
    case GGML_OP_DIV:
@@ -10683,30 +10596,24 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx,
    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    GGML_TENSOR_BINARY_OP_LOCALS
-
    GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t));
    GGML_ASSERT(nb10 == sizeof(float));
    GGML_ASSERT(nb0 == sizeof(float));

+    bool transpose = dst->op == GGML_OP_CONV_TRANSPOSE_2D;
+
    vk_op_conv2d_push_constants p{};
-    p.Cout = static_cast<uint32_t>(ne03);
-    p.Cin  = static_cast<uint32_t>(ne02);
+    p.Cout = static_cast<uint32_t>(!transpose ? ne03 : ne02);
+    p.Cin  = static_cast<uint32_t>(!transpose ? ne02 : ne03);
    p.N    = static_cast<uint32_t>(ne13);
+    GGML_ASSERT(p.Cout == ne2);
+    GGML_ASSERT(p.Cin == ne12);

-    p.KW = static_cast<uint32_t>(ne00);
-    p.KH = static_cast<uint32_t>(ne01);
    p.W  = static_cast<uint32_t>(ne10);
    p.H  = static_cast<uint32_t>(ne11);
    p.OW = static_cast<uint32_t>(ne0);
    p.OH = static_cast<uint32_t>(ne1);

-    p.s0 = static_cast<uint32_t>(dst->op_params[0]);
-    p.s1 = static_cast<uint32_t>(dst->op_params[1]);
-    p.p0 = static_cast<uint32_t>(dst->op_params[2]);
-    p.p1 = static_cast<uint32_t>(dst->op_params[3]);
-    p.d0 = static_cast<uint32_t>(dst->op_params[4]);
-    p.d1 = static_cast<uint32_t>(dst->op_params[5]);
-
    p.nb01 = static_cast<uint32_t>(nb01 / nb00);
    p.nb02 = static_cast<uint32_t>(nb02 / nb00);
    p.nb03 = static_cast<uint32_t>(nb03 / nb00);
@@ -10719,59 +10626,7 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx,
    p.nb2 = static_cast<uint32_t>(nb2 / nb0);
    p.nb3 = static_cast<uint32_t>(nb3 / nb0);

-    GGML_ASSERT(ne03 == ne2);
-    GGML_ASSERT(ne02 == ne12);
-
-    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_2D, std::move(p));
-}
-
-static void ggml_vk_conv_transpose_2d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0,
-                                      const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t));
-    GGML_ASSERT(nb10 == sizeof(float));
-    GGML_ASSERT(nb0 == sizeof(float));
-
-    vk_op_conv_transpose_2d_push_constants p{};
-    p.Cout = static_cast<uint32_t>(ne02);
-    p.Cin  = static_cast<uint32_t>(ne03);
-    p.N    = static_cast<uint32_t>(ne13);
-
-    p.KW = static_cast<uint32_t>(ne00);
-    p.KH = static_cast<uint32_t>(ne01);
-    p.W  = static_cast<uint32_t>(ne10);
-    p.H  = static_cast<uint32_t>(ne11);
-    p.OW = static_cast<uint32_t>(ne0);
-    p.OH = static_cast<uint32_t>(ne1);
-
-    p.s0 = static_cast<uint32_t>(dst->op_params[0]);
-    p.s1 = static_cast<uint32_t>(dst->op_params[0]);
-    p.p0 = 0;
-    p.p1 = 0;
-    p.d0 = 1;
-    p.d1 = 1;
-
-    p.nb01 = static_cast<uint32_t>(nb01 / nb00);
-    p.nb02 = static_cast<uint32_t>(nb02 / nb00);
-    p.nb03 = static_cast<uint32_t>(nb03 / nb00);
-
-    p.nb11 = static_cast<uint32_t>(nb11 / nb10);
-    p.nb12 = static_cast<uint32_t>(nb12 / nb10);
-    p.nb13 = static_cast<uint32_t>(nb13 / nb10);
-
-    p.nb1 = static_cast<uint32_t>(nb1 / nb0);
-    p.nb2 = static_cast<uint32_t>(nb2 / nb0);
-    p.nb3 = static_cast<uint32_t>(nb3 / nb0);
-
-    GGML_ASSERT(ne02 == ne2);
-    GGML_ASSERT(ne03 == ne12);
-
-    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_2D, std::move(p));
+    ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, dst->op, std::move(p));
 }

 static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -12142,11 +11997,8 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr

        break;
    case GGML_OP_CONV_2D:
-        ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node);
-
-        break;
    case GGML_OP_CONV_TRANSPOSE_2D:
-        ggml_vk_conv_transpose_2d(ctx, compute_ctx, src0, src1, node);
+        ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node);

        break;
    case GGML_OP_CONV_2D_DW:
@@ -14179,10 +14031,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                const uint32_t N = op->src[0]->ne[0];
                const uint32_t K = op->src[1]->ne[0];
                // K dimension limited to workgroup size
-                if (K > 128) {
+                if (K > 1u << device->max_workgroup_size_log2) {
                    return false;
                }
-                if (N * N * sizeof(float) + N * K * sizeof(float) > device->properties.limits.maxComputeSharedMemorySize) {
+                const uint32_t batch_N = device->properties.limits.maxComputeSharedMemorySize / ((N + K) * sizeof(float));
+
+                if (batch_N == 0) {
                    return false;
                }
                return true;
@@ -14255,13 +14109,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_CONV_2D:
        case GGML_OP_CONV_TRANSPOSE_2D:
            {
-                // Op is disabled for Apple because it segfaults at pipeline create time on MoltenVK
-                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-                const vk_device& device = ggml_vk_get_device(ctx->device);
-                if (op->op == GGML_OP_CONV_TRANSPOSE_2D &&
-                    device->properties.limits.maxPushConstantsSize < sizeof(vk_op_conv_transpose_2d_push_constants)) {
-                    return false;
-                }
                // Channel-contiguous format is not supported yet.
                return ((op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
                    op->src[1]->type == GGML_TYPE_F32 &&
@@ -14386,21 +14233,21 @@ ggml_backend_reg_t ggml_backend_vk_reg() {
 }

 // Extension availability
-static bool ggml_vk_instance_validation_ext_available() {
+static bool ggml_vk_instance_layer_settings_available() {
 #ifdef GGML_VULKAN_VALIDATE
    // Check if validation layer provides the extension
    const std::string layer_name = "VK_LAYER_KHRONOS_validation";
    for (const auto& layer : vk::enumerateInstanceLayerProperties()) {
        if (layer_name == layer.layerName.data()) {
            for (const auto& ext : vk::enumerateInstanceExtensionProperties(layer_name)) {
-                if (strcmp("VK_EXT_validation_features", ext.extensionName.data()) == 0) {
+                if (strcmp("VK_EXT_layer_settings", ext.extensionName.data()) == 0) {
                    return true;
                }
            }
        }
    }

-    std::cerr << "ggml_vulkan: WARNING: Validation layer or layer extension VK_EXT_validation_features not found." << std::endl;
+    std::cerr << "ggml_vulkan: WARNING: Validation layer or layer extension VK_EXT_layer_settings not found." << std::endl;
 #endif
    return false;
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp
@@ -32,22 +32,12 @@ layout(push_constant) uniform parameter {
    uint32_t Cin;
    uint32_t N;

-    // Tensor spatial sizes: kernel, input, output
-    uint32_t KW;
-    uint32_t KH;
+    // Tensor spatial sizes: input, output
    uint32_t W;
    uint32_t H;
    uint32_t OW;
    uint32_t OH;

-    // Parameters: stride, padding, dilation - 0=y, 1=x
-    uint32_t s0;
-    uint32_t s1;
-    uint32_t p0;
-    uint32_t p1;
-    uint32_t d0;
-    uint32_t d1;
-
    // Strides in elements
    uint32_t nb01;
    uint32_t nb02;
@@ -77,13 +67,14 @@ layout(constant_id = 3) const uint BS_NPQ          = 128;
 layout(constant_id = 4) const uint TS_K            = 8;
 layout(constant_id = 5) const uint use_collectives = 1;
 layout(constant_id = 6) const uint SHMEM_PAD       = 4;
-
+// Stride, padding, dilation
 layout(constant_id = 7)  const uint s0             = 1;
 layout(constant_id = 8)  const uint s1             = 1;
 layout(constant_id = 9)  const uint p0             = 0;
 layout(constant_id = 10) const uint p1             = 0;
 layout(constant_id = 11) const uint d0             = 1;
 layout(constant_id = 12) const uint d1             = 1;
+// Kernel spatial sizes
 layout(constant_id = 13) const uint KW             = 1;
 layout(constant_id = 14) const uint KH             = 1;

@@ -138,7 +129,7 @@ P,Q=OH,OW
 */

 uint32_t B_idx_K   = gl_WorkGroupID.x;
-uint32_t B_idx_NPQ = gl_WorkGroupID.y;
+uint32_t B_idx_NPQ = gl_WorkGroupID.y + gl_WorkGroupID.z * 512;

 uint32_t T_y = tid / NT_NPQ;
 uint32_t T_x = tid % NT_NPQ;
@@ -178,6 +169,10 @@ ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_T
 #endif

 void main() {
+    if (B_idx_NPQ * BS_NPQ >= NPQ) {
+        return;
+    }
+
 #ifdef COOPMAT2
    coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator> matC;
    matC = coopmat<ACC_TYPE, gl_ScopeWorkgroup, BS_K, BS_NPQ, gl_MatrixUseAccumulator>(0.0);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
@@ -131,8 +131,12 @@ void main() {
        rms_norm(num_blocks);
    } else if (num_blocks > 16) {
        rms_norm(32);
-    } else if (num_blocks > 8) {
+    } else if (num_blocks > 12) {
        rms_norm(16);
+    } else if (num_blocks > 10) {
+        rms_norm(12);
+    } else if (num_blocks > 8) {
+        rms_norm(10);
    } else if (num_blocks > 4) {
        rms_norm(8);
    } else if (num_blocks == 4) {
--- a/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/solve_tri.comp
@@ -5,8 +5,9 @@

 layout (constant_id = 1) const uint N = 64;
 layout (constant_id = 2) const uint K = 32;
+layout (constant_id = 3) const uint BATCH_N = 32;

-layout(local_size_x = 128, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x_id = 4, local_size_y = 1, local_size_z = 1) in;

 uint a_base, b_base, x_base;

@@ -22,8 +23,8 @@ void store_x(uint r, uint c, FLOAT_TYPE v) {
    data_d[x_base + r * p.nb21 + c * p.nb20] = D_TYPE(v);
 }

-shared FLOAT_TYPE shA[N * N];
-shared FLOAT_TYPE shB[N * K];
+shared FLOAT_TYPE shA[BATCH_N * N];
+shared FLOAT_TYPE shB[BATCH_N * K];

 void main() {
    const uint batch = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
@@ -39,34 +40,42 @@ void main() {
    b_base = get_boffset() + i2 * p.nb12 + i3 * p.nb13;
    x_base = get_doffset() + i2 * p.nb22 + i3 * p.nb23;

-    // Load the A matrix into shA
-    [[unroll]] for (uint i = 0; i < N * N; i += gl_WorkGroupSize.x) {
-        uint idx = i + tid;
-        if (((N * N) % gl_WorkGroupSize.x == 0) || idx < N * N) {
-            shA[idx] = get_a(idx / N, idx % N);
-        }
-    }
-    // Load the B matrix into shB
-    [[unroll]] for (uint i = 0; i < N * K; i += gl_WorkGroupSize.x) {
-        uint idx = i + tid;
-        if (((N * K) % gl_WorkGroupSize.x == 0) || idx < N * K) {
-            shB[idx] = get_b(idx / K, idx % K);
-        }
-    }
-    barrier();
-
    FLOAT_TYPE X[N];
-    // Each thread solves one column
-    if (tid < K) {
-        [[unroll]] for (int r = 0; r < N; ++r) {
-            FLOAT_TYPE b = shB[r * K + tid];
-            // Compute x[r,c] = (b[r,c] - sum(a[r,c]*x[c])) / a[r,r]
-            [[unroll]] for (int c = 0; c < r; ++c) {
-                b -= shA[r * N + c] * X[c];
+
+    // Loop over batches of rows
+    [[unroll]] for (uint row_base = 0; row_base < N; row_base += BATCH_N) {
+        const uint cur_N = min(BATCH_N, N - row_base);
+
+        // Load the A matrix batch into shA
+        [[unroll]] for (uint i = 0; i < cur_N * N; i += gl_WorkGroupSize.x) {
+            uint idx = i + tid;
+            if (((cur_N * N) % gl_WorkGroupSize.x == 0) || idx < cur_N * N) {
+                shA[idx] = get_a(row_base + idx / N, idx % N);
            }
-            FLOAT_TYPE x = b / shA[r * N + r];
-            X[r] = x;
-            store_x(r, tid, x);
        }
+        // Load the B matrix batch into shB
+        [[unroll]] for (uint i = 0; i < cur_N * K; i += gl_WorkGroupSize.x) {
+            uint idx = i + tid;
+            if (((cur_N * K) % gl_WorkGroupSize.x == 0) || idx < cur_N * K) {
+                shB[idx] = get_b(row_base + idx / K, idx % K);
+            }
+        }
+        barrier();
+
+        // Each thread solves one column
+        if (tid < K) {
+            [[unroll]] for (uint row_offset = 0; row_offset < cur_N; ++row_offset) {
+                uint r = row_base + row_offset;
+                FLOAT_TYPE b = shB[row_offset * K + tid];
+                // Compute x[r,c] = (b[r,c] - sum(a[r,c]*x[c])) / a[r,r]
+                [[unroll]] for (int c = 0; c < r; ++c) {
+                    b -= shA[row_offset * N + c] * X[c];
+                }
+                FLOAT_TYPE x = b / shA[row_offset * N + r];
+                X[r] = x;
+                store_x(r, tid, x);
+            }
+        }
+        barrier();
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_moe.comp
@@ -75,7 +75,7 @@ void softmax_warp_inplace(inout float vals[experts_per_thread], const uint limit
 }

 void main() {
-    const uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_LocalInvocationID.y;
+    const uint row = gl_WorkGroupID.x * gl_WorkGroupSize.y + gl_SubgroupID;
    if (row >= n_rows) {
        return;
    }
@@ -83,17 +83,18 @@ void main() {
    const uint logits_offset = n_experts * row;
    const uint weights_offset = n_expert_used * row;
    const uint ids_offset = n_experts * row;
+    const uint lane = gl_SubgroupInvocationID;

    float wt[experts_per_thread];

    [[unroll]]
    for (uint i = 0; i < n_experts; i += WARP_SIZE) {
-        const uint expert = i + gl_LocalInvocationID.x;
+        const uint expert = i + lane;
        wt[i / WARP_SIZE] = (n_experts % WARP_SIZE == 0 || expert < n_experts) ? logits[logits_offset + expert] : -INFINITY;
    }

    if (!late_softmax) {
-        softmax_warp_inplace(wt, n_experts, gl_LocalInvocationID.x, false);
+        softmax_warp_inplace(wt, n_experts, lane, false);
    }

    // at this point, each thread holds a portion of softmax,
@@ -111,11 +112,11 @@ void main() {

    for (int k = 0; k < n_expert_used; k++) {
        float max_val    = wt[0];
-        uint   max_expert = gl_LocalInvocationID.x;
+        uint   max_expert = lane;

        [[unroll]]
        for (int i = 1; i < experts_per_thread; i++) {
-            const uint expert = gl_LocalInvocationID.x + i * WARP_SIZE;
+            const uint expert = lane + i * WARP_SIZE;
            if ((n_experts % WARP_SIZE == 0 || expert < n_experts) && wt[i] > max_val) {
                max_val    = wt[i];
                max_expert = expert;
@@ -132,11 +133,11 @@ void main() {
            }
        }

-        if ((k & (WARP_SIZE - 1)) == gl_LocalInvocationID.x) {
+        if ((k & (WARP_SIZE - 1)) == lane) {
            output_weights[k / WARP_SIZE] = max_val;
        }

-        if ((max_expert & (WARP_SIZE - 1)) == gl_LocalInvocationID.x) {
+        if ((max_expert & (WARP_SIZE - 1)) == lane) {
            wt[max_expert / WARP_SIZE] = -INFINITY;

            ids[ids_offset + k] = max_expert;
@@ -158,12 +159,12 @@ void main() {
    }

    if (late_softmax) {
-        softmax_warp_inplace(output_weights, n_expert_used, gl_LocalInvocationID.x, true);
+        softmax_warp_inplace(output_weights, n_expert_used, lane, true);
    }

    [[unroll]]
    for (uint i = 0; i < experts_per_thread; ++i) {
-        uint idx = i * WARP_SIZE + gl_LocalInvocationID.x;
+        uint idx = i * WARP_SIZE + lane;
        if (idx < n_expert_used) {
            weights[weights_offset + idx] = output_weights[i];
        }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/topk_nary_search.comp
@@ -38,6 +38,7 @@ shared int counts[SUBGROUP_SIZE];
 shared int sh_min_idx;
 shared uint sh_total;
 shared uint offset_partials[BLOCK_SIZE / SUBGROUP_SIZE];
+shared uint eq_min_partials[BLOCK_SIZE / SUBGROUP_SIZE];

 // Map float values to uint such that comparisons still work.
 // Positive values set the high bit, negative values are inverted.
@@ -156,25 +157,66 @@ void topk(const uint row) {
        // We need to compact these values to the start of the dst_row array.
        // Have each subgroup count how many items it'll store, so other
        // subgroups can compute their base offset.
-        bool top = f2ui(intBitsToFloat(v.y)) >= range_min;
-        uvec4 b = subgroupBallot(top);
-        uint bit_count = subgroupBallotBitCount(b);
-        if ((tid % SUBGROUP_SIZE) == 0) {
-            offset_partials[tid / SUBGROUP_SIZE] = bit_count;
-        }
-        barrier();
-
-        uint out_idx = 0;
-        [[unroll]] for (int i = 0; i < BLOCK_SIZE / SUBGROUP_SIZE; ++i) {
-            if (i < tid / SUBGROUP_SIZE) {
-                out_idx += offset_partials[i];
+        // Values strictly greater than range_min must be stored. For values equal
+        // to range_min, there can be ties and it's possible we'll need to store
+        // an arbitrary subset of them.
+        // If total == p.k, have a fast path where we don't need to handle ties.
+        if (total == p.k) {
+            bool top = f2ui(intBitsToFloat(v.y)) >= range_min;
+            uvec4 b = subgroupBallot(top);
+            uint bit_count = subgroupBallotBitCount(b);
+            if ((tid % SUBGROUP_SIZE) == 0) {
+                offset_partials[tid / SUBGROUP_SIZE] = bit_count;
            }
-        }
+            barrier();

-        uint bit_count_ex = subgroupBallotExclusiveBitCount(b);
-        if (top) {
-            // TODO: Copy directly to the output?
-            dst_row[out_idx + bit_count_ex] = v;
+            uint out_idx = 0;
+            [[unroll]] for (int i = 0; i < BLOCK_SIZE / SUBGROUP_SIZE; ++i) {
+                if (i < tid / SUBGROUP_SIZE) {
+                    out_idx += offset_partials[i];
+                }
+            }
+
+            uint bit_count_ex = subgroupBallotExclusiveBitCount(b);
+            if (top) {
+                // TODO: Copy directly to the output?
+                dst_row[out_idx + bit_count_ex] = v;
+            }
+        } else {
+            bool top = f2ui(intBitsToFloat(v.y)) > range_min;
+            bool eq_min = f2ui(intBitsToFloat(v.y)) == range_min;
+            uvec4 b_top = subgroupBallot(top);
+            uvec4 b_eq_min = subgroupBallot(eq_min);
+            uint bit_count_top = subgroupBallotBitCount(b_top);
+            uint bit_count_eq_min = subgroupBallotBitCount(b_eq_min);
+            if ((tid % SUBGROUP_SIZE) == 0) {
+                offset_partials[tid / SUBGROUP_SIZE] = bit_count_top;
+                eq_min_partials[tid / SUBGROUP_SIZE] = bit_count_eq_min;
+            }
+            barrier();
+
+            uint out_idx = 0;
+            uint eq_min_base = 0;
+            uint eq_min_idx = 0;
+            [[unroll]] for (int i = 0; i < BLOCK_SIZE / SUBGROUP_SIZE; ++i) {
+                if (i < tid / SUBGROUP_SIZE) {
+                    out_idx += offset_partials[i];
+                    eq_min_idx += eq_min_partials[i];
+                }
+                eq_min_base += offset_partials[i];
+            }
+            // range_min values are stored at the end
+            eq_min_idx += eq_min_base;
+
+            uint bit_count_ex_top = subgroupBallotExclusiveBitCount(b_top);
+            uint bit_count_ex_eq_min = subgroupBallotExclusiveBitCount(b_eq_min);
+            if (top) {
+                // TODO: Copy directly to the output?
+                dst_row[out_idx + bit_count_ex_top] = v;
+            }
+            if (eq_min && eq_min_idx + bit_count_ex_eq_min < p.k) {
+                dst_row[eq_min_idx + bit_count_ex_eq_min] = v;
+            }
        }

        barrier();
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
--- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py
@@ -19,6 +19,15 @@ def parse_decls(decls_text):
    return decls


+def replace_repl_placeholders(variant, template_map):
+    for repl, code in variant["REPLS"].items():
+        for key, val in template_map.items():
+            # Match "key" and avoid matching subsequences using by using \b
+            code = re.sub(rf'\b{re.escape(str(key))}\b', str(val), code)
+        variant["REPLS"][repl] = code
+    return variant
+
+
 def replace_placeholders(shader_text, replacements):
    for key, val in replacements.items():
        # Match {{KEY}} literally, where KEY is escaped
@@ -71,6 +80,10 @@ def generate_variants(fname, input_dir, output_dir, outfile):
            decls_map = parse_decls(extract_block(text, "DECLS"))
        except ValueError:
            decls_map = {}
+        try:
+            templates_map = ast.literal_eval(extract_block(text, "REPL_TEMPLATES"))
+        except ValueError:
+            templates_map = {}

        for fname in sorted(os.listdir(input_dir)):
            if fname.endswith(".tmpl"):
@@ -90,9 +103,11 @@ def generate_variants(fname, input_dir, output_dir, outfile):
                if key not in decls_map:
                    raise ValueError(f"DECLS key '{key}' not found.")
                decls_code += decls_map[key] + "\n\n"
-
            final_shader = re.sub(r'\bDECLS\b', decls_code, shader_template)
            if "REPLS" in variant:
+                variant = replace_repl_placeholders(variant, templates_map)
+                final_shader = replace_placeholders(final_shader, variant["REPLS"])
+                # second run to expand placeholders in repl_template
                final_shader = replace_placeholders(final_shader, variant["REPLS"])
            final_shader = expand_includes(final_shader, input_dir)

--- a/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl
+++ b/ggml/src/ggml-webgpu/wgsl-shaders/unary_op.wgsl
@@ -0,0 +1,461 @@
+#define(REPL_TEMPLATES)
+
+{
+    "XIELU_FUNC": "{{MUTATE}}[dst_i] = select(((exp(min(src[src_i], {{TYPE}}(params.eps))) - 1.0) - src[src_i]) * {{TYPE}}(params.alpha_n) + {{TYPE}}(params.beta) * src[src_i], {{TYPE}}(params.alpha_p) * src[src_i] * src[src_i] + {{TYPE}}(params.beta) * src[src_i], src[src_i] > 0.0);",
+    "ABS_FUNC": "{{MUTATE}}[dst_i] = abs(src[src_i]);",
+    "SGN_FUNC": "{{MUTATE}}[dst_i] = select({{TYPE}}(select(0.0, -1.0, src[src_i] < 0.0)), {{TYPE}}(1.0), src[src_i] > 0.0);",
+    "NEG_FUNC": "{{MUTATE}}[dst_i] = -src[src_i];",
+    "STEP_FUNC": "{{MUTATE}}[dst_i] = {{TYPE}}(select(0.0, 1.0, src[src_i] > 0.0));",
+    "TANH_FUNC": "{{MUTATE}}[dst_i] = tanh(clamp(src[src_i], -9.010913, 9.010913)); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
+    "RELU_FUNC": "{{MUTATE}}[dst_i] = select(0.0, src[src_i], src[src_i] > 0.0);",
+    "ELU_FUNC": "{{MUTATE}}[dst_i] = select(exp(src[src_i]) - 1.0, src[src_i], src[src_i] > 0.0);",
+    "HARDSIGMOID_FUNC": "{{MUTATE}}[dst_i] = min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));",
+    "SIGMOID_FUNC": "{{MUTATE}}[dst_i] = 1.0 / (1.0 + exp(-src[src_i]));",
+    "SILU_FUNC": "{{MUTATE}}[dst_i] = src[src_i] / (1.0 + exp(-src[src_i]));",
+    "EXP_FUNC": "{{MUTATE}}[dst_i] = exp(src[src_i]);",
+    "HARDSWISH_FUNC": "{{MUTATE}}[dst_i] = src[src_i] * min(1.0, max(0.0, (src[src_i] + 3.0) / 6.0));",
+    "GELU_FUNC": "{{MUTATE}}[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(sqrt(2.0 / 3.14159265) * (src[src_i] + 0.044715 * pow(src[src_i], 3.0)), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
+    "GELU_QUICK_FUNC": "{{MUTATE}}[dst_i] = src[src_i] * 0.5 * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458",
+    "GELU_ERF_FUNC": "{{MUTATE}}[dst_i] = 0.5 * src[src_i] * (1.0 + tanh(clamp(0.79788456 * (src[src_i] + 0.044715 * src[src_i] * src[src_i] * src[src_i]), -9.010913, 9.010913))); // Regarding tanh() domain restrictions in wgsl https://github.com/gpuweb/gpuweb/issues/4458"
+}
+
+#end(REPL_TEMPLATES)
+
+#define(VARIANTS)
+
+[
+    {
+      "SHADER_NAME": "abs_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "ABS_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "abs_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "ABS_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "abs_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "ABS_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "abs_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "ABS_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "sgn_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "SGN_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "sgn_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "SGN_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "sgn_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "SGN_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "sgn_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "SGN_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "neg_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "NEG_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "neg_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "NEG_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "neg_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "NEG_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "neg_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "NEG_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "step_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "STEP_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "step_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "STEP_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "step_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "STEP_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "step_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "STEP_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "tanh_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "TANH_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "tanh_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "TANH_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "tanh_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "TANH_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "tanh_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "TANH_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "elu_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "ELU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "elu_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "ELU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "elu_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "ELU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "elu_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "ELU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "relu_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "RELU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "relu_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "RELU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "relu_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "RELU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "relu_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "RELU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "sigmoid_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "SIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "sigmoid_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "SIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "sigmoid_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "SIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "sigmoid_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "SIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "silu_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "SILU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "silu_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "SILU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "silu_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "SILU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "silu_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "SILU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "exp_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "EXP_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "exp_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "EXP_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "exp_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "EXP_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "exp_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "EXP_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "hardsigmoid_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "HARDSIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "hardsigmoid_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "HARDSIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "hardsigmoid_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "HARDSIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "hardsigmoid_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "HARDSIGMOID_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "hardswish_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "HARDSWISH_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "hardswish_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "HARDSWISH_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "hardswish_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "HARDSWISH_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "hardswish_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "HARDSWISH_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "gelu_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "GELU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "gelu_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "GELU_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "gelu_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "GELU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "gelu_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "GELU_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "gelu_quick_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "GELU_QUICK_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "gelu_quick_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "GELU_QUICK_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "gelu_quick_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "GELU_QUICK_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "gelu_quick_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "GELU_QUICK_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+
+    {
+      "SHADER_NAME": "xielu_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "XIELU_FUNC", "EXT_PARAMS": "alpha_n: f32, alpha_p: f32, beta: f32, eps: f32", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "xielu_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "XIELU_FUNC", "EXT_PARAMS": "alpha_n: f32, alpha_p: f32, beta: f32, eps: f32", "MUTATE": "dst" },
+      "DECLS": ["NOT_INPLACE"]
+    },
+    {
+      "SHADER_NAME": "xielu_inplace_f32",
+      "REPLS": { "TYPE": "f32", "FUNC": "XIELU_FUNC", "EXT_PARAMS": "alpha_n: f32, alpha_p: f32, beta: f32, eps: f32", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+      "SHADER_NAME": "xielu_inplace_f16",
+      "REPLS": { "TYPE": "f16", "FUNC": "XIELU_FUNC", "EXT_PARAMS": "alpha_n: f32, alpha_p: f32, beta: f32, eps: f32", "MUTATE": "src" },
+      "DECLS": ["INPLACE"]
+    },
+    {
+        "SHADER_NAME": "gelu_erf_f32",
+        "REPLS": { "TYPE": "f32", "FUNC": "GELU_ERF_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+        "DECLS": ["NOT_INPLACE"]
+    },
+    {
+        "SHADER_NAME": "gelu_erf_f16",
+        "REPLS": { "TYPE": "f16", "FUNC": "GELU_ERF_FUNC", "EXT_PARAMS": "", "MUTATE": "dst" },
+        "DECLS": ["NOT_INPLACE"]
+    },
+    {
+        "SHADER_NAME": "gelu_erf_inplace_f32",
+        "REPLS": { "TYPE": "f32", "FUNC": "GELU_ERF_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+        "DECLS": ["INPLACE"]
+    },
+    {
+        "SHADER_NAME": "gelu_erf_inplace_f16",
+        "REPLS": { "TYPE": "f16", "FUNC": "GELU_ERF_FUNC", "EXT_PARAMS": "", "MUTATE": "src" },
+        "DECLS": ["INPLACE"]
+    }
+]
+
+#end(VARIANTS)
+
+#define(DECLS)
+
+#decl(INPLACE)
+
+@group(0) @binding(1)
+var<uniform> params: Params;
+
+#enddecl(INPLACE)
+
+#decl(NOT_INPLACE)
+
+@group(0) @binding(1)
+var<storage, read_write> dst: array<{{TYPE}}>;
+
+@group(0) @binding(2)
+var<uniform> params: Params;
+
+#enddecl(NOT_INPLACE)
+
+#end(DECLS)
+
+#define(SHADER)
+
+enable f16;
+
+fn update(dst_i: u32, src_i: u32) {
+    {{FUNC}}
+}
+
+@group(0) @binding(0)
+var<storage, read_write> src: array<{{TYPE}}>;
+
+DECLS
+
+struct Params {
+    ne: u32,            // total number of elements
+    offset_src: u32,    // in elements
+    offset_dst: u32,    // in elements
+
+    // Strides (in elements) — may be permuted
+    stride_src0: u32,
+    stride_src1: u32,
+    stride_src2: u32,
+    stride_src3: u32,
+
+    stride_dst0: u32,
+    stride_dst1: u32,
+    stride_dst2: u32,
+    stride_dst3: u32,
+
+    // Logical shapes
+    src_ne0: u32,
+    src_ne1: u32,
+    src_ne2: u32,
+
+    dst_ne0: u32,
+    dst_ne1: u32,
+    dst_ne2: u32,
+
+    {{EXT_PARAMS}}
+};
+
+override wg_size: u32;
+@compute @workgroup_size(wg_size)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    if (gid.x >= params.ne) {
+      return;
+    }
+
+    var i = gid.x;
+    let i3 = i / (params.src_ne2 * params.src_ne1 * params.src_ne0);
+    i = i % (params.src_ne2 * params.src_ne1 * params.src_ne0);
+    let i2 = i / (params.src_ne1 * params.src_ne0);
+    i = i % (params.src_ne1 * params.src_ne0);
+    let i1 = i / params.src_ne0;
+    let i0 = i % params.src_ne0;
+
+    var j = gid.x;
+    let j3 = j / (params.dst_ne2 * params.dst_ne1 * params.dst_ne0);
+    j = j % (params.dst_ne2 * params.dst_ne1 * params.dst_ne0);
+    let j2 = j / (params.dst_ne1 * params.dst_ne0);
+    j = j % (params.dst_ne1 * params.dst_ne0);
+    let j1 = j / params.dst_ne0;
+    let j0 = j % params.dst_ne0;
+
+    let src_idx = i0 * params.stride_src0 + i1 * params.stride_src1 +
+                  i2 * params.stride_src2 + i3 * params.stride_src3;
+
+    let dst_idx = j0 * params.stride_dst0 + j1 * params.stride_dst1 +
+                  j2 * params.stride_dst2 + j3 * params.stride_dst3;
+
+
+    update(params.offset_dst + dst_idx, params.offset_src + src_idx);
+}
+
+#end(SHADER)
+
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -376,6 +376,7 @@ class TensorNameMap:
            "model.layers.{bid}.block_sparse_moe.primary_router", # smallthinker
            "model.layers.{bid}.feed_forward.gate",               # lfm2moe
            "model.layers.{bid}.mlp.router.gate",               # afmoe
+            "layers.{bid}.gate",                                # mistral-large
        ),

        MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@@ -450,6 +451,7 @@ class TensorNameMap:
            "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
            "model.layers.{bid}.feed_forward.down_proj",
            "model.layers.{bid}.mlp.shared_mlp.up_proj",             # hunyuan
+            "layers.{bid}.shared_experts.w3",                        # mistral-large
        ),

        MODEL_TENSOR.FFN_UP_CHEXP: (
@@ -496,6 +498,7 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.shared_experts.gate_proj",         # deepseek deepseek2
            "model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
            "model.layers.{bid}.mlp.shared_mlp.gate_proj",             # hunyuan
+            "layers.{bid}.shared_experts.w1",                          # mistral-large
        ),

        MODEL_TENSOR.FFN_GATE_CHEXP: (
@@ -557,6 +560,7 @@ class TensorNameMap:
            "model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
            "model.layers.{bid}.shared_mlp.output_linear",             # granitemoe
            "model.layers.{bid}.mlp.shared_mlp.down_proj",             # hunyuan
+            "layers.{bid}.shared_experts.w2",                          # mistral-large
        ),

        MODEL_TENSOR.FFN_DOWN_CHEXP: (
@@ -924,14 +928,17 @@ class TensorNameMap:

        MODEL_TENSOR.ATTN_Q_A: (
            "model.layers.{bid}.self_attn.q_a_proj", # deepseek2
+            "layers.{bid}.attention.wq_a",           # mistral-large
        ),

        MODEL_TENSOR.ATTN_Q_B: (
            "model.layers.{bid}.self_attn.q_b_proj", # deepseek2
+            "layers.{bid}.attention.wq_b",           # mistral-large
        ),

        MODEL_TENSOR.ATTN_KV_A_MQA: (
            "model.layers.{bid}.self_attn.kv_a_proj_with_mqa", # deepseek2
+            "layers.{bid}.attention.wkv_a_with_mqa",           # mistral-large
        ),

        MODEL_TENSOR.ATTN_KV_B: (
@@ -940,18 +947,22 @@ class TensorNameMap:

        MODEL_TENSOR.ATTN_K_B: (
            "model.layers.{bid}.self_attn.k_b_proj",  # deepseek2
+            "layers.{bid}.attention.k_b_proj",        # mistral-large
        ),

        MODEL_TENSOR.ATTN_V_B: (
            "model.layers.{bid}.self_attn.v_b_proj",  # deepseek2
+            "layers.{bid}.attention.v_b_proj",        # mistral-large
        ),

        MODEL_TENSOR.ATTN_Q_A_NORM: (
            "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2
+            "layers.{bid}.attention.q_a_norm",            # mistral-large
        ),

        MODEL_TENSOR.ATTN_KV_A_NORM: (
            "model.layers.{bid}.self_attn.kv_a_layernorm", # deepseek2
+            "layers.{bid}.attention.kv_a_norm",            # mistral-large
        ),

        MODEL_TENSOR.ATTN_SUB_NORM: (
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::

    std::map<int, std::string> mapped;
    int blk_id = 0;
-    int pruned_attention_w = 0;

    // make a list of weights
    std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
@@ -674,11 +673,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    for (const auto & it : ml.weights_map) {
        const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
        if (remapped_name.empty()) {
-            if (it.first.find("attn_v.weight") != std::string::npos ||
-                it.first.find("attn_qkv.weight") != std::string::npos ||
-                it.first.find("attn_kv_b.weight") != std::string::npos) {
-                    pruned_attention_w++;
-            }
            LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
            continue;
        }
@@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        });
    }

-    bool is_clip_model = false;
    for (const auto * it : tensors) {
        const struct ggml_tensor * tensor = it->tensor;

@@ -717,32 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
            qs.has_output = true;
        }
-
-        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
    }

    qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;

-    // sanity checks for models that have attention layers
-    if (qs.n_attention_wv != 0 && !is_clip_model)
-    {
-        const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
-        // attention layers have a non-zero number of kv heads
-        int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
-        if (llama_model_has_encoder(&model)) {
-            // now n_layer_attn is the number of attention layers in the encoder
-            // for each decoder block, there are 2 attention layers
-            n_layer_attn += 2 * model.hparams.dec_n_layer;
-        }
-
-        // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
-        const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
-
-        LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
-
-        GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
-    }
-
    size_t total_size_org = 0;
    size_t total_size_new = 0;

--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -499,7 +499,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &

 // use std::wregex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
-    std::wregex expr(regex_expr);
+    std::wregex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    size_t start = 0;
@@ -529,7 +529,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c

 // use std::regex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
-    std::regex expr(regex_expr);
+    std::regex expr(regex_expr, std::regex_constants::optimize | std::regex_constants::nosubs);
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    size_t start = 0;
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -286,10 +286,11 @@ static double nmse(const float * a, const float * b, size_t n) {
    return mse_a_b / mse_a_0;
 }

-// difference between 2 integer sets (Jaccard distance, 0 - no difference, 1 - no overlap)
-static double jdst(const int32_t * a, const int32_t * b, size_t n) {
-    std::unordered_map<int32_t, size_t> set_a;
-    std::unordered_map<int32_t, size_t> set_b;
+// difference between 2 sets (Jaccard distance, 0 - no difference, 1 - no overlap)
+template <typename T>
+static double jdst(const T * a, const T * b, size_t n) {
+    std::unordered_map<T, size_t> set_a;
+    std::unordered_map<T, size_t> set_b;

    for (size_t i = 0; i < n; ++i) {
        set_a[a[i]]++;
@@ -5001,42 +5002,94 @@ struct test_top_k : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
    const int k;
+    const bool ties;
+    ggml_tensor * input {};

    std::string vars() override {
-        return VARS_TO_STR3(type, ne, k);
+        return VARS_TO_STR4(type, ne, k, ties);
    }

    test_top_k(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne = {16, 10, 10, 10},
-            int k = 4)
-        : type(type), ne(ne), k(k) {}
+            int k = 4, bool ties = false)
+        : type(type), ne(ne), k(k), ties(ties) {}

    double max_err() override {
        return 0.0;
    }

+    // When there are ties, only validate the final result.
+    // The logic in err can't handle the sentinel tensors.
+    bool run_whole_graph() override { return ties; }
+
    double err(const float * a, const float * b, size_t n) override {
-        std::vector<int32_t> ia(n);
-        std::vector<int32_t> ib(n);
+        // When there are no ties, we expect the exact same set of indices,
+        // but possibly in a different order. When there are ties, the indices
+        // can be different but the input values they correspond to should be
+        // the same. The logic for ties could work for non-ties, but only for
+        // the output tensor, not for the sentinel tensors.
+        if (ties) {
+            std::vector<float> src(ggml_nelements(input));

-        double diff = 0.0f;
+            ggml_backend_tensor_get(input, src.data(), 0, ggml_nelements(input) * ggml_type_size(type));

-        for (size_t i = 0; i < n; i++) {
-            ia[i] = (int32_t) a[i];
-            ib[i] = (int32_t) b[i];
+            double diff = 0.0f;

-            // penalize the result if the data is not integer valued
-            diff += std::fabs(a[i] - ia[i]);
-            diff += std::fabs(b[i] - ib[i]);
+            GGML_ASSERT(n == (size_t)(ggml_nrows(input) * k));
+            int64_t cols = input->ne[0];
+            std::vector<int32_t> ia(k);
+            std::vector<int32_t> ib(k);
+            std::vector<float> asrc(k);
+            std::vector<float> bsrc(k);
+            for (int64_t r = 0; r < ggml_nrows(input); r++) {
+                // Convert indices for the row back to integer
+                for (int64_t c = 0; c < k; c++) {
+                    ia[c] = (int32_t)a[r * k + c];
+                    ib[c] = (int32_t)b[r * k + c];
+                }
+                // The src values for each row should match.
+                for (int64_t c = 0; c < k; c++) {
+                    asrc[c] = src[r * cols + ia[c]];
+                    bsrc[c] = src[r * cols + ib[c]];
+                }
+                diff += jdst(asrc.data(), bsrc.data(), k);
+                // There should be no duplicate indices
+                std::sort(ia.begin(), ia.end());
+                std::sort(ib.begin(), ib.end());
+                if (std::adjacent_find(ia.begin(), ia.end()) != ia.end()) {
+                    diff += 1;
+                }
+                if (std::adjacent_find(ib.begin(), ib.end()) != ib.end()) {
+                    diff += 1;
+                }
+            }
+            return diff;
+        } else {
+            std::vector<int32_t> ia(n);
+            std::vector<int32_t> ib(n);
+
+            double diff = 0.0f;
+
+            for (size_t i = 0; i < n; i++) {
+                ia[i] = (int32_t) a[i];
+                ib[i] = (int32_t) b[i];
+
+                // penalize the result if the data is not integer valued
+                diff += std::fabs(a[i] - ia[i]);
+                diff += std::fabs(b[i] - ib[i]);
+            }
+
+            return diff + jdst(ia.data(), ib.data(), n);
        }
-
-        return diff + jdst(ia.data(), ib.data(), n);
    }

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
        ggml_set_name(a, "a");

+        // Save 'a' for err()
+        input = a;
+
        ggml_tensor * out = ggml_top_k(ctx, a, k);
        ggml_set_name(out, "out");

@@ -5047,11 +5100,16 @@ struct test_top_k : public test_case {
        std::random_device rd;
        std::default_random_engine rng(rd());
        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            // initialize with unique values to avoid ties
+            int tie_denom = std::max(1, std::min(10, k / 2));
            for (int64_t r = 0; r < ggml_nrows(t); r++) {
                std::vector<float> data(t->ne[0]);
                for (int i = 0; i < t->ne[0]; i++) {
-                    data[i] = i;
+                    if (ties) {
+                        // integer division to introduce duplicates
+                        data[i] = i / tie_denom;
+                    } else {
+                        data[i] = i;
+                    }
                }
                std::shuffle(data.begin(), data.end(), rng);
                ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
@@ -6146,6 +6204,15 @@ struct test_solve_tri : public test_case {

    std::string vars() override { return VARS_TO_STR3(type, ne_lhs, ne_rhs); }

+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        int64_t n = ne_lhs[0];
+        int64_t k = ne_rhs[0];
+        int64_t batch = ne_lhs[2] * ne_lhs[3];
+        // n * (n + 1) / 2 non-zero elements of lhs, 2 flops each, for each col of rhs
+        return n * (n + 1) * k * batch;
+    }
+
    test_solve_tri(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne_lhs = { 10, 10, 4, 3 },
            std::array<int64_t, 4> ne_rhs = { 3, 10, 4, 3 }
@@ -6982,6 +7049,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {

    test_cases.emplace_back(new test_conv_transpose_2d({3, 2, 3, 1}, {2, 2, 1, 3}, 1));
    test_cases.emplace_back(new test_conv_transpose_2d({10, 10, 9, 1}, {3, 3, 1, 9}, 2));
+    test_cases.emplace_back(new test_conv_transpose_2d({129, 63, 35, 1}, {3, 3, 48, 35}, 1));

    test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4,  500, 1, 1}));
    test_cases.emplace_back(new test_count_equal(GGML_TYPE_F32, {4, 5000, 1, 1}));
@@ -7656,6 +7724,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
            if (k <= 1<<i) {
                test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {(1<<i), 1, 1, 1}, k));
                test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {(1<<i) + 11, 1, 2, 1}, k));
+                test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {(1<<i) + 11, 1, 2, 1}, k, true));
            }
        }
    }
@@ -7725,6 +7794,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 10, 5, 4, 3 }));
    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 127, 5, 4, 3 }));
    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 128, 5, 4, 3 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 128, 128, 4, 4 }));
    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 255, 5, 4, 3 }));
    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 256, 5, 4, 3 }));
    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 511, 5, 4, 3 }));
@@ -7755,6 +7825,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 42, 42, 5, 2 }, { 10, 42, 5, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 2, 2 }, { 10, 64, 2, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 100, 100, 4, 4 }, { 41, 100, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 4 }, { 31, 128, 4, 4 }));
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 4 }, { 300, 64, 4, 4 }));

    for (bool v : {false, true}) {
        test_cases.emplace_back(new test_pad_ext(GGML_TYPE_F32, {512, 512, 1, 1}, 0, 1, 0, 1, 0, 0, 0, 0, v));
@@ -7896,6 +7968,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
        { 58,  3, 64,   32,  8 },
 // A deep layer of a ConvNet, several images in the batch
        { 16,  3, 512,  128, 8 },
+ // High resolution output (large NPQ)
+        {1536, 3, 64,   32,  1 },
    };

    for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
@@ -7953,6 +8027,17 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {

    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 4, 2 }, { 6, 64, 4, 2 }));
    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 1 }, { 8, 128, 4, 1 }));
+    // qwen3next with CHUNK_SIZE 64
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 64, 64, 8, 32 }, { 64, 64, 8, 32 }));
+    // qwen3next with CHUNK_SIZE 128
+    test_cases.emplace_back(new test_solve_tri(GGML_TYPE_F32, { 128, 128, 4, 32 }, { 128, 128, 4, 32 }));
+
+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_LOWER, GGML_TYPE_F32, { 256, 256, 4, 4 }));
+    test_cases.emplace_back(new test_tri(GGML_TRI_TYPE_UPPER_DIAG, GGML_TYPE_F32, { 1024, 1024, 8, 4 }));
+
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 128, 128, 4, 4 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 2048, 16, 5, 4 }));
+    test_cases.emplace_back(new test_cumsum(GGML_TYPE_F32, { 20000, 10, 4, 1 }));

    for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
        for (ggml_type type_a : all_types) {
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -2,11 +2,6 @@ set(TARGET llama-server)

 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})

-if (MINGW)
-    # fix: https://github.com/ggml-org/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
-    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
-endif()
-
 if (NOT LLAMA_HTTPLIB)
    message(FATAL_ERROR "LLAMA_HTTPLIB is OFF, cannot build llama-server. Hint: to skip building server, set -DLLAMA_BUILD_SERVER=OFF")
 endif()
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -101,8 +101,6 @@ struct server_slot {
    std::string  generated_text;
    llama_tokens generated_tokens;

-    common_chat_msg chat_msg;
-
    std::vector<completion_token_output> generated_token_probs;

    bool has_next_token = true;
@@ -153,9 +151,6 @@ struct server_slot {

    llama_token sampled;

-    common_chat_format chat_format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
-    std::vector<std::string> generated_tool_call_ids;
-
    // stats
    size_t n_sent_text = 0; // number of sent text character

@@ -183,13 +178,10 @@ struct server_slot {
        stop           = STOP_TYPE_NONE;
        stopping_word  = "";
        n_sent_text    = 0;
-        chat_format    = COMMON_CHAT_FORMAT_CONTENT_ONLY;

        generated_tokens.clear();
        generated_token_probs.clear();
-        chat_msg = {};
        json_schema = json();
-        generated_tool_call_ids.clear();

        // clear speculative decoding stats
        n_draft_total = 0;
@@ -302,23 +294,6 @@ struct server_slot {
        return timings;
    }

-    const common_chat_msg & update_chat_msg(std::vector<common_chat_msg_diff> & diffs) {
-        GGML_ASSERT(task);
-
-        auto previous_msg = chat_msg;
-        SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
-        auto new_msg = common_chat_parse(
-            generated_text,
-            /* is_partial= */ stop != STOP_TYPE_EOS,
-            task->params.oaicompat_chat_syntax);
-        if (!new_msg.empty()) {
-            new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id);
-            chat_msg = new_msg;
-            diffs = common_chat_msg_diff::compute_diffs(previous_msg, new_msg.empty() ? previous_msg : new_msg);
-        }
-        return chat_msg;
-    }
-
    size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) {
        GGML_ASSERT(task);

@@ -1284,8 +1259,6 @@ struct server_context_impl {
        } else {
            res->content = tkn.text_to_send;
            res->tokens  = { tkn.tok };
-
-            slot.update_chat_msg(res->oaicompat_msg_diffs);
        }

        res->n_decoded           = slot.n_decoded;
@@ -1317,8 +1290,14 @@ struct server_context_impl {
        res->id_slot = slot.id;

        res->index           = slot.task->index;
-        res->content         = slot.generated_text;
-        res->tokens          = std::move(slot.generated_tokens);
+        // in stream mode, content and tokens are already in last partial chunk
+        if (slot.task->params.stream) {
+            res->content     = "";
+            res->tokens      = llama_tokens{};
+        } else {
+            res->content     = std::move(slot.generated_text);
+            res->tokens      = std::move(slot.generated_tokens);
+        }
        res->timings         = slot.get_timings();
        res->prompt          = slot.task->tokens.detokenize(ctx, true);
        res->response_fields = std::move(slot.task->params.response_fields);
@@ -1338,7 +1317,6 @@ struct server_context_impl {
        res->res_type          = slot.task->params.res_type;
        res->oaicompat_model   = slot.task->params.oaicompat_model;
        res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id;
-        res->oaicompat_msg     = slot.update_chat_msg(res->oaicompat_msg_diffs);

        // populate res.probs_output
        if (slot.task->params.sampling.n_probs > 0) {
@@ -2596,6 +2574,9 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
    try {
        std::vector<server_task> tasks;

+        // tracking generation state and partial tool calls
+        std::vector<task_result_state> states;
+
        const auto & prompt = data.at("prompt");
        // TODO: this log can become very long, put it behind a flag or think about a more compact format
        //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());
@@ -2611,6 +2592,7 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
            inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
        }
        tasks.reserve(inputs.size());
+        states.reserve(inputs.size());
        for (size_t i = 0; i < inputs.size(); i++) {
            server_task task = server_task(type);

@@ -2628,10 +2610,12 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
            task.params.res_type          = res_type;
            task.params.oaicompat_cmpl_id = completion_id;
            task.params.oaicompat_model   = ctx_server.model_name;
+            states.push_back(task.params.oaicompat_chat_syntax);

            tasks.push_back(std::move(task));
        }

+        rd.set_states(std::move(states));
        rd.post_tasks(std::move(tasks));
    } catch (const std::exception & e) {
        res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
@@ -2657,7 +2641,6 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
            // if single request, return single object instead of array
            res->ok(arr.size() == 1 ? arr[0] : arr);
        }
-
    } else {
        // in streaming mode, the first error must be treated as non-stream response
        // this is to match the OAI API behavior
@@ -2676,76 +2659,92 @@ static std::unique_ptr<server_res_generator> handle_completions_impl(
        }

        // next responses are streamed
+        // to be sent immediately
+        json first_result_json = first_result->to_json();
        if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
-            res->data = format_anthropic_sse(first_result->to_json());
+            res->data = format_anthropic_sse(first_result_json);
        } else {
-            res->data = format_oai_sse(first_result->to_json()); // to be sent immediately
+            res->data = format_oai_sse(first_result_json);
        }
        res->status = 200;
        res->content_type = "text/event-stream";
        res->next = [res_this = res.get(), res_type, &should_stop](std::string & output) -> bool {
-            if (should_stop()) {
-                SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
-                return false; // should_stop condition met
-            }
-
-            if (!res_this->data.empty()) {
-                // flush the first chunk
-                output = std::move(res_this->data);
-                res_this->data.clear();
-                return true;
-            }
-
-            server_response_reader & rd = res_this->rd;
-
-            // check if there is more data
-            if (!rd.has_next()) {
+            static auto format_error = [](task_response_type res_type, const json & res_json) {
                if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
-                    // Anthropic doesn't send [DONE], message_stop was already sent
-                    output = "";
-                } else if (res_type != TASK_RESPONSE_TYPE_NONE) {
-                    output = "data: [DONE]\n\n";
-                } else {
-                    output = "";
-                }
-                SRV_DBG("%s", "all results received, terminating stream\n");
-                return false; // no more data, terminate
-            }
-
-            // receive subsequent results
-            auto result = rd.next(should_stop);
-            if (result == nullptr) {
-                SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
-                return false; // should_stop condition met
-            }
-
-            // send the results
-            json res_json = result->to_json();
-            if (result->is_error()) {
-                if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
-                    output = format_anthropic_sse({
+                    return format_anthropic_sse({
                        {"event", "error"},
                        {"data", res_json},
                    });
                } else {
-                    output = format_oai_sse(json {{ "error", res_json }});
+                    return format_oai_sse(json {{ "error", res_json }});
                }
-                SRV_DBG("%s", "error received during streaming, terminating stream\n");
-                return false; // terminate on error
-            } else {
-                GGML_ASSERT(
-                    dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr
-                    || dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
-                );
-                if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
-                    output = format_anthropic_sse(res_json);
-                } else {
-                    output = format_oai_sse(res_json);
-                }
-            }
+            };

-            // has next data, continue
-            return true;
+            try {
+                if (should_stop()) {
+                    SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
+                    return false; // should_stop condition met
+                }
+
+                if (!res_this->data.empty()) {
+                    // flush the first chunk
+                    output = std::move(res_this->data);
+                    res_this->data.clear();
+                    return true;
+                }
+
+                server_response_reader & rd = res_this->rd;
+
+                // check if there is more data
+                if (!rd.has_next()) {
+                    if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
+                        // Anthropic doesn't send [DONE], message_stop was already sent
+                        output = "";
+                    } else if (res_type != TASK_RESPONSE_TYPE_NONE) {
+                        output = "data: [DONE]\n\n";
+                    } else {
+                        output = "";
+                    }
+                    SRV_DBG("%s", "all results received, terminating stream\n");
+                    return false; // no more data, terminate
+                }
+
+                // receive subsequent results
+                auto result = rd.next(should_stop);
+                if (result == nullptr) {
+                    SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
+                    return false; // should_stop condition met
+                }
+
+                // send the results
+                if (result->is_error()) {
+                    json res_json = result->to_json();
+                    output = format_error(res_type, res_json);
+                    SRV_DBG("%s", "error received during streaming, terminating stream\n");
+                    return false; // terminate on error
+                } else {
+                    GGML_ASSERT(
+                        dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr
+                        || dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
+                    );
+                    json res_json = result->to_json();
+                    if (res_type == TASK_RESPONSE_TYPE_ANTHROPIC) {
+                        output = format_anthropic_sse(res_json);
+                    } else {
+                        output = format_oai_sse(res_json);
+                    }
+                }
+
+                // has next data, continue
+                return true;
+
+            } catch (const std::exception & e) {
+                json error_json = format_error_response(e.what(), ERROR_TYPE_SERVER);
+                output = format_error(res_type, error_json);
+
+                // terminate on exception
+                return false;
+            }
        };
    }

--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -900,6 +900,7 @@ static bool should_strip_proxy_header(const std::string & header_name) {
    // Headers that get duplicated when router forwards child responses
    if (header_name == "server" ||
        header_name == "transfer-encoding" ||
+        header_name == "content-length" || // quick fix for https://github.com/ggml-org/llama.cpp/issues/17710
        header_name == "keep-alive") {
        return true;
    }
--- a/tools/server/server-queue.cpp
+++ b/tools/server/server-queue.cpp
@@ -271,6 +271,10 @@ void server_response::terminate() {
 // server_response_reader
 //

+void server_response_reader::set_states(std::vector<task_result_state> && states) {
+    this->states = std::move(states);
+}
+
 void server_response_reader::post_tasks(std::vector<server_task> && tasks) {
    id_tasks = server_task::get_list_id(tasks);
    queue_results.add_waiting_tasks(tasks);
@@ -298,6 +302,12 @@ server_task_result_ptr server_response_reader::next(const std::function<bool()>
                SRV_DBG("%s", "received error result, stopping further processing\n");
                return result;
            }
+            if (!states.empty()) {
+                // update the generation state if needed
+                size_t idx = result->get_index();
+                GGML_ASSERT(idx < states.size());
+                result->update(states[idx]);
+            }
            if (result->is_stop()) {
                received_count++;
            }
--- a/tools/server/server-queue.h
+++ b/tools/server/server-queue.h
@@ -7,6 +7,8 @@
 #include <mutex>
 #include <unordered_set>

+// struct for managing server tasks
+// in most cases, use server_response_reader to post new tasks and retrieve results
 struct server_queue {
 private:
    int id = 0;
@@ -67,6 +69,8 @@ private:
    void cleanup_pending_task(int id_target);
 };

+// struct for managing server responses
+// in most cases, use server_response_reader to retrieve results
 struct server_response {
 private:
    bool running = true;
@@ -120,6 +124,10 @@ struct server_response_reader {
    bool cancelled = false;
    int polling_interval_seconds;

+    // tracking generation state and partial tool calls
+    // only used by streaming completions
+    std::vector<task_result_state> states;
+
    // should_stop function will be called each polling_interval_seconds
    server_response_reader(std::pair<server_queue &, server_response &> server_queues, int polling_interval_seconds)
        : queue_tasks(server_queues.first), queue_results(server_queues.second), polling_interval_seconds(polling_interval_seconds) {}
@@ -127,6 +135,7 @@ struct server_response_reader {
        stop();
    }

+    void set_states(std::vector<task_result_state> && states);
    void post_tasks(std::vector<server_task> && tasks);
    bool has_next() const;

--- a/tools/server/server-task.cpp
+++ b/tools/server/server-task.cpp
@@ -565,6 +565,7 @@ std::vector<unsigned char> completion_token_output::str_to_bytes(const std::stri
 // server_task_result_cmpl_final
 //
 json server_task_result_cmpl_final::to_json() {
+    GGML_ASSERT(is_updated && "update() must be called before to_json()");
    switch (res_type) {
        case TASK_RESPONSE_TYPE_NONE:
            return to_json_non_oaicompat();
@@ -582,8 +583,8 @@ json server_task_result_cmpl_final::to_json() {
 json server_task_result_cmpl_final::to_json_non_oaicompat() {
    json res = json {
        {"index",               index},
-        {"content",             stream ? "" : content}, // in stream mode, content is already in last partial chunk
-        {"tokens",              stream ? llama_tokens {} : tokens},
+        {"content",             content},
+        {"tokens",              tokens},
        {"id_slot",             id_slot},
        {"stop",                true},
        {"model",               oaicompat_model},
@@ -619,7 +620,7 @@ json server_task_result_cmpl_final::to_json_oaicompat() {
    json res = json {
        {"choices",            json::array({
            json{
-                {"text",          stream ? "" : content}, // in stream mode, content is already in last partial chunk
+                {"text",          content},
                {"index",         index},
                {"logprobs",      logprobs},
                {"finish_reason", finish_reason},
@@ -700,6 +701,25 @@ json server_task_result_cmpl_final::to_json_oaicompat_chat() {
    return res;
 }

+common_chat_msg task_result_state::update_chat_msg(
+        const std::string & text_added,
+        bool is_partial,
+        std::vector<common_chat_msg_diff> & diffs) {
+    generated_text += text_added;
+    auto msg_prv_copy = chat_msg;
+    SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
+    auto new_msg = common_chat_parse(
+        generated_text,
+        is_partial,
+        oaicompat_chat_syntax);
+    if (!new_msg.empty()) {
+        new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id);
+        chat_msg = new_msg;
+        diffs = common_chat_msg_diff::compute_diffs(msg_prv_copy, new_msg.empty() ? msg_prv_copy : new_msg);
+    }
+    return chat_msg;
+}
+
 json server_task_result_cmpl_final::to_json_oaicompat_chat_stream() {
    std::time_t t = std::time(0);
    std::string finish_reason = "length";
@@ -956,6 +976,7 @@ json server_task_result_cmpl_final::to_json_anthropic_stream() {
 // server_task_result_cmpl_partial
 //
 json server_task_result_cmpl_partial::to_json() {
+    GGML_ASSERT(is_updated && "update() must be called before to_json()");
    switch (res_type) {
        case TASK_RESPONSE_TYPE_NONE:
            return to_json_non_oaicompat();
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -161,6 +161,25 @@ struct result_prompt_progress {
    json to_json() const;
 };

+// struct for tracking the state of a task (e.g., for streaming)
+struct task_result_state {
+    // tracking diffs for partial tool calls
+    std::vector<common_chat_msg_diff> diffs;
+    common_chat_syntax oaicompat_chat_syntax;
+    common_chat_msg chat_msg;
+    std::string generated_text; // append new chunks of generated text here
+    std::vector<std::string> generated_tool_call_ids;
+
+    task_result_state(const common_chat_syntax & oaicompat_chat_syntax)
+        : oaicompat_chat_syntax(oaicompat_chat_syntax) {}
+
+    // parse partial tool calls and update the internal state
+    common_chat_msg update_chat_msg(
+        const std::string & text_added,
+        bool is_partial,
+        std::vector<common_chat_msg_diff> & diffs);
+};
+
 struct server_task_result {
    int id           = -1;
    int id_slot      = -1;
@@ -175,6 +194,9 @@ struct server_task_result {
    virtual int get_index() {
        return -1;
    }
+    virtual void update(task_result_state &) {
+        // only used by server_task_result_cmpl_*
+    }
    virtual json to_json() = 0;
    virtual ~server_task_result() = default;
 };
@@ -233,9 +255,10 @@ struct server_task_result_cmpl_final : server_task_result {
    task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
    std::string        oaicompat_model;
    std::string        oaicompat_cmpl_id;
-    common_chat_msg    oaicompat_msg;
+    common_chat_msg    oaicompat_msg; // to be populated by update()

-    std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
+    std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
+    bool is_updated = false;

    virtual int get_index() override {
        return index;
@@ -247,6 +270,11 @@ struct server_task_result_cmpl_final : server_task_result {

    virtual json to_json() override;

+    virtual void update(task_result_state & state) override {
+        is_updated = true;
+        oaicompat_msg = state.update_chat_msg(content, false, oaicompat_msg_diffs);
+    }
+
    json to_json_non_oaicompat();

    json to_json_oaicompat();
@@ -280,7 +308,8 @@ struct server_task_result_cmpl_partial : server_task_result {
    task_response_type res_type = TASK_RESPONSE_TYPE_NONE;
    std::string        oaicompat_model;
    std::string        oaicompat_cmpl_id;
-    std::vector<common_chat_msg_diff> oaicompat_msg_diffs;
+    std::vector<common_chat_msg_diff> oaicompat_msg_diffs; // to be populated by update()
+    bool is_updated = false;

    virtual int get_index() override {
        return index;
@@ -292,6 +321,11 @@ struct server_task_result_cmpl_partial : server_task_result {

    virtual json to_json() override;

+    virtual void update(task_result_state & state) override {
+        is_updated = true;
+        state.update_chat_msg(content, true, oaicompat_msg_diffs);
+    }
+
    json to_json_non_oaicompat();

    json to_json_oaicompat();
Author	SHA1	Message	Date
Daniel Bevenius	444f00b0ec	llama : remove quantization sanity check (#17788 ) * llama : remove quantization sanity check This commit removes the quantization sanity check for attention layers. The motivation for this is that there are model that are hybrid models that have recurrent layers, experts layers, and attention layers. For these models the current check fails as the experts layers are not taking into account. After consideration, it was decided that this check is not strictly necessary, and can be removed to allow for more flexible model architectures. * llama : remove unused pruned_attention_w and is_clip_model vars	2025-12-06 12:26:20 +01:00
Jeff Bolz	2960eb2975	vulkan: Use one row per workgroup for f32 mmv (#17711 ) The MoE models have a mul_mat_vec with very small m (32, 64, 128) right before the topk_moe selection. Running multiple rows per wg doesn't utilize the SMs well. I think even for larger m, f32 is so bandwidth-limited that running multiple rows doesn't help.	2025-12-06 11:12:26 +01:00
Xuan-Son Nguyen	dbc15a7967	convert: support Mistral 3 Large MoE (#17730 ) * convert: support Mistral 3 Large MoE * filter out vision tensors, add missing keys * handle vocab * add temperature_length * fix mscale_all_dim * clean up * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * fix * Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2025-12-06 10:49:33 +01:00
Jeff Bolz	c6c5e85979	vulkan: support solve_tri with larger N/K values (#17781 ) Split N into chunks to fit into shared memory. If K > 128, use a larger workgroup with enough invocations. Add perf tests matching qwen3next.	2025-12-06 08:56:45 +01:00
Georgi Gerganov	8e5f4987b1	contrib : stale PRs (#17803 )	2025-12-06 09:34:18 +02:00
Georgi Gerganov	8ce774a102	metal : fix build(#17799 ) * metal : fix build * tests : fix context destruction	2025-12-06 09:33:59 +02:00
Masato Nakasaka	67788f6846	vulkan: Replace deprecated VK_EXT_validation_features (#17637 ) * replaced deprecated VK_EXT_validation_features * forgot to remove old code	2025-12-06 06:39:42 +01:00
Masato Nakasaka	d8c0a7b085	vulkan: Fix mismatch in TOPK_MOE unit test (#17541 ) * Fix shader to support 2D workgroup mapping to a single subgroup * Set required_subgroup_size topk_moe shader requires static WARP_SIZE and actual subgroup size to match	2025-12-06 06:23:30 +01:00
Jeff Bolz	933414c0b6	vulkan: add more num_blocks instantiations in rms_norm (#17701 )	2025-12-05 22:08:56 +01:00
Jeff Bolz	a0f3897d53	vulkan: fix top_k bug when there are ties in the input (#17659 ) * vulkan: Reduce temporary memory usage for TOP_K - Compute row size for the temp buffer based on the output of the first pass. - Update shader addressing math to use the output row size - Pass the output row size as "ncols_output", what used to be "ncols_output" is now "k" For the common case of K=40 and src0=(200000,1,1,1), this reduces the temporary buffer from about 3.2MB to 500KB. * vulkan: fix top_k bug when there are ties in the input I noticed by inspection a bug in the vulkan top_k shader where if the least value in the top_k appears multiple times we could end up writing those extra copies out rather than some larger values (if the larger values are on higher numbered threads). I rewrote the test verification to handle this case, where the final index set is not necessarily the same. * Update tests/test-backend-ops.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2025-12-05 22:03:19 +01:00
Acly	e15cd06a94	vulkan : support conv-2d with large output size (#17685 )	2025-12-05 21:46:39 +01:00
Reese Levine	fd57b24c0f	ggml webgpu: unary op suppport, code refactoring, ops support (#17764 ) * Squashed commit of the following: commit b3c6bf4b0450d8d452b934df27a0fb7cb53cd755 Author: Abhijit Ramesh <abhijitramesh2k@gmail.com> Date: Mon Dec 1 18:29:00 2025 -0800 ggml webgpu: fix xielu parameter passing (#11) The XIELU operation was incorrectly using static_cast to convert float parameters to uint32_t, which converted numeric values instead of preserving IEEE 754 bit patterns. This caused incorrect values to be interpreted by the GPU shader. * Use reinterpret_cast to preserve float bit patterns when passing through uint32_t params buffer * Update WGSL shader parameter types from u32 to f32 * Re-enable XIELU support (was disabled due to numerical issues) Fixes NMSE test failures for XIELU operation on WebGPU backend. commit `5ca9b5e49e` Author: neha-ha <137219201+neha-ha@users.noreply.github.com> Date: Tue Nov 18 12:17:00 2025 -0800 Refactored pipelines and workgroup calculations (#10) * refactored pipelines * refactored workgroup calculation * removed commented out block of prior maps * Clean up ceiling division pattern --------- Co-authored-by: Neha Abbas <nehaabbas@eduroam-169-233-141-223.ucsc.edu> Co-authored-by: Reese Levine <reeselevine1@gmail.com> Author: James Contini <jamescontini@gmail.com> Date: Wed Oct 29 23:13:06 2025 -0700 formatted embed wgsl and ggml-webgpu.cpp commit `e1f6baea31` Author: James Contini <jamescontini@gmail.com> Date: Wed Oct 29 23:08:37 2025 -0700 implemented REPL_Template support and removed bug in unary operators kernel commit `8c70b8fece` Author: James Contini <jamescontini@gmail.com> Date: Wed Oct 15 16:14:20 2025 -0700 responded and dealt with PR comments commit `f9282c660c` Author: James Contini <jamescontini@gmail.com> Date: Sun Oct 12 13:41:41 2025 -0700 removed unnecesarry checking if node->src[1] exists for unary operators commit `4cf28d7dec` Author: James Contini <jamescontini@gmail.com> Date: Sun Oct 12 13:32:45 2025 -0700 All operators (inlcluding xielu) working commit `74c6add176` Author: James Contini <jamescontini@gmail.com> Date: Fri Oct 10 13:16:48 2025 -0700 fixed autoconfig commit `362749910b` Author: James Contini <jamescontini@gmail.com> Date: Fri Oct 10 13:10:46 2025 -0700 removed vestigial files commit `cb08583337` Author: James Contini <jamescontini@gmail.com> Date: Fri Oct 10 12:59:32 2025 -0700 abides by editor-config commit `5360e2852a` Author: James Contini <jamescontini@gmail.com> Date: Fri Oct 10 12:45:57 2025 -0700 rms_norm double declaration bug atoned commit `7b09baa4aa` Merge: `8a6ec843` `74b8fc17` Author: James Contini <jamescontini@gmail.com> Date: Fri Oct 10 11:50:03 2025 -0700 resolving merge conflicts commit `8a6ec843a5` Author: James Contini <jamescontini@gmail.com> Date: Wed Oct 8 18:06:47 2025 -0700 unary operators pass ggml tests commit `c3ae38278a` Author: James Contini <jamescontini@gmail.com> Date: Wed Oct 1 16:22:40 2025 -0700 neg passes backend test commit `aa1c9b2f88` Author: James Contini <jamescontini@gmail.com> Date: Tue Sep 30 23:55:27 2025 -0700 neg f16xf32xip builds and runs, havent actually ran a model that uses neg kernel yet though Co-authored-by: James Contini <jamescontini@gmail.com> Co-authored-by: Neha Abbas <neabbas@ucsc.edu> Co-authored-by: Abhijit Ramesh <abhijitramesh2k@gmail.com> * Remove extra code and format * Add ops documentation (finally) * Update ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: James Contini <jamescontini@gmail.com> Co-authored-by: Neha Abbas <neabbas@ucsc.edu> Co-authored-by: Abhijit Ramesh <abhijitramesh2k@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2025-12-05 12:25:51 -08:00
Jeff Bolz	6ab0d64960	vulkan: enable mmvq for q2_k on NVIDIA (#17675 )	2025-12-05 21:21:57 +01:00
Jeff Bolz	93bb92664e	vulkan: set all memory allocations to high priority (#17624 ) * vulkan: set all memory allocations to high priority * gate by env var	2025-12-05 21:21:04 +01:00
Georgi Gerganov	8160b38a5f	rpc : fix alloc size logic (#17116 ) * rpc : fix alloc size logic * rpc : bump version	2025-12-05 19:39:04 +02:00
Georgi Gerganov	c41bde6fbd	metal : add residency sets keep-alive heartbeat (#17766 ) * examples : add idle * metal : attach residency sets to queue * idle : add link * idle : adjust intervals * metal : add residency sets keep-alive heartbeat * cont : adjust default keep-alive time	2025-12-05 19:38:54 +02:00
Johannes Gäßler	6016d0bd41	HIP : fix RDNA4 build (#17792 )	2025-12-05 13:47:52 +01:00
Pascal	1be97831e4	fix: prevent segfault in tokenizer on highly repetitive input (#17786 ) Add nosubs\|optimize flags to std::regex constructors to prevent catastrophic backtracking when processing prompts with repeated identical characters (e.g., 'A' * 10000). The nosubs flag disables subgroup capture, significantly reducing memory usage and backtracking on uniform token sequences	2025-12-05 13:52:23 +02:00
Adrien Gallouët	a6cfc212ed	ci : fix winget workflow (#17790 )	2025-12-05 19:44:17 +08:00
shalinib-ibm	3a0d10533a	Q4/Q8 Tiled Gemm Optimization. (#16999 )	2025-12-05 19:41:51 +08:00
Piotr Wilkin (ilintar)	6648989673	Add pwilkin to CODEOWNERS for chat files (#17789 ) * Add pwilkin to CODEOWNERS for chat files * Reorder alphabetically	2025-12-05 12:00:57 +01:00
Johannes Gäßler	e95d0bc8fd	CUDA: fix FA VKQ accumulator overflow (#17746 )	2025-12-05 09:18:10 +01:00
Jiacheng (Jason) Chen	668ed76574	HIP: enable WMMA-MMQ INT kernels for RDNA 3 (#17576 ) * enabled wmma instructions for most quantizations other than q2k * fixed the last q2_k test case failure * address comments: fix out of bound write for RDNA4, add comments after #endif * clean up rebase: fix ne error in half2 * fix the EditorConfig CI	2025-12-05 09:17:37 +01:00
Sigbjørn Skjæret	03d9a77b85	ci : transform release binary root dir in tar to llama-bXXXX (#17773 ) * transform release binary root dir in tar to llama-bXXXX * bsdtar supports -s instead of --transform	2025-12-05 01:50:19 +01:00
Gabe Goodhart	3143a755c8	docs : update ops.md (Metal, BLAS) (#17768 ) * docs: Regen Metal.csv Branch: UpdateOpsMd Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * docs: Regen BLAS.csv Branch: UpdateOpsMd Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * docs: Update ops.md Branch: UpdateOpsMd Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>	2025-12-05 00:55:34 +01:00
Piotr Wilkin (ilintar)	96fe9badfc	Add support for CUMSUM and TRI for CUDA. (#17584 ) * Add support for CUMSUM and TRI for CUDA. * Minor optimizations. * Correct warp_prefix_inclusive_sum in float2 variant to return float2 * Optimize TRI * Whitespace * Fix strides. * Implement double loop * Whitespace * Fix HIP compilation bugs * Optimizations + big case performance tests * Implement using CUB with fallback to custom kernel * Remove error message. * Fixes from code review * Comment out CPU-unsupported F16/BF16 cases to fix CI * Fine, you win :P * Fix last cast, use NO_DEVICE_CODE and GGML_UNUSED_VARS * Vary warp-size based on physical warp size * Add GGML_UNUSED_VARS in tri as well * Use constexpr and call prefix_inclusive with warp_size template param * Update ggml/src/ggml-cuda/cumsum.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Apply suggestions from code review Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Change to tid % warp_size * Fix strides; hardcode mask; add ggml_lane_mask_t * Missing renames, remove unused get_warp_mask(), explicit calls to ggml_cuda_info() * Too hasty... --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>	2025-12-04 22:19:51 +01:00
Gabe Goodhart	bde188d60f	metal: TRI, FILL, EXPM1, SOFTPLUS (#16623 ) * feat(wip): Port initial TRI impl from pervious work The kernel does not work and is not optimized, but the code compiles and runs, so this will be the starting point now that the core op has been merged. Branch: ggml-cumsum-tri Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Remove argument for constant val override This was added in the original draft, but later removed. With this, the kernel now passes tests. Branch: ggml-cumsum-tri Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Move the ttype conditional to templating to avoid conditional in kernel Branch: ggml-cumsum-tri Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Type fixes Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * feat: Add softplus for metal Branch: ggml-cumsum-tri Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add EXPM1 for metal Branch: ggml-cumsum-tri Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add FILL for metal Branch: ggml-cumsum-tri Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Branchless version of tri using _ggml_vec_tri_cmp as a mask Branch: ggml-cumsum-tri Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Remove unused arguments Branch: ggml-cumsum-tri Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Use select instead of branch for softplus non-vec Branch: ggml-cumsum-tri Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2025-12-04 19:12:19 +02:00
Xuan-Son Nguyen	9d0229967a	server: strip content-length header on proxy (#17734 )	2025-12-04 16:32:57 +01:00
Xuan-Son Nguyen	c4c10bfb86	server: move msg diffs tracking to HTTP thread (#17740 ) * server: move msg diffs tracking to HTTP thread * wip * tool call tests ok * minor : style * cont : fix * move states to server_response_reader * add safe-guard * fix * fix 2 --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2025-12-04 15:46:08 +01:00
Daniel Bevenius	817d743cc1	examples : add missing code block end marker [no ci] (#17756 ) This commit adds the missing code block end marker in simple-cmake-pkg to correct the formatting.	2025-12-04 14:17:30 +01:00
Daniel Bevenius	bd4ef13476	common : skip model validation when --help is requested (#17755 ) This commit skips the model validation check when the user specifies the --help option. The motivation for this is that currently and error is thrown before the --help could be processed. Now skips validation if params.usage is set, allowing help to display without requiring --model. Resolves: https://github.com/ggml-org/llama.cpp/issues/17754	2025-12-04 13:36:50 +01:00
Alberto Cabrera Pérez	87a2084c45	ggml-cpu : remove asserts always evaluating to false (#17728 )	2025-12-04 13:16:38 +01:00
SmartestWashingMachine	3659aa28e9	convert: use existing local chat_template if mistral-format model has one. (#17749 ) * conversion: use existing local chat_template.jinja file if mistral-format model has one. * fix --mistral-format mistakenly assuming some <=v7 chat template names are file paths and reading them. * Update convert_hf_to_gguf.py - change from exists() to is_file() Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>	2025-12-04 12:12:45 +01:00
Adrien Gallouët	2a73f81f8a	cmake : simplify build info detection using standard variables (#17423 ) The current approach has several drawbacks. Mostly, when cross-compiling, invoking the compiler binary directly to query the machine hardware can behave unexpectedly depending on the toolchain wrapper (using COMPILER_TARGET, CFLAGS, etc). As CMake is the official tool to build llama.cpp, I propose to only rely on it to get those variables (`CMAKE_SYSTEM_NAME` and `CMAKE_SYSTEM_PROCESSOR`). Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2025-12-04 12:42:13 +02:00
Sigbjørn Skjæret	7dba049b07	ci : disable ggml-ci-x64-amd-* (#17753 )	2025-12-04 11:25:08 +01:00
Adrien Gallouët	83c1171529	common: use native MultiByteToWideChar (#17738 ) `std::codecvt_utf8<wchar_t>` is deprecated and produces warnings: common/common.cpp:792:31: warning: 'codecvt_utf8<wchar_t>' is deprecated [-Wdeprecated-declarations] 792 \| std::wstring_convert<std::codecvt_utf8<wchar_t>> converter; \| Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2025-12-04 12:06:49 +02:00
Georgi Gerganov	0d1324856f	metal : use params per pipeline instance (#17739 )	2025-12-04 10:34:11 +02:00
Georgi Gerganov	a67ef0f47f	llama : fix sanity checks during quantization (#17721 )	2025-12-04 10:33:42 +02:00
Adrien Gallouët	ef75a89fdb	build : move _WIN32_WINNT definition to headers (#17736 ) Previously, cmake was forcing `_WIN32_WINNT=0x0A00` for MinGW builds, This caused "macro redefined" warnings with toolchains that define the version. This also removes the `GGML_WIN_VER` variable as it is no longer needed. Signed-off-by: Adrien Gallouët <angt@huggingface.co>	2025-12-04 07:04:02 +01:00
Jeff Bolz	d8b5cdc4fe	build: enable parallel builds in msbuild using MTT (#17708 ) * build: enable parallel builds in msbuild using MTT * check LLAMA_STANDALONE	2025-12-03 22:42:29 -06:00