mtmd : remove libllava, remove clip-quantize-cli (⚠️ breaking change) (#13460 )

* mtmd : remove libllava, remove clip-quantize-cli * rm clip_model_quantize
scripts : support arbitrary input file formats in compare-llama-bench.py (#13455 )
2026-02-12 14:03:20 +02:00 · 2025-05-13 15:33:58 +02:00 · 2025-05-13 15:31:12 +02:00 · 2025-05-13 15:12:01 +02:00 · 2025-05-13 14:02:28 +03:00 · 2025-05-13 00:31:37 +02:00
147 changed files with 7215 additions and 3598 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -15,7 +15,6 @@ concurrency:
  cancel-in-progress: true

 env:
-  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
  LLAMA_LOG_COLORS: 1
@@ -308,7 +307,7 @@ jobs:
        run: |
          cd build
          # This is using llvmpipe and runs slower than other backends
-          ctest -L main --verbose --timeout 2700
+          ctest -L main --verbose --timeout 3600

  ubuntu-22-cmake-hip:
    runs-on: ubuntu-22.04
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -42,8 +42,7 @@ jobs:
          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          # Note: the intel images are failing due to an out of disk space error
-          # - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -16,11 +16,6 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

-# Fine-grant permission
-# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
-permissions:
-  contents: write # for creating release
-
 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
  CMAKE_ARGS: "-DLLAMA_BUILD_EXAMPLES=OFF -DLLAMA_BUILD_TESTS=OFF -DLLAMA_BUILD_TOOLS=ON -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON"
@@ -416,28 +411,27 @@ jobs:
          CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
        run: |
          cp $env:CURL_PATH\bin\libcurl-x64.dll .\build\bin\Release\libcurl-x64.dll
-          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+          7z a llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip .\build\bin\Release\*

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
        with:
-          path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
-          name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          path: llama-${{ steps.tag.outputs.name }}-bin-win-cuda${{ matrix.cuda }}-x64.zip
+          name: llama-bin-win-cuda${{ matrix.cuda }}-x64.zip

      - name: Copy and pack Cuda runtime
-        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }}
        run: |
          echo "Cuda install location: ${{ env.CUDA_PATH }}"
          $dst='.\build\bin\cudart\'
          robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
          robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
-          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
+          7z a cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip $dst\*

      - name: Upload Cuda runtime
        uses: actions/upload-artifact@v4
        with:
-          path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
-          name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+          path: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip
+          name: cudart-llama-bin-win-cuda${{ matrix.cuda }}-x64.zip

  windows-sycl:
    runs-on: windows-latest
@@ -646,6 +640,11 @@ jobs:
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

+    # Fine-grant permission
+    # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
+    permissions:
+        contents: write # for creating release
+
    runs-on: ubuntu-latest

    needs:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -252,20 +252,3 @@ configure_file(cmake/llama.pc.in

 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
-
-#
-# copy the license files
-#
-
-# Check if running in GitHub Actions
-if(DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
-    message(STATUS "Running inside GitHub Actions - copying license files")
-
-    # Copy all files from licenses/ to build/bin/
-    file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
-    foreach(LICENSE_FILE ${LICENSE_FILES})
-        get_filename_component(FILENAME ${LICENSE_FILE} NAME)
-        configure_file(${LICENSE_FILE} "${CMAKE_BINARY_DIR}/bin/${FILENAME}" COPYONLY)
-    endforeach()
-endif()
-
--- a/README.md
+++ b/README.md
@@ -16,8 +16,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

 ## Hot topics

+- 🔥 Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
 - **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
+- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141](https://github.com/ggml-org/llama.cpp/pull/13141)), `libllava` will be deprecated
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -117,6 +117,7 @@ setup_framework_structure() {
    # Copy all required headers (common for all platforms)
    cp include/llama.h             ${header_path}
    cp ggml/include/ggml.h         ${header_path}
+    cp ggml/include/ggml-opt.h     ${header_path}
    cp ggml/include/ggml-alloc.h   ${header_path}
    cp ggml/include/ggml-backend.h ${header_path}
    cp ggml/include/ggml-metal.h   ${header_path}
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -119,8 +119,8 @@ if (LLAMA_LLGUIDANCE)

    ExternalProject_Add(llguidance_ext
        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
-        # v0.7.10:
-        GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
+        # v0.7.19 (+ fancy-regex build fix):
+        GIT_TAG b59f98f85269892a7de3d3641ad155366f13daa6
        PREFIX ${CMAKE_BINARY_DIR}/llguidance
        SOURCE_DIR ${LLGUIDANCE_SRC}
        BUILD_IN_SOURCE TRUE
@@ -144,3 +144,27 @@ endif ()
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features   (${TARGET} PUBLIC cxx_std_17)
 target_link_libraries     (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
+
+
+#
+# copy the license files
+#
+
+# Check if running in GitHub Actions
+if (DEFINED ENV{GITHUB_ACTIONS} AND "$ENV{GITHUB_ACTIONS}" STREQUAL "true")
+    message(STATUS "Running inside GitHub Actions - copying license files")
+
+    # Copy all files from licenses/ to build/bin/
+    file(GLOB LICENSE_FILES "${CMAKE_SOURCE_DIR}/licenses/*")
+    foreach(LICENSE_FILE ${LICENSE_FILES})
+        get_filename_component(FILENAME ${LICENSE_FILE} NAME)
+        add_custom_command(
+            POST_BUILD
+            TARGET ${TARGET}
+            COMMAND ${CMAKE_COMMAND} -E copy_if_different
+                "${LICENSE_FILE}"
+                "$<TARGET_FILE_DIR:llama>/${FILENAME}"
+            COMMENT "Copying ${FILENAME} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}")
+        message(STATUS "Copying ${LICENSE_FILE} to ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${FILENAME}")
+    endforeach()
+endif()
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -40,7 +40,7 @@ using json = nlohmann::ordered_json;

 std::initializer_list<enum llama_example> mmproj_examples = {
    LLAMA_EXAMPLE_LLAVA,
-    // TODO: add LLAMA_EXAMPLE_SERVER when it's ready
+    LLAMA_EXAMPLE_SERVER,
 };

 static std::string read_file(const std::string & fname) {
@@ -2204,32 +2204,33 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
    add_opt(common_arg(
        {"--mmproj"}, "FILE",
-        "path to a multimodal projector file. see tools/mtmd/README.md",
+        "path to a multimodal projector file. see tools/mtmd/README.md\n"
+        "note: if -hf is used, this argument can be omitted",
        [](common_params & params, const std::string & value) {
            params.mmproj.path = value;
        }
-    ).set_examples(mmproj_examples));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ"));
    add_opt(common_arg(
        {"--mmproj-url"}, "URL",
        "URL to a multimodal projector file. see tools/mtmd/README.md",
        [](common_params & params, const std::string & value) {
            params.mmproj.url = value;
        }
-    ).set_examples(mmproj_examples));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
    add_opt(common_arg(
        {"--no-mmproj"},
        "explicitly disable multimodal projector, useful when using -hf",
        [](common_params & params) {
            params.no_mmproj = true;
        }
-    ).set_examples(mmproj_examples));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
    add_opt(common_arg(
        {"--no-mmproj-offload"},
        "do not offload multimodal projector to GPU",
        [](common_params & params) {
            params.mmproj_use_gpu = false;
        }
-    ).set_examples(mmproj_examples));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
    add_opt(common_arg(
        {"--image"}, "FILE",
        "path to an image file. use with multimodal models. Specify multiple times for batching",
@@ -2436,6 +2437,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ));
+    add_opt(common_arg(
+        {"--no-op-offload"},
+        string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
+        [](common_params & params) {
+            params.no_op_offload = true;
+        }
+    ));
    add_opt(common_arg(
        {"--lora"}, "FNAME",
        "path to LoRA adapter (can be repeated to use multiple adapters)",
@@ -2627,6 +2635,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.i_chunk = value;
        }
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
+    add_opt(common_arg(
+        {"--parse-special"},
+        string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
+        [](common_params & params) {
+            params.parse_special = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
        {"-pps"},
        string_format("is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false"),
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1113,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
    cparams.offload_kqv       = !params.no_kv_offload;
    cparams.flash_attn        = params.flash_attn;
    cparams.no_perf           = params.no_perf;
+    cparams.op_offload        = !params.no_op_offload;

    if (params.reranking) {
        cparams.embeddings    = true;
@@ -1564,3 +1565,20 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c

    return result;
 }
+
+ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride) {
+    const int64_t ne_datapoint = llama_n_ctx(ctx);
+    const int64_t ndata        = (tokens.size() - ne_datapoint - 1) / stride;
+    ggml_opt_dataset_t result = ggml_opt_dataset_init(
+        GGML_TYPE_I32, GGML_TYPE_I32, ne_datapoint, ne_datapoint, ndata, /*ndata_shard =*/ 1);
+
+    llama_token * data   = (llama_token *) ggml_opt_dataset_data(result)->data;
+    llama_token * labels = (llama_token *) ggml_opt_dataset_labels(result)->data;
+
+    for (int64_t idata = 0; idata < ndata; ++idata) {
+        memcpy(data   + idata*ne_datapoint, tokens.data() + idata*stride + 0, ne_datapoint*sizeof(llama_token));
+        memcpy(labels + idata*ne_datapoint, tokens.data() + idata*stride + 1, ne_datapoint*sizeof(llama_token));
+    }
+
+    return result;
+}
--- a/common/common.h
+++ b/common/common.h
@@ -332,6 +332,7 @@ struct common_params {
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data
+    bool no_op_offload     = false; // globally disable offload host tensor operations to device

    bool single_turn       = false; // single turn chat conversation

@@ -409,6 +410,7 @@ struct common_params {

    bool process_output = false; // collect data for the output tensor
    bool compute_ppl    = true;  // whether to compute perplexity
+    bool parse_special  = false; // whether to parse special tokens during imatrix tokenization

    // cvector-generator params
    int n_pca_batch = 100;
@@ -664,3 +666,9 @@ const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";

 }
+
+//
+// training utils
+//
+
+ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std::vector<llama_token> & tokens, int64_t stride);
--- a/common/llguidance.cpp
+++ b/common/llguidance.cpp
@@ -189,6 +189,7 @@ static LlgTokenizer * llama_sampler_llg_new_tokenizer(const llama_vocab * vocab)
        /* .tokenize_fn                        = */ llama_sampler_llg_tokenize_fn,
        /* .use_approximate_greedy_tokenize_fn = */ false,
        /* .tokenize_user_data                 = */ vocab,
+        /* .slices                             = */ nullptr,
    };

    char           error_buffer[1024];
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -426,7 +426,11 @@ class ModelBase:
            logger.warning(f"Failed to load model config from {dir_model}: {e}")
            logger.warning("Trying to load config.json instead")
            with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-                return json.load(f)
+                config = json.load(f)
+                if "llm_config" in config:
+                    # rename for InternVL
+                    config["text_config"] = config["llm_config"]
+                return config

    @classmethod
    def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
@@ -794,6 +798,9 @@ class TextModel(ModelBase):
        if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
            # ref: https://huggingface.co/mistral-community/pixtral-12b
            res = "pixtral"
+        if chkhsh == "d5f1dd6f980fec569fb218a81a7658ac45fc56b38c5a0adeb1c232fbe04ef5ec":
+            # ref: https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base
+            res = "seed-coder"

        if res is None:
            logger.warning("\n")
@@ -2606,6 +2613,11 @@ class Qwen2Model(TextModel):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        if self.hf_arch == "Qwen2Model":
            name = f"model.{name}"  # map to Qwen2ForCausalLM tensors
+        if "language_model." in name:
+            name = name.replace("language_model.", "") # for InternVL
+        if name.startswith("mlp") or name.startswith("vision_model"):
+            # skip visual tensors
+            return []
        yield from super().modify_tensors(data_torch, name, bid)


@@ -2709,6 +2721,62 @@ class Qwen2VLVisionModel(VisionModel):
        return [] # skip other tensors


+@ModelBase.register("InternVisionModel")
+class InternVisionModel(VisionModel):
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.INTERNVL)
+        self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
+        # hidden_act
+        if hparams["hidden_act"] == "silu":
+            self.gguf_writer.add_vision_use_silu(True)
+        elif hparams["hidden_act"] == "gelu":
+            self.gguf_writer.add_vision_use_gelu(True)
+        else:
+            raise ValueError(f"Unsupported hidden_act: {hparams['hidden_act']}")
+        # downsample_ratio
+        downsample_ratio = self.global_config.get("downsample_ratio")
+        assert downsample_ratio is not None
+        self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        del bid, name, n_dims  # unused
+        if ".patch_embd." in new_name:
+            return gguf.GGMLQuantizationType.F16
+        if ".position_embd." in new_name:
+            return gguf.GGMLQuantizationType.F32
+        return False
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+        if name.startswith("vision_model") or name.startswith("mlp"):
+            # process visual tensors
+            # correct name
+            if name.startswith("vision_model"):
+                name = "vision_tower." + name
+            if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"):
+                name += ".weight"
+            # split QKV tensors if needed
+            if ".qkv." in name:
+                if data_torch.ndim == 2: # weight
+                    c3, _ = data_torch.shape
+                else: # bias
+                    c3 = data_torch.shape[0]
+                assert c3 % 3 == 0
+                c = c3 // 3
+                wq = data_torch[:c]
+                wk = data_torch[c: c * 2]
+                wv = data_torch[c * 2:]
+                return [
+                    (self.map_tensor_name(name.replace("attn.qkv", "self_attn.q_proj")), wq),
+                    (self.map_tensor_name(name.replace("attn.qkv", "self_attn.k_proj")), wk),
+                    (self.map_tensor_name(name.replace("attn.qkv", "self_attn.v_proj")), wv),
+                ]
+            return [(self.map_tensor_name(name), data_torch)]
+        return [] # skip other tensors
+
+
@ModelBase.register("WavTokenizerDec")
 class WavTokenizerDecModel(TextModel):
    model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
@@ -3360,6 +3428,11 @@ class InternLM2Model(TextModel):
        head_dim = n_embd // num_heads
        num_groups = num_heads // q_per_kv

+        name = name.replace("language_model.", "") # InternVL
+        if name.startswith("mlp") or name.startswith("vision_model"):
+            # skip visual tensors
+            return []
+
        if bid is not None and f"model.layers.{bid}.attention.wqkv" in name:
            qkv = data_torch

@@ -3433,6 +3506,10 @@ class InternLM3Model(TextModel):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        n_head = self.hparams["num_attention_heads"]
        n_kv_head = self.hparams.get("num_key_value_heads")
+        name = name.replace("language_model.", "") # InternVL
+        if name.startswith("mlp") or name.startswith("vision_model"):
+            # skip visual tensors
+            return []
        if name.endswith(("q_proj.weight", "q_proj.bias")):
            data_torch = LlamaModel.permute(data_torch, n_head, n_head)
        if name.endswith(("k_proj.weight", "k_proj.bias")):
@@ -5669,11 +5746,20 @@ class GraniteModel(LlamaModel):
            logger.info("gguf: (granite) logits_scale = %s", logits_scale)


-@ModelBase.register("GraniteMoeForCausalLM")
+@ModelBase.register("GraniteMoeForCausalLM", "GraniteMoeSharedForCausalLM")
 class GraniteMoeModel(GraniteModel):
    """Conversion for IBM's GraniteMoeForCausalLM"""
    model_arch = gguf.MODEL_ARCH.GRANITE_MOE

+    def set_gguf_parameters(self):
+        """GraniteMoeShared uses GraniteMoe parameters plus the following:
+        - shared_intermediate_size
+        """
+        super().set_gguf_parameters()
+        if shared_feed_forward_length := self.hparams.get("shared_intermediate_size"):
+            self.gguf_writer.add_expert_shared_feed_forward_length(shared_feed_forward_length)
+            logger.info("gguf: (granitemoeshared) shared_feed_forward_length = %s", shared_feed_forward_length)
+
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        """In modeling_granitemoe, the JetMoe implementation of parallel experts
        is used. This essentially merges w1 and w3 into a single tensor with 2x
@@ -5684,12 +5770,21 @@ class GraniteMoeModel(GraniteModel):
        if name.endswith("block_sparse_moe.input_linear.weight"):
            ffn_dim = self.hparams["intermediate_size"]
            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * intermediate_size"
-            gate, up = data_torch[..., :ffn_dim, :], data_torch[..., ffn_dim:, :]
+            gate, up = data_torch.split(ffn_dim, dim=-2)
            return [
                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_EXP, bid), gate),
                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_EXP, bid), up),
            ]

+        if name.endswith("shared_mlp.input_linear.weight"):
+            ffn_dim = self.hparams["shared_intermediate_size"]
+            assert data_torch.shape[-2] == 2 * ffn_dim, "Merged FFN tensor size must be 2 * shared_intermediate_size"
+            gate, up = data_torch.split(ffn_dim, dim=-2)
+            return [
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_SHEXP, bid), gate),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP_SHEXP, bid), up),
+            ]
+
        return super().modify_tensors(data_torch, name, bid)


--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -116,6 +116,7 @@ models = [
    {"name": "llama4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
    {"name": "glm4",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
    {"name": "pixtral",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
+    {"name": "seed-coder",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
 ]


--- a/docs/multimodal.md
+++ b/docs/multimodal.md
@@ -0,0 +1,77 @@
+# Multimodal
+
+llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools support this feature:
+- [llama-mtmd-cli](../tools/mtmd/README.md)
+- [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API
+
+To enable it, can use use one of the 2 methods below:
+
+- Use `-hf` option with a supported model (see a list of pre-quantized model below)
+    - To load a model using `-hf` while disabling multimodal, use `--no-mmproj`
+    - To load a model using `-hf` while using a custom mmproj file, use `--mmproj local_file.gguf`
+- Use `-m model.gguf` option with `--mmproj file.gguf` to specify text and multimodal projector respectively
+
+By default, multimodal projector will be offloaded to GPU. To disable this, add `--no-mmproj-offload`
+
+For example:
+
+```sh
+# simple usage with CLI
+llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
+
+# simple usage with server
+llama-server -hf ggml-org/gemma-3-4b-it-GGUF
+
+# using local file
+llama-server -m gemma-3-4b-it-Q4_K_M.gguf --mmproj mmproj-gemma-3-4b-it-Q4_K_M.gguf
+
+# no GPU offload
+llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
+```
+
+## Pre-quantized models
+
+These are ready-to-use models, most of them come with `Q4_K_M` quantization by default.
+
+Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`
+
+NOTE: some models may require large context window, for example: `-c 8192`
+
+```sh
+# Gemma 3
+(tool_name) -hf ggml-org/gemma-3-4b-it-GGUF
+(tool_name) -hf ggml-org/gemma-3-12b-it-GGUF
+(tool_name) -hf ggml-org/gemma-3-27b-it-GGUF
+
+# SmolVLM
+(tool_name) -hf ggml-org/SmolVLM-Instruct-GGUF
+(tool_name) -hf ggml-org/SmolVLM-256M-Instruct-GGUF
+(tool_name) -hf ggml-org/SmolVLM-500M-Instruct-GGUF
+(tool_name) -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF
+(tool_name) -hf ggml-org/SmolVLM2-256M-Video-Instruct-GGUF
+(tool_name) -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
+
+# Pixtral 12B
+(tool_name) -hf ggml-org/pixtral-12b-GGUF
+
+# Qwen 2 VL
+(tool_name) -hf ggml-org/Qwen2-VL-2B-Instruct-GGUF
+(tool_name) -hf ggml-org/Qwen2-VL-7B-Instruct-GGUF
+
+# Qwen 2.5 VL
+(tool_name) -hf ggml-org/Qwen2.5-VL-3B-Instruct-GGUF
+(tool_name) -hf ggml-org/Qwen2.5-VL-7B-Instruct-GGUF
+(tool_name) -hf ggml-org/Qwen2.5-VL-32B-Instruct-GGUF
+(tool_name) -hf ggml-org/Qwen2.5-VL-72B-Instruct-GGUF
+
+# Mistral Small 3.1 24B (IQ2_M quantization)
+(tool_name) -hf ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF
+
+# InternVL 2.5 and 3
+(tool_name) -hf ggml-org/InternVL2_5-1B-GGUF
+(tool_name) -hf ggml-org/InternVL2_5-4B-GGUF
+(tool_name) -hf ggml-org/InternVL3-1B-Instruct-GGUF
+(tool_name) -hf ggml-org/InternVL3-2B-Instruct-GGUF
+(tool_name) -hf ggml-org/InternVL3-8B-Instruct-GGUF
+(tool_name) -hf ggml-org/InternVL3-14B-Instruct-GGUF
+```
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -32,6 +32,7 @@ else()
    add_subdirectory(speculative)
    add_subdirectory(speculative-simple)
    add_subdirectory(gen-docs)
+    add_subdirectory(training)
    if (NOT GGML_BACKEND_DL)
        add_subdirectory(convert-llama2c-to-ggml)
        # these examples use the backends directly and cannot be built with dynamic loading
--- a/examples/training/CMakeLists.txt
+++ b/examples/training/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-finetune)
+add_executable(${TARGET} finetune.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/training/README.md
+++ b/examples/training/README.md
@@ -0,0 +1,17 @@
+# llama.cpp/examples/training
+
+This directory contains examples related to language model training using llama.cpp/GGML.
+So far finetuning is technically functional (for FP32 models and limited hardware setups) but the code is very much WIP.
+Finetuning of Stories 260K and LLaMA 3.2 1b seems to work with 24 GB of memory.
+**For CPU training, compile llama.cpp without any additional backends such as CUDA.**
+**For CUDA training, use the maximum number of GPU layers.**
+
+Proof of concept:
+
+``` sh
+export model_name=llama_3.2-1b && export quantization=f32
+./build/bin/finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
+./build/bin/perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
+```
+
+The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs.
--- a/examples/training/finetune.cpp
+++ b/examples/training/finetune.cpp
@@ -0,0 +1,96 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    params.escape = false;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
+        return 1;
+    }
+
+    if (params.use_mmap) {
+        LOG_INF("%s: force disabling memory mapping because it would result in-read-only pointers to the weights\n", __func__);
+        params.use_mmap = false;
+    }
+    if (params.cache_type_k != GGML_TYPE_F32) {
+        LOG_INF("%s: force changing k cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
+        params.cache_type_k = GGML_TYPE_F32;
+    }
+    if (params.cache_type_v != GGML_TYPE_F32) {
+        LOG_INF("%s: force changing v cache type to f32 due to a lack of f16 support for OUT_PROD\n", __func__);
+        params.cache_type_v = GGML_TYPE_F32;
+    }
+
+    common_init();
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // load the model and apply lora adapter, if any
+    common_init_result llama_init = common_init_from_params(params);
+    llama_model_ptr   & model = llama_init.model;
+    llama_context_ptr & ctx   = llama_init.context;
+
+    if (model == NULL) {
+        LOG_ERR("%s: unable to load model\n", __func__);
+        return 1;
+    }
+
+    // print system information
+    {
+        LOG_INF("\n");
+        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    }
+
+    constexpr float val_split = 0.05f;
+
+    std::vector<llama_token> tokens = common_tokenize(ctx.get(), params.prompt, true);
+    ggml_opt_dataset_t dataset = common_opt_dataset_init(ctx.get(), tokens, llama_n_ctx(ctx.get())/2);
+
+    struct ggml_opt_optimizer_params optimizer_params = ggml_opt_get_default_optimizer_params(nullptr);
+    optimizer_params.adamw.alpha = 1e-7f; // learning rate
+
+    struct llama_opt_params lopt_params {
+        /*n_ctx_train     =*/ 0,
+        /*param_filter    =*/ llama_opt_param_filter_all,
+        /*param_filter_ud =*/ nullptr,
+        /*get_opt_pars    =*/ ggml_opt_get_constant_optimizer_params,
+        /*get_opt_pars_ud =*/ &optimizer_params,
+    };
+    llama_opt_init(ctx.get(), model.get(), lopt_params);
+
+    const int64_t idata_split = ggml_opt_dataset_ndata(dataset) * (1.0f - val_split);
+
+    ggml_opt_result_t result_train = ggml_opt_result_init();
+    ggml_opt_result_t result_eval  = ggml_opt_result_init();
+
+    for (int epoch = 0; epoch < 2; ++epoch) {
+        llama_opt_epoch(ctx.get(), dataset, result_train, result_eval, idata_split,
+            ggml_opt_epoch_callback_progress_bar, ggml_opt_epoch_callback_progress_bar);
+        fprintf(stderr, "\n");
+
+        ggml_opt_result_reset(result_train);
+        ggml_opt_result_reset(result_eval);
+    }
+    ggml_opt_result_free(result_train);
+    ggml_opt_result_free(result_eval);
+
+    llama_model_save_to_file(model.get(), "finetuned-model.gguf");
+
+    llama_backend_free();
+
+    return 0;
+}
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -248,7 +248,7 @@ extern "C" {
        // preferrably to run on the same backend as the buffer
        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);

-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);

        // initialize buffers from a max size graph (optional)
        reserve_graph = build_graph(sched, max_batch_size);
@@ -289,7 +289,7 @@ extern "C" {
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);

    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload);
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);

    // Initialize backend buffers from a measure graph
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@@ -37,13 +37,16 @@ extern "C" {
    // ====== Dataset ======

    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
-            int64_t ne_datapoint, // number of elements per datapoint
-            int64_t ne_label,     // number of elements per label
-            int64_t ndata,        // total number of datapoints/labels
-            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
+            enum ggml_type type_data,    // the type for the internal data tensor
+            enum ggml_type type_label,   // the type for the internal labels tensor
+            int64_t        ne_datapoint, // number of elements per datapoint
+            int64_t        ne_label,     // number of elements per label
+            int64_t        ndata,        // total number of datapoints/labels
+            int64_t        ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);

    // get underlying tensors that store the data
+    GGML_API int64_t              ggml_opt_dataset_ndata (ggml_opt_dataset_t dataset);
    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]

@@ -56,13 +59,19 @@ extern "C" {
            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
            int64_t              ibatch);
+    GGML_API void ggml_opt_dataset_get_batch_host(
+            ggml_opt_dataset_t   dataset,
+            void               * data_batch,
+            size_t               nb_data_batch,
+            void               * labels_batch,
+            int64_t              ibatch);

    // ====== Model / Context ======

    enum ggml_opt_build_type {
-        GGML_OPT_BUILD_TYPE_FORWARD,
-        GGML_OPT_BUILD_TYPE_GRAD,
-        GGML_OPT_BUILD_TYPE_OPT,
+        GGML_OPT_BUILD_TYPE_FORWARD = 10,
+        GGML_OPT_BUILD_TYPE_GRAD    = 20,
+        GGML_OPT_BUILD_TYPE_OPT     = 30,
    };

    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
@@ -81,20 +90,22 @@ extern "C" {
    // userdata can be used to pass arbitrary data
    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);

-    // returns the default optimizer params (constant)
+    // returns the default optimizer params (constant, hard-coded values)
    // userdata is not used
    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);

+    // casts userdata to ggml_opt_optimizer_params and returns it
+    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata);
+
    // parameters for initializing a new optimization context
    struct ggml_opt_params {
        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs

-        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
-
-        // the forward graph is defined by inputs and outputs
-        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
-        struct ggml_tensor * inputs;
-        struct ggml_tensor * outputs;
+        // by default the forward graph needs to be reconstructed for each eval
+        // if ctx_compute, inputs, and outputs are set the graphs are instead allocated statically
+        struct ggml_context * ctx_compute;
+        struct ggml_tensor  * inputs;
+        struct ggml_tensor  * outputs;

        enum ggml_opt_loss_type  loss_type;
        enum ggml_opt_build_type build_type;
@@ -107,12 +118,9 @@ extern "C" {

    // get parameters for an optimization context with defaults set where possible
    // parameters for which no sensible defaults exist are supplied as arguments to this function
-    GGML_API ggml_opt_params ggml_opt_default_params(
-            ggml_backend_sched_t      backend_sched,
-            struct ggml_context     * ctx_compute,
-            struct ggml_tensor      * inputs,
-            struct ggml_tensor      * outputs,
-            enum ggml_opt_loss_type   loss_type);
+    GGML_API struct ggml_opt_params ggml_opt_default_params(
+            ggml_backend_sched_t    backend_sched,
+            enum ggml_opt_loss_type loss_type);

    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
@@ -121,6 +129,7 @@ extern "C" {
    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);

    // get underlying tensors that store data
+    // if not using static graphs these pointers become invalid with the next call to ggml_opt_alloc
    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
@@ -128,11 +137,12 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels

+    // get the gradient accumulator for a node from the forward graph
    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);

    // ====== Optimization Result ======

-    GGML_API ggml_opt_result_t ggml_opt_result_init();
+    GGML_API ggml_opt_result_t ggml_opt_result_init(void);
    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);

@@ -144,11 +154,20 @@ extern "C" {

    // ====== Computation ======

-    // do forward pass, increment result if not NULL
-    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+    // if not using static graphs, this function must be called prior to ggml_opt_alloc
+    GGML_API void ggml_opt_prepare_alloc(
+        ggml_opt_context_t    opt_ctx,
+        struct ggml_context * ctx_compute,
+        struct ggml_cgraph  * gf,
+        struct ggml_tensor  * inputs,
+        struct ggml_tensor  * outputs);

-    // do forward pass, increment result if not NULL, do backward pass
-    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+    // allocate the next graph for evaluation, either forward or forward + backward
+    // must be called exactly once prior to calling ggml_opt_eval
+    GGML_API void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward);
+
+    // do forward pass, increment result if not NULL, do backward pass if allocated
+    GGML_API void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);

    // ############################################################################
    // ## The high-level functions start here. They do not depend on any private ##
@@ -200,9 +219,9 @@ extern "C" {
    // fit model defined by inputs and outputs to dataset
    GGML_API void ggml_opt_fit(
            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
-            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
-            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
-            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
+            struct ggml_context           * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
+            struct ggml_tensor            * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
+            struct ggml_tensor            * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
            enum ggml_opt_loss_type         loss_type,      // loss to minimize
            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -768,7 +768,7 @@ extern "C" {
    // Tensor flags
    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
-    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API void ggml_set_param(struct ggml_tensor * tensor);
    GGML_API void ggml_set_loss(struct ggml_tensor * tensor);

    //
@@ -938,7 +938,7 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_repeat_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride

    // concat a and b along dim
    // used in stable-diffusion
@@ -2049,15 +2049,14 @@ extern "C" {

    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API void ggml_build_backward_expand(
-        struct ggml_context * ctx_static,  // context for static gradients (loss + gradient accumulation)
-        struct ggml_context * ctx_compute, // context for gradient computation
-        struct ggml_cgraph  * cgraph,
-        bool                  accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
+        struct ggml_context *  ctx,        // context for gradient computation
+        struct ggml_cgraph  *  cgraph,
+        struct ggml_tensor  ** grad_accs);

    // graph allocation in a context
    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads);
    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -674,6 +674,8 @@ struct ggml_backend_sched {
    char * context_buffer;
    size_t context_buffer_size;

+    bool op_offload;
+
    int debug;
 };

@@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
        if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
            int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
            // check if a backend with higher prio wants to offload the op
-            if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
+            if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) {
                for (int b = 0; b < src_backend_id; b++) {
                    if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
                        SET_CAUSE(tensor, "1.off");
@@ -1109,7 +1111,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg

            const int node_backend_id = tensor_backend_id(node);

-            assert(node_backend_id != -1); // all nodes should be assigned by now
+            assert(node_backend_id != -1); // all nodes should be assigned by now, this can happen if there is no CPU fallback

            // check if we should start a new split based on the sources of the current node
            bool need_new_split = false;
@@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new(
        ggml_backend_buffer_type_t * bufts,
        int n_backends,
        size_t graph_size,
-        bool parallel) {
+        bool parallel,
+        bool op_offload) {
    GGML_ASSERT(n_backends > 0);
    GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
    GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
@@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
    }

    sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
+    sched->op_offload = op_offload;

    ggml_backend_sched_reset(sched);

--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -428,6 +428,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            ${KLEIDIAI_SRC}/kai/ukernels/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/
            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/)

        set(ARCH_FLAGS_TEMP "${ARCH_FLAGS}")
@@ -438,17 +439,19 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        string(FIND "${ARCH_FLAGS_TEMP}" "+i8mm" I8MM_ENABLED)
        string(FIND "${ARCH_FLAGS_TEMP}" "+sme" SME_ENABLED)

-        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS})
+        set(PRIVATE_ARCH_FLAGS ${ARCH_FLAGS_TEMP})

-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c)
-        list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)
+        list(APPEND GGML_KLEIDIAI_SOURCES
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_quant_pack_qsi8d32p_f32_neon.c
+            ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.c)

        if (NOT DOTPROD_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
+            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.c)
        endif()

        if (NOT I8MM_ENABLED MATCHES -1)
@@ -456,9 +459,13 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        endif()

        if (NOT SME_ENABLED MATCHES -1)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c)
-            list(APPEND GGML_KLEIDIAI_SOURCES ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c)
-            set(PRIVATE_ARCH_FLAGS "${PRIVATE_ARCH_FLAGS}+sve+sve2")
+            list(APPEND GGML_KLEIDIAI_SOURCES
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_f32_qsi8d32p_qsi4c32p/kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/matmul_clamp_fp32_bf16p_bf16p/kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_lhs_pack_bf16p2vlx2_f32_sme.c
+                ${KLEIDIAI_SRC}/kai/ukernels/matmul/pack/kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.c)
+            set(PRIVATE_ARCH_FLAGS "-fno-tree-vectorize;${PRIVATE_ARCH_FLAGS}+sve+sve2")
        endif()

        set_source_files_properties(${GGML_KLEIDIAI_SOURCES} PROPERTIES COMPILE_OPTIONS "${PRIVATE_ARCH_FLAGS}")
--- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp
@@ -4,16 +4,22 @@

 // KleidiAI micro-kernels
 #include "kai_matmul_clamp_f32_qsi8d32p_qsi4c32p_interface.h"
-#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
-#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
-#include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
-#include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x8_qsi4c32p4x8_1x4x32_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4x4_1x4_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p4x4_qsi4c32p4x4_16x4_neon_dotprod.h"
 #include "kai_matmul_clamp_f32_qsi8d32p4x8_qsi4c32p4x8_16x4_neon_i8mm.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1vlx4_qsi4c32p4vlx4_1vlx4vl_sme2_mopa.h"
 #include "kai_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot.h"
+#include "kai_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa.h"
+
+#include "kai_lhs_pack_bf16p2vlx2_f32_sme.h"
+#include "kai_lhs_quant_pack_qsi8d32p_f32.h"
+#include "kai_lhs_quant_pack_qsi8d32p_f32_neon.h"
+
+#include "kai_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme.h"
+#include "kai_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0.h"
+#include "kai_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon.h"
+
 #include "kai_common.h"

 #include "kernels.h"
@@ -61,6 +67,53 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
        },
        /* .required_cpu       = */ CPU_FEATURE_SME,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_Q4_0,
+        /* .op_type            = */ GGML_TYPE_F32,
+    },
+    {
+        /* SME GEMM */
+        /* .kern_info = */ {
+            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .run_kernel            = */ kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+        },
+        /* SME GEMV */
+        /* .kern_info = */ {
+            /* .get_m_step            = */ kai_get_m_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_n_step            = */ kai_get_n_step_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_mr                = */ kai_get_mr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_nr                = */ kai_get_nr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_kr                = */ kai_get_kr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_sr                = */ kai_get_sr_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_lhs_offset        = */ kai_get_lhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_rhs_packed_offset = */ kai_get_rhs_packed_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_dst_offset        = */ kai_get_dst_offset_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .get_dst_size          = */ kai_get_dst_size_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+            /* .run_kernel            = */ kai_run_matmul_clamp_f32_bf16p2vlx2_bf16p2vlx2_2vlx2vl_sme2_mopa,
+        },
+        /* .lhs_info = */ {
+            /* .get_offset            = */ kai_get_lhs_offset_lhs_pack_bf16p2vlx2_f32_sme,
+            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_pack_bf16p2vlx2_f32_sme,
+            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_pack_bf16p2vlx2_f32_sme,
+            /* .pack_func             = */ kai_run_lhs_pack_bf16p2vlx2_f32_sme,
+        },
+        /* .rhs_info = */ {
+            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
+            /* .pack_func   = */ kai_run_rhs_pack_kxn_bf16p2vlx2b_f32_x32_sme,
+        },
+        /* .required_cpu       = */ CPU_FEATURE_SME,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_F16,
+        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #if defined(__APPLE__)
@@ -105,6 +158,9 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
        },
        /* .required_cpu       = */ CPU_FEATURE_DOTPROD,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_Q4_0,
+        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #if defined(__ARM_FEATURE_MATMUL_INT8)
@@ -148,6 +204,9 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
        },
        /* .required_cpu       = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_Q4_0,
+        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #else
@@ -192,6 +251,9 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
        },
        /* .required_cpu       = */ CPU_FEATURE_DOTPROD | CPU_FEATURE_I8MM,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_Q4_0,
+        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #if defined(__ARM_FEATURE_DOTPROD)
@@ -235,12 +297,33 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .pack_func   = */ kai_run_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
        },
        /* .required_cpu       = */ CPU_FEATURE_DOTPROD,
+        /* .lhs_type           = */ GGML_TYPE_F32,
+        /* .rhs_type           = */ GGML_TYPE_Q4_0,
+        /* .op_type            = */ GGML_TYPE_F32,
    },
 #endif
 #endif
 };

-ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature features) {
+ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor) {
+    ggml_kleidiai_kernels * kernel = nullptr;
+
+    if (tensor->op == GGML_OP_MUL_MAT && tensor->src[0] != nullptr && tensor->src[1] != nullptr) {
+        for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
+            if ((cpu_features & gemm_gemv_kernels[i].required_cpu) == gemm_gemv_kernels[i].required_cpu &&
+                gemm_gemv_kernels[i].lhs_type == tensor->src[1]->type &&
+                gemm_gemv_kernels[i].rhs_type == tensor->src[0]->type &&
+                gemm_gemv_kernels[i].op_type  == tensor->type) {
+                kernel = &gemm_gemv_kernels[i];
+                break;
+            }
+        }
+    }
+
+    return kernel;
+}
+
+ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features) {
    ggml_kleidiai_kernels * kernels = nullptr;

    for (size_t i = 0; i < NELEMS(gemm_gemv_kernels); ++i) {
--- a/ggml/src/ggml-cpu/kleidiai/kernels.h
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.h
@@ -4,6 +4,9 @@

 #pragma once

+#include <functional>
+#include "ggml.h"
+
 enum cpu_feature {
    CPU_FEATURE_NONE    = 0,
    CPU_FEATURE_DOTPROD = 1,
@@ -26,26 +29,53 @@ struct kernel_info {
    size_t (*get_nr)(void);
    size_t (*get_kr)(void);
    size_t (*get_sr)(void);
-    size_t (*get_lhs_offset)(size_t m_idx, size_t k, size_t bl);
-    size_t (*get_rhs_packed_offset)(size_t n_idx, size_t k, size_t bl);
+    std::variant<
+        std::function<size_t(size_t n_idx, size_t k, size_t bl)>,
+        std::function<size_t(size_t m_idx, size_t k)>
+    > get_lhs_offset;
+    std::variant<
+        std::function<size_t(size_t n_idx, size_t k, size_t bl)>,
+        std::function<size_t(size_t n_idx, size_t k)>
+    > get_rhs_packed_offset;
    size_t (*get_dst_offset)(size_t m_idx, size_t n_idx, size_t stride);
    size_t (*get_dst_size)(size_t m, size_t n);
-    void (*run_kernel)(size_t m, size_t n, size_t k, size_t bl, const void* lhs_packed, const void* rhs_packed,
-                         float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max);
+    std::variant<
+        std::function<void(size_t m, size_t n, size_t k, size_t bl, const void* lhs_packed, const void* rhs_packed,
+            float* dst, size_t dst_stride_row, size_t dst_stride_col, float scalar_min, float scalar_max)>,
+        std::function<void(size_t m, size_t n, size_t k, const void* lhs_packed, const void* rhs_packed, void* dst, size_t dst_stride_row,
+            size_t dst_stride_col, float clamp_min, float clamp_max)>
+    > run_kernel;
 };

 struct lhs_packing_info {
    size_t (*get_offset)(size_t m_idx, size_t lhs_stride);
-    size_t (*get_packed_offset)(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
-    size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
-    void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
-                      size_t lhs_stride, void* lhs_packed);
+    std::variant<
+        std::function<size_t(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr)>,
+        std::function<size_t(size_t m_idx, size_t k, size_t mr, size_t kr, size_t sr)>
+    > get_packed_offset;
+    std::variant<
+        std::function<size_t(size_t m_idx, size_t k, size_t bl, size_t mr, size_t kr, size_t sr)>,
+        std::function<size_t(size_t m, size_t k, size_t mr, size_t kr, size_t sr)>
+    > packed_size;
+    std::variant<
+        std::function<void(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
+            size_t lhs_stride, void* lhs_packed)>,
+        std::function<void(size_t m, size_t k, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const void* lhs, size_t lhs_stride,
+        void* lhs_packed)>
+    > pack_func;
 };

 struct rhs_packing_info {
-    size_t (*packed_size)(size_t n, size_t k, size_t nr, size_t kr, size_t bl);
-    void (*pack_func)(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, const uint8_t* rhs,
-                      const float* bias, void* rhs_packed, size_t extra_bytes, const struct kai_rhs_pack_qs4cxs1s0_param* params);
+    std::variant<
+        std::function<size_t(size_t n, size_t k, size_t nr, size_t kr, size_t bl)>,
+        std::function<size_t(size_t n, size_t k)>
+    > packed_size;
+    std::variant<
+        std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t bl, const uint8_t* rhs,
+            const float* bias, void* rhs_packed, size_t extra_bytes, const struct kai_rhs_pack_qs4cxs1s0_param* params)>,
+        std::function<void(size_t num_groups, size_t n, size_t k, size_t nr, size_t kr, size_t sr, size_t rhs_stride, const void* rhs,
+            const void* bias, const void* scale, void* rhs_packed, size_t extra_bytes, const void* params)>
+    > pack_func;
 };

 struct ggml_kleidiai_kernels {
@@ -55,6 +85,10 @@ struct ggml_kleidiai_kernels {
    rhs_packing_info rhs_info;

    cpu_feature required_cpu;
+    ggml_type lhs_type;
+    ggml_type rhs_type;
+    ggml_type op_type;
 };

-ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features);
+ggml_kleidiai_kernels * ggml_kleidiai_select_kernels(cpu_feature cpu_features, const ggml_tensor * tensor);
+ggml_kleidiai_kernels * ggml_kleidiai_select_kernels_q4_0(cpu_feature features);
--- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
@@ -34,8 +34,9 @@
 #include "ggml-common.h"

 struct ggml_kleidiai_context {
+    cpu_feature features;
    ggml_kleidiai_kernels * kernels;
-} static ctx = { NULL };
+} static ctx = { CPU_FEATURE_NONE, NULL };

 static void init_kleidiai_context(void) {

@@ -47,18 +48,18 @@ static void init_kleidiai_context(void) {
        const char *env_var = getenv("GGML_KLEIDIAI_SME");
        int sme_enabled = 0;

-        cpu_feature features  = (ggml_cpu_has_dotprod()     ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
-                                (ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM    : CPU_FEATURE_NONE) |
-                                (ggml_cpu_has_sve()         ? CPU_FEATURE_SVE     : CPU_FEATURE_NONE);
+        ctx.features  = (ggml_cpu_has_dotprod()     ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
+                        (ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM    : CPU_FEATURE_NONE) |
+                        (ggml_cpu_has_sve()         ? CPU_FEATURE_SVE     : CPU_FEATURE_NONE);

        if (env_var) {
            sme_enabled = atoi(env_var);
        }

        if (sme_enabled != 0) {
-            features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
+            ctx.features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
        }
-        ctx.kernels = ggml_kleidiai_select_kernels(features);
+        ctx.kernels = ggml_kleidiai_select_kernels_q4_0(ctx.features);
    }
    ggml_critical_section_end();
 }
@@ -68,95 +69,275 @@ static inline int64_t ggml_ne(const ggml_tensor * tensor, int dim) {
    return tensor->ne[dim];
 }

+template<typename Ret, typename Variant, typename... Args>
+static Ret variant_call(const Variant & var, Args&&... args) {
+    return std::visit([&](auto&& func) -> Ret {
+        if constexpr (std::is_invocable_r_v<Ret, decltype(func), Args...>) {
+            return func(std::forward<Args>(args)...);
+        } else {
+            throw std::runtime_error("Invalid function type in variant_call");
+        }
+    }, var);
+}
+
 namespace ggml::cpu::kleidiai {
+
+static size_t round_down(size_t x, size_t y) {
+    return y == 0 ? x : x - (x % y);
+}
+
+static void transpose_f32kxn_f16nxk(size_t n, size_t k, float * dst, const uint16_t * src, size_t rhs_stride) {
+    size_t src_stride = rhs_stride / sizeof(uint16_t);
+    size_t dst_stride = n;
+
+    for (size_t k_idx = 0; k_idx < k; ++k_idx) {
+        for (size_t n_idx = 0; n_idx < n; ++n_idx) {
+            uint16_t v = *(src + k_idx + n_idx * src_stride);
+            *(dst + n_idx + k_idx * dst_stride) = kai_cast_f32_f16(v);
+        }
+    }
+}
+
 class tensor_traits : public ggml::cpu::tensor_traits {
    bool work_size(int /* n_threads */, const struct ggml_tensor * op, size_t & size) override {
-        GGML_ASSERT(ctx.kernels);
-        kernel_info * kernel = op->src[1]->ne[1] == 1 ? &ctx.kernels->gemv : &ctx.kernels->gemm;
+        ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, op);
+        GGML_ASSERT(kernels);
+        kernel_info * kernel = op->src[1]->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;

        size_t k = op->src[0]->ne[0];
+        size_t n = op->src[0]->ne[1];
        size_t m = op->src[1]->ne[1];

        size_t mr = kernel->get_mr();
        size_t kr = kernel->get_kr();
        size_t sr = kernel->get_sr();

-        size = ctx.kernels->lhs_info.packed_size(m, k, QK4_0, mr, kr, sr);
+        if (kernels->rhs_type == GGML_TYPE_Q4_0) {
+            size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, QK4_0, mr, kr, sr);
+        } else if (kernels->rhs_type == GGML_TYPE_F16) {
+            size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, mr, kr, sr) +
+                   variant_call<size_t>(kernels->rhs_info.packed_size, n, k) +
+                   k * n * sizeof(float) + n * sizeof(float);
+        } else {
+            GGML_ASSERT(false);
+        }

        return true;
    }

+
    bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * dst) override {
        if (dst->op == GGML_OP_MUL_MAT) {
-            const ggml_tensor * src0 = dst->src[0];
-            const ggml_tensor * src1 = dst->src[1];
+            if (dst->src[0]->type == GGML_TYPE_Q4_0) {
+                return compute_forward_q4_0(params, dst);
+            } else if (dst->src[0]->type == GGML_TYPE_F16) {
+                return compute_forward_kv_cache(params, dst);
+            }
+        }
+        return false;
+    }

-            GGML_TENSOR_BINARY_OP_LOCALS
+    bool compute_forward_kv_cache(ggml_compute_params * params, struct ggml_tensor * dst) {
+        static std::atomic_flag first_to_arrive = ATOMIC_FLAG_INIT;

-            GGML_ASSERT(ctx.kernels);
-            kernel_info * kernel = src1->ne[1] == 1 ? &ctx.kernels->gemv : &ctx.kernels->gemm;
-            lhs_packing_info * lhs_info = &ctx.kernels->lhs_info;
+        const ggml_tensor * src0 = dst->src[0];
+        const ggml_tensor * src1 = dst->src[1];

-            GGML_ASSERT(kernel);
+        GGML_TENSOR_BINARY_OP_LOCALS

-            const int ith = params->ith;
-            const int nth = params->nth;
+        ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
+        GGML_ASSERT(kernels);

-            const size_t k = ne00;
-            const size_t m = ne11;
-            const size_t n = ne01;
+        kernel_info * kernel = src1->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
+        GGML_ASSERT(kernel);

-            const size_t n_step = kernel->get_n_step();
-            const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
-            const size_t n_start = ith * num_n_per_thread;
+        const int nth = params->nth;
+        const int ith = params->ith;

-            size_t n_to_process = num_n_per_thread;
-            if ((n_start + n_to_process) > n) {
-                n_to_process = n - n_start;
+        const int64_t lhs_batch_size0 = ne12;
+        const int64_t rhs_batch_size0 = ne02;
+        const int64_t batch_size      = rhs_batch_size0;
+
+        const int64_t r = lhs_batch_size0 / rhs_batch_size0;
+
+        const int64_t m = ne11 * r;
+        const int64_t n = ne01;
+        const int64_t k = ne00;
+
+        const size_t lhs_stride = src1->nb[1];
+        const size_t rhs_stride = src0->nb[1];
+        const size_t dst_stride = dst->nb[1];
+
+        const int64_t mr = static_cast<int64_t>(kernel->get_mr());
+        const int64_t nr = static_cast<int64_t>(kernel->get_nr());
+        const int64_t kr = static_cast<int64_t>(kernel->get_kr());
+        const int64_t sr = static_cast<int64_t>(kernel->get_sr());
+
+        const size_t lhs_packed_size = variant_call<size_t>(kernels->lhs_info.packed_size, m, k, mr, kr, sr);
+        const size_t rhs_packed_size = variant_call<size_t>(kernels->rhs_info.packed_size, n, k);
+        const size_t kxn_size        = k * n * sizeof(float);
+        const size_t bias_size       = n * sizeof(float);
+
+        const size_t wsize_required = lhs_packed_size + rhs_packed_size + kxn_size + bias_size;
+        GGML_ASSERT(wsize_required <= params->wsize);
+
+        uint8_t * lhs_packed = static_cast<uint8_t *>(params->wdata);
+        uint8_t * rhs_packed = lhs_packed + lhs_packed_size;
+        uint8_t * rhs_kxn    = rhs_packed + rhs_packed_size;
+        uint8_t * bias       = rhs_kxn + kxn_size;
+
+        for (int64_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
+            const uint8_t * lhs_batch = static_cast<const uint8_t *>(src1->data) + batch_idx * m * lhs_stride;
+            const uint8_t * rhs_batch = static_cast<const uint8_t *>(src0->data) + batch_idx * n * rhs_stride;
+            uint8_t * dst_batch       = static_cast<uint8_t *>(dst->data) + batch_idx * m * dst_stride;
+
+            // LHS packing
+            {
+                const int64_t m_roundup_mr = kai_roundup(m, mr);
+                const int64_t num_threads  = KAI_MIN(m_roundup_mr / mr, nth);
+
+                if (ith < num_threads) {
+                    const int64_t num_m_per_thread0   = round_down(m_roundup_mr / num_threads, mr);
+                    const int64_t num_m_per_threadN_1 = m - (num_threads - 1) * num_m_per_thread0;
+
+                    const int64_t m_start          = ith * num_m_per_thread0;
+                    const int64_t num_m_per_thread = (ith == num_threads - 1) ? num_m_per_threadN_1 : num_m_per_thread0;
+
+                    const size_t lhs_offset        = variant_call<size_t>(kernels->gemm.get_lhs_offset, m_start, lhs_stride);
+                    const size_t lhs_packed_offset = variant_call<size_t>(kernels->lhs_info.get_packed_offset, m_start, k, mr, kr, sr);
+
+                    const void * src_ptr = static_cast<const uint8_t *>(lhs_batch) + lhs_offset;
+                    void * dst_ptr       = static_cast<uint8_t *>(lhs_packed) + lhs_packed_offset;
+
+                    variant_call<void>(kernels->lhs_info.pack_func, num_m_per_thread, k, mr, kr, sr, 0, src_ptr, lhs_stride, dst_ptr);
+                }
            }

-            const uint8_t * lhs        = static_cast<const uint8_t *>(src1->data);
-            uint8_t * lhs_packed       = (uint8_t*)params->wdata;
-            const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
+            // RHS packing
+            if (first_to_arrive.test_and_set(std::memory_order_acquire) == false) {
+                // First thread to reach this point handles RHS packing
+                memset(bias, 0, n * sizeof(float));
+                transpose_f32kxn_f16nxk(n, k, reinterpret_cast<float *>(rhs_kxn),
+                                        reinterpret_cast<const uint16_t *>(rhs_batch), rhs_stride);

-            size_t mr = kernel->get_mr();
-            size_t kr = kernel->get_kr();
-            size_t sr = kernel->get_sr();
-
-            // Calculate number of columns to be processed per thread
-            const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
-            const size_t m_start = ith * num_m_per_thread;
-            size_t m_to_process = num_m_per_thread;
-            if ((m_start + m_to_process) > m) {
-                m_to_process = m - m_start;
-            }
-
-            if(m_start < m) {
-                // Transform LHS
-                const size_t src_stride        = src1->nb[1];
-                const float * src_ptr          = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
-                const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr);
-                void * lhs_packed_ptr          = static_cast<void *>(lhs_packed + lhs_packed_offset);
-
-                lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
+                variant_call<void>(kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, n * sizeof(float),
+                             rhs_kxn, bias, nullptr, rhs_packed, 0, nullptr);
            }

            ggml_barrier(params->threadpool);

-            // Perform the operation
-            const size_t dst_stride        = dst->nb[1];
-            const size_t lhs_packed_offset = lhs_info->get_packed_offset(0, k, QK4_0, mr, kr, sr);
-            const size_t rhs_packed_offset = kernel->get_rhs_packed_offset(n_start, k, QK4_0);
-            const size_t dst_offset        = kernel->get_dst_offset(0, n_start, dst_stride);
-            const void * rhs_ptr           = static_cast<const void *>(rhs_packed + rhs_packed_offset);
-            const void* lhs_ptr            = (const void*)((const char *)lhs_packed + lhs_packed_offset);
-            float *dst_ptr                 = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
+            first_to_arrive.clear(std::memory_order_release);

-            kernel->run_kernel(m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr,
-                               dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
-            return true;
+            // Perform the matmul
+            {
+                const int64_t m_to_process = m;
+                const int64_t m_start      = 0;
+
+                const int64_t n_step      = static_cast<int64_t>(kernel->get_n_step());
+                const int64_t num_threads = KAI_MIN(n / n_step, nth);
+
+                if (ith < num_threads) {
+                    const int64_t num_n_per_thread0   = round_down(n / num_threads, n_step);
+                    const int64_t num_n_per_threadN_1 = n - (num_threads - 1) * num_n_per_thread0;
+
+                    const int64_t n_start      = ith * num_n_per_thread0;
+                    const int64_t n_to_process = (ith == num_threads - 1) ? num_n_per_threadN_1 : num_n_per_thread0;
+
+                    const size_t lhs_packed_offset = variant_call<size_t>(kernel->get_lhs_offset, m_start, k);
+                    const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, n_start, k);
+                    const size_t dst_offset        = kernel->get_dst_offset(m_start, n_start, dst_stride);
+
+                    const void * lhs_ptr = lhs_packed + lhs_packed_offset;
+                    const void * rhs_ptr = rhs_packed + rhs_packed_offset;
+                    float * dst_ptr      = reinterpret_cast<float *>(dst_batch + dst_offset);
+
+                    variant_call<void>(kernel->run_kernel, m_to_process, n_to_process, k, lhs_ptr, rhs_ptr, dst_ptr, dst_stride, sizeof(float), -FLT_MAX, FLT_MAX);
+                }
+            }
+
+            if (batch_idx != batch_size - 1) {
+                // This barrier is necessary when the batch size is larger than 1. While processing a batch,
+                // the work data buffer (params->wdata) is used as temporary storage which means that only
+                // a single batch can be processed at any given time. No barrier is needed for the last
+                // batch since GGML inserts a barrier between the execution of every operator.
+                ggml_barrier(params->threadpool);
+            }
        }
-        return false;
+
+        return true;
+    }
+
+    bool compute_forward_q4_0(struct ggml_compute_params * params, struct ggml_tensor * dst) {
+        const ggml_tensor * src0 = dst->src[0];
+        const ggml_tensor * src1 = dst->src[1];
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        ggml_kleidiai_kernels *kernels = ggml_kleidiai_select_kernels(ctx.features, dst);
+        GGML_ASSERT(kernels);
+
+        kernel_info * kernel = src1->ne[1] == 1 ? &kernels->gemv : &kernels->gemm;
+        lhs_packing_info * lhs_info = &kernels->lhs_info;
+
+        GGML_ASSERT(kernel);
+
+        const int ith = params->ith;
+        const int nth = params->nth;
+
+        const size_t k = ne00;
+        const size_t m = ne11;
+        const size_t n = ne01;
+
+        size_t mr = kernel->get_mr();
+        size_t kr = kernel->get_kr();
+        size_t sr = kernel->get_sr();
+
+        const uint8_t * lhs        = static_cast<const uint8_t *>(src1->data);
+        uint8_t * lhs_packed       = (uint8_t*)params->wdata;
+        const uint8_t * rhs_packed = static_cast<const uint8_t *>(src0->data);
+
+        const size_t n_step = kernel->get_n_step();
+        const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step);
+        const size_t n_start = ith * num_n_per_thread;
+
+        size_t n_to_process = num_n_per_thread;
+        if ((n_start + n_to_process) > n) {
+            n_to_process = n - n_start;
+        }
+
+        // Calculate number of columns to be processed per thread
+        const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
+        const size_t m_start = ith * num_m_per_thread;
+        size_t m_to_process = num_m_per_thread;
+        if ((m_start + m_to_process) > m) {
+            m_to_process = m - m_start;
+        }
+
+        if (m_start < m) {
+            // Transform LHS
+            const size_t src_stride        = src1->nb[1];
+            const float * src_ptr          = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
+            const size_t lhs_packed_offset = variant_call<size_t>(lhs_info->get_packed_offset, m_start, k, QK4_0, mr, kr, sr);
+            void * lhs_packed_ptr          = static_cast<void *>(lhs_packed + lhs_packed_offset);
+
+            variant_call<void>(lhs_info->pack_func, m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
+        }
+
+        ggml_barrier(params->threadpool);
+
+        // Perform the operation
+        const size_t dst_stride        = dst->nb[1];
+        const size_t lhs_packed_offset = variant_call<size_t>(lhs_info->get_packed_offset, 0, k, QK4_0, mr, kr, sr);
+        const size_t rhs_packed_offset = variant_call<size_t>(kernel->get_rhs_packed_offset, n_start, k, QK4_0);
+        const size_t dst_offset        = kernel->get_dst_offset(0, n_start, dst_stride);
+        const void * rhs_ptr           = static_cast<const void *>(rhs_packed + rhs_packed_offset);
+        const void* lhs_ptr            = (const void*)((const char *)lhs_packed + lhs_packed_offset);
+        float *dst_ptr                 = reinterpret_cast<float *>(static_cast<uint8_t *>(dst->data) + dst_offset);
+
+        variant_call<void>(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride,
+                           sizeof(float), -FLT_MAX, FLT_MAX);
+
+        return true;
    }

 public:
@@ -169,13 +350,13 @@ public:
        size_t sr      = ctx.kernels->gemm.get_sr();

 #ifndef NDEBUG
-        const size_t repacked_size = ctx.kernels->rhs_info.packed_size(n, k, nr, kr, QK4_0);
+        const size_t repacked_size = variant_call<size_t>(ctx.kernels->rhs_info.packed_size, n, k, nr, kr, QK4_0);
        GGML_ASSERT(repacked_size <= data_size && "repacked size larger than the packed size!");
 #endif
        struct kai_rhs_pack_qs4cxs1s0_param params;
        params.lhs_zero_point = 1;
        params.rhs_zero_point = 8;
-        ctx.kernels->rhs_info.pack_func(1, n, k, nr, kr, sr, QK4_0, (const uint8_t *)data, NULL, tensor->data, 0, &params);
+        variant_call<void>(ctx.kernels->rhs_info.pack_func, 1, n, k, nr, kr, sr, QK4_0, (const uint8_t*)data, nullptr, tensor->data, 0, &params);

        return 0;

@@ -189,7 +370,7 @@ static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struc
 }
 }  // namespace ggml::cpu::kleidiai

-GGML_API enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+static enum ggml_status ggml_backend_cpu_kleidiai_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    tensor->extra = (void *) ggml::cpu::kleidiai::get_tensor_traits(buffer, tensor);

    GGML_UNUSED(buffer);
@@ -238,12 +419,11 @@ static size_t ggml_backend_cpu_kleidiai_buffer_type_get_alignment(ggml_backend_b
 namespace ggml::cpu::kleidiai {
 class extra_buffer_type : ggml::cpu::extra_buffer_type {
    bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
-        if (    op->op == GGML_OP_MUL_MAT &&
-                op->src[0]->type == GGML_TYPE_Q4_0 &&
-                op->src[0]->buffer &&
-                (ggml_n_dims(op->src[0]) == 2) &&
-                op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels
-                ) {
+        if (op->op == GGML_OP_MUL_MAT &&
+            op->src[0]->type == GGML_TYPE_Q4_0 &&
+            op->src[0]->buffer &&
+            (ggml_n_dims(op->src[0]) == 2) &&
+            op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels) {
            if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
                return false;
            }
@@ -260,6 +440,19 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
            if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
                return (ggml::cpu::tensor_traits *) op->src[0]->extra;
            }
+            else if (ggml_kleidiai_select_kernels(ctx.features, op) &&
+                     op->src[0]->op == GGML_OP_VIEW &&
+                     (op->src[1]->op == GGML_OP_PERMUTE || op->src[1]->op ==  GGML_OP_SOFT_MAX) &&
+                     op->src[1]->ne[1] > 1) {
+                if ((op->src[0]->nb[0] != 2) ||
+                    (op->src[1]->nb[0] != 4) ||
+                    (op->src[0]->nb[1] * op->src[0]->ne[1] != op->src[0]->nb[2]) ||
+                    (op->src[1]->nb[1] * op->src[1]->ne[1] != op->src[1]->nb[2])) {
+                    return nullptr;
+                }
+
+                return ggml::cpu::kleidiai::get_tensor_traits(NULL, NULL);
+            }
        }
        return nullptr;
    }
--- a/ggml/src/ggml-cuda/CMakeLists.txt
+++ b/ggml/src/ggml-cuda/CMakeLists.txt
@@ -118,7 +118,7 @@ if (CUDAToolkit_FOUND)

    set(CUDA_CXX_FLAGS "")

-    set(CUDA_FLAGS -use_fast_math)
+    set(CUDA_FLAGS -use_fast_math -extended-lambda)

    if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
        # Options are:
--- a/ggml/src/ggml-cuda/acc.cu
+++ b/ggml/src/ggml-cuda/acc.cu
@@ -1,47 +1,61 @@
 #include "acc.cuh"

-static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, int offset) {
-    const int i = blockDim.x * blockIdx.x + threadIdx.x;
+static __global__ void acc_f32(const float * x, const float * y, float * dst, const int64_t ne,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const int64_t s11, const int64_t s12, const int64_t s13, const int64_t offset) {
+    const int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
    if (i >= ne) {
        return;
    }
-    int src1_idx = i - offset;
-    int oz = src1_idx / nb2;
-    int oy = (src1_idx - (oz * nb2)) / nb1;
-    int ox = src1_idx % nb1;
-    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
-        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
-    } else {
-        dst[i] = x[i];
+
+    int64_t src1_idx = i - offset;
+
+    int64_t tmp = src1_idx;
+    const int64_t i13 = tmp / s13;
+    tmp -= i13 * s13;
+    const int64_t i12 = tmp / s12;
+    tmp -= i12 * s12;
+    const int64_t i11 = tmp / s11;
+    tmp -= i11 * s11;
+    const int64_t i10 = tmp;
+
+    float val = x[i];
+    if (src1_idx >= 0 && i10 < ne10 && i11 < ne11 && i12 < ne12 && i13 < ne13) {
+        val += y[((i13*ne12 + i12) * ne11 + i11) * ne10 + i10];
    }
+    dst[i] = val;
 }

-static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, const int offset, cudaStream_t stream) {
-    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
-    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset);
+static void acc_f32_cuda(const float * x, const float * y, float * dst, const int64_t n_elements,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const int64_t s1, const int64_t s2, const int64_t s3, const int64_t offset, cudaStream_t stream) {
+    const int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
+    acc_f32<<<num_blocks, CUDA_ACC_BLOCK_SIZE, 0, stream>>>(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset);
 }

 void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const ggml_tensor * src1 = dst->src[1];
-    const float * src0_d = (const float *)src0->data;
-    const float * src1_d = (const float *)src1->data;
-    float * dst_d = (float *)dst->data;
+
+    const float * src0_d = (const float *) src0->data;
+    const float * src1_d = (const float *) src1->data;
+    float       * dst_d  = (float       *)  dst->data;
+
    cudaStream_t stream = ctx.stream();

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT(src1->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported

-    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
-    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
-    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
-    int offset = dst->op_params[3] / 4; // offset in bytes
+    GGML_ASSERT(ggml_is_contiguous(src1));
+    GGML_ASSERT(dst->nb[0] == ggml_element_size(dst));
+    GGML_ASSERT(ggml_is_contiguously_allocated(dst));

-    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream);
+    const int64_t s1     = dst->op_params[0] / sizeof(float);
+    const int64_t s2     = dst->op_params[1] / sizeof(float);
+    const int64_t s3     = dst->op_params[2] / sizeof(float);
+    const int64_t offset = dst->op_params[3] / sizeof(float);
+
+    acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], s1, s2, s3, offset, stream);
 }
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -296,6 +296,25 @@ static __device__ void no_device_code(
 #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
 #endif // __CUDA_ARCH__

+// The compiler is always able to unroll loops if they contain continue expressions.
+// In such cases loop unrolling can still be achieved via recursion:
+template <int n>
+struct ggml_cuda_unroll {
+    template <typename Func, typename... Args>
+    __device__ void operator()(const Func & f, Args... args) const {
+        f(n - 1, args...);
+        ggml_cuda_unroll<n - 1>{}(f, args...);
+    }
+};
+
+template <>
+struct ggml_cuda_unroll<1> {
+    template <typename Func, typename... Args>
+    __device__ void operator()(const Func & f, Args... args) const {
+        f(0, args...);
+    }
+};
+
 template<int width = WARP_SIZE>
 static __device__ __forceinline__ int warp_reduce_sum(int x) {
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
--- a/ggml/src/ggml-cuda/cp-async.cuh
+++ b/ggml/src/ggml-cuda/cp-async.cuh
@@ -2,6 +2,17 @@

 #include "common.cuh"

+
+static __device__ __forceinline__ unsigned int ggml_cuda_cvta_generic_to_shared(void * generic_ptr) {
+#ifdef CP_ASYNC_AVAILABLE
+    return __cvta_generic_to_shared(generic_ptr);
+#else
+    GGML_UNUSED(generic_ptr);
+    NO_DEVICE_CODE;
+    return 0;
+#endif // CP_ASYNC_AVAILABLE
+}
+
 // Copies data from global to shared memory, cg == cache global.
 // Both the src and dst pointers must be aligned to 16 bit.
 // Shared memory uses 32 bit addressing, the pointer is passed as unsigned int.
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -516,7 +516,7 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
        nullptr;
 }

-template<int D, int ncols1, int ncols2, int KQ_stride> // D == head size
+template<int D, int ncols1, int ncols2> // D == head size
 __launch_bounds__(D, 1)
 static __global__ void flash_attn_stream_k_fixup(
        float * __restrict__ dst, const float2 * __restrict__ dst_fixup, const int ne01, const int ne02, const int ne11) {
@@ -665,13 +665,13 @@ static void on_no_fattn_vec_case(const int D) {
        fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
        GGML_ABORT("fatal error");
    } else {
-        fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
+        fprintf(stderr, "Unsupported KV type combination for head_size %d.\n", D);
        fprintf(stderr, "Only f16 is supported.\n");
        GGML_ABORT("fatal error");
    }
 }

-template <int D, int ncols1, int ncols2, int KQ_stride>
+template <int DV, int ncols1, int ncols2>
 void launch_fattn(
    ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, const int nwarps, const size_t nbytes_shared,
    const int KQ_row_granularity, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE
@@ -691,7 +691,7 @@ void launch_fattn(

    GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
    GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
-                                "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
+        "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");

    GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding.");

@@ -754,10 +754,13 @@ void launch_fattn(
    const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3];

    const dim3 block_dim(warp_size, nwarps, 1);
+    int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
+    CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
+
    dim3 blocks_num;
    if (stream_k) {
        // For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
-        const int max_blocks = 2*nsm;
+        const int max_blocks = max_blocks_per_sm*nsm;
        const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
        const int tiles_efficiency_percent = 100 * ntiles_total / (max_blocks*tiles_nwaves);

@@ -769,14 +772,11 @@ void launch_fattn(
        blocks_num.y = 1;
        blocks_num.z = 1;

-        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + D) * sizeof(float));
+        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + DV) * sizeof(float));
    } else {
        GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0);
        const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.

-        int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
-        CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
-
        // parallel_blocks should be at least large enough to achieve max. occupancy for a single wave:
        parallel_blocks = std::max((nsm * max_blocks_per_sm) / ntiles_total, 1);

@@ -853,19 +853,19 @@ void launch_fattn(

    if (stream_k) {
        if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
-            const dim3 block_dim_combine(D, 1, 1);
+            const dim3 block_dim_combine(DV, 1, 1);
            const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};

-            flash_attn_stream_k_fixup<D, ncols1, ncols2, KQ_stride>
+            flash_attn_stream_k_fixup<DV, ncols1, ncols2>
                <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
                ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[1]);
        }
    } else if (parallel_blocks > 1) {
-        const dim3 block_dim_combine(D, 1, 1);
+        const dim3 block_dim_combine(DV, 1, 1);
        const dim3 blocks_num_combine(Q->ne[1], 1, blocks_num.z);
        const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2);

-        flash_attn_combine_results<D>
+        flash_attn_combine_results<DV>
            <<<blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream>>>
            (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks);
    }
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
--- a/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -307,7 +307,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, -1>
+            launch_fattn<D, cols_per_block, 1>
                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
        } break;
        case 128: {
@@ -315,7 +315,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, -1>
+            launch_fattn<D, cols_per_block, 1>
                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
        } break;
        default: {
--- a/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -318,7 +318,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, -1>
+            launch_fattn<D, cols_per_block, 1>
                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
        } break;
        case 128: {
@@ -326,7 +326,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, -1>
+            launch_fattn<D, cols_per_block, 1>
                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
        } break;
        default: {
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -168,6 +168,7 @@ static __global__ void flash_attn_vec_ext_f16(
    for (int j = 0; j < ncols; ++j) {
        KQ[j*D + tid] = -HALF_MAX_HALF;
    }
+    __syncthreads();

    half2 VKQ[ncols] = {{0.0f, 0.0f}};

@@ -315,7 +316,7 @@ void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx,
    constexpr bool need_f16_K = D != 128;
    constexpr bool need_f16_V = D != 128 && D != 64;
    constexpr size_t nbytes_shared = 0;
-    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
+    launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
 }

 template <int D, ggml_type type_K, ggml_type type_V>
--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -310,7 +310,7 @@ void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx,
    constexpr bool need_f16_K = D != 128;
    constexpr bool need_f16_V = D != 128 && D != 64;
    constexpr size_t nbytes_shared = 0;
-    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
+    launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
 }

 template <int D, ggml_type type_K, ggml_type type_V>
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -490,7 +490,7 @@ void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggm
        fattn_kernel = flash_attn_ext_f16<
            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), KQ_acc_t, use_logit_softcap>;
    }
-    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size);
+    launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size);
 }

 void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -8,58 +8,32 @@
 #include "fattn-wmma-f16.cuh"
 #include "fattn.cuh"

-template <int D, int ncols2>
+template <int DKQ, int DV, int ncols2>
 static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];

-    if (Q->ne[1] <= 8/ncols2) {
-        ggml_cuda_flash_attn_ext_mma_f16_case<D, 8/ncols2, ncols2>(ctx, dst);
-        return;
+    if constexpr (ncols2 <= 8) {
+        if (Q->ne[1] <= 8/ncols2) {
+            ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 8/ncols2, ncols2>(ctx, dst);
+            return;
+        }
    }

    if (Q->ne[1] <= 16/ncols2) {
-        ggml_cuda_flash_attn_ext_mma_f16_case<D, 16/ncols2, ncols2>(ctx, dst);
+        ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 16/ncols2, ncols2>(ctx, dst);
        return;
    }

    if (Q->ne[1] <= 32/ncols2) {
-        ggml_cuda_flash_attn_ext_mma_f16_case<D, 32/ncols2, ncols2>(ctx, dst);
+        ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 32/ncols2, ncols2>(ctx, dst);
        return;
    }

-    ggml_cuda_flash_attn_ext_mma_f16_case<D, 64/ncols2, ncols2>(ctx, dst);
+    ggml_cuda_flash_attn_ext_mma_f16_case<DKQ, DV, 64/ncols2, ncols2>(ctx, dst);
 }

-template <int ncols2>
-static void ggml_cuda_flash_attn_ext_mma_f16_switch_hs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * Q = dst->src[0];
-
-    switch (Q->ne[0]) {
-        case 64:
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 64, ncols2>(ctx, dst);
-            break;
-        case 80:
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 80, ncols2>(ctx, dst);
-            break;
-        case 96:
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1< 96, ncols2>(ctx, dst);
-            break;
-        case 112:
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<112, ncols2>(ctx, dst);
-            break;
-        case 128:
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<128, ncols2>(ctx, dst);
-            break;
-        case 256:
-            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<256, ncols2>(ctx, dst);
-            break;
-        default:
-            GGML_ABORT("fatal error");
-            break;
-    }
-}
-
-static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+template <int DKQ, int DV>
+static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * KQV  = dst;
    const ggml_tensor * Q    = dst->src[0];
    const ggml_tensor * K    = dst->src[1];
@@ -68,27 +42,79 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
    float max_bias = 0.0f;
    memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));

-    const float use_gqa_opt = mask && max_bias == 0.0f;
+    const bool use_gqa_opt = mask && max_bias == 0.0f;

    GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
    const int gqa_ratio = Q->ne[2] / K->ne[2];

    if (use_gqa_opt && gqa_ratio % 8 == 0) {
-        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<8>(ctx, dst);
+        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 8>(ctx, dst);
        return;
    }

-    if (use_gqa_opt && gqa_ratio == 4) {
-        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<4>(ctx, dst);
+    if (use_gqa_opt && gqa_ratio % 4 == 0) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 4>(ctx, dst);
        return;
    }

-    if (use_gqa_opt && gqa_ratio == 2) {
-        ggml_cuda_flash_attn_ext_mma_f16_switch_hs<2>(ctx, dst);
+    if (use_gqa_opt && gqa_ratio % 2 == 0) {
+        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 2>(ctx, dst);
        return;
    }

-    ggml_cuda_flash_attn_ext_mma_f16_switch_hs<1>(ctx, dst);
+    ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<DKQ, DV, 1>(ctx, dst);
+}
+
+static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * KQV  = dst;
+    const ggml_tensor * Q    = dst->src[0];
+    const ggml_tensor * K    = dst->src[1];
+    const ggml_tensor * V    = dst->src[2];
+    const ggml_tensor * mask = dst->src[3];
+
+    switch (Q->ne[0]) {
+        case 64:
+            GGML_ASSERT(V->ne[0] == 64);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 64,  64>(ctx, dst);
+            break;
+        case 80:
+            GGML_ASSERT(V->ne[0] == 80);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 80,  80>(ctx, dst);
+            break;
+        case 96:
+            GGML_ASSERT(V->ne[0] == 96);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2< 96,  96>(ctx, dst);
+            break;
+        case 112:
+            GGML_ASSERT(V->ne[0] == 112);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<112, 112>(ctx, dst);
+            break;
+        case 128:
+            GGML_ASSERT(V->ne[0] == 128);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<128, 128>(ctx, dst);
+            break;
+        case 256:
+            GGML_ASSERT(V->ne[0] == 256);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2<256, 256>(ctx, dst);
+            break;
+        case 576: {
+            // For Deepseek, go straight to the ncols1 switch to avoid compiling unnecessary kernels.
+            GGML_ASSERT(V->ne[0] == 512);
+            float max_bias = 0.0f;
+            memcpy(&max_bias, (const float *) KQV->op_params + 1, sizeof(float));
+
+            const bool use_gqa_opt = mask && max_bias == 0.0f;
+            GGML_ASSERT(use_gqa_opt);
+
+            GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
+            const int gqa_ratio = Q->ne[2] / K->ne[2];
+            GGML_ASSERT(gqa_ratio % 16 == 0);
+            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
+        } break;
+        default:
+            GGML_ABORT("fatal error");
+            break;
+    }
 }

 #define FATTN_VEC_F16_CASE(D, type_K, type_V)                               \
@@ -299,7 +325,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
    const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
    const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion;
-    const bool can_use_vector_kernel = Q->ne[0] % (2*warp_size) == 0;
+    const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % (2*warp_size) == 0;
    if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
        if (prec == GGML_PREC_DEFAULT) {
            ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
--- a/ggml/src/ggml-cuda/getrows.cu
+++ b/ggml/src/ggml-cuda/getrows.cu
@@ -10,10 +10,11 @@ static __global__ void k_get_rows(
        /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
        const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {

-    const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2;
-    const int i10 =  blockDim.y*blockIdx.y + threadIdx.y;
-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
+    // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
+    const int i00 = (blockIdx.y * blockDim.x + threadIdx.x)*2;
+    const int i10 =  blockIdx.x;
+    const int i11 =  blockIdx.z / ne12;
+    const int i12 =  blockIdx.z % ne12;

    if (i00 >= ne00) {
        return;
@@ -46,10 +47,11 @@ static __global__ void k_get_rows_float(
        /*const size_t nb00,*/ const size_t nb01, const size_t nb02, const size_t nb03,
        const size_t s10, const size_t s11, const size_t s12/*, const size_t s13*/) {

-    const int i00 =  blockIdx.x*blockDim.x + threadIdx.x;
-    const int i10 =  blockDim.y*blockIdx.y + threadIdx.y;
-    const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12;
-    const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12;
+    // The x and y dimensions of the grid are swapped because the maximum allowed grid size for x is higher.
+    const int i00 = blockIdx.y * blockDim.x + threadIdx.x;
+    const int i10 = blockIdx.x;
+    const int i11 = blockIdx.z / ne12;
+    const int i12 = blockIdx.z % ne12;

    if (i00 >= ne00) {
        return;
@@ -94,8 +96,8 @@ static void get_rows_cuda_q(
        const size_t nb1, const size_t nb2, const size_t nb3,
        cudaStream_t stream) {
    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
-    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
+    const int block_num_y = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
+    const dim3 block_nums(ne10, block_num_y, ne11*ne12);

    // strides in elements
    // const size_t s0 = nb0 / sizeof(dst_t);
@@ -127,8 +129,8 @@ static void get_rows_cuda_float(
        const size_t nb1, const size_t nb2, const size_t nb3,
        cudaStream_t stream) {
    const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
-    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
-    const dim3 block_nums(block_num_x, ne10, ne11*ne12);
+    const int block_num_y = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
+    const dim3 block_nums(ne10, block_num_y, ne11*ne12);

    // strides in elements
    // const size_t s0 = nb0 / sizeof(dst_t);
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1909,13 +1909,19 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);

+    // If src0 is a temporary compute buffer it may have some padding that needs to be cleared for mul_mat_vec_q or mul_mat_q.
+    // But if src0 is also a view of another tensor then this cannot be done safely because it may overwrite valid tensor data.
+    // Therefore, in such cases use cuBLAS.
+    const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE
+        && ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && src0->view_src;
+
    bool use_mul_mat_vec   = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
-    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
+    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
        && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
-    bool use_mul_mat_q     = ggml_is_quantized(src0->type)
+    bool use_mul_mat_q     = ggml_is_quantized(src0->type) && !bad_padding_clear
        && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;

    bool any_gpus_with_slow_fp16   = false;
@@ -3215,16 +3221,16 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
            return false;
 #endif // FLASH_ATTN_AVAILABLE
            if (op->src[1]->ne[0] != op->src[2]->ne[0]) {
-                // different head sizes of K and V are not supported yet
-                return false;
+                const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
+                if (!new_mma_available(cc) || cc < GGML_CUDA_CC_AMPERE) {
+                    return false;
+                }
+                const int gqa_ratio = op->src[0]->ne[2] / op->src[1]->ne[2];
+                return op->src[1]->ne[0] == 576 && op->src[2]->ne[0] == 512 && op->src[3] && gqa_ratio % 16 == 0;
            }
            if (op->src[0]->ne[0] == 192) {
                return false;
            }
-            if (op->src[0]->ne[0] == 576) {
-                // DeepSeek MLA
-                return false;
-            }
            if (op->src[0]->ne[3] != 1) {
                return false;
            }
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -91,11 +91,11 @@ void ggml_cuda_mul_mat_q(

    // If src0 is a temporary compute buffer, clear any potential padding.
    if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
-        GGML_ASSERT(ggml_is_contiguously_allocated(src0));
-        GGML_ASSERT(!src0->view_src);
        const size_t size_data  = ggml_nbytes(src0);
        const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
        if (size_alloc > size_data) {
+            GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+            GGML_ASSERT(!src0->view_src);
            CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
        }
    }
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -515,11 +515,11 @@ void ggml_cuda_mul_mat_vec_q(

    // If src0 is a temporary compute buffer, clear any potential padding.
    if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
-        GGML_ASSERT(ggml_is_contiguously_allocated(src0));
-        GGML_ASSERT(!src0->view_src);
        const size_t size_data  = ggml_nbytes(src0);
        const size_t size_alloc = ggml_backend_buffer_get_alloc_size(src0->buffer, src0);
        if (size_alloc > size_data) {
+            GGML_ASSERT(ggml_is_contiguously_allocated(src0));
+            GGML_ASSERT(!src0->view_src);
            CUDA_CHECK(cudaMemsetAsync((char *) src0->data + size_data, 0, size_alloc - size_data, stream));
        }
    }
--- a/ggml/src/ggml-cuda/sum.cu
+++ b/ggml/src/ggml-cuda/sum.cu
@@ -31,7 +31,7 @@ void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(src0));
+    GGML_ASSERT(ggml_is_contiguously_allocated(src0));

    const float * src0_d = (const float *) src0->data;
    float * dst_d = (float *) dst->data;
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(576, 512, 1, 16);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 1, 8);
-DECL_FATTN_MMA_F16_CASE(80, 1, 8);
-DECL_FATTN_MMA_F16_CASE(96, 1, 8);
-DECL_FATTN_MMA_F16_CASE(112, 1, 8);
-DECL_FATTN_MMA_F16_CASE(128, 1, 8);
-DECL_FATTN_MMA_F16_CASE(256, 1, 8);
+DECL_FATTN_MMA_F16_CASE(64, 64, 1, 8);
+DECL_FATTN_MMA_F16_CASE(80, 80, 1, 8);
+DECL_FATTN_MMA_F16_CASE(96, 96, 1, 8);
+DECL_FATTN_MMA_F16_CASE(112, 112, 1, 8);
+DECL_FATTN_MMA_F16_CASE(128, 128, 1, 8);
+DECL_FATTN_MMA_F16_CASE(256, 256, 1, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 16, 1);
-DECL_FATTN_MMA_F16_CASE(80, 16, 1);
-DECL_FATTN_MMA_F16_CASE(96, 16, 1);
-DECL_FATTN_MMA_F16_CASE(112, 16, 1);
-DECL_FATTN_MMA_F16_CASE(128, 16, 1);
-DECL_FATTN_MMA_F16_CASE(256, 16, 1);
+DECL_FATTN_MMA_F16_CASE(64, 64, 16, 1);
+DECL_FATTN_MMA_F16_CASE(80, 80, 16, 1);
+DECL_FATTN_MMA_F16_CASE(96, 96, 16, 1);
+DECL_FATTN_MMA_F16_CASE(112, 112, 16, 1);
+DECL_FATTN_MMA_F16_CASE(128, 128, 16, 1);
+DECL_FATTN_MMA_F16_CASE(256, 256, 16, 1);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 16, 2);
-DECL_FATTN_MMA_F16_CASE(80, 16, 2);
-DECL_FATTN_MMA_F16_CASE(96, 16, 2);
-DECL_FATTN_MMA_F16_CASE(112, 16, 2);
-DECL_FATTN_MMA_F16_CASE(128, 16, 2);
-DECL_FATTN_MMA_F16_CASE(256, 16, 2);
+DECL_FATTN_MMA_F16_CASE(64, 64, 16, 2);
+DECL_FATTN_MMA_F16_CASE(80, 80, 16, 2);
+DECL_FATTN_MMA_F16_CASE(96, 96, 16, 2);
+DECL_FATTN_MMA_F16_CASE(112, 112, 16, 2);
+DECL_FATTN_MMA_F16_CASE(128, 128, 16, 2);
+DECL_FATTN_MMA_F16_CASE(256, 256, 16, 2);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 16, 4);
-DECL_FATTN_MMA_F16_CASE(80, 16, 4);
-DECL_FATTN_MMA_F16_CASE(96, 16, 4);
-DECL_FATTN_MMA_F16_CASE(112, 16, 4);
-DECL_FATTN_MMA_F16_CASE(128, 16, 4);
-DECL_FATTN_MMA_F16_CASE(256, 16, 4);
+DECL_FATTN_MMA_F16_CASE(64, 64, 16, 4);
+DECL_FATTN_MMA_F16_CASE(80, 80, 16, 4);
+DECL_FATTN_MMA_F16_CASE(96, 96, 16, 4);
+DECL_FATTN_MMA_F16_CASE(112, 112, 16, 4);
+DECL_FATTN_MMA_F16_CASE(128, 128, 16, 4);
+DECL_FATTN_MMA_F16_CASE(256, 256, 16, 4);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(576, 512, 2, 16);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 2, 4);
-DECL_FATTN_MMA_F16_CASE(80, 2, 4);
-DECL_FATTN_MMA_F16_CASE(96, 2, 4);
-DECL_FATTN_MMA_F16_CASE(112, 2, 4);
-DECL_FATTN_MMA_F16_CASE(128, 2, 4);
-DECL_FATTN_MMA_F16_CASE(256, 2, 4);
+DECL_FATTN_MMA_F16_CASE(64, 64, 2, 4);
+DECL_FATTN_MMA_F16_CASE(80, 80, 2, 4);
+DECL_FATTN_MMA_F16_CASE(96, 96, 2, 4);
+DECL_FATTN_MMA_F16_CASE(112, 112, 2, 4);
+DECL_FATTN_MMA_F16_CASE(128, 128, 2, 4);
+DECL_FATTN_MMA_F16_CASE(256, 256, 2, 4);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 2, 8);
-DECL_FATTN_MMA_F16_CASE(80, 2, 8);
-DECL_FATTN_MMA_F16_CASE(96, 2, 8);
-DECL_FATTN_MMA_F16_CASE(112, 2, 8);
-DECL_FATTN_MMA_F16_CASE(128, 2, 8);
-DECL_FATTN_MMA_F16_CASE(256, 2, 8);
+DECL_FATTN_MMA_F16_CASE(64, 64, 2, 8);
+DECL_FATTN_MMA_F16_CASE(80, 80, 2, 8);
+DECL_FATTN_MMA_F16_CASE(96, 96, 2, 8);
+DECL_FATTN_MMA_F16_CASE(112, 112, 2, 8);
+DECL_FATTN_MMA_F16_CASE(128, 128, 2, 8);
+DECL_FATTN_MMA_F16_CASE(256, 256, 2, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 32, 1);
-DECL_FATTN_MMA_F16_CASE(80, 32, 1);
-DECL_FATTN_MMA_F16_CASE(96, 32, 1);
-DECL_FATTN_MMA_F16_CASE(112, 32, 1);
-DECL_FATTN_MMA_F16_CASE(128, 32, 1);
-DECL_FATTN_MMA_F16_CASE(256, 32, 1);
+DECL_FATTN_MMA_F16_CASE(64, 64, 32, 1);
+DECL_FATTN_MMA_F16_CASE(80, 80, 32, 1);
+DECL_FATTN_MMA_F16_CASE(96, 96, 32, 1);
+DECL_FATTN_MMA_F16_CASE(112, 112, 32, 1);
+DECL_FATTN_MMA_F16_CASE(128, 128, 32, 1);
+DECL_FATTN_MMA_F16_CASE(256, 256, 32, 1);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 32, 2);
-DECL_FATTN_MMA_F16_CASE(80, 32, 2);
-DECL_FATTN_MMA_F16_CASE(96, 32, 2);
-DECL_FATTN_MMA_F16_CASE(112, 32, 2);
-DECL_FATTN_MMA_F16_CASE(128, 32, 2);
-DECL_FATTN_MMA_F16_CASE(256, 32, 2);
+DECL_FATTN_MMA_F16_CASE(64, 64, 32, 2);
+DECL_FATTN_MMA_F16_CASE(80, 80, 32, 2);
+DECL_FATTN_MMA_F16_CASE(96, 96, 32, 2);
+DECL_FATTN_MMA_F16_CASE(112, 112, 32, 2);
+DECL_FATTN_MMA_F16_CASE(128, 128, 32, 2);
+DECL_FATTN_MMA_F16_CASE(256, 256, 32, 2);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../fattn-mma-f16.cuh"
+
+DECL_FATTN_MMA_F16_CASE(576, 512, 4, 16);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 4, 2);
-DECL_FATTN_MMA_F16_CASE(80, 4, 2);
-DECL_FATTN_MMA_F16_CASE(96, 4, 2);
-DECL_FATTN_MMA_F16_CASE(112, 4, 2);
-DECL_FATTN_MMA_F16_CASE(128, 4, 2);
-DECL_FATTN_MMA_F16_CASE(256, 4, 2);
+DECL_FATTN_MMA_F16_CASE(64, 64, 4, 2);
+DECL_FATTN_MMA_F16_CASE(80, 80, 4, 2);
+DECL_FATTN_MMA_F16_CASE(96, 96, 4, 2);
+DECL_FATTN_MMA_F16_CASE(112, 112, 4, 2);
+DECL_FATTN_MMA_F16_CASE(128, 128, 4, 2);
+DECL_FATTN_MMA_F16_CASE(256, 256, 4, 2);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 4, 4);
-DECL_FATTN_MMA_F16_CASE(80, 4, 4);
-DECL_FATTN_MMA_F16_CASE(96, 4, 4);
-DECL_FATTN_MMA_F16_CASE(112, 4, 4);
-DECL_FATTN_MMA_F16_CASE(128, 4, 4);
-DECL_FATTN_MMA_F16_CASE(256, 4, 4);
+DECL_FATTN_MMA_F16_CASE(64, 64, 4, 4);
+DECL_FATTN_MMA_F16_CASE(80, 80, 4, 4);
+DECL_FATTN_MMA_F16_CASE(96, 96, 4, 4);
+DECL_FATTN_MMA_F16_CASE(112, 112, 4, 4);
+DECL_FATTN_MMA_F16_CASE(128, 128, 4, 4);
+DECL_FATTN_MMA_F16_CASE(256, 256, 4, 4);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 4, 8);
-DECL_FATTN_MMA_F16_CASE(80, 4, 8);
-DECL_FATTN_MMA_F16_CASE(96, 4, 8);
-DECL_FATTN_MMA_F16_CASE(112, 4, 8);
-DECL_FATTN_MMA_F16_CASE(128, 4, 8);
-DECL_FATTN_MMA_F16_CASE(256, 4, 8);
+DECL_FATTN_MMA_F16_CASE(64, 64, 4, 8);
+DECL_FATTN_MMA_F16_CASE(80, 80, 4, 8);
+DECL_FATTN_MMA_F16_CASE(96, 96, 4, 8);
+DECL_FATTN_MMA_F16_CASE(112, 112, 4, 8);
+DECL_FATTN_MMA_F16_CASE(128, 128, 4, 8);
+DECL_FATTN_MMA_F16_CASE(256, 256, 4, 8);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_64-ncols2_1.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 64, 1);
-DECL_FATTN_MMA_F16_CASE(80, 64, 1);
-DECL_FATTN_MMA_F16_CASE(96, 64, 1);
-DECL_FATTN_MMA_F16_CASE(112, 64, 1);
-DECL_FATTN_MMA_F16_CASE(128, 64, 1);
-DECL_FATTN_MMA_F16_CASE(256, 64, 1);
+DECL_FATTN_MMA_F16_CASE(64, 64, 64, 1);
+DECL_FATTN_MMA_F16_CASE(80, 80, 64, 1);
+DECL_FATTN_MMA_F16_CASE(96, 96, 64, 1);
+DECL_FATTN_MMA_F16_CASE(112, 112, 64, 1);
+DECL_FATTN_MMA_F16_CASE(128, 128, 64, 1);
+DECL_FATTN_MMA_F16_CASE(256, 256, 64, 1);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_1.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 8, 1);
-DECL_FATTN_MMA_F16_CASE(80, 8, 1);
-DECL_FATTN_MMA_F16_CASE(96, 8, 1);
-DECL_FATTN_MMA_F16_CASE(112, 8, 1);
-DECL_FATTN_MMA_F16_CASE(128, 8, 1);
-DECL_FATTN_MMA_F16_CASE(256, 8, 1);
+DECL_FATTN_MMA_F16_CASE(64, 64, 8, 1);
+DECL_FATTN_MMA_F16_CASE(80, 80, 8, 1);
+DECL_FATTN_MMA_F16_CASE(96, 96, 8, 1);
+DECL_FATTN_MMA_F16_CASE(112, 112, 8, 1);
+DECL_FATTN_MMA_F16_CASE(128, 128, 8, 1);
+DECL_FATTN_MMA_F16_CASE(256, 256, 8, 1);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_2.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 8, 2);
-DECL_FATTN_MMA_F16_CASE(80, 8, 2);
-DECL_FATTN_MMA_F16_CASE(96, 8, 2);
-DECL_FATTN_MMA_F16_CASE(112, 8, 2);
-DECL_FATTN_MMA_F16_CASE(128, 8, 2);
-DECL_FATTN_MMA_F16_CASE(256, 8, 2);
+DECL_FATTN_MMA_F16_CASE(64, 64, 8, 2);
+DECL_FATTN_MMA_F16_CASE(80, 80, 8, 2);
+DECL_FATTN_MMA_F16_CASE(96, 96, 8, 2);
+DECL_FATTN_MMA_F16_CASE(112, 112, 8, 2);
+DECL_FATTN_MMA_F16_CASE(128, 128, 8, 2);
+DECL_FATTN_MMA_F16_CASE(256, 256, 8, 2);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 8, 4);
-DECL_FATTN_MMA_F16_CASE(80, 8, 4);
-DECL_FATTN_MMA_F16_CASE(96, 8, 4);
-DECL_FATTN_MMA_F16_CASE(112, 8, 4);
-DECL_FATTN_MMA_F16_CASE(128, 8, 4);
-DECL_FATTN_MMA_F16_CASE(256, 8, 4);
+DECL_FATTN_MMA_F16_CASE(64, 64, 8, 4);
+DECL_FATTN_MMA_F16_CASE(80, 80, 8, 4);
+DECL_FATTN_MMA_F16_CASE(96, 96, 8, 4);
+DECL_FATTN_MMA_F16_CASE(112, 112, 8, 4);
+DECL_FATTN_MMA_F16_CASE(128, 128, 8, 4);
+DECL_FATTN_MMA_F16_CASE(256, 256, 8, 4);
--- a/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
@@ -2,9 +2,9 @@

 #include "../fattn-mma-f16.cuh"

-DECL_FATTN_MMA_F16_CASE(64, 8, 8);
-DECL_FATTN_MMA_F16_CASE(80, 8, 8);
-DECL_FATTN_MMA_F16_CASE(96, 8, 8);
-DECL_FATTN_MMA_F16_CASE(112, 8, 8);
-DECL_FATTN_MMA_F16_CASE(128, 8, 8);
-DECL_FATTN_MMA_F16_CASE(256, 8, 8);
+DECL_FATTN_MMA_F16_CASE(64, 64, 8, 8);
+DECL_FATTN_MMA_F16_CASE(80, 80, 8, 8);
+DECL_FATTN_MMA_F16_CASE(96, 96, 8, 8);
+DECL_FATTN_MMA_F16_CASE(112, 112, 8, 8);
+DECL_FATTN_MMA_F16_CASE(128, 128, 8, 8);
+DECL_FATTN_MMA_F16_CASE(256, 256, 8, 8);
--- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -18,7 +18,7 @@ SOURCE_FATTN_MMA_START = """// This file has been autogenerated by generate_cu_f

 """

-SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size}, {ncols1}, {ncols2});\n"
+SOURCE_FATTN_MMA_CASE = "DECL_FATTN_MMA_F16_CASE({head_size_kq}, {head_size_v}, {ncols1}, {ncols2});\n"

 TYPES_MMQ = [
    "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
@@ -57,18 +57,21 @@ for vkq_size in [16, 32]:
                with open(f"fattn-vec-f{vkq_size}-instance-hs{head_size}-{get_short_name(type_k)}-{get_short_name(type_v)}.cu", "w") as f:
                    f.write(SOURCE_FATTN_VEC.format(vkq_size=vkq_size, head_size=head_size, type_k=type_k, type_v=type_v))

-for ncols in [8, 16, 32, 64, 128]:
-    for ncols2 in [1, 2, 4, 8]:
+for ncols in [8, 16, 32, 64]:
+    for ncols2 in [1, 2, 4, 8, 16]:
+        if ncols2 > ncols:
+            continue
        ncols1 = ncols // ncols2
-        if ncols == 128:
-            continue  # Too much register pressure.
        with open(f"fattn-mma-f16-instance-ncols1_{ncols1}-ncols2_{ncols2}.cu", "w") as f:
            f.write(SOURCE_FATTN_MMA_START)

-            for head_size in [64, 80, 96, 112, 128, 256]:
-                if ncols == 128 and head_size == 256:
-                    continue  # Needs too much shared memory.
-                f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size=head_size))
+            for head_size_kq in [64, 80, 96, 112, 128, 256, 576]:
+                if head_size_kq != 576 and ncols2 == 16:
+                    continue
+                if head_size_kq == 576 and ncols2 != 16:
+                    continue
+                head_size_v = head_size_kq if head_size_kq != 576 else 512
+                f.write(SOURCE_FATTN_MMA_CASE.format(ncols1=ncols1, ncols2=ncols2, head_size_kq=head_size_kq, head_size_v=head_size_v))

 for type in TYPES_MMQ:
    with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -207,6 +207,10 @@ typedef struct {
    float    attn_factor;
    float    beta_fast;
    float    beta_slow;
+    int32_t  sect_0;
+    int32_t  sect_1;
+    int32_t  sect_2;
+    int32_t  sect_3;
 } ggml_metal_kargs_rope;

 typedef struct {
@@ -299,21 +303,42 @@ typedef struct {
 } ggml_metal_kargs_mul_mv_ext;

 typedef struct {
-    int32_t  nei0;
-    int32_t  nei1;
-    uint64_t nbi1;
+    int32_t  ne10;
+    int32_t  ne11;  // n_expert_used (bcast)
+    uint64_t nb11;
+    uint64_t nb12;
+    int32_t  neh11; // n_tokens
+    uint64_t nbh11;
+    int32_t  ne20;  // n_expert_used
+    uint64_t nb21;
+} ggml_metal_kargs_mul_mm_id_map0;
+
+typedef struct {
+    int32_t  ne20; // n_expert_used
+    int32_t  neh0;
+    int32_t  neh1;
+    uint64_t nbh1;
+    uint64_t nbh2;
+    int32_t  ne0;
+    uint64_t nb1;
+    uint64_t nb2;
+} ggml_metal_kargs_mul_mm_id_map1;
+
+typedef struct {
    int32_t  ne00;
    int32_t  ne02;
    uint64_t nb01;
    uint64_t nb02;
-    int32_t  ne11;
-    int32_t  ne12;
-    int32_t  ne13;
-    uint64_t nb10;
-    uint64_t nb11;
-    uint64_t nb12;
-    int32_t  ne0;
-    int32_t  ne1;
+    uint64_t nb03;
+    int32_t  neh12;
+    uint64_t nbh10;
+    uint64_t nbh11;
+    uint64_t nbh12;
+    uint64_t nbh13;
+    int32_t  neh0;
+    int32_t  neh1;
+    int16_t  r2;
+    int16_t  r3;
 } ggml_metal_kargs_mul_mm_id;

 typedef struct {
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -306,30 +306,36 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,
    GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
-    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,
+    GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,
    GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
+    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,
+    GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,
+    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,
+    GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,
    GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
    GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
    GGML_METAL_KERNEL_TYPE_IM2COL_F16,
@@ -650,7 +656,8 @@ static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) {
    }

    if (mem_pool->heaps_to_remove.count > 0) {
-        for (NSUInteger i = 0; i < [mem_pool->heaps_to_remove count]; i++) {
+        // remove in reverse order
+        for (NSUInteger i = [mem_pool->heaps_to_remove count] - 1; ; --i) {
            NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue];
            ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index];

@@ -659,6 +666,10 @@ static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) {

            [mem_pool->heaps removeObjectAtIndex:index];
            [ptr release];
+
+            if (i == 0) {
+                break;
+            }
        }

        [mem_pool->heaps_to_remove removeAllObjects];
@@ -672,7 +683,7 @@ static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) {
 }

 static id<MTLBuffer> ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) {
-    const size_t alignment = 32;
+    const size_t alignment = 256;

    const size_t size_aligned = GGML_PAD(size, alignment);

@@ -1242,30 +1253,36 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32,                mul_mm_iq1_m_f32,                has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32,               mul_mm_iq4_nl_f32,               has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32,               mul_mm_iq4_xs_f32,               has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32,               mul_mm_id_f32_f32,               has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32,               mul_mm_id_f16_f32,               has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32,              mul_mm_id_bf16_f32,              has_simdgroup_mm && use_bfloat);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32,              mul_mm_id_q4_0_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32,              mul_mm_id_q4_1_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32,              mul_mm_id_q5_0_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32,              mul_mm_id_q5_1_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32,              mul_mm_id_q8_0_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32,              mul_mm_id_q2_K_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32,              mul_mm_id_q3_K_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32,              mul_mm_id_q4_K_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32,              mul_mm_id_q5_K_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32,              mul_mm_id_q6_K_f32,              has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32,           mul_mm_id_iq2_xxs_f32,           has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32,            mul_mm_id_iq2_xs_f32,            has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32,           mul_mm_id_iq3_xxs_f32,           has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32,             mul_mm_id_iq3_s_f32,             has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32,             mul_mm_id_iq2_s_f32,             has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32,             mul_mm_id_iq1_s_f32,             has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,             mul_mm_id_iq1_m_f32,             has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,            mul_mm_id_iq4_nl_f32,            has_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,            mul_mm_id_iq4_xs_f32,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16,              mul_mm_id_map0_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32,              mul_mm_id_map1_f32,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16,               mul_mm_id_f32_f16,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16,               mul_mm_id_f16_f16,               has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16,              mul_mm_id_bf16_f16,              has_simdgroup_mm && use_bfloat);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16,              mul_mm_id_q4_0_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16,              mul_mm_id_q4_1_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16,              mul_mm_id_q5_0_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16,              mul_mm_id_q5_1_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16,              mul_mm_id_q8_0_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16,              mul_mm_id_q2_K_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16,              mul_mm_id_q3_K_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16,              mul_mm_id_q4_K_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16,              mul_mm_id_q5_K_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16,              mul_mm_id_q6_K_f16,              has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16,           mul_mm_id_iq2_xxs_f16,           has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16,            mul_mm_id_iq2_xs_f16,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16,           mul_mm_id_iq3_xxs_f16,           has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16,             mul_mm_id_iq3_s_f16,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16,             mul_mm_id_iq2_s_f16,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16,             mul_mm_id_iq1_s_f16,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16,             mul_mm_id_iq1_m_f16,             has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16,            mul_mm_id_iq4_nl_f16,            has_simdgroup_mm);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16,            mul_mm_id_iq4_xs_f16,            has_simdgroup_mm);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,                   rope_norm_f32,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                   rope_norm_f16,                   true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32,                  rope_multi_f32,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16,                  rope_multi_f16,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32,                 rope_vision_f32,                 true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16,                 rope_vision_f16,                 true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,                   rope_neox_f32,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,                   rope_neox_f16,                   true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                      im2col_f16,                      true);
@@ -1628,16 +1645,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
        case GGML_OP_NORM:
            return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0]));
        case GGML_OP_ROPE:
-            {
-                const int mode = ((const int32_t *) op->op_params)[2];
-                if (mode & GGML_ROPE_TYPE_MROPE) {
-                    return false;
-                }
-                if (mode & GGML_ROPE_TYPE_VISION) {
-                    return false;
-                }
-                return true;
-            }
+            return true;
        case GGML_OP_IM2COL:
            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_1D:
@@ -2999,7 +3007,7 @@ static bool ggml_metal_encode_node(
                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];

                    [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                    [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                    [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                } else {
                    id<MTLComputePipelineState> pipeline = nil;

@@ -3219,8 +3227,6 @@ static bool ggml_metal_encode_node(
            } break;
        case GGML_OP_MUL_MAT_ID:
            {
-                const int n_as = src0->ne[2];
-
                // src2 = ids
                const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t);

@@ -3234,24 +3240,21 @@ static bool ggml_metal_encode_node(
                GGML_ASSERT(ne03 == 1);
                GGML_ASSERT(ne13 == 1);

+                const uint32_t r2 = 1;
+                const uint32_t r3 = 1;
+
                // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                // to the matrix-vector kernel
                // ne20 = n_used_experts
-                // ne21 = n_rows
-                const int dst_rows = ne20*ne21;
-                const int dst_rows_min = n_as;
-                const int dst_rows_max = (device.maxThreadgroupMemoryLength/2 - 8192)/4;
-
-                // max size of the rowids array in the kernel shared buffer
-                //GGML_ASSERT(dst_rows <= dst_rows_max);
+                // ne21 = n_rows (batch size)
+                const int ne21_mm_id_min = 32;

                // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
                if ([device supportsFamily:MTLGPUFamilyApple7] &&
                        ne00 % 32 == 0 && ne00 >= 64 &&
-                        //ne01 / ne02 >= 512 &&    // NOTE: this is based on Mixtral shapes, might need adjustments
-                        dst_rows >  dst_rows_min &&
-                        dst_rows <= dst_rows_max) {
+                        (ne21 >= ne21_mm_id_min)) {
+                    GGML_ASSERT(ne00 % 4 == 0);

                    // some Metal matrix data types require aligned pointers
                    // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5)
@@ -3262,62 +3265,169 @@ static bool ggml_metal_encode_node(
                        default: break;
                    }

-                    id<MTLComputePipelineState> pipeline = nil;
+                    const int64_t neh10 = ne10; // n_embd
+                    const int64_t neh11 = ne21; // n_tokens
+                    const int64_t neh12 = ne02; // n_expert

-                    switch (src0->type) {
-                        case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
-                        case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
-                        case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F32   ].pipeline; break;
-                        case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
-                        case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
-                        case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
-                        case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32   ].pipeline; break;
-                        case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32   ].pipeline; break;
-                        case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32   ].pipeline; break;
-                        case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32   ].pipeline; break;
-                        case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32   ].pipeline; break;
-                        case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32   ].pipeline; break;
-                        case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
-                        case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
-                        case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
-                        case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break;
-                        case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32  ].pipeline; break;
-                        case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32  ].pipeline; break;
-                        case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32  ].pipeline; break;
-                        case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32  ].pipeline; break;
-                        case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
-                        case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
-                        default: GGML_ABORT("MUL_MAT_ID not implemented");
+                    const uint64_t nbh10 = ggml_type_size(GGML_TYPE_F16);
+                    const uint64_t nbh11 = nbh10*neh10;
+                    const uint64_t nbh12 = nbh11*neh11;
+                    const uint64_t nbh13 = nbh12*neh12;
+
+                    const size_t s_src1 = ggml_type_size(GGML_TYPE_F16)*neh10*neh11*neh12;
+                    id<MTLBuffer> h_src1 = ggml_metal_mem_pool_alloc(mem_pool, s_src1);
+                    if (!h_src1) {
+                        GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_src1);
+                        return false;
                    }

-                    ggml_metal_kargs_mul_mm_id args = {
-                        /*.nei0 =*/ ne20,
-                        /*.nei1 =*/ ne21,
-                        /*.nbi1 =*/ nb21,
-                        /*.ne00 =*/ ne00,
-                        /*.ne02 =*/ ne02,
-                        /*.nb01 =*/ nb01,
-                        /*.nb02 =*/ nb02,
-                        /*.ne11 =*/ ne11,
-                        /*.ne12 =*/ ne12,
-                        /*.ne13 =*/ ne13,
-                        /*.nb10 =*/ nb10,
-                        /*.nb11 =*/ nb11,
-                        /*.nb12 =*/ nb12,
-                        /*.ne0  =*/ ne0,
-                        /*.ne1  =*/ ne1,
-                    };
+                    const int64_t neh0 = ne0;
+                    const int64_t neh1 = ne21;
+                    const int64_t neh2 = ne02;

-                    [encoder setComputePipelineState:pipeline];
-                    [encoder setBytes:&args    length:sizeof(args) atIndex:0];
-                    [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
-                    [encoder setBuffer:id_src1 offset:offs_src1    atIndex:2];
-                    [encoder setBuffer:id_dst  offset:offs_dst     atIndex:3];
-                    [encoder setBuffer:id_src2 offset:offs_src2    atIndex:4];
+                    const uint64_t nbh0 = ggml_type_size(GGML_TYPE_F32);
+                    const uint64_t nbh1 = nbh0*neh0;
+                    const uint64_t nbh2 = nbh1*neh1;
+                  //const uint64_t nbh3 = nbh2*neh2;

-                    [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0];
+                    const size_t s_dst = ggml_type_size(GGML_TYPE_F32)*neh0*neh1*neh2;
+                    id<MTLBuffer> h_dst = ggml_metal_mem_pool_alloc(mem_pool, s_dst);
+                    if (!h_dst) {
+                        GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_dst);
+                        return false;
+                    }

-                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                    // tokens per expert
+                    const size_t s_tpe = ggml_type_size(GGML_TYPE_I32)*ne02;
+                    id<MTLBuffer> h_tpe = ggml_metal_mem_pool_alloc(mem_pool, s_tpe);
+                    if (!h_tpe) {
+                        GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tpe);
+                        return false;
+                    }
+
+                    // id map
+                    // [n_expert_used, n_tokens]
+                    const size_t s_ids = ggml_type_size(GGML_TYPE_I32)*ne20*ne21;
+                    id<MTLBuffer> h_ids = ggml_metal_mem_pool_alloc(mem_pool, s_ids);
+                    if (!h_ids) {
+                        GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_ids);
+                        return false;
+                    }
+
+                    {
+                        const int nth = MIN(1024, ne10/4);
+
+                        ggml_metal_kargs_mul_mm_id_map0 args = {
+                            ne10,
+                            ne11,  // n_expert_used (bcast)
+                            nb11,
+                            nb12,
+                            neh11, // n_tokens
+                            nbh11,
+                            ne20,  // n_expert_used
+                            nb21,
+                        };
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP0_F16].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBytes:&args    length:sizeof(args) atIndex:0];
+                        [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                        [encoder setBuffer:id_src2 offset:offs_src2    atIndex:2];
+                        [encoder setBuffer: h_src1 offset:0            atIndex:3];
+                        [encoder setBuffer: h_tpe  offset:0            atIndex:4];
+                        [encoder setBuffer: h_ids  offset:0            atIndex:5];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne02, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    }
+
+                    {
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        switch (src0->type) {
+                            case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F16    ].pipeline; break;
+                            case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F16    ].pipeline; break;
+                            case GGML_TYPE_BF16:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_BF16_F16   ].pipeline; break;
+                            case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F16   ].pipeline; break;
+                            case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F16   ].pipeline; break;
+                            case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F16   ].pipeline; break;
+                            case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F16   ].pipeline; break;
+                            case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F16   ].pipeline; break;
+                            case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F16   ].pipeline; break;
+                            case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F16   ].pipeline; break;
+                            case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F16   ].pipeline; break;
+                            case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F16   ].pipeline; break;
+                            case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F16   ].pipeline; break;
+                            case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F16].pipeline; break;
+                            case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F16 ].pipeline; break;
+                            case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F16].pipeline; break;
+                            case GGML_TYPE_IQ3_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F16  ].pipeline; break;
+                            case GGML_TYPE_IQ2_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F16  ].pipeline; break;
+                            case GGML_TYPE_IQ1_S:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F16  ].pipeline; break;
+                            case GGML_TYPE_IQ1_M:   pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F16  ].pipeline; break;
+                            case GGML_TYPE_IQ4_NL:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F16 ].pipeline; break;
+                            case GGML_TYPE_IQ4_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F16 ].pipeline; break;
+                            default: GGML_ABORT("MUL_MAT_ID not implemented");
+                        }
+
+                        ggml_metal_kargs_mul_mm_id args = {
+                            /*.ne00  =*/ ne00,
+                            /*.ne02  =*/ ne02,
+                            /*.nb01  =*/ nb01,
+                            /*.nb02  =*/ nb02,
+                            /*.nb03  =*/ nb03,
+                            /*.neh12 =*/ neh12,
+                            /*.nbh10 =*/ nbh10,
+                            /*.nbh11 =*/ nbh11,
+                            /*.nbh12 =*/ nbh12,
+                            /*.nbh13 =*/ nbh13,
+                            /*.neh0  =*/ neh0,
+                            /*.neh1  =*/ neh1,
+                            /*.r2    =*/ r2,
+                            /*.r3    =*/ r3,
+                        };
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBytes:&args    length:sizeof(args) atIndex:0];
+                        [encoder setBuffer:id_src0 offset:offs_src0    atIndex:1];
+                        [encoder setBuffer: h_src1 offset:0            atIndex:2];
+                        [encoder setBuffer: h_tpe  offset:0            atIndex:3];
+                        [encoder setBuffer: h_dst  offset:0            atIndex:4];
+
+                        [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+                        [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, ne02) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                    }
+
+                    {
+                        GGML_ASSERT(ne0 % 4 == 0);
+
+                        const int nth = MIN(1024, ne0/4);
+
+                        ggml_metal_kargs_mul_mm_id_map1 args = {
+                            ne20, // n_expert_used
+                            neh0,
+                            neh1,
+                            nbh1,
+                            nbh2,
+                            ne0,
+                            nb1,
+                            nb2,
+                        };
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_MAP1_F32].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBytes:&args   length:sizeof(args) atIndex:0];
+                        [encoder setBuffer: h_dst offset:0            atIndex:1];
+                        [encoder setBuffer: h_ids offset:0            atIndex:2];
+                        [encoder setBuffer:id_dst offset:offs_dst     atIndex:3];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne20, ne21, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    }
                } else {
                    id<MTLComputePipelineState> pipeline = nil;

@@ -3511,7 +3621,7 @@ static bool ggml_metal_encode_node(
                    [encoder setBuffer:id_src2 offset:offs_src2 atIndex:4];

                    const int64_t _ne1 = 1;
-                    const int64_t ne123 = dst_rows;
+                    const int64_t ne123 = ne20*ne21;

                    if (smem > 0) {
                        [encoder setThreadgroupMemoryLength:smem atIndex:0];
@@ -3715,6 +3825,7 @@ static bool ggml_metal_encode_node(
            } break;
        case GGML_OP_ROPE:
            {
+
                // make sure we have one or more position id(ne10) per token(ne02)
                GGML_ASSERT(ne10 % ne02 == 0);
                GGML_ASSERT(ne10 >= ne02);
@@ -3741,20 +3852,42 @@ static bool ggml_metal_encode_node(
                memcpy(&beta_fast,   (const int32_t *) dst->op_params +  9, sizeof(float));
                memcpy(&beta_slow,   (const int32_t *) dst->op_params + 10, sizeof(float));

-                const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
+                const bool is_neox   = mode & GGML_ROPE_TYPE_NEOX;
+                const bool is_mrope  = mode & GGML_ROPE_TYPE_MROPE;
+                const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
+
+                // mrope
+                const int sect_0 = ((const int32_t *) dst->op_params)[11];
+                const int sect_1 = ((const int32_t *) dst->op_params)[12];
+                const int sect_2 = ((const int32_t *) dst->op_params)[13];
+                const int sect_3 = ((const int32_t *) dst->op_params)[14];

                id<MTLComputePipelineState> pipeline = nil;

-                if (!is_neox) {
+                if (is_neox) {
                    switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    };
+                } else if (is_mrope && !is_vision) {
+                    GGML_ASSERT(ne10*4 >= ne02); // need at least 4 pos per token
+                    switch (src0->type) {
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_MULTI_F16].pipeline; break;
+                        default: GGML_ABORT("fatal error");
+                    };
+                } else if (is_vision) {
+                    GGML_ASSERT(ne10*4 >= ne02); // need at least 4 pos per token
+                    switch (src0->type) {
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_VISION_F16].pipeline; break;
                        default: GGML_ABORT("fatal error");
                    };
                } else {
                    switch (src0->type) {
-                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
-                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
+                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
+                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
                        default: GGML_ABORT("fatal error");
                    };
                }
@@ -3785,6 +3918,10 @@ static bool ggml_metal_encode_node(
                    /*.attn_factor =*/ attn_factor,
                    /*.beta_fast   =*/ beta_fast,
                    /*.beta_slow   =*/ beta_slow,
+                    /* sect_0      =*/ sect_0,
+                    /* sect_1      =*/ sect_1,
+                    /* sect_2      =*/ sect_2,
+                    /* sect_3      =*/ sect_3,
                };

                [encoder setComputePipelineState:pipeline];
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2713,8 +2713,148 @@ kernel void kernel_rope_neox(
    }
 }

+template<typename T>
+kernel void kernel_rope_multi(
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+
+    device const int32_t * pos = (device const int32_t *) src1;
+
+    const float inv_ndims = -1.f/args.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < args.n_dims) {
+            const int ic = i0/2;
+
+            // mrope theta calculations
+            // note: the rest is the same as kernel_rope_neox
+            const int sect_dims = args.sect_0 + args.sect_1 + args.sect_2 + args.sect_3;
+            const int sec_w01   = args.sect_0 + args.sect_1;               // end of section 1
+            const int sec_w012  = args.sect_0 + args.sect_1 + args.sect_2; // end of section 2
+            const int sector    = ic % sect_dims;
+
+            float theta_base;
+            if (sector < args.sect_0) {
+                theta_base = (float) pos[i2];
+            } else if (sector < sec_w01) {
+                theta_base = (float) pos[i2 + args.ne02];
+            } else if (sector < sec_w012) {
+                theta_base = (float) pos[i2 + args.ne02 * 2];
+            } else {
+                theta_base = (float) pos[i2 + args.ne02 * 3];
+            }
+            // end of mrope
+
+            const float theta = theta_base * pow(args.freq_base, inv_ndims*i0);
+
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[args.n_dims/2];
+
+            dst_data[0]             = x0*cos_theta - x1*sin_theta;
+            dst_data[args.n_dims/2] = x0*sin_theta + x1*cos_theta;
+        } else {
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+template<typename T>
+kernel void kernel_rope_vision(
+        constant ggml_metal_kargs_rope & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        ushort  tiitg[[thread_index_in_threadgroup]],
+        ushort3 tptg [[threads_per_threadgroup]],
+        uint3   tgpig[[threadgroup_position_in_grid]]) {
+    const int i3 = tgpig[2];
+    const int i2 = tgpig[1];
+    const int i1 = tgpig[0];
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+
+    device const int32_t * pos = (device const int32_t *) src1;
+
+    const float inv_ndims = -1.f/args.n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (int i0 = 2*tiitg; i0 < args.ne0; i0 += 2*tptg.x) {
+        if (i0 < 2*args.n_dims) { // different from kernel_rope_multi
+            const int ic = i0/2;
+
+            // mrope theta calculations (only support 2 dimensions)
+            const int sect_dims = args.sect_0 + args.sect_1;
+            const int sector    = ic % sect_dims;
+
+            float p;
+            float theta_base;
+            if (sector < args.sect_1) {
+                p = (float) sector;
+                theta_base = (float) pos[i2];
+            } else {
+                p = (float) sector - args.sect_0;
+                theta_base = (float) pos[i2 + args.ne02];
+            }
+
+            const float theta = theta_base * pow(args.freq_base, 2.0f * inv_ndims * p);
+            // end of mrope
+
+            const float freq_factor = src2 != src0 ? ((device const float *) src2)[ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + ic*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + ic*args.nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[args.n_dims]; // different from kernel_rope_multi
+
+            dst_data[0]           = x0*cos_theta - x1*sin_theta;
+            dst_data[args.n_dims] = x0*sin_theta + x1*cos_theta; // different from kernel_rope_multi
+        } else {
+            device const T * const src = (device T *)(src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01 + i0*args.nb00);
+            device       T * dst_data  = (device T *)( dst + i3*args.nb3  + i2*args.nb2  + i1*args.nb1  + i0*args.nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
 typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
 typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
+typedef decltype(kernel_rope_multi<float>) kernel_rope_multi_t;
+typedef decltype(kernel_rope_vision<float>) kernel_rope_vision_t;

 template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
 template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
@@ -2722,6 +2862,12 @@ template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_
 template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
 template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;

+template [[host_name("kernel_rope_multi_f32")]] kernel kernel_rope_multi_t kernel_rope_multi<float>;
+template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kernel_rope_multi<half>;
+
+template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
+template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
+
 typedef void (im2col_t)(
        device const float * x,
        device        char * dst,
@@ -6336,127 +6482,219 @@ kernel void kernel_mul_mm(
    }
 }

-// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in rowids
-// TODO: this kernel needs to be reimplemented from scratch for better performance
-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-void kernel_mul_mm_id_impl(
-        int32_t  ne00,
-        int32_t  ne02,
-        uint64_t nb01,
-        uint64_t nb02,
-        int32_t  ne11,
-        int32_t  ne12,
-        uint64_t nb10,
-        uint64_t nb11,
-        uint64_t nb12,
-        int32_t  ne0,
-        int32_t  ne1,
-        int64_t  ne0ne1,
-        device   const char * src0,
-        device   const char * src1,
-        threadgroup ushort2 * rowids,
-        device         char * dst,
-        threadgroup    char * shmem,
+template<typename T4>
+kernel void kernel_mul_mm_id_map0(
+        constant ggml_metal_kargs_mul_mm_id_map0 & args,
+        device  const char * src1,
+        device  const char * src2,
+        device        char * hsrc1,
+        device        char * htpe,
+        device        char * hids,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int ide = tgpig[0]; // expert id
+
+    int n_all = 0;
+
+    device int32_t * ids_i32 = (device int32_t *) (hids);
+
+    for (int i21 = 0; i21 < args.neh11; i21++) { // n_tokens
+        device const int32_t * src2_i32 = (device const int32_t *) (src2 + i21*args.nb21);
+
+        for (int i20 = 0; i20 < args.ne20; i20++) { // n_expert_used
+            if (src2_i32[i20] != ide) {
+                continue;
+            }
+
+            device const float4 *  src1_f32x4 = (device const float4 *) ( src1 + i21*args.nb12 + (i20%args.ne11)*args.nb11);
+            device       T4     * hsrc1_f32x4 = (device       T4     *) (hsrc1 + (ide*args.neh11 + n_all)*args.nbh11);
+
+            for (int64_t i00 = tpitg.x; i00 < args.ne10/4; i00 += ntg.x) {
+                hsrc1_f32x4[i00] = (T4) (src1_f32x4[i00]);
+            }
+
+            if (tpitg.x == 0) {
+                ids_i32[i21*args.ne20 + i20] = ide*args.neh11 + n_all;
+            }
+
+            ++n_all;
+        }
+    }
+
+    if (tpitg.x == 0) {
+        device int32_t * tpe_i32 = (device int32_t *) (htpe);
+        tpe_i32[ide] = n_all;
+    }
+}
+
+typedef decltype(kernel_mul_mm_id_map0<half4>) kernel_mul_mm_id_map0_t;
+
+template [[host_name("kernel_mul_mm_id_map0_f16")]] kernel kernel_mul_mm_id_map0_t kernel_mul_mm_id_map0<half4>;
+
+template<typename T>
+kernel void kernel_mul_mm_id_map1(
+        constant ggml_metal_kargs_mul_mm_id_map1 & args,
+        device  const char * hdst,
+        device  const char * hids,
+        device        char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
+    const int i20 = tgpig[0]; // used expert
+    const int i21 = tgpig[1]; // token
+
+    device const int32_t * ids_i32    = (device const int32_t *) (hids);
+    device       float4  * dst_f32x4  = (device       float4  *) (dst + i20*args.nb1 + i21*args.nb2);
+
+    const int id = ids_i32[i21*args.ne20 + i20];
+
+    const int ide = id / args.neh1;
+    const int idt = id % args.neh1;
+
+    device const float4 * hdst_f32x4 = (device const float4 *) (hdst + idt*args.nbh1 + ide*args.nbh2);
+
+    for (int64_t i0 = tpitg.x; i0 < args.neh0/4; i0 += ntg.x) {
+        dst_f32x4[i0] = hdst_f32x4[i0];
+    }
+}
+
+typedef decltype(kernel_mul_mm_id_map1<float>) kernel_mul_mm_id_map1_t;
+
+template [[host_name("kernel_mul_mm_id_map1_f32")]] kernel kernel_mul_mm_id_map1_t kernel_mul_mm_id_map1<float>;
+
+template<typename T, typename T4x4, typename simdgroup_T8x8, typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread T4x4 &)>
+kernel void kernel_mul_mm_id(
+        constant ggml_metal_kargs_mul_mm_id & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * tpe,
+        device       char * dst,
+        threadgroup  char * shmem [[threadgroup(0)]],
        uint3  tgpig[[threadgroup_position_in_grid]],
        ushort tiitg[[thread_index_in_threadgroup]],
        ushort sgitg[[simdgroup_index_in_threadgroup]]) {

-    threadgroup half  * sa = (threadgroup half  *)(shmem);
-    threadgroup float * sb = (threadgroup float *)(shmem + 4096);
+    threadgroup T    * sa = (threadgroup T    *)(shmem);
+    threadgroup half * sb = (threadgroup half *)(shmem + 4096);

    const int r0 = tgpig.y;
    const int r1 = tgpig.x;
+    const int im = tgpig.z;

-    if (r1*BLOCK_SIZE_N >= ne1) return;
+    device const int32_t * tpe_i32 = (device const int32_t *) (tpe);
+
+    const int neh1 = tpe_i32[im];
+
+    if (r1*BLOCK_SIZE_N >= neh1) {
+        return;
+    }

    // if this block is of 64x32 shape or smaller
-    short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M;
-    short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N;
+    const short n_rows = (args.neh0 - r0*BLOCK_SIZE_M < BLOCK_SIZE_M) ? (args.neh0 - r0*BLOCK_SIZE_M) : BLOCK_SIZE_M;
+    const short n_cols = (     neh1 - r1*BLOCK_SIZE_N < BLOCK_SIZE_N) ? (     neh1 - r1*BLOCK_SIZE_N) : BLOCK_SIZE_N;

    // a thread shouldn't load data outside of the matrix
-    short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
-    short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;
+    const short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1;
+    const short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1;

-    simdgroup_half8x8  ma[4];
-    simdgroup_float8x8 mb[2];
+    simdgroup_T8x8     ma[4];
+    simdgroup_half8x8  mb[2];
    simdgroup_float8x8 mc[8];
-    for (int i = 0; i < 8; i++){
+
+    for (short i = 0; i < 8; i++){
        mc[i] = make_filled_simdgroup_matrix<float, 8>(0.f);
    }
+
    short il = (tiitg % THREAD_PER_ROW);

-    ushort offset1 = il/nl;
+    const int i12 = im%args.neh12;
+    const int i13 = im/args.neh12;

-    threadgroup const auto & id = rowids[r1 * BLOCK_SIZE_N + thread_col];
+    const uint64_t offset0 = (i12/args.r2)*args.nb02 + (i13/args.r3)*args.nb03;
+    const short    offset1 = il/nl;

-    device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01) + offset1;
-    device const float   * y = (device const float   *)(src1
-        + nb12 * id[1]
-        + nb11 * (id[0] % ne11)
-        + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
+    device const block_q * x = (device const block_q *)(src0
+        + args.nb01*(r0*BLOCK_SIZE_M + thread_row) + offset0) + offset1;

-    for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
+    device const half   * y = (device const half   *)(src1
+        + args.nbh13*i13
+        + args.nbh12*i12
+        + args.nbh11*(r1*BLOCK_SIZE_N + thread_col)
+        + args.nbh10*(BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
+
+    for (int loop_k = 0; loop_k < args.ne00; loop_k += BLOCK_SIZE_K) {
        // load data and store to threadgroup memory
-        half4x4 temp_a;
+        T4x4 temp_a;
        dequantize_func(x, il, temp_a);
+
        threadgroup_barrier(mem_flags::mem_threadgroup);

-        for (int i = 0; i < 16; i++) {
-            *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \
-            +                     (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \
-            +                     (tiitg / THREAD_PER_ROW) % 8  + (i & 7) * 8) = temp_a[i/4][i%4];
+        #pragma unroll(16)
+        for (short i = 0; i < 16; i++) {
+            *(sa + SG_MAT_SIZE * ((tiitg/THREAD_PER_ROW/8) \
+            +                     (tiitg%THREAD_PER_ROW)*16 + (i/8)*8) \
+            +                     (tiitg/THREAD_PER_ROW)%8  + (i&7)*8) = temp_a[i/4][i%4];
        }

-        *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y);
+        *(threadgroup half2x4 *)(sb + 32*8*(tiitg%THREAD_PER_COL) + 8*(tiitg/THREAD_PER_COL)) = *((device half2x4 *) y);

        il = (il + 2 < nl) ? il + 2 : il % 2;
-        x  = (il < 2) ? x + (2+nl-1)/nl : x;
+        x  = (il < 2) ? x + (2 + nl - 1)/nl : x;
        y += BLOCK_SIZE_K;

        threadgroup_barrier(mem_flags::mem_threadgroup);

        // load matrices from threadgroup memory and conduct outer products
-        threadgroup half  * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2));
-        threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2));
+        threadgroup const T    * lsma = (sa + THREAD_MAT_M*SG_MAT_SIZE*(sgitg%2));
+        threadgroup const half * lsmb = (sb + THREAD_MAT_N*SG_MAT_SIZE*(sgitg/2));

-        #pragma unroll(BLOCK_SIZE_K/8)
-        for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) {
+        #pragma unroll(4)
+        for (short ik = 0; ik < BLOCK_SIZE_K/8; ik++) {
            #pragma unroll(4)
-            for (int i = 0; i < 4; i++) {
+            for (short i = 0; i < 4; i++) {
                simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i);
            }
+
            simdgroup_barrier(mem_flags::mem_none);
+
            #pragma unroll(2)
-            for (int i = 0; i < 2; i++) {
+            for (short i = 0; i < 2; i++) {
                simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i);
            }

-            lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE;
-            lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE;
-
            #pragma unroll(8)
-            for (int i = 0; i < 8; i++){
+            for (short i = 0; i < 8; i++){
                simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]);
            }
+
+            lsma += (BLOCK_SIZE_M/SG_MAT_ROW)*SG_MAT_SIZE;
+            lsmb += (BLOCK_SIZE_N/SG_MAT_ROW)*SG_MAT_SIZE;
        }
    }

-    {
+    if ((r0 + 1) * BLOCK_SIZE_M <= args.neh0 && (r1 + 1) * BLOCK_SIZE_N <= neh1) {
+        device float * C = (device float *) dst +
+            (BLOCK_SIZE_M * r0 + 32*(sgitg &  1)) + \
+            (BLOCK_SIZE_N * r1 + 16*(sgitg >> 1)) * args.neh0 + im*args.neh1*args.neh0;
+
+        for (short i = 0; i < 8; i++) {
+            simdgroup_store(mc[i], C + 8 * (i%4) + 8 * args.neh0 * (i/4), args.neh0);
+        }
+    } else {
+        // block is smaller than 64x32, we should avoid writing data outside of the matrix
        threadgroup_barrier(mem_flags::mem_threadgroup);
        threadgroup float * temp_str = ((threadgroup float *) shmem) \
-                                      + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M;
-        for (int i = 0; i < 8; i++) {
-            simdgroup_store(mc[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M);
+                                     + 32*(sgitg&1) + (16*(sgitg >> 1))*BLOCK_SIZE_M;
+        for (short i = 0; i < 8; i++) {
+            simdgroup_store(mc[i], temp_str + 8*(i%4) + 8*BLOCK_SIZE_M*(i/4), BLOCK_SIZE_M);
        }

        threadgroup_barrier(mem_flags::mem_threadgroup);

        if (sgitg == 0) {
            for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) {
-                threadgroup const auto & jid = rowids[r1 * BLOCK_SIZE_N + j];
-                int64_t joff = jid[0]*ne0 + jid[1]*ne0ne1;
-
-                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + joff;
+                device float  * D  = (device float  *) dst + (r0*BLOCK_SIZE_M) + (r1*BLOCK_SIZE_N + j)*args.neh0 + im*args.neh1*args.neh0;
                device float4 * D4 = (device float4 *) D;

                threadgroup float  * C  = temp_str + (j*BLOCK_SIZE_M);
@@ -6476,66 +6714,6 @@ void kernel_mul_mm_id_impl(
    }
 }

-template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
-kernel void kernel_mul_mm_id(
-        constant ggml_metal_kargs_mul_mm_id & args,
-        device const char * src0s,
-        device const char * src1,
-        device       char * dst,
-        device const char * ids,
-        threadgroup  char * shmem [[threadgroup(0)]],
-        uint3  tgpig[[threadgroup_position_in_grid]],
-        ushort tiitg[[thread_index_in_threadgroup]],
-        ushort sgitg[[simdgroup_index_in_threadgroup]]) {
-
-    const int32_t i02 = tgpig.z;
-
-    tgpig.z = 0;
-
-    device const char * src0 = src0s + i02*args.nb02;
-
-    // row indices
-    threadgroup ushort2 * rowids = (threadgroup ushort2 *)(shmem + 8192);
-
-    // TODO: parallelize this loop
-    int32_t _ne1 = 0;
-    for (ushort ii1 = 0; ii1 < args.nei1; ii1++) {
-        for (ushort ii0 = 0; ii0 < args.nei0; ii0++) {
-            int32_t id = ((device int32_t *) (ids + ii1*args.nbi1))[ii0];
-            if (id == i02) {
-                if (tiitg == 0) {
-                    rowids[_ne1] = ushort2(ii0, ii1);
-                }
-                _ne1++;
-            }
-        }
-    }
-
-    threadgroup_barrier(mem_flags::mem_threadgroup);
-
-    kernel_mul_mm_id_impl<block_q, nl, dequantize_func>(
-        args.ne00,
-        args.ne02,
-        args.nb01,
-        args.nb02,
-        args.ne11,
-        args.ne12,
-        args.nb10,
-        args.nb11,
-        args.nb12,
-        args.ne0,
-        _ne1,
-        (int64_t)args.ne0*args.ne1,
-        src0,
-        src1,
-        rowids,
-        dst,
-        shmem,
-        tgpig,
-        tiitg,
-        sgitg);
-}
-
 #define QK_NL 16

 //
@@ -6576,63 +6754,64 @@ template [[host_name("kernel_get_rows_iq4_xs")]]  kernel get_rows_q_t kernel_get
 // matrix-matrix multiplication
 //

-typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mat_mm_t;
+typedef decltype(kernel_mul_mm<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mul_mm_t;

-template [[host_name("kernel_mul_mm_f32_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
-template [[host_name("kernel_mul_mm_f16_f32")]]     kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
+template [[host_name("kernel_mul_mm_f32_f32")]]     kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
+template [[host_name("kernel_mul_mm_f16_f32")]]     kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
 #if defined(GGML_METAL_USE_BF16)
-template [[host_name("kernel_mul_mm_bf16_f32")]]    kernel mat_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
+template [[host_name("kernel_mul_mm_bf16_f32")]]    kernel mul_mm_t kernel_mul_mm<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
 #endif
-template [[host_name("kernel_mul_mm_q4_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
-template [[host_name("kernel_mul_mm_q4_1_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
-template [[host_name("kernel_mul_mm_q5_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
-template [[host_name("kernel_mul_mm_q5_1_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_1,    2,     dequantize_q5_1>;
-template [[host_name("kernel_mul_mm_q8_0_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q8_0,    2,     dequantize_q8_0>;
-template [[host_name("kernel_mul_mm_q2_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q2_K,    QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_mul_mm_q3_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q3_K,    QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_mul_mm_q4_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_mul_mm_q5_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_mul_mm_q6_K_f32")]]    kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
-template [[host_name("kernel_mul_mm_iq3_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_s,   QK_NL, dequantize_iq3_s>;
-template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_s,   QK_NL, dequantize_iq2_s>;
-template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_s,   QK_NL, dequantize_iq1_s>;
-template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_m,   QK_NL, dequantize_iq1_m>;
-template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
-template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mat_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+template [[host_name("kernel_mul_mm_q4_0_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
+template [[host_name("kernel_mul_mm_q4_1_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_q5_0_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
+template [[host_name("kernel_mul_mm_q5_1_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_1,    2,     dequantize_q5_1>;
+template [[host_name("kernel_mul_mm_q8_0_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q8_0,    2,     dequantize_q8_0>;
+template [[host_name("kernel_mul_mm_q2_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q2_K,    QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_mul_mm_q3_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q3_K,    QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_mul_mm_q4_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_mul_mm_q5_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_mul_mm_q6_K_f32")]]    kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K>;
+template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
+template [[host_name("kernel_mul_mm_iq2_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
+template [[host_name("kernel_mul_mm_iq3_s_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq3_s,   QK_NL, dequantize_iq3_s>;
+template [[host_name("kernel_mul_mm_iq2_s_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq2_s,   QK_NL, dequantize_iq2_s>;
+template [[host_name("kernel_mul_mm_iq1_s_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_s,   QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_mul_mm_iq1_m_f32")]]   kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq1_m,   QK_NL, dequantize_iq1_m>;
+template [[host_name("kernel_mul_mm_iq4_nl_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
+template [[host_name("kernel_mul_mm_iq4_xs_f32")]]  kernel mul_mm_t kernel_mul_mm<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;

 //
 // indirect matrix-matrix multiplication
 //

-typedef decltype(kernel_mul_mm_id<float4x4, 1, dequantize_f32>) mat_mm_id_t;
+typedef decltype(kernel_mul_mm_id<half, half4x4, simdgroup_half8x8, float4x4, 1, dequantize_f32>) mul_mm_id;

-template [[host_name("kernel_mul_mm_id_f32_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<float4x4,      1,     dequantize_f32>;
-template [[host_name("kernel_mul_mm_id_f16_f32")]]     kernel mat_mm_id_t kernel_mul_mm_id<half4x4,       1,     dequantize_f16>;
+template [[host_name("kernel_mul_mm_id_f32_f16")]]     kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   float4x4,      1,     dequantize_f32>;
+template [[host_name("kernel_mul_mm_id_f16_f16")]]     kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   half4x4,       1,     dequantize_f16>;
 #if defined(GGML_METAL_USE_BF16)
-template [[host_name("kernel_mul_mm_id_bf16_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<bfloat4x4,     1,     dequantize_bf16>;
+template [[host_name("kernel_mul_mm_id_bf16_f16")]]    kernel mul_mm_id kernel_mul_mm_id<bfloat, bfloat4x4, simdgroup_bfloat8x8, bfloat4x4,     1,     dequantize_bf16>;
 #endif
-template [[host_name("kernel_mul_mm_id_q4_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_0,    2,     dequantize_q4_0>;
-template [[host_name("kernel_mul_mm_id_q4_1_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_1,    2,     dequantize_q4_1>;
-template [[host_name("kernel_mul_mm_id_q5_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_0,    2,     dequantize_q5_0>;
-template [[host_name("kernel_mul_mm_id_q5_1_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_1,    2,     dequantize_q5_1>;
-template [[host_name("kernel_mul_mm_id_q8_0_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q8_0,    2,     dequantize_q8_0>;
-template [[host_name("kernel_mul_mm_id_q2_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q2_K,    QK_NL, dequantize_q2_K>;
-template [[host_name("kernel_mul_mm_id_q3_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q3_K,    QK_NL, dequantize_q3_K>;
-template [[host_name("kernel_mul_mm_id_q4_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q4_K,    QK_NL, dequantize_q4_K>;
-template [[host_name("kernel_mul_mm_id_q5_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q5_K,    QK_NL, dequantize_q5_K>;
-template [[host_name("kernel_mul_mm_id_q6_K_f32")]]    kernel mat_mm_id_t kernel_mul_mm_id<block_q6_K,    QK_NL, dequantize_q6_K>;
-template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
-template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
-template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
-template [[host_name("kernel_mul_mm_id_iq3_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq3_s,   QK_NL, dequantize_iq3_s>;
-template [[host_name("kernel_mul_mm_id_iq2_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq2_s,   QK_NL, dequantize_iq2_s>;
-template [[host_name("kernel_mul_mm_id_iq1_s_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_s,   QK_NL, dequantize_iq1_s>;
-template [[host_name("kernel_mul_mm_id_iq1_m_f32")]]   kernel mat_mm_id_t kernel_mul_mm_id<block_iq1_m,   QK_NL, dequantize_iq1_m>;
-template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_nl,  2,     dequantize_iq4_nl>;
-template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]]  kernel mat_mm_id_t kernel_mul_mm_id<block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+template [[host_name("kernel_mul_mm_id_q4_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q4_0,    2,     dequantize_q4_0>;
+template [[host_name("kernel_mul_mm_id_q4_1_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q4_1,    2,     dequantize_q4_1>;
+template [[host_name("kernel_mul_mm_id_q5_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q5_0,    2,     dequantize_q5_0>;
+template [[host_name("kernel_mul_mm_id_q5_1_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q5_1,    2,     dequantize_q5_1>;
+template [[host_name("kernel_mul_mm_id_q8_0_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q8_0,    2,     dequantize_q8_0>;
+template [[host_name("kernel_mul_mm_id_q2_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q2_K,    QK_NL, dequantize_q2_K>;
+template [[host_name("kernel_mul_mm_id_q3_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q3_K,    QK_NL, dequantize_q3_K>;
+template [[host_name("kernel_mul_mm_id_q4_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q4_K,    QK_NL, dequantize_q4_K>;
+template [[host_name("kernel_mul_mm_id_q5_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q5_K,    QK_NL, dequantize_q5_K>;
+template [[host_name("kernel_mul_mm_id_q6_K_f16")]]    kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_q6_K,    QK_NL, dequantize_q6_K>;
+template [[host_name("kernel_mul_mm_id_iq2_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq2_xxs, QK_NL, dequantize_iq2_xxs>;
+template [[host_name("kernel_mul_mm_id_iq2_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq2_xs,  QK_NL, dequantize_iq2_xs>;
+template [[host_name("kernel_mul_mm_id_iq3_xxs_f16")]] kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq3_xxs, QK_NL, dequantize_iq3_xxs>;
+template [[host_name("kernel_mul_mm_id_iq3_s_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq3_s,   QK_NL, dequantize_iq3_s>;
+template [[host_name("kernel_mul_mm_id_iq2_s_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq2_s,   QK_NL, dequantize_iq2_s>;
+template [[host_name("kernel_mul_mm_id_iq1_s_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq1_s,   QK_NL, dequantize_iq1_s>;
+template [[host_name("kernel_mul_mm_id_iq1_m_f16")]]   kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq1_m,   QK_NL, dequantize_iq1_m>;
+template [[host_name("kernel_mul_mm_id_iq4_nl_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_nl,  2,     dequantize_iq4_nl>;
+template [[host_name("kernel_mul_mm_id_iq4_xs_f16")]]  kernel mul_mm_id kernel_mul_mm_id<half,   half4x4,   simdgroup_half8x8,   block_iq4_xs,  QK_NL, dequantize_iq4_xs>;
+

 //
 // matrix-vector multiplication
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -4855,8 +4855,6 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
            if (!any_on_device) {
                return false;
            }
-            GGML_ASSERT(ggml_is_contiguous(src0));
-            GGML_ASSERT(ggml_is_contiguous(src1));
            func = ggml_cl_add;
            break;
        case GGML_OP_MUL:
--- a/ggml/src/ggml-opt.cpp
+++ b/ggml/src/ggml-opt.cpp
@@ -28,16 +28,19 @@ struct ggml_opt_dataset {
 };

 struct ggml_opt_context {
-    ggml_backend_sched_t    backend_sched        = nullptr;
-    ggml_cgraph           * allocated_graph      = nullptr;
-    ggml_cgraph           * allocated_graph_copy = nullptr;
-    struct ggml_context   * ctx_static           = nullptr;
-    struct ggml_context   * ctx_static_cpu       = nullptr;
-    struct ggml_context   * ctx_compute          = nullptr;
-    struct ggml_context   * ctx_copy             = nullptr;
-    ggml_backend_buffer_t   buf_static           = nullptr;
-    ggml_backend_buffer_t   buf_static_cpu       = nullptr;
-    std::mt19937            rng;
+    ggml_backend_sched_t       backend_sched        = nullptr;
+    ggml_cgraph              * allocated_graph      = nullptr;
+    ggml_cgraph              * allocated_graph_copy = nullptr;
+    struct ggml_context      * ctx_static           = nullptr;
+    struct ggml_context      * ctx_cpu              = nullptr;
+    struct ggml_context      * ctx_compute          = nullptr;
+    struct ggml_context      * ctx_copy             = nullptr;
+    ggml_backend_buffer_t      buf_static           = nullptr;
+    ggml_backend_buffer_t      buf_cpu              = nullptr;
+    std::mt19937               rng;
+    enum ggml_opt_loss_type    loss_type;
+    enum ggml_opt_build_type   build_type;
+    enum ggml_opt_build_type   build_type_alloc;

    struct ggml_tensor * inputs  = nullptr;
    struct ggml_tensor * outputs = nullptr;
@@ -50,6 +53,11 @@ struct ggml_opt_context {
    struct ggml_cgraph * gf      = nullptr;
    struct ggml_cgraph * gb_grad = nullptr;
    struct ggml_cgraph * gb_opt  = nullptr;
+    bool static_graphs           = false;
+    bool eval_ready              = false;
+    std::vector<struct ggml_tensor *> grad_accs;
+    std::vector<struct ggml_tensor *> grad_m;
+    std::vector<struct ggml_tensor *> grad_v;

    int64_t iter               = 1;
    int32_t opt_period         = 1;
@@ -73,7 +81,13 @@ struct ggml_opt_result {

 // ====== Dataset ======

-ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label, int64_t ndata, int64_t ndata_shard) {
+ggml_opt_dataset_t ggml_opt_dataset_init(
+        enum ggml_type type_data,
+        enum ggml_type type_label,
+        int64_t        ne_datapoint,
+        int64_t        ne_label,
+        int64_t        ndata,
+        int64_t        ndata_shard) {
    GGML_ASSERT(ne_datapoint >  0);
    GGML_ASSERT(ne_label     >= 0);
    GGML_ASSERT(ndata        >  0);
@@ -92,11 +106,11 @@ ggml_opt_dataset_t ggml_opt_dataset_init(int64_t ne_datapoint, int64_t ne_label,
        result->ctx = ggml_init(params);
    }

-    result->data = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_datapoint, ndata);
+    result->data = ggml_new_tensor_2d(result->ctx, type_data, ne_datapoint, ndata);
    result->nbs_data = ggml_nbytes(result->data) * ndata_shard/ndata;

    if (ne_label > 0) {
-        result->labels = ggml_new_tensor_2d(result->ctx, GGML_TYPE_F32, ne_label, ndata);
+        result->labels = ggml_new_tensor_2d(result->ctx, type_label, ne_label, ndata);
        result->nbs_labels = ggml_nbytes(result->labels) * ndata_shard/ndata;
    } else {
        result->labels = nullptr;
@@ -119,6 +133,10 @@ void ggml_opt_dataset_free(ggml_opt_dataset_t dataset) {
    delete dataset;
 }

+int64_t ggml_opt_dataset_ndata(ggml_opt_dataset_t dataset) {
+    return dataset->ndata;
+}
+
 struct ggml_tensor * ggml_opt_dataset_data(ggml_opt_dataset_t dataset) {
    return dataset->data;
 }
@@ -144,6 +162,8 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor *
    GGML_ASSERT(   data_batch && ggml_is_contiguous(data_batch));
    GGML_ASSERT(!labels_batch || ggml_is_contiguous(labels_batch));
    GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
+    GGML_ASSERT(                   data_batch->type == dataset->data->type);
+    GGML_ASSERT(!labels_batch || labels_batch->type == dataset->labels->type);

    const size_t nb_data_batch = ggml_nbytes(data_batch);
    GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
@@ -171,6 +191,31 @@ void ggml_opt_dataset_get_batch(ggml_opt_dataset_t dataset, struct ggml_tensor *
    }
 }

+void ggml_opt_dataset_get_batch_host(ggml_opt_dataset_t dataset, void * data_batch, size_t nb_data_batch, void * labels_batch, int64_t ibatch) {
+    GGML_ASSERT((labels_batch == nullptr) == (dataset->labels == nullptr));
+    GGML_ASSERT(nb_data_batch % dataset->nbs_data == 0);
+
+    const int64_t shards_per_batch = nb_data_batch / dataset->nbs_data;
+
+    GGML_ASSERT((ibatch + 1)*shards_per_batch <= int64_t(dataset->permutation.size()));
+
+    for (int64_t ishard_batch = 0; ishard_batch < shards_per_batch; ++ishard_batch) {
+        const int64_t ishard = dataset->permutation[ibatch*shards_per_batch + ishard_batch];
+
+        const char * ptr_data       = (const char *) dataset->data->data + ishard      *dataset->nbs_data;
+        char       * ptr_data_batch = (char       *) data_batch          + ishard_batch*dataset->nbs_data;
+        memcpy(ptr_data_batch, ptr_data, dataset->nbs_data);
+
+        if (!labels_batch) {
+            continue;
+        }
+
+        const char * ptr_labels       = (const char *) dataset->labels->data + ishard      *dataset->nbs_labels;
+        char       * ptr_labels_batch = (char       *) labels_batch          + ishard_batch*dataset->nbs_labels;
+        memcpy(ptr_labels_batch, ptr_labels, dataset->nbs_labels);
+    }
+}
+
 // ====== Model / Context ======

 struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata) {
@@ -187,17 +232,18 @@ struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * us
    return result;
 }

+struct ggml_opt_optimizer_params ggml_opt_get_constant_optimizer_params(void * userdata) {
+    return *((struct ggml_opt_optimizer_params *) userdata);
+}
+
 struct ggml_opt_params ggml_opt_default_params(
        ggml_backend_sched_t      backend_sched,
-        struct ggml_context     * ctx_compute,
-        struct ggml_tensor      * inputs,
-        struct ggml_tensor      * outputs,
        enum ggml_opt_loss_type   loss_type) {
    return {
        /*backend_sched   =*/ backend_sched,
-        /*ctx_compute     =*/ ctx_compute,
-        /*inputs          =*/ inputs,
-        /*logits          =*/ outputs,
+        /*ctx_compute     =*/ nullptr,
+        /*inputs          =*/ nullptr,
+        /*logits          =*/ nullptr,
        /*loss_type       =*/ loss_type,
        /*build_type      =*/ GGML_OPT_BUILD_TYPE_OPT,
        /*opt_period      =*/ 1,
@@ -266,195 +312,246 @@ static ggml_cgraph * dup_graph(ggml_context * ctx, ggml_cgraph * src) {
    return dst;
 }

-static void ggml_opt_alloc_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph) {
-    GGML_ASSERT(graph);
-    if (opt_ctx->allocated_graph == graph) {
-        return;
-    }
+static void ggml_opt_build(ggml_opt_context_t opt_ctx) {
+    GGML_ASSERT(opt_ctx->ctx_compute && "no compute context set, either use static graphs or set one with ggml_opt_prepare_alloc");
+    GGML_ASSERT((!opt_ctx->static_graphs || opt_ctx->inputs->data) && "when using static graphs the inputs must be allocated statically");

-    ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
+    const bool accumulate = opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD &&
+        !(opt_ctx->static_graphs && opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period == 1);

-    {
-        ggml_init_params params = {
-            /*.mem_size   =*/ ggml_tensor_overhead() * GGML_DEFAULT_GRAPH_SIZE,
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ true,
-        };
-        ggml_free(opt_ctx->ctx_copy);
-        opt_ctx->ctx_copy = ggml_init(params);
-    }
-
-    opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
-
-    ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
-    opt_ctx->allocated_graph = graph;
-}
-
-ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
-    ggml_opt_context_t result = new struct ggml_opt_context;
-    result->backend_sched   = params.backend_sched;
-    result->ctx_compute     = params.ctx_compute;
-    result->inputs          = params.inputs;
-    result->outputs         = params.outputs;
-    result->opt_period      = params.opt_period;
-    result->get_opt_pars    = params.get_opt_pars;
-    result->get_opt_pars_ud = params.get_opt_pars_ud;
-
-    GGML_ASSERT(result->inputs->data && "the inputs must be allocated statically");
-    GGML_ASSERT(result->opt_period >= 1);
-
-    const bool accumulate = params.build_type == GGML_OPT_BUILD_TYPE_GRAD ||
-        (params.build_type == GGML_OPT_BUILD_TYPE_OPT && result->opt_period > 1);
-
-    ggml_set_input(result->inputs);
-    ggml_set_output(result->outputs);
-
-    result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
-    ggml_build_forward_expand(result->gf, result->outputs);
+    ggml_set_input(opt_ctx->inputs);
+    ggml_set_output(opt_ctx->outputs);

    int n_param = 0;
-    for (int i = 0; i < result->gf->n_nodes; ++i) {
-        if (result->gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
+    for (int i = 0; i < opt_ctx->gf->n_nodes; ++i) {
+        const struct ggml_tensor * node = opt_ctx->gf->nodes[i];
+        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
            n_param++;
        }
+        GGML_ASSERT(!(node->flags & GGML_TENSOR_FLAG_LOSS) && "support for extra loss terms not implemented");
    }

-    {
+    if (!opt_ctx->ctx_static) {
        // The static context is used for:
-        //   - gradients (1 tensor per param if using gradient accumulation)
+        //   - gradients (1 per loss, 1 tensor per param if using gradient accumulation)
        //   - optimizer momenta (2 tensors per param)
-        //   - labels
-        //   - loss + its gradient (up to 5 tensors)
-        //   - pred
-        //   - ncorrect (2 tensors).
-        const size_t tensors_per_param = (accumulate ? 1 : 0) + (params.build_type == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
-        const size_t size_meta = (tensors_per_param*n_param + 9) * ggml_tensor_overhead();
+        //   - labels (if using static graphs)
+        //   - loss (if using static graphs, up to 5 tensors)
+        //   - pred (if using static graphs)
+        //   - ncorrect (if using static graphs, 2 tensors).
+        constexpr size_t n_loss = 1;
+        const size_t tensors_per_param = (accumulate ? 1 : 0) +
+            (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT ? 2 : 0);
+        const size_t tensors_const = opt_ctx->static_graphs ? 9 : 0;
+        const size_t size_meta = (n_loss + tensors_per_param*n_param + tensors_const) * ggml_tensor_overhead();
        struct ggml_init_params params = {
            /*.mem_size   =*/ size_meta,
            /*.mem_buffer =*/ nullptr,
            /*.no_alloc   =*/ true,
        };
-        result->ctx_static = ggml_init(params);
+        opt_ctx->ctx_static = ggml_init(params);
    }
+    GGML_ASSERT(opt_ctx->build_type <= opt_ctx->build_type_alloc);
+
    {
-        // The static cpu context is used for:
-        //   - optimizer parameters (1 for the entire context)
+        // The cpu context is allocated statically if using static graphs, dynamically otherwise.
+        // It is used for:
+        //   - optimizer parameters (1 shared for all optimizer invocations)
        const size_t size_meta = 1 * ggml_tensor_overhead();
        struct ggml_init_params params = {
            /*.mem_size   =*/ size_meta,
            /*.mem_buffer =*/ nullptr,
            /*.no_alloc   =*/ true,
        };
-        result->ctx_static_cpu = ggml_init(params);
+        ggml_free(opt_ctx->ctx_cpu);
+        opt_ctx->ctx_cpu = ggml_init(params);
+
+        ggml_backend_buffer_free(opt_ctx->buf_cpu);
+        opt_ctx->buf_cpu = nullptr;
    }

+    struct ggml_context * ctx_results = opt_ctx->static_graphs ? opt_ctx->ctx_static : opt_ctx->ctx_compute;

-    switch (params.loss_type) {
+    switch (opt_ctx->loss_type) {
        case GGML_OPT_LOSS_TYPE_MEAN: {
-            result->loss = ggml_sum(result->ctx_static, result->outputs);
-            ggml_set_name(result->loss, "loss_sum");
-            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
-            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
-            ggml_set_name(result->loss, "loss_mean");
-            result->loss_per_datapoint = true;
+            opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs);
+            ggml_set_name(opt_ctx->loss, "loss_sum");
+            const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs));
+            opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale);
+            ggml_set_name(opt_ctx->loss, "loss_mean");
+            opt_ctx->loss_per_datapoint = true;
            break;
        }
        case GGML_OPT_LOSS_TYPE_SUM: {
-            result->loss = ggml_sum(result->ctx_static, result->outputs);
-            ggml_set_name(result->loss, "loss_sum");
-            result->loss_per_datapoint = false;
+            opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->outputs);
+            ggml_set_name(opt_ctx->loss, "loss_sum");
+            opt_ctx->loss_per_datapoint = false;
            break;
        }
        case GGML_OPT_LOSS_TYPE_CROSS_ENTROPY: {
-            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
-            ggml_set_input(result->labels);
-            ggml_set_name(result->labels, "labels");
-            result->loss = ggml_cross_entropy_loss(result->ctx_static, result->outputs, result->labels);
-            ggml_set_name(result->loss, "loss_cross_entropy");
-            if (result->opt_period > 1) {
-                result->loss = ggml_scale(result->ctx_static, result->loss, 1.0f / result->opt_period);
-                ggml_set_name(result->loss, "loss_cross_entropy_scaled");
+            opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs);
+            ggml_set_input(opt_ctx->labels);
+            ggml_set_name(opt_ctx->labels, "labels");
+            opt_ctx->loss = ggml_cross_entropy_loss(ctx_results, opt_ctx->outputs, opt_ctx->labels);
+            ggml_set_name(opt_ctx->loss, "loss_cross_entropy");
+            if (opt_ctx->opt_period > 1) {
+                opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, 1.0f / opt_ctx->opt_period);
+                ggml_set_name(opt_ctx->loss, "loss_cross_entropy_scaled");
            }
-            result->loss_per_datapoint = true;
+            opt_ctx->loss_per_datapoint = true;
            break;
        }
        case GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR: {
-            result->labels = ggml_dup_tensor(result->ctx_static, result->outputs);
-            ggml_set_input(result->labels);
-            ggml_set_name(result->labels, "labels");
-            result->loss = ggml_sub(result->ctx_static, result->outputs, result->labels);
-            ggml_set_name(result->loss, "loss_error");
-            result->loss = ggml_sqr(result->ctx_static, result->loss);
-            ggml_set_name(result->loss, "loss_squared_error");
-            result->loss = ggml_sum(result->ctx_static, result->loss);
-            ggml_set_name(result->loss, "loss_sum_squared_error");
-            const float scale = 1.0f / (result->opt_period * ggml_nelements(result->outputs));
-            result->loss = ggml_scale(result->ctx_static, result->loss, scale);
-            ggml_set_name(result->loss, "loss_mean_squared_error");
-            result->loss_per_datapoint = true;
+            opt_ctx->labels = ggml_dup_tensor(ctx_results, opt_ctx->outputs);
+            ggml_set_input(opt_ctx->labels);
+            ggml_set_name(opt_ctx->labels, "labels");
+            opt_ctx->loss = ggml_sub(ctx_results, opt_ctx->outputs, opt_ctx->labels);
+            ggml_set_name(opt_ctx->loss, "loss_error");
+            opt_ctx->loss = ggml_sqr(ctx_results, opt_ctx->loss);
+            ggml_set_name(opt_ctx->loss, "loss_squared_error");
+            opt_ctx->loss = ggml_sum(ctx_results, opt_ctx->loss);
+            ggml_set_name(opt_ctx->loss, "loss_sum_squared_error");
+            const float scale = 1.0f / (opt_ctx->opt_period * ggml_nelements(opt_ctx->outputs));
+            opt_ctx->loss = ggml_scale(ctx_results, opt_ctx->loss, scale);
+            ggml_set_name(opt_ctx->loss, "loss_mean_squared_error");
+            opt_ctx->loss_per_datapoint = true;
            break;
        }
    }
-    ggml_set_output(result->loss);
-    ggml_set_loss(result->loss);
-    ggml_build_forward_expand(result->gf, result->loss);
+    ggml_set_output(opt_ctx->loss);
+    ggml_set_loss(opt_ctx->loss);
+    ggml_build_forward_expand(opt_ctx->gf, opt_ctx->loss);

-    result->pred = ggml_argmax(result->ctx_static, result->outputs);
-    ggml_set_name(result->pred, "pred");
-    ggml_set_output(result->pred);
-    ggml_build_forward_expand(result->gf, result->pred);
+    if (opt_ctx->loss_type == GGML_OPT_LOSS_TYPE_CROSS_ENTROPY) {
+        opt_ctx->pred = ggml_argmax(ctx_results, opt_ctx->outputs);
+        ggml_set_name(opt_ctx->pred, "pred");
+        ggml_set_output(opt_ctx->pred);
+        ggml_build_forward_expand(opt_ctx->gf, opt_ctx->pred);

-    if (result->labels) {
-        result->ncorrect = ggml_count_equal(result->ctx_static, result->pred, ggml_argmax(result->ctx_static, result->labels));
-        ggml_set_name(result->ncorrect, "ncorrect");
-        ggml_set_output(result->ncorrect);
-        ggml_build_forward_expand(result->gf, result->ncorrect);
-    } else {
-        result->ncorrect = nullptr;
+        opt_ctx->ncorrect = ggml_count_equal(ctx_results, opt_ctx->pred, ggml_argmax(ctx_results, opt_ctx->labels));
+        ggml_set_name(opt_ctx->ncorrect, "ncorrect");
+        ggml_set_output(opt_ctx->ncorrect);
+        ggml_build_forward_expand(opt_ctx->gf, opt_ctx->ncorrect);
    }

-    if (params.build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
-        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
-        return result;
+    if (opt_ctx->buf_static) {
+        if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_FORWARD) {
+            return;
+        }
+    } else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_FORWARD) {
+        opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(
+            opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
+        return;
    }

-    // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
-    result->gb_grad = ggml_graph_dup(result->ctx_compute, result->gf);
-    ggml_build_backward_expand(result->ctx_static, result->ctx_compute, result->gb_grad, accumulate);
+    if (opt_ctx->grad_accs.empty()) {
+        GGML_ASSERT(opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_GRAD);

-    if (params.build_type == GGML_OPT_BUILD_TYPE_GRAD) {
-        result->buf_static = ggml_backend_alloc_ctx_tensors(result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
-        ggml_graph_reset(result->gb_grad);
-        return result;
-    }
+        const int n_nodes = opt_ctx->gf->n_nodes;
+        opt_ctx->grad_accs.resize(n_nodes);
+        for (int i = 0; i < n_nodes; ++i) {
+            ggml_tensor * node = opt_ctx->gf->nodes[i];
+            if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
+                opt_ctx->grad_accs[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+            } else {
+                opt_ctx->grad_accs[i] = nullptr;
+            }
+        }

-    GGML_ASSERT(params.build_type == GGML_OPT_BUILD_TYPE_OPT);
-
-    // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
-    result->gb_opt = ggml_graph_dup(result->ctx_compute, result->gb_grad);
-
-    result->adamw_params = ggml_new_tensor_1d(result->ctx_static_cpu, GGML_TYPE_F32, 7);
-    ggml_set_input(result->adamw_params);
-    ggml_set_name(result->adamw_params, "adamw_params");
-
-    for (int i = result->gf->n_nodes-1; i >= 0; --i) {
-        struct ggml_tensor * node = result->gb_opt->nodes[i];
-        struct ggml_tensor * grad = ggml_graph_get_grad(result->gb_opt, node);
-
-        if (node->flags & GGML_TENSOR_FLAG_PARAM) {
-            struct ggml_tensor * m        = ggml_dup_tensor(result->ctx_static, node);
-            struct ggml_tensor * v        = ggml_dup_tensor(result->ctx_static, node);
-            struct ggml_tensor * opt_step = ggml_opt_step_adamw(result->ctx_compute, node, grad, m, v, result->adamw_params);
-            ggml_build_forward_expand(result->gb_opt, opt_step);
+        if (opt_ctx->build_type_alloc >= GGML_OPT_BUILD_TYPE_OPT) {
+            opt_ctx->grad_m.resize(n_nodes);
+            opt_ctx->grad_v.resize(n_nodes);
+            for (int i = 0; i < n_nodes; ++i) {
+                ggml_tensor * node = opt_ctx->gf->nodes[i];
+                if (node->flags & GGML_TENSOR_FLAG_PARAM) {
+                    opt_ctx->grad_m[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+                    opt_ctx->grad_v[i] = ggml_new_tensor(opt_ctx->ctx_static, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+                } else {
+                    opt_ctx->grad_m[i] = nullptr;
+                    opt_ctx->grad_v[i] = nullptr;
+                }
+            }
        }
    }

-    result->buf_static = ggml_backend_alloc_ctx_tensors(
-        result->ctx_static, ggml_backend_sched_get_backend(result->backend_sched, 0));
+    // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients.
+    opt_ctx->gb_grad = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gf, /*force_grads =*/ true);
+    ggml_build_backward_expand(opt_ctx->ctx_compute, opt_ctx->gb_grad, opt_ctx->grad_accs.data());

-    result->buf_static_cpu = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx_static_cpu, ggml_backend_cpu_buffer_type());
+    if (opt_ctx->buf_static) {
+        if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_GRAD) {
+            return;
+        }
+    } else if (opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_GRAD) {
+        opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
+        ggml_graph_reset(opt_ctx->gb_grad);
+    }

-    ggml_graph_reset(result->gb_opt);
+    GGML_ASSERT(opt_ctx->build_type_alloc == GGML_OPT_BUILD_TYPE_OPT);
+
+    // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step.
+    opt_ctx->gb_opt = ggml_graph_dup(opt_ctx->ctx_compute, opt_ctx->gb_grad, /*force_grads =*/ true);
+
+    opt_ctx->adamw_params = ggml_new_tensor_1d(opt_ctx->ctx_cpu, GGML_TYPE_F32, 7);
+    ggml_set_input(opt_ctx->adamw_params);
+    ggml_set_name(opt_ctx->adamw_params, "adamw_params");
+
+    for (int i = opt_ctx->gf->n_nodes-1; i >= 0; --i) {
+        struct ggml_tensor * node = opt_ctx->gb_opt->nodes[i];
+        struct ggml_tensor * grad = ggml_graph_get_grad(opt_ctx->gb_opt, node);
+
+        if (grad && (node->flags & GGML_TENSOR_FLAG_PARAM)) {
+            struct ggml_tensor * m        = opt_ctx->grad_m[i];
+            struct ggml_tensor * v        = opt_ctx->grad_v[i];
+            struct ggml_tensor * opt_step = ggml_opt_step_adamw(opt_ctx->ctx_compute, node, grad, m, v, opt_ctx->adamw_params);
+
+            ggml_set_name(m,        (std::string("AdamW m for ")    + std::string(node->name)).c_str());
+            ggml_set_name(v,        (std::string("AdamW v for ")    + std::string(node->name)).c_str());
+            ggml_set_name(opt_step, (std::string("AdamW step for ") + std::string(node->name)).c_str());
+
+            ggml_build_forward_expand(opt_ctx->gb_opt, opt_step);
+        }
+    }
+
+    if (!opt_ctx->buf_static) {
+        opt_ctx->buf_static = ggml_backend_alloc_ctx_tensors(
+            opt_ctx->ctx_static, ggml_backend_sched_get_backend(opt_ctx->backend_sched, 0));
+        ggml_graph_reset(opt_ctx->gb_opt);
+    }
+
+    opt_ctx->buf_cpu = ggml_backend_alloc_ctx_tensors_from_buft(opt_ctx->ctx_cpu, ggml_backend_cpu_buffer_type());
+}
+
+ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params) {
+    ggml_opt_context_t result = new struct ggml_opt_context;
+    result->backend_sched    = params.backend_sched;
+    result->ctx_compute      = params.ctx_compute;
+    result->loss_type        = params.loss_type;
+    result->build_type       = params.build_type;
+    result->build_type_alloc = params.build_type;
+    result->inputs           = params.inputs;
+    result->outputs          = params.outputs;
+    result->opt_period       = params.opt_period;
+    result->get_opt_pars     = params.get_opt_pars;
+    result->get_opt_pars_ud  = params.get_opt_pars_ud;
+
+    GGML_ASSERT(result->opt_period >= 1);
+
+    result->static_graphs = result->ctx_compute;
+
+    if (!result->static_graphs) {
+        GGML_ASSERT(!result->inputs);
+        GGML_ASSERT(!result->outputs);
+        return result;
+    }
+
+    GGML_ASSERT(result->inputs);
+    GGML_ASSERT(result->outputs);
+
+    result->gf = ggml_new_graph_custom(result->ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
+    ggml_build_forward_expand(result->gf, result->outputs);
+
+    ggml_opt_build(result);

    return result;
 }
@@ -464,9 +561,9 @@ void ggml_opt_free(ggml_opt_context_t opt_ctx) {
        return;
    }
    ggml_backend_buffer_free(opt_ctx->buf_static);
-    ggml_backend_buffer_free(opt_ctx->buf_static_cpu);
+    ggml_backend_buffer_free(opt_ctx->buf_cpu);
    ggml_free(opt_ctx->ctx_static);
-    ggml_free(opt_ctx->ctx_static_cpu);
+    ggml_free(opt_ctx->ctx_cpu);
    delete opt_ctx;
 }

@@ -582,8 +679,79 @@ void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, doubl

 // ====== Computation ======

-static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph, ggml_opt_result * result) {
-    if (graph != opt_ctx->gf) {
+void ggml_opt_prepare_alloc(
+        ggml_opt_context_t    opt_ctx,
+        struct ggml_context * ctx_compute,
+        struct ggml_cgraph  * gf,
+        struct ggml_tensor  * inputs,
+        struct ggml_tensor  * outputs) {
+    GGML_ASSERT(!opt_ctx->static_graphs);
+    opt_ctx->ctx_compute = ctx_compute;
+    opt_ctx->gf          = gf;
+    opt_ctx->inputs      = inputs;
+    opt_ctx->outputs     = outputs;
+}
+
+void ggml_opt_alloc(ggml_opt_context_t opt_ctx, bool backward) {
+    GGML_ASSERT(!opt_ctx->eval_ready);
+    if (opt_ctx->build_type == GGML_OPT_BUILD_TYPE_OPT && opt_ctx->opt_period > 1 && opt_ctx->opt_i == 0) {
+        ggml_graph_reset(opt_ctx->gb_grad);
+    }
+    if (backward) {
+        const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
+        opt_ctx->build_type = opt_i_next == 0 ? GGML_OPT_BUILD_TYPE_OPT : GGML_OPT_BUILD_TYPE_GRAD;
+    } else {
+        opt_ctx->build_type = GGML_OPT_BUILD_TYPE_FORWARD;
+    }
+
+    if (!opt_ctx->static_graphs) {
+        ggml_opt_build(opt_ctx);
+    }
+
+    struct ggml_cgraph * graph = nullptr;
+    switch (opt_ctx->build_type) {
+        case GGML_OPT_BUILD_TYPE_FORWARD: {
+            graph = opt_ctx->gf;
+        } break;
+        case GGML_OPT_BUILD_TYPE_GRAD: {
+            graph = opt_ctx->gb_grad;
+        } break;
+        case GGML_OPT_BUILD_TYPE_OPT: {
+            graph = opt_ctx->gb_opt;
+        } break;
+    }
+    GGML_ASSERT(graph);
+
+    if (opt_ctx->allocated_graph == graph) {
+        opt_ctx->eval_ready = true;
+        return;
+    }
+
+    ggml_backend_sched_reset(opt_ctx->backend_sched); // clear allocation of previous graph
+
+    if (opt_ctx->static_graphs) {
+        ggml_init_params params = {
+            /*.mem_size   =*/ graph->size*ggml_tensor_overhead() + ggml_graph_overhead_custom(graph->size, graph->grads),
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ true,
+        };
+        ggml_free(opt_ctx->ctx_copy);
+        opt_ctx->ctx_copy = ggml_init(params);
+
+        opt_ctx->allocated_graph_copy = dup_graph(opt_ctx->ctx_copy, graph);
+    } else {
+        opt_ctx->allocated_graph_copy = graph;
+    }
+
+    ggml_backend_sched_alloc_graph(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
+    opt_ctx->allocated_graph = graph;
+
+    opt_ctx->eval_ready = true;
+}
+
+void ggml_opt_eval(ggml_opt_context_t opt_ctx, ggml_opt_result_t result) {
+    GGML_ASSERT(opt_ctx->eval_ready);
+    if (opt_ctx->allocated_graph == opt_ctx->gb_opt) {
        struct ggml_opt_optimizer_params opt_pars = opt_ctx->get_opt_pars(opt_ctx->get_opt_pars_ud);

        GGML_ASSERT(opt_pars.adamw.alpha >  0.0f);
@@ -609,9 +777,19 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
        adamw_par_data[6] = beta2h;
    }

-    ggml_opt_alloc_graph(opt_ctx, graph);
    ggml_backend_sched_graph_compute(opt_ctx->backend_sched, opt_ctx->allocated_graph_copy);
    opt_ctx->iter += opt_ctx->allocated_graph == opt_ctx->gb_opt;
+    opt_ctx->opt_i = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
+
+    if (!opt_ctx->static_graphs) {
+        opt_ctx->gf                   = nullptr;
+        opt_ctx->gb_grad              = nullptr;
+        opt_ctx->gb_opt               = nullptr;
+        opt_ctx->allocated_graph      = nullptr;
+        opt_ctx->allocated_graph_copy = nullptr;
+    }
+
+    opt_ctx->eval_ready = false;

    if (!result) {
        return;
@@ -635,12 +813,14 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
    ggml_backend_tensor_get(opt_ctx->loss, &loss, 0, ggml_nbytes(opt_ctx->loss));
    result->loss.push_back(loss);

-    GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
-    std::vector<int32_t> pred(ndata);
-    ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
-    result->pred.insert(result->pred.end(), pred.begin(), pred.end());
+    if (opt_ctx->pred) {
+        GGML_ASSERT(opt_ctx->pred->type == GGML_TYPE_I32);
+        std::vector<int32_t> pred(ndata);
+        ggml_backend_tensor_get(opt_ctx->pred, pred.data(), 0, ggml_nbytes(opt_ctx->pred));
+        result->pred.insert(result->pred.end(), pred.begin(), pred.end());
+    }

-    if (!opt_ctx->labels || result->ncorrect < 0) {
+    if (!opt_ctx->ncorrect || result->ncorrect < 0) {
        result->ncorrect = -1;
        return;
    }
@@ -652,26 +832,6 @@ static void ggml_opt_eval_graph(ggml_opt_context_t opt_ctx, ggml_cgraph * graph,
    result->ncorrect += ncorrect;
 }

-void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
-    ggml_opt_eval_graph(opt_ctx, opt_ctx->gf, result);
-}
-
-void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result * result) {
-    if (opt_ctx->opt_period == 1) {
-        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
-        return;
-    }
-
-    const int32_t opt_i_next = (opt_ctx->opt_i + 1) % opt_ctx->opt_period;
-    if (opt_i_next == 0) {
-        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_opt, result);
-        ggml_opt_reset(opt_ctx, /*optimizer =*/ false);
-    } else {
-        ggml_opt_eval_graph(opt_ctx, opt_ctx->gb_grad, result);
-    }
-    opt_ctx->opt_i = opt_i_next;
-}
-
 // ====== High-Level Functions ======

 void ggml_opt_epoch(
@@ -700,16 +860,18 @@ void ggml_opt_epoch(
    int64_t ibatch = 0;
    int64_t t_loop_start = ggml_time_us();
    for (; ibatch < ibatch_split; ++ibatch) {
+        ggml_opt_alloc(opt_ctx, /*backward =*/ true);
        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
-        ggml_opt_forward_backward(opt_ctx, result_train);
+        ggml_opt_eval(opt_ctx, result_train);
        if (callback_train) {
            callback_train(true, opt_ctx, dataset, result_train, ibatch+1, ibatch_split, t_loop_start);
        }
    }
    t_loop_start = ggml_time_us();
    for (; ibatch < nbatches; ++ibatch) {
+        ggml_opt_alloc(opt_ctx, /*backward =*/ false);
        ggml_opt_dataset_get_batch(dataset, inputs, labels, ibatch);
-        ggml_opt_forward(opt_ctx, result_eval);
+        ggml_opt_eval(opt_ctx, result_eval);
        if (callback_eval) {
            callback_eval(false, opt_ctx, dataset, result_eval, ibatch+1-ibatch_split, nbatches-ibatch_split, t_loop_start);
        }
@@ -726,13 +888,26 @@ void ggml_opt_epoch_callback_progress_bar(
        int64_t            t_start_us) {
    fprintf(stderr, "%s[", train ? "train: " : "val:   ");

-    constexpr int64_t bar_length = 25;
+    // The progress bar consists of partially filled blocks, unicode has 8 separate fill levels.
+    constexpr int64_t bar_length = 8;
+    const int64_t ibatch8 = 8 * ibatch;
    for (int64_t j = 0; j < bar_length; ++j) {
-        const int64_t ibatch_j = ibatch_max * j/bar_length;
-        if (ibatch_j < ibatch) {
-            fprintf(stderr, "=");
-        } else if (ibatch_max * (j - 1)/bar_length < ibatch) {
-            fprintf(stderr, ">");
+        if        (ibatch_max * (8*j + 8) / bar_length < ibatch8) {
+            fprintf(stderr, "\u2588"); // full block
+        } else if (ibatch_max * (8*j + 7) / bar_length < ibatch8) {
+            fprintf(stderr, "\u2589"); // 7/8 filled
+        } else if (ibatch_max * (8*j + 6) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258A"); // 6/8 filled
+        } else if (ibatch_max * (8*j + 5) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258B"); // 5/8 filled
+        } else if (ibatch_max * (8*j + 4) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258C"); // 4/8 filled
+        } else if (ibatch_max * (8*j + 3) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258D"); // 3/8 filled
+        } else if (ibatch_max * (8*j + 2) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258E"); // 2/8 filled
+        } else if (ibatch_max * (8*j + 1) / bar_length < ibatch8) {
+            fprintf(stderr, "\u258F"); // 1/8 filled
        } else {
            fprintf(stderr, " ");
        }
@@ -764,8 +939,8 @@ void ggml_opt_epoch_callback_progress_bar(
    const int64_t t_eta_m = t_eta_s / 60;
    t_eta_s -= t_eta_m * 60;

-    fprintf(stderr, "| data=%06" PRId64 "/%06" PRId64 ", loss=%.6lf+-%.6lf, accuracy=%.2lf+-%.2lf%%, "
-            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 ", ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 "]\r",
+    fprintf(stderr, "] data=%07" PRId64 "/%07" PRId64 " loss=%.5lf±%.5lf acc=%.2lf±%.2lf%% "
+            "t=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " ETA=%02" PRId64 ":%02" PRId64 ":%02" PRId64 " \r",
            idata, idata_max, loss, loss_unc, 100.0*accuracy, 100.0*accuracy_unc,
            t_ibatch_h, t_ibatch_m, t_ibatch_s, t_eta_h, t_eta_m, t_eta_s);
    if (ibatch == ibatch_max) {
@@ -806,7 +981,10 @@ void ggml_opt_fit(

    int64_t epoch = 1;

-    ggml_opt_params params = ggml_opt_default_params(backend_sched, ctx_compute, inputs, outputs, loss_type);
+    ggml_opt_params params = ggml_opt_default_params(backend_sched, loss_type);
+    params.ctx_compute     = ctx_compute;
+    params.inputs          = inputs;
+    params.outputs         = outputs;
    params.opt_period      = opt_period;
    params.get_opt_pars    = get_opt_pars;
    params.get_opt_pars_ud = &epoch;
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -151,6 +151,12 @@ struct rpc_msg_buffer_clear_req {
    uint8_t value;
 };

+struct rpc_msg_set_tensor_hash_req {
+    rpc_tensor tensor;
+    uint64_t offset;
+    uint64_t hash;
+};
+
 struct rpc_msg_set_tensor_hash_rsp {
    uint8_t result;
 };
@@ -548,15 +554,12 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
    ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
    rpc_tensor rpc_tensor = serialize_tensor(tensor);
    if (size > HASH_THRESHOLD) {
-        // input serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes)
-        size_t input_size = sizeof(rpc_tensor) + sizeof(uint64_t) + sizeof(uint64_t);
-        std::vector<uint8_t> input(input_size, 0);
-        uint64_t hash = fnv_hash((const uint8_t*)data, size);
-        memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
-        memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
-        memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), &hash, sizeof(hash));
+        rpc_msg_set_tensor_hash_req request;
+        request.tensor = rpc_tensor;
+        request.offset = offset;
+        request.hash = fnv_hash((const uint8_t*)data, size);
        rpc_msg_set_tensor_hash_rsp response;
-        bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, input.data(), input.size(), &response, sizeof(response));
+        bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR_HASH, &request, sizeof(request), &response, sizeof(response));
        GGML_ASSERT(status);
        if (response.result) {
            // the server has the same data, no need to send it
@@ -864,7 +867,7 @@ public:
    bool free_buffer(const rpc_msg_free_buffer_req & request);
    bool buffer_clear(const rpc_msg_buffer_clear_req & request);
    bool set_tensor(const std::vector<uint8_t> & input);
-    bool set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set_tensor_hash_rsp & response);
+    bool set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response);
    bool get_tensor(const rpc_msg_get_tensor_req & request, std::vector<uint8_t> & response);
    bool copy_tensor(const rpc_msg_copy_tensor_req & request, rpc_msg_copy_tensor_rsp & response);
    bool graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph_compute_rsp & response);
@@ -1101,18 +1104,10 @@ bool rpc_server::get_cached_file(uint64_t hash, std::vector<uint8_t> & data) {
    return true;
 }

-bool rpc_server::set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set_tensor_hash_rsp & response)
+bool rpc_server::set_tensor_hash(const rpc_msg_set_tensor_hash_req & request, rpc_msg_set_tensor_hash_rsp & response)
 {
-    // serialization format: | rpc_tensor | offset (8 bytes) | hash (8 bytes) |
-    if (input.size() != sizeof(rpc_tensor) + 16) {
-        return false;
-    }
-    const rpc_tensor * in_tensor = (const rpc_tensor *)input.data();
-    uint64_t offset;
-    memcpy(&offset, input.data() + sizeof(rpc_tensor), sizeof(offset));
-    const uint64_t * hash = (const uint64_t *)(input.data() + sizeof(rpc_tensor) + sizeof(offset));
    std::vector<uint8_t> cached_file;
-    if (!get_cached_file(*hash, cached_file)) {
+    if (!get_cached_file(request.hash, cached_file)) {
        response.result = 0;
        return true;
    }
@@ -1125,25 +1120,28 @@ bool rpc_server::set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set
    ggml_context_ptr ctx_ptr { ggml_init(params) };
    GGML_ASSERT(ctx_ptr != nullptr);
    ggml_context * ctx = ctx_ptr.get();
-    ggml_tensor * tensor = deserialize_tensor(ctx, in_tensor);
+    ggml_tensor * tensor = deserialize_tensor(ctx, &request.tensor);
    if (tensor == nullptr) {
        GGML_LOG_ERROR("[%s] error deserializing tensor\n", __func__);
        return false;
    }
-    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n", __func__, (void*)tensor->buffer, tensor->data, offset, size, *hash);
+    GGML_PRINT_DEBUG("[%s] buffer: %p, data: %p, offset: %" PRIu64 ", size: %zu, hash: %" PRIx64 "\n",
+        __func__, (void*)tensor->buffer, tensor->data, request.offset, size, request.hash);

    // sanitize tensor->data
    {
        const size_t p0 = (size_t) ggml_backend_buffer_get_base(tensor->buffer);
        const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);

-        if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
+        if (request.tensor.data + request.offset < p0
+         || request.tensor.data + request.offset >= p1
+         || size > (p1 - request.tensor.data - request.offset)) {
            GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%zu, hash=0x%" PRIx64 ") out of buffer bounds [0x%zx, 0x%zx)\n",
-                           __func__, in_tensor->data, offset, size, *hash, p0, p1);
+                           __func__, request.tensor.data, request.offset, size, request.hash, p0, p1);
            return false;
        }
    }
-    ggml_backend_tensor_set(tensor, cached_file.data(), offset, size);
+    ggml_backend_tensor_set(tensor, cached_file.data(), request.offset, size);
    response.result = 1;
    return true;
 }
@@ -1503,12 +1501,12 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
                break;
            }
            case RPC_CMD_SET_TENSOR_HASH: {
-                std::vector<uint8_t> input;
-                if (!recv_msg(sockfd, input)) {
+                rpc_msg_set_tensor_hash_req request;
+                if (!recv_msg(sockfd, &request, sizeof(request))) {
                    return;
                }
                rpc_msg_set_tensor_hash_rsp response;
-                if (!server.set_tensor_hash(input, response)) {
+                if (!server.set_tensor_hash(request, response)) {
                    return;
                }
                if (!send_msg(sockfd, &response, sizeof(response))) {
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -52,9 +52,8 @@ target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
 find_package(DNNL)
 set(GGML_SYCL_DNNL 0)
 if(DNNL_FOUND)
-    if (DEFINED ENV{ONEAPI_ROOT} AND NOT DEFINED DNNL_GPU_VENDOR)
-        # Assuming oneDNN packaged with oneapi release is used which
-        # supports only intel target
+    if (NOT DEFINED DNNL_GPU_VENDOR)
+        # default to intel target
        set(DNNL_GPU_VENDOR "INTEL")
        if(NOT "${GGML_SYCL_TARGET}" STREQUAL "INTEL")
            message(WARNING "oneDNN builds bundled with oneapi release only support INTEL target")
@@ -108,6 +107,9 @@ endif()
 if (GGML_SYCL_TARGET STREQUAL "INTEL")
    # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
    # See https://github.com/uxlfoundation/oneMath/issues/654
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        set(SYCL_COMPILER ON)
+    endif()
    find_package(MKL REQUIRED)
    target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
--- a/ggml/src/ggml-sycl/backend.hpp
+++ b/ggml/src/ggml-sycl/backend.hpp
@@ -14,23 +14,24 @@
 #define GGML_SYCL_BACKEND_HPP

 #include "binbcast.hpp"
-#include "concat.hpp"
 #include "common.hpp"
+#include "concat.hpp"
 #include "conv.hpp"
 #include "convert.hpp"
+#include "cpy.hpp"
 #include "dequantize.hpp"
 #include "dmmv.hpp"
+#include "element_wise.hpp"
+#include "gla.hpp"
+#include "im2col.hpp"
 #include "mmq.hpp"
 #include "mmvq.hpp"
-#include "rope.hpp"
 #include "norm.hpp"
+#include "outprod.hpp"
+#include "quants.hpp"
+#include "rope.hpp"
 #include "softmax.hpp"
 #include "tsembd.hpp"
-#include "im2col.hpp"
 #include "wkv.hpp"
-#include "outprod.hpp"
-#include "element_wise.hpp"
-#include "cpy.hpp"
-#include "gla.hpp"

-#endif // GGML_SYCL_BACKEND_HPP
+#endif  // GGML_SYCL_BACKEND_HPP
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -42,6 +42,7 @@ void ggml_sycl_host_free(void* ptr);

 extern int g_ggml_sycl_debug;
 extern int g_ggml_sycl_disable_optimize;
+extern int g_ggml_sycl_prioritize_dmmv;

 #define GGML_SYCL_DEBUG(...)        \
  do {                              \
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -49,6 +49,7 @@ static bool g_sycl_loaded = false;
 int g_ggml_sycl_debug = 0;
 int g_ggml_sycl_disable_optimize = 0;
 int g_ggml_sycl_disable_graph = 0;
+int g_ggml_sycl_prioritize_dmmv = 0;

 static ggml_sycl_device_info ggml_sycl_init() {
    ggml_sycl_device_info info = {};
@@ -195,11 +196,13 @@ static void ggml_check_sycl() try {
        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
        g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
        g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
+        g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0);
        GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
        GGML_LOG_INFO("Running with Environment Variables:\n");
        GGML_LOG_INFO("  GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
        GGML_LOG_INFO("  GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
        GGML_LOG_INFO("  GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
+        GGML_LOG_INFO("  GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv);
        GGML_LOG_INFO("Build with Macros:\n");
 #if defined(GGML_SYCL_FORCE_MMQ)
        GGML_LOG_INFO("  GGML_SYCL_FORCE_MMQ: yes\n");
@@ -2822,12 +2825,45 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, cons
    std::exit(1);
 }

+enum class mul_mat_algo {
+    DMMV         = 0,
+    MMVQ         = 1,
+    MUL_MAT_SYCL = 2,
+};
+
 inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
    // TODO: accuracy issues in MMQ
    GGML_UNUSED(type);
    return false;
 }

+inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return true;
+        default:
+            return false;
+    }
+}
+
+inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return true;
+        default:
+            return false;
+    }
+}
+
+inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return true;
+        default:
+            return false;
+    }
+}
+
 static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
    switch (type) {
        case GGML_TYPE_Q4_0:
@@ -2856,7 +2892,7 @@ static void reorder_qw(char *data_device, const int ncols, const int nrows,
    GGML_ASSERT((size % sizeof(block_q4_0) == 0));
    GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
    int offset_blks = offset / sizeof(block_q4_0);
-    auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
+    auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;
    auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;

    stream->parallel_for(
@@ -2884,25 +2920,44 @@ static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
    reorder_qw(data_device, ncols, nrows, size, 0, stream);
 }

-/*
-* This function could be called when the OP (mul_mat) function support reorder optimizition.
-*/
-static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1,
-    ggml_tensor * dst) {
-    if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
-        ctx->opt_feature.reorder &&      //allow this device due to good perf, skip the devices with bad perf.
-        dst->op == GGML_OP_MUL_MAT &&    //limit to some supported cases of Q4_0, to do for more cases.
-        src0->type == GGML_TYPE_Q4_0 &&
-        src1->ne[2]==1 && src1->ne[3]==1) {
+static bool should_reorder_tensor(ggml_backend_sycl_context& ctx, const ggml_tensor * dst) {
+    return !g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
+            ctx.opt_feature.reorder &&      //allow this device due to good perf, skip the devices with bad perf.
+            dst->op == GGML_OP_MUL_MAT &&   //limit to some supported cases of Q4_0, to do for more cases.
+            dst->src[1]->ne[2]==1 && dst->src[1]->ne[3]==1;
+}

-        ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
-        if (!extra) return; //only happen in CI/UT permute case.
-
-        if (extra->optimized_feature.reorder) return; //skip the tensor which is handled for reorder.
-
-        reorder_qw(src0, ctx->stream());
-        extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
+static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * /* src1 */,
+                            ggml_tensor * dst, mul_mat_algo mm_algorithm) {
+    if (!should_reorder_tensor(*ctx, dst)) {
+        return;
    }
+
+    ggml_tensor_extra_gpu * extra = static_cast<ggml_tensor_extra_gpu *>(src0->extra);
+    if (!extra || extra->optimized_feature.reorder) {
+        return;  // Skip permutations and already reordered tensors
+    }
+
+    switch (mm_algorithm) {
+        case mul_mat_algo::DMMV:
+            if (!ggml_sycl_supports_reorder_dmmv(src0->type)) {
+                return;
+            }
+            break;
+        case mul_mat_algo::MMVQ:
+            if (!ggml_sycl_supports_reorder_mmvq(src0->type)) {
+                return;
+            }
+            break;
+        case mul_mat_algo::MUL_MAT_SYCL:
+            if (!ggml_sycl_supports_reorder_mul_mat_sycl(src0->type)) {
+                return;
+            }
+            break;
+    }
+
+    reorder_qw(src0, ctx->stream());
+    extra->optimized_feature.reorder = true;  // Used to decode/dequan in next steps and avoid re-reordering
 }

 static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2911,7 +2966,8 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
    int64_t min_compute_capability = INT_MAX;

    if (split) {
-        ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context;
+        ggml_backend_sycl_split_buffer_type_context * buft_ctx =
+            (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context;
        auto & tensor_split = buft_ctx->tensor_split;
        for (int id = 0; id < ggml_sycl_info().device_count; ++id) {
            // skip devices that are not going to do any work:
@@ -2924,7 +2980,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
            }
        }
    } else {
-        min_compute_capability    = ggml_sycl_info().devices[ctx.device].cc;
+        min_compute_capability = ggml_sycl_info().devices[ctx.device].cc;
    }

    // check data types and tensor shapes for custom matrix multiplication kernels:
@@ -2946,9 +3002,15 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
    use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
 #endif // SYCL_USE_XMX

+
    // mmvq path is faster in the CUDA backend.
-    if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda)
+    if (!g_ggml_sycl_prioritize_dmmv && (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda
+        // Dispatch becomes obscure with the reorder, MMVQ when the reorder optimization
+        // is enabled takes precedence over DMMV, the current if-else implementation
+        // requires disabling DMMV if both conditions are met
+        || (should_reorder_tensor(ctx, dst) && ggml_sycl_supports_reorder_mmvq(src0->type)))) {
        use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
+    }

    if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
        // TODO: Refactor and cleanup of mul mat dispatching.
@@ -2967,17 +3029,23 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
        // KQ + KQV multi-batch
        ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
    } else if (use_dequantize_mul_mat_vec) {
-        opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
-        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
-        // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
+        constexpr bool convert_src1_to_q8_1 = false;
+        opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::DMMV);
+        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, convert_src1_to_q8_1);
    } else if (use_mul_mat_vec_q) {
-        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
+        constexpr bool convert_src1_to_q8_1 = true;
+        opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MMVQ);
+        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, convert_src1_to_q8_1);
    } else if (use_mul_mat_q) {
-        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
+        constexpr bool convert_src1_to_q8_1 = true;
+        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, convert_src1_to_q8_1);
    } else {
-        opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
-        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
+        constexpr bool convert_src1_to_q8_1 = false;
+        // MUL_MAT_SYCL supports reorder
+        opt_for_reorder(&ctx, src0, src1, dst, mul_mat_algo::MUL_MAT_SYCL);
+        ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, convert_src1_to_q8_1);
    }
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
 }


--- a/ggml/src/ggml-sycl/mmvq.cpp
+++ b/ggml/src/ggml-sycl/mmvq.cpp
@@ -1,6 +1,60 @@
 #include "mmvq.hpp"
+
+#include "ggml.h"
+#include "common.hpp"
+#include "quants.hpp"
 #include "vecdotq.hpp"
-#include <cassert>
+
+template <typename reorder_vec_dot_q_sycl>
+static void mul_mat_vec_q_reorder(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+                                  const int ncols, const int nrows, const sycl::nd_item<3> & nd_item) {
+    using block_type   = ggml_sycl_reordered::block_q_t<reorder_vec_dot_q_sycl::gtype>;
+    using block_traits = typename block_type::traits;
+
+    const auto sg           = nd_item.get_sub_group();
+    const int  sg_range     = sg.get_group_linear_range();
+    const int  workgroup_id = nd_item.get_group_linear_id();
+    const int  sg_id        = sg.get_group_linear_id();
+    const int  row          = workgroup_id * sg_range + sg_id;
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int     blocks_per_row              = ncols / block_traits::qk;
+    constexpr int blocks_per_subgroup         = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi);
+    constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq;
+
+    static_assert(blocks_per_subgroup > 0);
+    static_assert(block_elements_per_subgroup > 0);
+
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    float partial_sum = 0.0f;
+    for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) {
+        const int ibx       = row * blocks_per_row + i;  // x block index
+        // TODO: Generalize offsets, right now only works for quantizations that don't split high and low bits
+        const int bx_offset = block_type::get_block_offset(ibx);
+        const int d_offset  = block_type::get_d_offset(nrows, ncols, ibx);
+
+        // Y block index that aligns with ibx
+        const int iby = i * block_type::block_to_q8_1_ratio();
+
+#pragma unroll
+        for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) {
+            // x block quant index when casting the quants to int
+            const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup);
+
+            partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, &y[iby], iqs);
+        }
+    }
+
+    auto sum = sycl::reduce_over_group(nd_item.get_sub_group(), partial_sum, std::plus<>());
+
+    if (sg.leader()) {
+        dst[row] = sum;
+    }
+}

 template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
 static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
@@ -480,26 +534,39 @@ static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx,
    }
 }

-static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
+static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols,
+                                                    const int nrows, dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK4_0 == 0);
+    const int        block_num_y   = ceil_div(nrows, GGML_SYCL_MMV_Y);
+    constexpr size_t num_subgroups = 16;
+    GGML_ASSERT(block_num_y % num_subgroups == 0);
+
+    const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE));
+    const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE);
+
+    stream->submit([&](sycl::handler & cgh) {
+        cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size),
+                         [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                             mul_mat_vec_q_reorder<reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0>>(vx, vy, dst, ncols, nrows,
+                                                                                           nd_item);
+                         });
+    });
+}
+
+static void mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, float * dst, const int ncols, const int nrows,
                                       dpct::queue_ptr stream) {
    GGML_ASSERT(ncols % QK4_0 == 0);
    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
    const sycl::range<3> block_nums(1, 1, block_num_y);
    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
+
    {
-
-        stream->submit([&](sycl::handler &cgh) {
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                        mul_mat_vec_q<QK4_0, QI4_0, block_q4_0,
-                                      VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
-                            vx, vy, dst, ncols, nrows, item_ct1);
-                    });
+        stream->submit([&](sycl::handler & cgh) {
+            cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                             [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+                                 mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(
+                                     vx, vy, dst, ncols, nrows, item_ct1);
+                             });
        });
    }
 }
@@ -916,93 +983,95 @@ static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy,
    }
 }

-void ggml_sycl_op_mul_mat_vec_q(
-    ggml_backend_sycl_context & ctx,
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_col_size,
-    const dpct::queue_ptr &stream) {
-
+void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1,
+                                ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+                                const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low,
+                                const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_col_size,
+                                const dpct::queue_ptr & stream) {
    const int64_t ne10 = src1->ne[0];
    GGML_ASSERT(ne10 % QK8_1 == 0);

-    const int64_t ne00 = src0->ne[0];
+    const int64_t ne00     = src0->ne[0];
    const int64_t row_diff = row_high - row_low;

    int id;
-    SYCL_CHECK(
-        CHECK_TRY_ERROR(id = get_current_device_id()));
+    SYCL_CHECK(CHECK_TRY_ERROR(id = get_current_device_id()));
    const size_t q8_1_ts = sizeof(block_q8_1);
    const size_t q8_1_bs = QK8_1;
    // the main device has a larger memory buffer to hold the results from all GPUs
    // nrows_dst == nrows of the matrix that the kernel writes into

-    for (int i = 0; i < src1_ncols; i++)
-    {
+    for (int i = 0; i < src1_ncols; i++) {
        const size_t src1_ddq_i_offset = i * src1_padded_col_size * q8_1_ts / q8_1_bs;
-        const char* src1_ddq_i_bs = src1_ddq_i + src1_ddq_i_offset;
-        float* dst_dd_i_bs = dst_dd_i + i * dst->ne[0];
+        const char * src1_ddq_i_bs     = src1_ddq_i + src1_ddq_i_offset;
+        float *      dst_dd_i_bs       = dst_dd_i + i * dst->ne[0];
        switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_IQ1_S:
-            mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_IQ1_M:
-            mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_IQ2_XXS:
-            mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_IQ2_XS:
-            mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_IQ2_S:
-            mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_IQ3_XXS:
-            mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_IQ3_S:
-            mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_IQ4_NL:
-            mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_IQ4_XS:
-            mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
-            break;
-        default:
-            GGML_ABORT("fatal error");
+            case GGML_TYPE_Q4_0:
+                if ((ggml_tensor_extra_gpu *) dst->src[0]->extra &&
+                    ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
+                    GGML_SYCL_DEBUG("Calling reorder_mul_mat_vec_q4_0_q8_1_sycl\n");
+                    reorder_mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                } else {
+                    GGML_SYCL_DEBUG("Calling mul_mat_vec_q4_0_q8_1_sycl\n");
+                    mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                }
+                break;
+            case GGML_TYPE_Q4_1:
+                mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_Q5_0:
+                mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_Q5_1:
+                mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_Q8_0:
+                mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_Q2_K:
+                mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_Q3_K:
+                mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_Q4_K:
+                mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_Q5_K:
+                mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_Q6_K:
+                mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_IQ1_S:
+                mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_IQ1_M:
+                mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_IQ2_XXS:
+                mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_IQ2_XS:
+                mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_IQ2_S:
+                mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_IQ3_XXS:
+                mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_IQ3_S:
+                mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_IQ4_NL:
+                mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            case GGML_TYPE_IQ4_XS:
+                mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
+                break;
+            default:
+                GGML_ABORT("fatal error");
        }
    }
    GGML_UNUSED(src1);
--- a/ggml/src/ggml-sycl/quants.hpp
+++ b/ggml/src/ggml-sycl/quants.hpp
@@ -0,0 +1,61 @@
+//
+// MIT license
+// Copyright (C) 2025 Codeplay Software Ltd.
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: MIT
+//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+
+#ifndef GGML_SYCL_QUANTS_HPP
+#define GGML_SYCL_QUANTS_HPP
+
+#include "ggml-common.h"
+#include "ggml.h"
+
+namespace ggml_sycl_reordered {
+
+
+// The reordered block moves quants (qs) and  scales(d) to two
+// uniform regions of memory that is contiguous in the same tensor.
+// What this means is that instead of having:
+// [d0, qs0] [d1, qs1] [d2, qs2] ... [dN, qsN]
+// We have:
+// [qs0, qs1, qs2, ..., qsN]  [d0, d1, d2, ..., dN]
+//
+// Notes: out-of-bounds qs will run into d values
+// Aligment relies on the allocated size of qs
+
+template <ggml_type type> struct block_q_t;
+
+
+// qk number of weights / quants in a block
+// qr number of weights in a byte (described as 'before dequantization')
+//    for quantization types that has low and high bits split, qr is calculated with
+//    using the lower bits, e.g for Q6 quants QR6 is 2
+// qi number of 32 bit integers needed to represent all the quants from a block (`qs` field)
+// See ggml-common.h to see how these are calculated
+template <> struct block_q_t<GGML_TYPE_Q4_0> {
+    struct traits {
+        static constexpr uint32_t qk       = QK4_0;
+        static constexpr uint32_t qi       = QI4_0;
+        static constexpr uint32_t qr       = QR4_0;
+        static constexpr uint32_t vdr_mmvq = 2;
+    };
+
+    static constexpr int get_block_offset(const int block_index) { return block_index * (traits::qk / traits::qr); }
+
+    static constexpr int get_d_offset(int nrows, int ncols, const int block_index) {
+        return (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half);
+    }
+
+    static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
+};
+
+}  // namespace ggml_sycl_reordered
+
+#endif  // GGML_SYCL_QUANTS_HPP
--- a/ggml/src/ggml-sycl/vecdotq.hpp
+++ b/ggml/src/ggml-sycl/vecdotq.hpp
@@ -1,6 +1,6 @@
 //
 // MIT license
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2025 Intel Corporation
 // SPDX-License-Identifier: MIT
 //

@@ -14,8 +14,11 @@
 #define GGML_SYCL_VECDOTQ_HPP

 #include "dpct/helper.hpp"
+#include "ggml.h"
+#include "quants.hpp"

-typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
+typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1,
+                                  const int & iqs);

 static __dpct_inline__ int get_int_from_int8(const int8_t* x8, const int& i32) {
  const uint16_t* x16 =
@@ -252,13 +255,60 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
 // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
 // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q

+template <ggml_type T> struct reorder_vec_dot_q_sycl {
+    static_assert(T != T, "ggml_type for reorder vecdot not implemented");
+};
+
+template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_0> {
+    static constexpr ggml_type gtype = GGML_TYPE_Q4_0;
+
+    using q4_0_block  = ggml_sycl_reordered::block_q_t<GGML_TYPE_Q4_0>;
+    using q4_0_traits = typename q4_0_block::traits;
+
+    __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4, const sycl::half2 & ds8) {
+        int sumi = 0;
+
+#pragma unroll
+        for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
+            const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+            const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+            // SIMD dot product of quantized values
+            sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
+            sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
+        }
+
+        const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();
+
+        // second part effectively subtracts 8 from each quant value
+        return d4 * (sumi * ds8f.x() - (8 * q4_0_traits::vdr_mmvq / q4_0_traits::qi) * ds8f.y());
+    }
+
+    __dpct_inline__ float operator()(const void * __restrict__ vbq, const int ibx_offset, const int d_offset,
+                     const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+        const uint8_t * bq4_0 = static_cast<const uint8_t *>(vbq) + ibx_offset;
+        const ggml_half d     = *(reinterpret_cast<const ggml_half *>(static_cast<const uint8_t *>(vbq) + d_offset));
+        int             v[q4_0_traits::vdr_mmvq];
+        int             u[2 * q4_0_traits::vdr_mmvq];
+
+#pragma unroll
+
+        for (size_t i = 0; i < q4_0_traits::vdr_mmvq; ++i) {
+            v[i]         = get_int_from_uint8(bq4_0, iqs + i);
+            u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+            u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + q4_0_traits::qi);
+        }
+
+        return vec_dot_q4_0_q8_1_impl(v, u, d, bq8_1->ds);
+    };
+};
+
 #define VDR_Q4_0_Q8_1_MMVQ 2
 #define VDR_Q4_0_Q8_1_MMQ  4

 template <int vdr>
-static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
-                                                    const float &d4,
-                                                    const sycl::half2 &ds8) {
+static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int * v, const int * u, const float & d4,
+                                                    const sycl::half2 & ds8) {
    int sumi = 0;
 #pragma unroll
    for (int i = 0; i < vdr; ++i) {
@@ -270,8 +320,7 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
        sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
    }

-    const sycl::float2 ds8f =
-        ds8.convert<float, sycl::rounding_mode::automatic>();
+    const sycl::float2 ds8f = ds8.convert<float, sycl::rounding_mode::automatic>();

    // second part effectively subtracts 8 from each quant value
    return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y());
@@ -456,13 +505,13 @@ vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;

    int v[VDR_Q4_0_Q8_1_MMVQ];
-    int u[2*VDR_Q4_0_Q8_1_MMVQ];
+    int u[2 * VDR_Q4_0_Q8_1_MMVQ];

 #pragma unroll
    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
-        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
+        v[i]         = get_int_from_uint8(bq4_0->qs, iqs + i);
+        u[2 * i + 0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2 * i + 1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
    }

    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -275,6 +275,7 @@ struct vk_device_struct {
    bool prefer_host_memory;
    bool float_controls_rte_fp16;
    bool subgroup_add;
+    bool subgroup_shuffle;

    bool integer_dot_product;

@@ -402,12 +403,20 @@ struct vk_device_struct {
    vk_pipeline pipeline_conv2d_dw_cwhn_f32;

    // [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned}
+    vk_pipeline pipeline_flash_attn_f32_f16_D64_cm2[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D80_cm2[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D96_cm2[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D112_cm2[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D128_cm2[GGML_TYPE_COUNT][2][2][2];
+    vk_pipeline pipeline_flash_attn_f32_f16_D256_cm2[GGML_TYPE_COUNT][2][2][2];
+
    vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2];
    vk_pipeline pipeline_flash_attn_f32_f16_D80[GGML_TYPE_COUNT][2][2][2];
    vk_pipeline pipeline_flash_attn_f32_f16_D96[GGML_TYPE_COUNT][2][2][2];
    vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2];
    vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2];
    vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2];
+
    vk_pipeline pipeline_flash_attn_split_k_reduce;

    std::unordered_map<std::string, vk_pipeline_ref> pipelines;
@@ -1581,13 +1590,29 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector<vk::Event>&& events

 // number of rows/cols for flash attention shader
 static constexpr uint32_t flash_attention_num_small_rows = 32;
-static std::array<uint32_t, 2> fa_rows_cols(uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) {
+static constexpr uint32_t scalar_flash_attention_num_small_rows = 1;
+static constexpr uint32_t scalar_flash_attention_num_large_rows = 8;
+
+static uint32_t get_fa_num_small_rows(bool scalar) {
+    return scalar ? scalar_flash_attention_num_small_rows : flash_attention_num_small_rows;
+}
+
+static std::array<uint32_t, 2> fa_rows_cols(bool scalar, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) {
    GGML_UNUSED(clamp);

+    if (scalar) {
+        if (small_rows) {
+            return {scalar_flash_attention_num_small_rows, 64};
+        } else {
+            return {scalar_flash_attention_num_large_rows, 32};
+        }
+    }
+
    // small rows, large cols
    if (small_rows) {
-        return {flash_attention_num_small_rows, 64};
+        return {get_fa_num_small_rows(scalar), 32};
    }
+
    // small cols to reduce register count
    if (ggml_is_quantized(type) || D == 256) {
        return {64, 32};
@@ -1632,7 +1657,7 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec
    const uint32_t warps = warptile[0] / warptile[10];

    const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size;
-    const uint32_t mmid_row_ids = mul_mat_id ? 3072 * sizeof(uint32_t) : 0;
+    const uint32_t mmid_row_ids = mul_mat_id ? 4096 * sizeof(uint32_t) : 0;
    const uint32_t coopmat_stage = device->coopmat_support ? warptile[7] * warptile[8] / warps * sizeof(float) : 0;

    const uint32_t total_size = load_bufs + mmid_row_ids + coopmat_stage + lut_size;
@@ -1882,65 +1907,66 @@ static void ggml_vk_load_shaders(vk_device& device) {
                                      parameter_count, wg_denoms, specialization_constants, disable_robustness, require_full_subgroups, required_subgroup_size));
    };

+    auto const &fa_wg_denoms = [&](bool scalar, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array<uint32_t, 3> {
+        return {fa_rows_cols(scalar, D, clamp, type, small_rows)[0], 1, 1};
+    };
+
+    auto const &fa_spec_constants = [&](bool scalar, uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector<uint32_t> {
+        // For large number of rows, 128 invocations seems to work best.
+        // For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we
+        // can't use 256 for D==80.
+        // For scalar, use 128 (arbitrary)
+        uint32_t wg_size = scalar ? 128 : ((small_rows && (D % 32) == 0) ? 256 : 128);
+        auto rows_cols = fa_rows_cols(scalar, D, clamp, type, small_rows);
+
+        // D_split can't be larger than a subgroup because we use subgroupShuffle to reduce it.
+        // D_split can't be larger than the LSB of D divided by 4 due to vectorization in the shader.
+        const uint32_t D_lsb = D ^ (D & (D-1));
+        uint32_t D_split = std::min(std::min(device->subgroup_size, 8u), D_lsb / 4);
+
+        // mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
+        GGML_ASSERT((GGML_KQ_MASK_PAD % rows_cols[0]) == 0);
+        return {wg_size, rows_cols[0], rows_cols[1], (D), clamp, D_split};
+    };
+
+#define CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, D) \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc"         #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,false), fa_spec_constants(SCALAR, D,1,TYPE,false), 1, true);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,false), fa_spec_constants(SCALAR, D,0,TYPE,false), fa_rows_cols(SCALAR,D,0,TYPE,false)[1], true);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc"         #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,false), fa_spec_constants(SCALAR, D,1,TYPE,false), 1, true);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC #SUFFIX,           flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,false), fa_spec_constants(SCALAR, D,0,TYPE,false), fa_rows_cols(SCALAR,D,0,TYPE,false)[1], true);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows"         #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,true), fa_spec_constants(SCALAR, D,1,TYPE,true), 1, true);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,true), fa_spec_constants(SCALAR, D,0,TYPE,true), fa_rows_cols(SCALAR,D,0,TYPE,true)[1], true);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows"         #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,1,TYPE,true), fa_spec_constants(SCALAR, D,1,TYPE,true), 1, true);     \
+        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D ## SUFFIX[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _len,         flash_attn_f32_f16_ ## NAMELC ##     SUFFIX ## _data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(SCALAR, D,0,TYPE,true), fa_spec_constants(SCALAR, D,0,TYPE,true), fa_rows_cols(SCALAR,D,0,TYPE,true)[1], true);     \
+
+#define CREATE_FA(TYPE, NAMELC, SCALAR, SUFFIX) \
+        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 64) \
+        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 80) \
+        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 96) \
+        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 112) \
+        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 128) \
+        CREATE_FA2(TYPE, NAMELC, SCALAR, SUFFIX, 256)
+
+    CREATE_FA(GGML_TYPE_F16, f16, true, )
+    CREATE_FA(GGML_TYPE_Q4_0, q4_0, true, )
+    CREATE_FA(GGML_TYPE_Q8_0, q8_0, true, )
 #if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
    if (device->coopmat2) {
-
-        auto const &fa_wg_denoms = [&](uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array<uint32_t, 3> {
-            return {fa_rows_cols(D, clamp, type, small_rows)[0], 1, 1};
-        };
-
-        auto const &fa_spec_constants = [&](uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::vector<uint32_t> {
-            // For large number of rows, 128 invocations seems to work best.
-            // For small number of rows (e.g. N==1), 256 works better. But matrix granularity for 256 is 32, so we
-            // can't use 256 for D==80.
-            uint32_t wg_size = (small_rows && (D % 32) == 0) ? 256 : 128;
-            auto rows_cols = fa_rows_cols(D, clamp, type, small_rows);
-            // mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
-            GGML_ASSERT((GGML_KQ_MASK_PAD % rows_cols[0]) == 0);
-            return {wg_size, rows_cols[0], rows_cols[1], (D), clamp};
-        };
-
-#define CREATE_FA2(TYPE, NAMELC, D) \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][0][0], "flash_attn_f32_f16_D" #D "_f16acc"         #NAMELC,           flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,false), fa_spec_constants(D,1,TYPE,false), 1);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][0][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc" #NAMELC,           flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,false), fa_spec_constants(D,0,TYPE,false), fa_rows_cols(D,0,TYPE,false)[1]);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][0][0], "flash_attn_f32_f16_D" #D "_f32acc"         #NAMELC,           flash_attn_f32_f16_ ## NAMELC ## _cm2_len,         flash_attn_f32_f16_ ## NAMELC ## _cm2_data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,false), fa_spec_constants(D,1,TYPE,false), 1);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][0][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc" #NAMELC,           flash_attn_f32_f16_ ## NAMELC ## _cm2_len,         flash_attn_f32_f16_ ## NAMELC ## _cm2_data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,false), fa_spec_constants(D,0,TYPE,false), fa_rows_cols(D,0,TYPE,false)[1]);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][1][0], "flash_attn_f32_f16_D" #D "_f16acc_smallrows"         #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,true), fa_spec_constants(D,1,TYPE,true), 1);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][0][1][1], "flash_attn_f32_f16_D" #D "_aligned_f16acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_len,  flash_attn_f32_f16_ ## NAMELC ## _f16acc_cm2_data,  "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,true), fa_spec_constants(D,0,TYPE,true), fa_rows_cols(D,0,TYPE,true)[1]);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][1][0], "flash_attn_f32_f16_D" #D "_f32acc_smallrows"         #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len,         flash_attn_f32_f16_ ## NAMELC ## _cm2_data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,1,TYPE,true), fa_spec_constants(D,1,TYPE,true), 1);     \
-        ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16_D ## D[TYPE][1][1][1], "flash_attn_f32_f16_D" #D "_aligned_f32acc_smallrows" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _cm2_len,         flash_attn_f32_f16_ ## NAMELC ## _cm2_data,         "main", 5, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(D,0,TYPE,true), fa_spec_constants(D,0,TYPE,true), fa_rows_cols(D,0,TYPE,true)[1]);     \
-
-#define CREATE_FA(TYPE, NAMELC) \
-        CREATE_FA2(TYPE, NAMELC, 64) \
-        CREATE_FA2(TYPE, NAMELC, 80) \
-        CREATE_FA2(TYPE, NAMELC, 96) \
-        CREATE_FA2(TYPE, NAMELC, 112) \
-        CREATE_FA2(TYPE, NAMELC, 128) \
-        CREATE_FA2(TYPE, NAMELC, 256)
-
-        CREATE_FA(GGML_TYPE_F16, f16)
-        CREATE_FA(GGML_TYPE_Q4_0, q4_0)
-        CREATE_FA(GGML_TYPE_Q4_1, q4_1)
-        CREATE_FA(GGML_TYPE_Q5_0, q5_0)
-        CREATE_FA(GGML_TYPE_Q5_1, q5_1)
-        CREATE_FA(GGML_TYPE_Q8_0, q8_0)
-        // K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently
-        //CREATE_FA(GGML_TYPE_Q2_K, q2_k)
-        //CREATE_FA(GGML_TYPE_Q3_K, q3_k)
-        //CREATE_FA(GGML_TYPE_Q4_K, q4_k)
-        //CREATE_FA(GGML_TYPE_Q5_K, q5_k)
-        //CREATE_FA(GGML_TYPE_Q6_K, q6_k)
-        //CREATE_FA(GGML_TYPE_IQ1_S, iq1_s)
-        //CREATE_FA(GGML_TYPE_IQ1_M, iq1_m)
-        //CREATE_FA(GGML_TYPE_IQ2_XXS, iq2_xxs)
-        //CREATE_FA(GGML_TYPE_IQ2_XS, iq2_xs)
-        //CREATE_FA(GGML_TYPE_IQ2_S, iq2_s)
-        //CREATE_FA(GGML_TYPE_IQ3_XXS, iq3_xxs)
-        //CREATE_FA(GGML_TYPE_IQ3_S, iq3_s)
-        //CREATE_FA(GGML_TYPE_IQ4_XS, iq4_xs)
-        CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl)
+        CREATE_FA(GGML_TYPE_F16, f16, false, _cm2)
+        CREATE_FA(GGML_TYPE_Q4_0, q4_0, false, _cm2)
+        CREATE_FA(GGML_TYPE_Q4_1, q4_1, false, _cm2)
+        CREATE_FA(GGML_TYPE_Q5_0, q5_0, false, _cm2)
+        CREATE_FA(GGML_TYPE_Q5_1, q5_1, false, _cm2)
+        CREATE_FA(GGML_TYPE_Q8_0, q8_0, false, _cm2)
+        CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, false, _cm2)
+    }
+#endif
+#undef CREATE_FA2
 #undef CREATE_FA

+#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
+    if (device->coopmat2) {
+
        // Create 6 variants, {s,m,l}x{unaligned,aligned}
 #define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \
        ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1);   \
@@ -2837,6 +2863,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
        device->subgroup_add = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
                               (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eArithmetic);

+        device->subgroup_shuffle = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) &&
+                                   (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eShuffle);
+
        const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr;

        device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
@@ -5260,7 +5289,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context&

    const uint64_t nei0 = ids->ne[0];
    const uint64_t nei1 = ids->ne[1];
-    GGML_ASSERT(nei0 * nei1 <= 3072);
+    GGML_ASSERT(nei0 * nei1 <= 4096);

    const uint32_t nbi1 = ids->nb[1];
    const uint32_t nbi2 = ids->nb[2];
@@ -5709,20 +5738,57 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    assert(q->type == GGML_TYPE_F32);
    assert(k->type == v->type);

+    bool scalar = !ctx->device->coopmat2;
+
+    uint32_t gqa_ratio = 1;
+    uint32_t qk_ratio = neq2 / nek2;
+    uint32_t workgroups_x = (uint32_t)neq1;
+    uint32_t workgroups_y = (uint32_t)neq2;
+    uint32_t workgroups_z = (uint32_t)neq3;
+
+    // For scalar FA, we can use the "large" size to accommodate qga.
+    // For coopmat FA, we always use the small size (which is still pretty large for gqa).
+    const uint32_t max_gqa = scalar ? scalar_flash_attention_num_large_rows : get_fa_num_small_rows(false);
+
+    if (N == 1 && qk_ratio > 1 && qk_ratio <= max_gqa &&
+        qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) {
+        // grouped query attention - make the N dimension equal to gqa_ratio, reduce
+        // workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1
+        // and change addressing calculations to index Q's dimension 2.
+        gqa_ratio = qk_ratio;
+        N = gqa_ratio;
+        workgroups_y /= N;
+    }
+
    vk_pipeline *pipelines;
    // XXX TODO other backends may be changing accumulator precision to default to f32 soon
-    bool f32acc = dst->op_params[3] == GGML_PREC_F32;
-    bool small_rows = N <= flash_attention_num_small_rows;
-    switch (D) {
-    case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64[k->type][f32acc][small_rows][0]; break;
-    case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80[k->type][f32acc][small_rows][0]; break;
-    case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96[k->type][f32acc][small_rows][0]; break;
-    case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112[k->type][f32acc][small_rows][0]; break;
-    case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128[k->type][f32acc][small_rows][0]; break;
-    case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256[k->type][f32acc][small_rows][0]; break;
-    default:
-        assert(!"unsupported D value");
-        return;
+    bool f32acc = scalar || dst->op_params[3] == GGML_PREC_F32;
+    bool small_rows = N <= get_fa_num_small_rows(scalar);
+
+    if (scalar) {
+        switch (D) {
+        case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64[k->type][f32acc][small_rows][0]; break;
+        case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80[k->type][f32acc][small_rows][0]; break;
+        case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96[k->type][f32acc][small_rows][0]; break;
+        case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112[k->type][f32acc][small_rows][0]; break;
+        case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128[k->type][f32acc][small_rows][0]; break;
+        case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256[k->type][f32acc][small_rows][0]; break;
+        default:
+            GGML_ASSERT(!"unsupported D value");
+            return;
+        }
+    } else {
+        switch (D) {
+        case 64: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D64_cm2[k->type][f32acc][small_rows][0]; break;
+        case 80: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D80_cm2[k->type][f32acc][small_rows][0]; break;
+        case 96: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D96_cm2[k->type][f32acc][small_rows][0]; break;
+        case 112: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D112_cm2[k->type][f32acc][small_rows][0]; break;
+        case 128: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D128_cm2[k->type][f32acc][small_rows][0]; break;
+        case 256: pipelines = &ctx->device->pipeline_flash_attn_f32_f16_D256_cm2[k->type][f32acc][small_rows][0]; break;
+        default:
+            GGML_ASSERT(!"unsupported D value");
+            return;
+        }
    }
    assert(pipelines);

@@ -5740,27 +5806,14 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    vk_pipeline pipeline = pipelines[aligned];
    assert(pipeline);

-    uint32_t gqa_ratio = 1;
-    uint32_t qk_ratio = neq2 / nek2;
-    uint32_t workgroups_x = (uint32_t)neq1;
-    uint32_t workgroups_y = (uint32_t)neq2;
-    uint32_t workgroups_z = (uint32_t)neq3;
-
-    if (N == 1 && qk_ratio > 1 && gqa_ratio <= flash_attention_num_small_rows &&
-        qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) {
-        // grouped query attention - make the N dimension equal to gqa_ratio, reduce
-        // workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1
-        // and change addressing calculations to index Q's dimension 2.
-        gqa_ratio = qk_ratio;
-        N = gqa_ratio;
-        workgroups_y /= N;
-    }
-
    uint32_t split_kv = KV;
    uint32_t split_k = 1;

+    // Use a placeholder core count if one isn't available. split_k is a big help for perf.
+    const uint32_t shader_core_count = ctx->device->shader_core_count ? ctx->device->shader_core_count : 16;
+
    // Try to use split_k when KV is large enough to be worth the overhead
-    if (workgroups_x == 1 && ctx->device->shader_core_count > 0 && KV >= 512) {
+    if (workgroups_x == 1 && shader_core_count > 0 && KV >= 512) {
        // Try to run two workgroups per SM.
        split_k = ctx->device->shader_core_count * 2 / workgroups_y;
        if (split_k > 1) {
@@ -9530,9 +9583,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_FLASH_ATTN_EXT:
            {
                ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
-                if (!ggml_vk_get_device(ctx->device)->coopmat2) {
-                    return false;
-                }
+                auto device = ggml_vk_get_device(ctx->device);
+                bool coopmat2 = device->coopmat2;
                switch (op->src[0]->ne[0]) {
                case 64:
                case 80:
@@ -9540,7 +9592,6 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                case 112:
                case 128:
                case 256:
-                case 575: // DeepSeek MLA
                    break;
                default:
                    return false;
@@ -9566,10 +9617,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                switch (op->src[1]->type) {
                case GGML_TYPE_F16:
                case GGML_TYPE_Q4_0:
+                case GGML_TYPE_Q8_0:
+                    // supported in scalar and coopmat2 paths
+                    break;
                case GGML_TYPE_Q4_1:
                case GGML_TYPE_Q5_0:
                case GGML_TYPE_Q5_1:
-                case GGML_TYPE_Q8_0:
                // K dequants currently disabled because D dimension is rounded up to 256 and runs inefficiently
                //case GGML_TYPE_Q2_K:
                //case GGML_TYPE_Q3_K:
@@ -9585,10 +9638,18 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
                //case GGML_TYPE_IQ3_S:
                //case GGML_TYPE_IQ4_XS:
                case GGML_TYPE_IQ4_NL:
+                    // currently supported only in coopmat2 path
+                    if (!coopmat2) {
+                        return false;
+                    }
                    break;
                default:
                    return false;
                }
+                if (!coopmat2 && !device->subgroup_shuffle) {
+                    // scalar FA uses subgroupShuffle
+                    return false;
+                }
                return true;
            }
        case GGML_OP_GET_ROWS:
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
@@ -0,0 +1,483 @@
+#version 450
+
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_EXT_shader_16bit_storage : require
+
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_shader_explicit_arithmetic_types_int32 : require
+
+#extension GL_KHR_shader_subgroup_shuffle : enable
+
+#include "types.comp"
+
+layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
+
+layout (constant_id = 1) const uint32_t Br = 1;
+layout (constant_id = 2) const uint32_t Bc = 32;
+layout (constant_id = 3) const uint32_t D = 32;
+
+layout (constant_id = 5) const uint32_t D_split = 16;
+const uint32_t D_per_thread = D / D_split;
+
+const uint32_t cols_per_iter = gl_WorkGroupSize.x / D_split;
+const uint32_t cols_per_thread = Bc / cols_per_iter;
+
+layout (push_constant) uniform parameter {
+    uint32_t N;
+    uint32_t KV;
+
+    uint32_t ne1;
+    uint32_t ne2;
+    uint32_t ne3;
+
+    uint32_t neq2;
+    uint32_t neq3;
+    uint32_t nek2;
+    uint32_t nek3;
+    uint32_t nev2;
+    uint32_t nev3;
+    uint32_t nem1;
+
+    uint32_t nb01;
+    uint32_t nb02;
+    uint32_t nb03;
+    uint32_t nb11;
+    uint32_t nb12;
+    uint32_t nb13;
+    uint32_t nb21;
+    uint32_t nb22;
+    uint32_t nb23;
+    uint32_t nb31;
+
+    float scale;
+    float max_bias;
+    float logit_softcap;
+
+    uint32_t mask;
+    uint32_t n_head_log2;
+    float m0;
+    float m1;
+
+    uint32_t gqa_ratio;
+    uint32_t split_kv;
+    uint32_t k_num;
+} p;
+
+layout (binding = 0) readonly buffer Q {float data_q[];};
+layout (binding = 0) readonly buffer QV4 {vec4 data_qv4[];};
+layout (binding = 1) readonly buffer K {float16_t data_k[];};
+layout (binding = 1) readonly buffer KV4 {f16vec4 data_kv4[];};
+layout (binding = 2) readonly buffer V {float16_t data_v[];};
+layout (binding = 2) readonly buffer VV4 {f16vec4 data_vv4[];};
+layout (binding = 3) readonly buffer M {float16_t data_m[];};
+layout (binding = 4) writeonly buffer O {D_TYPE data_o[];};
+
+#if defined(A_TYPE_PACKED16)
+#define BINDING_IDX_K 0
+#define BINDING_IDX_V 1
+layout (binding = 1) readonly buffer KV_PACKED16 {A_TYPE_PACKED16 data_packed16[];} kv_packed[2];
+#endif
+
+#if defined(DATA_A_Q4_0)
+#define BLOCK_BYTE_SIZE 18
+
+vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
+    uint vui_lo = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 0]);
+    uint vui_hi = uint(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[(iqs & 0xF) / 2 + 1]);
+    uint shift = (iqs & 0x10) >> 2;
+    vui_lo >>= shift;
+    vui_hi >>= shift;
+
+    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * (vec4(vui_lo & 0xF, (vui_lo >> 8) & 0xF, vui_hi & 0xF, (vui_hi >> 8) & 0xF) - 8.0f);
+}
+#endif
+
+#if defined(DATA_A_Q8_0)
+#define BLOCK_BYTE_SIZE 34
+vec4 dequantize4(uint ib, uint iqs, uint a_offset, uint binding_idx) {
+    const i8vec2 v0 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2])).xy; // vec4 used due to #12147
+    const i8vec2 v1 = unpack8(int32_t(kv_packed[binding_idx].data_packed16[a_offset + ib].qs[iqs / 2 + 1])).xy;
+
+    return float(kv_packed[binding_idx].data_packed16[a_offset + ib].d) * vec4(v0.x, v0.y, v1.x, v1.y);
+}
+#endif
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+
+// Store the output when doing grouped query attention.
+// Rows index by Q's dimension 2, and the first N rows are valid.
+D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
+{
+    uint32_t offset = (iq2 + r) * D + c;
+    data_o[o_offset + offset] = D_TYPE(elem);
+    return elem;
+}
+
+// Store column zero. This is used to save per-row m and L values for split_k.
+ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N)
+{
+    if (r < N && c == 0) {
+        uint32_t offset = iq2 + r;
+        data_o[o_offset + offset] = D_TYPE(elem);
+    }
+    return elem;
+}
+
+// Load the slope matrix, indexed by Q's dimension 2.
+ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2)
+{
+    const uint32_t h = iq2 + (r % p.gqa_ratio);
+
+    const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1);
+    const int      exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1);
+
+    return ACC_TYPE(pow(base, ACC_TYPE(exph)));
+}
+
+shared FLOAT_TYPE tmpsh[gl_WorkGroupSize.x];
+shared vec4 tmpshv4[gl_WorkGroupSize.x];
+
+shared float masksh[Bc][Br];
+shared vec4 Qf[Br][D / 4];
+
+void main() {
+#ifdef NEEDS_INIT_IQ_SHMEM
+    init_iq_shmem(gl_WorkGroupSize);
+#endif
+
+    const uint32_t tid = gl_LocalInvocationIndex;
+    const uint32_t N = p.N;
+    const uint32_t KV = p.KV;
+
+    const uint32_t d_tid = gl_LocalInvocationIndex % D_split;
+    const uint32_t col_tid = gl_LocalInvocationIndex / D_split;
+
+    uint32_t i = gl_WorkGroupID.x;
+    uint32_t split_k_index = 0;
+
+    if (p.k_num > 1) {
+        i = 0;
+        split_k_index = gl_WorkGroupID.x;
+    }
+
+    const uint32_t Tr = CEIL_DIV(N, Br);
+
+    const uint32_t start_j = split_k_index * p.split_kv / Bc;
+    const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc);
+
+    // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y.
+    // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2.
+    const uint32_t iq2 = gl_WorkGroupID.y * p.gqa_ratio;
+    const uint32_t iq3 = gl_WorkGroupID.z;
+
+    // broadcast factors
+    const uint32_t rk2 = p.neq2/p.nek2;
+    const uint32_t rk3 = p.neq3/p.nek3;
+
+    const uint32_t rv2 = p.neq2/p.nev2;
+    const uint32_t rv3 = p.neq3/p.nev3;
+
+    // k indices
+    const uint32_t ik3 = iq3 / rk3;
+    const uint32_t ik2 = iq2 / rk2;
+
+    // v indices
+    const uint32_t iv3 = iq3 / rv3;
+    const uint32_t iv2 = iq2 / rv2;
+
+    // nb?1 are already divided by the type size and are in units of elements.
+    // When using grouped query attention, Q is indexed by iq2, so the stride
+    // should be nb02 (which is in bytes).
+    uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01;
+    uint32_t k_stride = p.nb11;
+    uint32_t v_stride = p.nb21;
+    // When using grouped query attention, all rows use the same mask (stride 0).
+    // "p.gqa_ratio >> 16" is just a roundabout way of writing zero
+    // that prevents the compiler from folding the "&" through the select
+    // and breaking the alignment detection.
+    uint32_t m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV;
+
+    uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4;
+
+    [[unroll]] for (uint32_t idx = 0; idx < Br * D / 4; idx += gl_WorkGroupSize.x) {
+        uint32_t d = (idx + tid) % (D / 4);
+        uint32_t r = (idx + tid) / (D / 4);
+        if (r < Br && d < D / 4 &&
+            i * Br + r < N) {
+            Qf[r][d] = vec4(data_qv4[q_offset / 4 + (i * Br + r) * q_stride / 4 + d]) * p.scale;
+        }
+    }
+    barrier();
+
+    vec4 Of[Br][D_per_thread / 4];
+    [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+        [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+            Of[r][d] = vec4(0.0);
+        }
+    }
+
+    float Lf[Br], Mf[Br];
+
+    // Use -FLT_MAX/2 rather than -inf to reduce the possibility of NaNs, e.g. when computing Mold-M.
+    const float NEG_FLT_MAX_OVER_2 = uintBitsToFloat(0xFEFFFFFF);
+
+    [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+        Lf[r] = 0;
+        Mf[r] = NEG_FLT_MAX_OVER_2;
+    }
+
+    float slope[Br];
+    [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+        slope[r] = 1.0;
+    }
+
+    // ALiBi
+    if (p.max_bias > 0.0f) {
+        [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+            slope[r] = perElemOpComputeSlope(r, col_tid, ACC_TYPE(0), iq2);
+        }
+    }
+
+#if BLOCK_SIZE > 1
+    uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / BLOCK_BYTE_SIZE;
+    uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / BLOCK_BYTE_SIZE;
+#else
+    uint32_t k_offset = (ik2*p.nb12 + ik3*p.nb13) / 2;
+    uint32_t v_offset = (iv2*p.nb22 + iv3*p.nb23) / 2;
+#endif
+
+    [[dont_unroll]]
+    for (uint32_t j = start_j; j < end_j; ++j) {
+
+        float Sf[Br][cols_per_thread];
+        [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+            [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
+                Sf[r][c] = 0.0;
+            }
+        }
+
+
+        [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
+            [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+#if BLOCK_SIZE > 1
+                uint coord = (j * Bc + c * cols_per_iter + col_tid) * k_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
+                uint ib = coord / BLOCK_SIZE;
+                uint iqs = (coord % BLOCK_SIZE);
+                vec4 K_Tf = dequantize4(ib, iqs, k_offset, BINDING_IDX_K);
+#else
+                vec4 K_Tf = vec4(data_kv4[k_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * k_stride / 4 + d * D_split + d_tid]);
+#endif
+                [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+                    Sf[r][c] += dot(Qf[r][d * D_split + d_tid], K_Tf);
+                }
+            }
+        }
+
+        [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
+            // Compute sum across the D_split
+            [[unroll]] for (uint s = D_split / 2; s > 0; s >>= 1) {
+                [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+                    Sf[r][c] += subgroupShuffleXor(Sf[r][c], s);
+                }
+            }
+        }
+
+        if (p.logit_softcap != 0.0f) {
+            [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+                [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
+                    Sf[r][c] = p.logit_softcap * tanh(Sf[r][c]);
+                }
+            }
+        }
+
+        if (p.mask != 0) {
+
+            [[unroll]] for (uint32_t idx = 0; idx < Bc * Br; idx += gl_WorkGroupSize.x) {
+                uint32_t c = (idx + tid) % Bc;
+                uint32_t r = (idx + tid) / Bc;
+                if (idx + tid < Bc * Br) {
+                    masksh[c][r] = float(data_m[(i * Br + r) * m_stride + (j * Bc + c)]);
+                }
+            }
+            barrier();
+
+            [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
+                [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+                    float mvf = masksh[c * cols_per_iter + col_tid][r];
+
+                    Sf[r][c] += slope[r]*mvf;
+                }
+            }
+            barrier();
+        }
+
+        float rowmaxf[Br], Pf[Br][cols_per_thread], rowsumf[Br], eMf[Br], Moldf[Br];
+        [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+            rowmaxf[r] = Sf[r][0];
+            [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
+                rowmaxf[r] = max(rowmaxf[r], Sf[r][c]);
+            }
+            Moldf[r] = Mf[r];
+
+            // M = max(rowmax, Mold)
+            // P = e^(S - M)
+            // eM = e^(Mold - M)
+            Mf[r] = max(rowmaxf[r], Moldf[r]);
+            [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
+                Pf[r][c] = exp(Sf[r][c] - Mf[r]);
+            }
+            eMf[r] = exp(Moldf[r] - Mf[r]);
+
+            // Compute sum across row of P
+            rowsumf[r] = 0.0;
+            [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
+                rowsumf[r] += Pf[r][c];
+            }
+
+            Lf[r] = eMf[r]*Lf[r] + rowsumf[r];
+        }
+
+        [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+            [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+                Of[r][d] = eMf[r] * Of[r][d];
+            }
+        }
+
+        [[unroll]] for (uint32_t c = 0; c < cols_per_thread; ++c) {
+            [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+#if BLOCK_SIZE > 1
+                uint coord = (j * Bc + c * cols_per_iter + col_tid) * v_stride * BLOCK_SIZE + 4 * (d * D_split + d_tid);
+                uint ib = coord / BLOCK_SIZE;
+                uint iqs = (coord % BLOCK_SIZE);
+                vec4 Vf = dequantize4(ib, iqs, v_offset, BINDING_IDX_V);
+#else
+                vec4 Vf = vec4(data_vv4[v_offset / 4 + (j * Bc + c * cols_per_iter + col_tid) * v_stride / 4 + d * D_split + d_tid]);
+#endif
+                [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+                    Of[r][d] += Pf[r][c] * Vf;
+                }
+            }
+        }
+
+        barrier();
+    }
+
+    // reduce across threads
+
+    [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+        float rowmaxf, eMf;
+
+        tmpsh[tid] = Mf[r];
+        // Compute max across the row
+        barrier();
+        [[unroll]] for (int s = int(gl_WorkGroupSize.x) / 2; s >= D_split; s >>= 1) {
+            if (tid < s) {
+                tmpsh[tid] = max(tmpsh[tid], tmpsh[tid + s]);
+            }
+            barrier();
+        }
+        rowmaxf = tmpsh[d_tid];
+        barrier();
+
+        float Moldf = Mf[r];
+
+        // M = max(rowmax, Mold)
+        // eM = e^(Mold - M)
+        Mf[r] = max(rowmaxf, Moldf);
+        eMf = exp(Moldf - Mf[r]);
+
+        Lf[r] = eMf*Lf[r];
+
+        tmpsh[tid] = Lf[r];
+
+        // Compute sum across the row
+        barrier();
+        [[unroll]] for (int s = int(gl_WorkGroupSize.x) / 2; s >= D_split; s >>= 1) {
+            if (tid < s) {
+                tmpsh[tid] = tmpsh[tid] + tmpsh[tid + s];
+            }
+            barrier();
+        }
+        Lf[r] = tmpsh[d_tid];
+        barrier();
+
+        [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+
+            Of[r][d] = eMf * Of[r][d];
+            tmpshv4[tid] = Of[r][d];
+
+            barrier();
+            [[unroll]] for (int s = int(gl_WorkGroupSize.x) / 2; s >= D_split; s >>= 1) {
+                if (tid < s) {
+                    Of[r][d] += tmpshv4[tid + s];
+                    tmpshv4[tid] = Of[r][d];
+                }
+                barrier();
+            }
+            Of[r][d] = tmpshv4[d_tid];
+            barrier();
+        }
+    }
+
+
+    // If there is split_k, then the split_k resolve shader does the final
+    // division by L. Store the intermediate O value and per-row m and L values.
+    if (p.k_num > 1) {
+        uint32_t o_offset = D * p.ne1 * split_k_index;
+
+        [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+            if (r < N) {
+                [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+                    [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
+                        perElemOpGqaStore(r, 4*(d * D_split + d_tid) + comp, Of[r][d][comp], o_offset, iq2, N);
+                    }
+                }
+            }
+        }
+
+        o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2;
+        [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+            if (r < N) {
+                perElemOpStoreCol0(r, 0u, ACC_TYPE(Lf[r]), o_offset, iq2, N);
+                perElemOpStoreCol0(r, 0u, ACC_TYPE(Mf[r]), o_offset + p.ne1, iq2, N);
+            }
+        }
+
+        return;
+    }
+
+    float Lfrcp[Br];
+    [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+        Lfrcp[r] = 1.0 / Lf[r];
+    }
+
+    [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+        [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+            Of[r][d] *= Lfrcp[r];
+        }
+    }
+
+    uint32_t o_offset = iq3*p.ne2*p.ne1;
+
+    if (p.gqa_ratio > 1) {
+        [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+            if (r < N) {
+                [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+                    [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
+                        perElemOpGqaStore(r, 4*(d * D_split + d_tid) + comp, Of[r][d][comp], o_offset, iq2, N);
+                    }
+                }
+            }
+        }
+    } else {
+        [[unroll]] for (uint32_t r = 0; r < Br; ++r) {
+            if (i * Br + r < N) {
+                [[unroll]] for (uint32_t d = 0; d < D_per_thread / 4; ++d) {
+                    [[unroll]] for (uint32_t comp = 0; comp < 4; ++comp) {
+                        data_o[o_offset + iq2 * D + (i * Br + r) * p.ne1 * D + 4*(d * D_split + d_tid) + comp] = D_TYPE(Of[r][d][comp]);
+                    }
+                }
+            }
+        }
+    }
+}
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp
@@ -103,7 +103,7 @@ shared FLOAT_TYPE buf_a[BM * SHMEM_STRIDE];
 shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE];

 #ifdef MUL_MAT_ID
-shared u16vec2 row_ids[3072];
+shared u16vec2 row_ids[4096];
 #endif // MUL_MAT_ID

 #define NUM_WARPS (BLOCK_SIZE / WARP)
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp
@@ -92,7 +92,7 @@ layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
 #ifdef MUL_MAT_ID
 layout (binding = 3) readonly buffer IDS {int data_ids[];};

-shared u16vec4 row_ids[3072];
+shared u16vec4 row_ids[4096];

 layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB {
   B_TYPE b[];
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp
@@ -101,7 +101,7 @@ shared FLOAT_TYPE_VEC2 buf_b_ds[BN];
 #define LOAD_VEC_B 4

 #ifdef MUL_MAT_ID
-shared u16vec2 row_ids[3072];
+shared u16vec2 row_ids[4096];
 #endif // MUL_MAT_ID

 #define NUM_WARPS (BLOCK_SIZE / WARP)
--- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -421,7 +421,6 @@ void process_shaders() {
 #endif
    }

-#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
    // flash attention
    for (const auto& f16acc : {false, true}) {
        std::string acctype = f16acc ? "float16_t" : "float";
@@ -432,6 +431,7 @@ void process_shaders() {
            }
            if (tname == "bf16") continue;

+#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
            if (tname == "f16") {
                string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp",
                    merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, false, true, f16acc);
@@ -440,9 +440,17 @@ void process_shaders() {
                string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn_cm2.comp",
                    merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"DEQUANTFUNC", "dequantFunc"+to_uppercase(tname) }, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, false, true, f16acc);
            }
+#endif
+            if (tname == "f16") {
+                string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
+                    merge_maps(base_dict, {{"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}}), true, false, false, f16acc);
+            } else if (tname == "q4_0" || tname == "q8_0") {
+                std::string data_a_key = "DATA_A_" + to_uppercase(tname);
+                string_to_spv("flash_attn_f32_f16_" + tname, "flash_attn.comp",
+                    merge_maps(base_dict, {{data_a_key, "1"}, {"Q_TYPE", "float"}, {"D_TYPE", "float"}, {"ACC_TYPE", acctype}, {"BLOCK_SIZE", "QUANT_K_"+to_uppercase(tname) }}), true, false, false, f16acc);
+            }
        }
    }
-#endif

    for (const auto& tname : type_names) {
        // mul mat vec
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2732,11 +2732,11 @@ void ggml_mul_mat_set_prec(
    c = ggml_mul_mat_id(ctx, as, b, ids);

    as  -> [cols, rows, n_expert]
-    ids -> [n_experts_used, n_tokens] (i32)
    b   -> [cols, n_expert_used, n_tokens]
+    ids -> [n_expert_used, n_tokens] (i32)
    c   -> [rows, n_expert_used, n_tokens]

-    in b, n_experts_used can be broadcasted to match the n_expert_used of ids
+    in b, n_expert_used can be broadcasted to match the n_expert_used of ids

    c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids
 */
@@ -5499,7 +5499,7 @@ static void ggml_compute_backward(
            // tensor = src0 * 1 + src1 * 0
            if (src0_needs_grads) {
                // dsrc0 = dtensor * 1
-                ggml_add_or_set(ctx, cgraph, isrc0, grad);
+                ggml_add_or_set(ctx, cgraph, isrc0, ggml_reshape(ctx, grad, src0));
            }
            if (src1_needs_grads) {
                // dsrc1 = dtensor * 0 -> noop
@@ -5780,10 +5780,9 @@ void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor *
 }

 void ggml_build_backward_expand(
-        struct ggml_context * ctx_static,
-        struct ggml_context * ctx_compute,
-        struct ggml_cgraph  * cgraph,
-        bool                  accumulate) {
+        struct ggml_context *  ctx,
+        struct ggml_cgraph  *  cgraph,
+        struct ggml_tensor  ** grad_accs) {
    GGML_ASSERT(cgraph->n_nodes > 0);
    GGML_ASSERT(cgraph->grads);
    GGML_ASSERT(cgraph->grad_accs);
@@ -5856,21 +5855,24 @@ void ggml_build_backward_expand(
        GGML_ASSERT(!node->view_src || node->op == GGML_OP_CPY || node->op == GGML_OP_VIEW ||
            node->op == GGML_OP_RESHAPE || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_TRANSPOSE);

-        const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node);
-        GGML_ASSERT(igrad != GGML_HASHSET_FULL);
-        GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, igrad));
-        if ((accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) || (node->flags & GGML_TENSOR_FLAG_LOSS)) {
-            cgraph->grad_accs[igrad] = ggml_dup_tensor(ctx_static, node);
-            cgraph->grads[igrad]     = cgraph->grad_accs[igrad];
-            ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name);
+        const size_t ihash = ggml_hash_find(&cgraph->visited_hash_set, node);
+        GGML_ASSERT(ihash != GGML_HASHSET_FULL);
+        GGML_ASSERT(ggml_bitset_get(cgraph->visited_hash_set.used, ihash));
+        if (grad_accs && grad_accs[i]) {
+            cgraph->grad_accs[ihash] = grad_accs[i];
+            cgraph->grads[ihash]     = cgraph->grad_accs[ihash];
+        } else if (node->flags & GGML_TENSOR_FLAG_LOSS) {
+            // loss tensors always need a gradient accumulator
+            cgraph->grad_accs[ihash] = ggml_new_tensor(ctx, GGML_TYPE_F32, GGML_MAX_DIMS, node->ne);
+            cgraph->grads[ihash]     = cgraph->grad_accs[ihash];
        }
-        grads_needed[igrad] = true;
+        grads_needed[ihash] = true;
    }

    for (int i = n_nodes_f - 1; i >= 0; --i) {
        // inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation
        // use allocator to automatically make inplace operations
-        ggml_compute_backward(ctx_compute, cgraph, i, grads_needed);
+        ggml_compute_backward(ctx, cgraph, i, grads_needed);
    }

    free(grads_needed);
@@ -6016,8 +6018,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
    }
 }

-struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads != NULL);
+struct ggml_cgraph * ggml_graph_dup(struct ggml_context * ctx, struct ggml_cgraph * cgraph, bool force_grads) {
+    struct ggml_cgraph * result = ggml_new_graph_custom(ctx, cgraph->size, cgraph->grads || force_grads);
    ggml_graph_cpy(cgraph, result);
    return result;
 }
@@ -6036,6 +6038,9 @@ struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {
 }

 void ggml_graph_reset(struct ggml_cgraph * cgraph) {
+    if (!cgraph) {
+        return;
+    }
    GGML_ASSERT(cgraph->grads != NULL);

    for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -6345,8 +6350,8 @@ void ggml_set_output(struct ggml_tensor * tensor) {
    tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
 }

-void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor) {
-    GGML_UNUSED(ctx); // TODO: remove this parameter
+void ggml_set_param(struct ggml_tensor * tensor) {
+    GGML_ASSERT(tensor->op == GGML_OP_NONE);
    tensor->flags |= GGML_TENSOR_FLAG_PARAM;
 }

--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -483,7 +483,9 @@ class MODEL_TENSOR(IntEnum):
    V_ENC_EMBD_PATCH     = auto()
    V_ENC_EMBD_POS       = auto()
    V_ENC_ATTN_Q         = auto()
+    V_ENC_ATTN_Q_NORM    = auto()
    V_ENC_ATTN_K         = auto()
+    V_ENC_ATTN_K_NORM    = auto()
    V_ENC_ATTN_V         = auto()
    V_ENC_INPUT_NORM     = auto()
    V_ENC_OUTPUT         = auto()
@@ -491,6 +493,8 @@ class MODEL_TENSOR(IntEnum):
    V_ENC_FFN_UP         = auto()
    V_ENC_FFN_GATE       = auto()
    V_ENC_FFN_DOWN       = auto()
+    V_LAYER_SCALE_1      = auto()
+    V_LAYER_SCALE_2      = auto()
    V_PRE_NORM           = auto()
    V_POST_NORM          = auto()
    V_MM_INP_NORM        = auto()
@@ -740,7 +744,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.V_ENC_EMBD_PATCH:          "v.patch_embd",
    MODEL_TENSOR.V_ENC_EMBD_POS:            "v.position_embd",
    MODEL_TENSOR.V_ENC_ATTN_Q:              "v.blk.{bid}.attn_q",
+    MODEL_TENSOR.V_ENC_ATTN_Q_NORM:         "v.blk.{bid}.attn_q_norm",
    MODEL_TENSOR.V_ENC_ATTN_K:              "v.blk.{bid}.attn_k",
+    MODEL_TENSOR.V_ENC_ATTN_K_NORM:         "v.blk.{bid}.attn_k_norm",
    MODEL_TENSOR.V_ENC_ATTN_V:              "v.blk.{bid}.attn_v",
    MODEL_TENSOR.V_ENC_INPUT_NORM:          "v.blk.{bid}.ln1",
    MODEL_TENSOR.V_ENC_OUTPUT:              "v.blk.{bid}.attn_out",
@@ -748,6 +754,8 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.V_ENC_FFN_UP:              "v.blk.{bid}.ffn_up",
    MODEL_TENSOR.V_ENC_FFN_GATE:            "v.blk.{bid}.ffn_gate",
    MODEL_TENSOR.V_ENC_FFN_DOWN:            "v.blk.{bid}.ffn_down",
+    MODEL_TENSOR.V_LAYER_SCALE_1:           "v.blk.{bid}.ls1",
+    MODEL_TENSOR.V_LAYER_SCALE_2:           "v.blk.{bid}.ls2",
    MODEL_TENSOR.V_PRE_NORM:                "v.pre_ln",
    MODEL_TENSOR.V_POST_NORM:               "v.post_ln",
    MODEL_TENSOR.V_MM_INP_PROJ:             "mm.input_projection",
@@ -778,7 +786,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.V_ENC_EMBD_PATCH,
        MODEL_TENSOR.V_ENC_EMBD_POS,
        MODEL_TENSOR.V_ENC_ATTN_Q,
+        MODEL_TENSOR.V_ENC_ATTN_Q_NORM,
        MODEL_TENSOR.V_ENC_ATTN_K,
+        MODEL_TENSOR.V_ENC_ATTN_K_NORM,
        MODEL_TENSOR.V_ENC_ATTN_V,
        MODEL_TENSOR.V_ENC_INPUT_NORM,
        MODEL_TENSOR.V_ENC_OUTPUT,
@@ -786,6 +796,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.V_ENC_FFN_UP,
        MODEL_TENSOR.V_ENC_FFN_GATE,
        MODEL_TENSOR.V_ENC_FFN_DOWN,
+        MODEL_TENSOR.V_LAYER_SCALE_1,
+        MODEL_TENSOR.V_LAYER_SCALE_2,
        MODEL_TENSOR.V_PRE_NORM,
        MODEL_TENSOR.V_POST_NORM,
        MODEL_TENSOR.V_MM_INP_PROJ,
@@ -1893,6 +1905,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_GATE_EXP,
        MODEL_TENSOR.FFN_DOWN_EXP,
        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
    ],
    MODEL_ARCH.CHAMELEON: [
        MODEL_TENSOR.TOKEN_EMBD,
@@ -2167,6 +2182,7 @@ class VisionProjectorType:
    PIXTRAL = "pixtral"
    QWEN2VL = "qwen2vl_merger"
    QWEN25VL = "qwen2.5vl_merger"
+    INTERNVL = "internvl"


 # Items here are (block size, type size)
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -428,6 +428,7 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.shared_expert.down_proj",  # qwen2moe
            "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
            "language_model.model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
+            "model.layers.{bid}.shared_mlp.output_linear",     # granitemoe
        ),

        MODEL_TENSOR.ATTN_Q_NORM: (
@@ -905,6 +906,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_MMPROJ_MLP: (
            "model.mm_projector.mlp.mlp.{bid}",
+            "mlp1.{bid}", # InternVL
        ),

        MODEL_TENSOR.V_MMPROJ_PEG: (
@@ -937,6 +939,10 @@ class TensorNameMap:
            "visual.blocks.{bid}.attn.q", # qwen2vl, generated
        ),

+        MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
+            "vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
+        ),
+
        MODEL_TENSOR.V_ENC_ATTN_K: (
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
            "vpm.encoder.layers.{bid}.self_attn.k_proj",
@@ -945,6 +951,10 @@ class TensorNameMap:
            "visual.blocks.{bid}.attn.k", # qwen2vl, generated
        ),

+        MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
+            "vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
+        ),
+
        MODEL_TENSOR.V_ENC_ATTN_V: (
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
            "vpm.encoder.layers.{bid}.self_attn.v_proj",
@@ -955,6 +965,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_INPUT_NORM: (
            "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
+            "vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL
            "vpm.encoder.layers.{bid}.layer_norm1",
            "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
            "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
@@ -963,6 +974,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_OUTPUT: (
            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
+            "vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL
            "vpm.encoder.layers.{bid}.self_attn.out_proj",
            "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
            "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
@@ -971,6 +983,7 @@ class TensorNameMap:

        MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
            "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
+            "vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL
            "vpm.encoder.layers.{bid}.layer_norm2",
            "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
            "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
@@ -1000,6 +1013,14 @@ class TensorNameMap:
            "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
        ),

+        MODEL_TENSOR.V_LAYER_SCALE_1: (
+            "vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL
+        ),
+
+        MODEL_TENSOR.V_LAYER_SCALE_2: (
+            "vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL
+        ),
+
        MODEL_TENSOR.V_PRE_NORM: (
            "vision_tower.vision_model.pre_layrnorm",
            "vision_tower.ln_pre", # pixtral
--- a/include/llama.h
+++ b/include/llama.h
@@ -4,6 +4,7 @@
 #include "ggml.h"
 #include "ggml-cpu.h"
 #include "ggml-backend.h"
+#include "ggml-opt.h"

 #include <stddef.h>
 #include <stdint.h>
@@ -112,6 +113,7 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
        LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
        LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
+        LLAMA_VOCAB_PRE_TYPE_SEED_CODER     = 35,
    };

    enum llama_rope_type {
@@ -343,7 +345,7 @@ extern "C" {
        float    yarn_beta_fast;   // YaRN low correction dim
        float    yarn_beta_slow;   // YaRN high correction dim
        uint32_t yarn_orig_ctx;    // YaRN original context size
-        float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)
+        float    defrag_thold;     // defragment the KV cache if holes/size > thold, <= 0 disabled (default)

        ggml_backend_sched_eval_callback cb_eval;
        void * cb_eval_user_data;
@@ -362,6 +364,7 @@ extern "C" {
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
        bool no_perf;     // whether to measure performance timings
+        bool op_offload;  // whether to offload host tensor operations to device
    };

    // model quantization parameters
@@ -443,6 +446,10 @@ extern "C" {
                                 size_t    n_paths,
              struct llama_model_params    params);

+    LLAMA_API void llama_model_save_to_file(
+            const struct llama_model * model,
+                        const char * path_model);
+
    DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
            "use llama_model_free instead");

@@ -1431,6 +1438,37 @@ extern "C" {
    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);

+    //
+    // training
+    //
+
+    // function that returns whether or not a given tensor contains trainable parameters
+    typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
+
+    // always returns true
+    LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
+
+    struct llama_opt_params {
+        uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
+
+        llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
+        void * param_filter_ud;              // userdata for determining which tensors contain trainable parameters
+
+        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
+    };
+
+    LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
+
+    LLAMA_API void llama_opt_epoch(
+            struct llama_context    * lctx,
+            ggml_opt_dataset_t        dataset,
+            ggml_opt_result_t         result_train,
+            ggml_opt_result_t         result_eval,
+            int64_t                   idata_split,
+            ggml_opt_epoch_callback   callback_train,
+            ggml_opt_epoch_callback   callback_eval);
+
 #ifdef __cplusplus
 }
 #endif
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -7,6 +7,10 @@ import sys
 import os
 from glob import glob
 import sqlite3
+import json
+import csv
+from typing import Optional, Union
+from collections.abc import Iterator, Sequence

 try:
    import git
@@ -17,6 +21,28 @@ except ImportError as e:

 logger = logging.getLogger("compare-llama-bench")

+# All llama-bench SQL fields
+DB_FIELDS = [
+    "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
+    "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
+    "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
+    "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
+    "defrag_thold",
+    "use_mmap",     "embeddings",   "no_op_offload",  "n_prompt",   "n_gen",        "n_depth",
+    "test_time",    "avg_ns",       "stddev_ns",      "avg_ts",     "stddev_ts",
+]
+
+DB_TYPES = [
+    "TEXT",    "INTEGER", "TEXT",    "TEXT",    "TEXT",    "TEXT",
+    "TEXT",    "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
+    "TEXT",    "INTEGER", "INTEGER", "TEXT",    "TEXT",    "INTEGER",
+    "TEXT",    "INTEGER", "INTEGER", "INTEGER", "TEXT",    "TEXT",
+    "REAL",
+    "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER",
+    "TEXT",    "INTEGER", "INTEGER", "REAL",    "REAL",
+]
+assert len(DB_FIELDS) == len(DB_TYPES)
+
 # Properties by which to differentiate results per commit:
 KEY_PROPERTIES = [
    "cpu_info", "gpu_info", "backends", "n_gpu_layers", "tensor_buft_overrides", "model_filename", "model_type",
@@ -42,7 +68,7 @@ DEFAULT_HIDE = ["model_filename"]  # Always hide these properties by default.
 GPU_NAME_STRIP = ["NVIDIA GeForce ", "Tesla ", "AMD Radeon "]  # Strip prefixes for smaller tables.
 MODEL_SUFFIX_REPLACE = {" - Small": "_S", " - Medium": "_M", " - Large": "_L"}

-DESCRIPTION = """Creates tables from llama-bench data written to an SQLite database. Example usage (Linux):
+DESCRIPTION = """Creates tables from llama-bench data written to multiple JSON/CSV files, a single JSONL file or SQLite database. Example usage (Linux):

 $ git checkout master
 $ make clean && make llama-bench
@@ -70,12 +96,13 @@ help_c = (
 )
 parser.add_argument("-c", "--compare", help=help_c)
 help_i = (
-    "Input SQLite file for comparing commits. "
+    "JSON/JSONL/SQLite/CSV files for comparing commits. "
+    "Specify multiple times to use multiple input files (JSON/CSV only). "
    "Defaults to 'llama-bench.sqlite' in the current working directory. "
    "If no such file is found and there is exactly one .sqlite file in the current directory, "
    "that file is instead used as input."
 )
-parser.add_argument("-i", "--input", help=help_i)
+parser.add_argument("-i", "--input", action="append", help=help_i)
 help_o = (
    "Output format for the table. "
    "Defaults to 'pipe' (GitHub compatible). "
@@ -110,119 +137,321 @@ if unknown_args:
    sys.exit(1)

 input_file = known_args.input
-if input_file is None and os.path.exists("./llama-bench.sqlite"):
-    input_file = "llama-bench.sqlite"
-if input_file is None:
+if not input_file and os.path.exists("./llama-bench.sqlite"):
+    input_file = ["llama-bench.sqlite"]
+if not input_file:
    sqlite_files = glob("*.sqlite")
    if len(sqlite_files) == 1:
-        input_file = sqlite_files[0]
+        input_file = sqlite_files

-if input_file is None:
+if not input_file:
    logger.error("Cannot find a suitable input file, please provide one.\n")
    parser.print_help()
    sys.exit(1)

-connection = sqlite3.connect(input_file)
-cursor = connection.cursor()

-build_len_min: int = cursor.execute("SELECT MIN(LENGTH(build_commit)) from test;").fetchone()[0]
-build_len_max: int = cursor.execute("SELECT MAX(LENGTH(build_commit)) from test;").fetchone()[0]
+class LlamaBenchData:
+    repo: Optional[git.Repo]
+    build_len_min: int
+    build_len_max: int
+    build_len: int = 8
+    builds: list[str] = []
+    check_keys = set(KEY_PROPERTIES + ["build_commit", "test_time", "avg_ts"])

-if build_len_min != build_len_max:
-    logger.warning(f"{input_file} contains commit hashes of differing lengths. It's possible that the wrong commits will be compared. "
-                   "Try purging the the database of old commits.")
-    cursor.execute(f"UPDATE test SET build_commit = SUBSTRING(build_commit, 1, {build_len_min});")
+    def __init__(self):
+        try:
+            self.repo = git.Repo(".", search_parent_directories=True)
+        except git.InvalidGitRepositoryError:
+            self.repo = None

-build_len: int = build_len_min
+    def _builds_init(self):
+        self.build_len = self.build_len_min

-builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
-builds = list(map(lambda b: b[0], builds))  # list[tuple[str]] -> list[str]
-
-if not builds:
-    raise RuntimeError(f"{input_file} does not contain any builds.")
-
-try:
-    repo = git.Repo(".", search_parent_directories=True)
-except git.InvalidGitRepositoryError:
-    repo = None
-
-
-def find_parent_in_data(commit: git.Commit):
-    """Helper function to find the most recent parent measured in number of commits for which there is data."""
-    heap: list[tuple[int, git.Commit]] = [(0, commit)]
-    seen_hexsha8 = set()
-    while heap:
-        depth, current_commit = heapq.heappop(heap)
-        current_hexsha8 = commit.hexsha[:build_len]
-        if current_hexsha8 in builds:
-            return current_hexsha8
-        for parent in commit.parents:
-            parent_hexsha8 = parent.hexsha[:build_len]
-            if parent_hexsha8 not in seen_hexsha8:
-                seen_hexsha8.add(parent_hexsha8)
-                heapq.heappush(heap, (depth + 1, parent))
-    return None
-
-
-def get_all_parent_hexsha8s(commit: git.Commit):
-    """Helper function to recursively get hexsha8 values for all parents of a commit."""
-    unvisited = [commit]
-    visited   = []
-
-    while unvisited:
-        current_commit = unvisited.pop(0)
-        visited.append(current_commit.hexsha[:build_len])
-        for parent in current_commit.parents:
-            if parent.hexsha[:build_len] not in visited:
-                unvisited.append(parent)
-
-    return visited
-
-
-def get_commit_name(hexsha8: str):
-    """Helper function to find a human-readable name for a commit if possible."""
-    if repo is None:
-        return hexsha8
-    for h in repo.heads:
-        if h.commit.hexsha[:build_len] == hexsha8:
-            return h.name
-    for t in repo.tags:
-        if t.commit.hexsha[:build_len] == hexsha8:
-            return t.name
-    return hexsha8
-
-
-def get_commit_hexsha8(name: str):
-    """Helper function to search for a commit given a human-readable name."""
-    if repo is None:
+    def _check_keys(self, keys: set) -> Optional[set]:
+        """Private helper method that checks against required data keys and returns missing ones."""
+        if not keys >= self.check_keys:
+            return self.check_keys - keys
        return None
-    for h in repo.heads:
-        if h.name == name:
-            return h.commit.hexsha[:build_len]
-    for t in repo.tags:
-        if t.name == name:
-            return t.commit.hexsha[:build_len]
-    for c in repo.iter_commits("--all"):
-        if c.hexsha[:build_len] == name[:build_len]:
-            return c.hexsha[:build_len]
-    return None
+
+    def find_parent_in_data(self, commit: git.Commit) -> Optional[str]:
+        """Helper method to find the most recent parent measured in number of commits for which there is data."""
+        heap: list[tuple[int, git.Commit]] = [(0, commit)]
+        seen_hexsha8 = set()
+        while heap:
+            depth, current_commit = heapq.heappop(heap)
+            current_hexsha8 = commit.hexsha[:self.build_len]
+            if current_hexsha8 in self.builds:
+                return current_hexsha8
+            for parent in commit.parents:
+                parent_hexsha8 = parent.hexsha[:self.build_len]
+                if parent_hexsha8 not in seen_hexsha8:
+                    seen_hexsha8.add(parent_hexsha8)
+                    heapq.heappush(heap, (depth + 1, parent))
+        return None
+
+    def get_all_parent_hexsha8s(self, commit: git.Commit) -> Sequence[str]:
+        """Helper method to recursively get hexsha8 values for all parents of a commit."""
+        unvisited = [commit]
+        visited   = []
+
+        while unvisited:
+            current_commit = unvisited.pop(0)
+            visited.append(current_commit.hexsha[:self.build_len])
+            for parent in current_commit.parents:
+                if parent.hexsha[:self.build_len] not in visited:
+                    unvisited.append(parent)
+
+        return visited
+
+    def get_commit_name(self, hexsha8: str) -> str:
+        """Helper method to find a human-readable name for a commit if possible."""
+        if self.repo is None:
+            return hexsha8
+        for h in self.repo.heads:
+            if h.commit.hexsha[:self.build_len] == hexsha8:
+                return h.name
+        for t in self.repo.tags:
+            if t.commit.hexsha[:self.build_len] == hexsha8:
+                return t.name
+        return hexsha8
+
+    def get_commit_hexsha8(self, name: str) -> Optional[str]:
+        """Helper method to search for a commit given a human-readable name."""
+        if self.repo is None:
+            return None
+        for h in self.repo.heads:
+            if h.name == name:
+                return h.commit.hexsha[:self.build_len]
+        for t in self.repo.tags:
+            if t.name == name:
+                return t.commit.hexsha[:self.build_len]
+        for c in self.repo.iter_commits("--all"):
+            if c.hexsha[:self.build_len] == name[:self.build_len]:
+                return c.hexsha[:self.build_len]
+        return None
+
+    def builds_timestamp(self, reverse: bool = False) -> Union[Iterator[tuple], Sequence[tuple]]:
+        """Helper method that gets rows of (build_commit, test_time) sorted by the latter."""
+        return []
+
+    def get_rows(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
+        """
+        Helper method that gets table rows for some list of properties.
+        Rows are created by combining those where all provided properties are equal.
+        The resulting rows are then grouped by the provided properties and the t/s values are averaged.
+        The returned rows are unique in terms of property combinations.
+        """
+        return []
+
+
+class LlamaBenchDataSQLite3(LlamaBenchData):
+    connection: sqlite3.Connection
+    cursor: sqlite3.Cursor
+
+    def __init__(self):
+        super().__init__()
+        self.connection = sqlite3.connect(":memory:")
+        self.cursor = self.connection.cursor()
+        self.cursor.execute(f"CREATE TABLE test({', '.join(' '.join(x) for x in zip(DB_FIELDS, DB_TYPES))});")
+
+    def _builds_init(self):
+        if self.connection:
+            self.build_len_min = self.cursor.execute("SELECT MIN(LENGTH(build_commit)) from test;").fetchone()[0]
+            self.build_len_max = self.cursor.execute("SELECT MAX(LENGTH(build_commit)) from test;").fetchone()[0]
+
+            if self.build_len_min != self.build_len_max:
+                logger.warning("Data contains commit hashes of differing lengths. It's possible that the wrong commits will be compared. "
+                               "Try purging the the database of old commits.")
+                self.cursor.execute(f"UPDATE test SET build_commit = SUBSTRING(build_commit, 1, {self.build_len_min});")
+
+            builds = self.cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall()
+            self.builds = list(map(lambda b: b[0], builds))  # list[tuple[str]] -> list[str]
+        super()._builds_init()
+
+    def builds_timestamp(self, reverse: bool = False) -> Union[Iterator[tuple], Sequence[tuple]]:
+        data = self.cursor.execute(
+            "SELECT build_commit, test_time FROM test ORDER BY test_time;").fetchall()
+        return reversed(data) if reverse else data
+
+    def get_rows(self, properties: list[str], hexsha8_baseline: str, hexsha8_compare: str) -> Sequence[tuple]:
+        select_string = ", ".join(
+            [f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
+        equal_string = " AND ".join(
+            [f"tb.{p} = tc.{p}" for p in KEY_PROPERTIES] + [
+                f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"]
+        )
+        group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"])
+        query = (f"SELECT {select_string} FROM test tb JOIN test tc ON {equal_string} "
+                 f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
+        return self.cursor.execute(query).fetchall()
+
+
+class LlamaBenchDataSQLite3File(LlamaBenchDataSQLite3):
+    def __init__(self, data_file: str):
+        super().__init__()
+
+        self.connection.close()
+        self.connection = sqlite3.connect(data_file)
+        self.cursor = self.connection.cursor()
+        self._builds_init()
+
+    @staticmethod
+    def valid_format(data_file: str) -> bool:
+        connection = sqlite3.connect(data_file)
+        cursor = connection.cursor()
+
+        try:
+            if cursor.execute("PRAGMA schema_version;").fetchone()[0] == 0:
+                raise sqlite3.DatabaseError("The provided input file does not exist or is empty.")
+        except sqlite3.DatabaseError as e:
+            logger.debug(f'"{data_file}" is not a valid SQLite3 file.', exc_info=e)
+            cursor = None
+
+        connection.close()
+        return True if cursor else False
+
+
+class LlamaBenchDataJSONL(LlamaBenchDataSQLite3):
+    def __init__(self, data_file: str):
+        super().__init__()
+
+        with open(data_file, "r", encoding="utf-8") as fp:
+            for i, line in enumerate(fp):
+                parsed = json.loads(line)
+
+                for k in parsed.keys() - set(DB_FIELDS):
+                    del parsed[k]
+
+                if (missing_keys := self._check_keys(parsed.keys())):
+                    raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
+
+                self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
+
+        self._builds_init()
+
+    @staticmethod
+    def valid_format(data_file: str) -> bool:
+        try:
+            with open(data_file, "r", encoding="utf-8") as fp:
+                for line in fp:
+                    json.loads(line)
+                    break
+        except Exception as e:
+            logger.debug(f'"{data_file}" is not a valid JSONL file.', exc_info=e)
+            return False
+
+        return True
+
+
+class LlamaBenchDataJSON(LlamaBenchDataSQLite3):
+    def __init__(self, data_files: list[str]):
+        super().__init__()
+
+        for data_file in data_files:
+            with open(data_file, "r", encoding="utf-8") as fp:
+                parsed = json.load(fp)
+
+                for i, entry in enumerate(parsed):
+                    for k in entry.keys() - set(DB_FIELDS):
+                        del entry[k]
+
+                    if (missing_keys := self._check_keys(entry.keys())):
+                        raise RuntimeError(f"Missing required data key(s) at entry {i + 1}: {', '.join(missing_keys)}")
+
+                    self.cursor.execute(f"INSERT INTO test({', '.join(entry.keys())}) VALUES({', '.join('?' * len(entry))});", tuple(entry.values()))
+
+        self._builds_init()
+
+    @staticmethod
+    def valid_format(data_files: list[str]) -> bool:
+        if not data_files:
+            return False
+
+        for data_file in data_files:
+            try:
+                with open(data_file, "r", encoding="utf-8") as fp:
+                    json.load(fp)
+            except Exception as e:
+                logger.debug(f'"{data_file}" is not a valid JSON file.', exc_info=e)
+                return False
+
+        return True
+
+
+class LlamaBenchDataCSV(LlamaBenchDataSQLite3):
+    def __init__(self, data_files: list[str]):
+        super().__init__()
+
+        for data_file in data_files:
+            with open(data_file, "r", encoding="utf-8") as fp:
+                for i, parsed in enumerate(csv.DictReader(fp)):
+                    keys = set(parsed.keys())
+
+                    for k in keys - set(DB_FIELDS):
+                        del parsed[k]
+
+                    if (missing_keys := self._check_keys(keys)):
+                        raise RuntimeError(f"Missing required data key(s) at line {i + 1}: {', '.join(missing_keys)}")
+
+                    self.cursor.execute(f"INSERT INTO test({', '.join(parsed.keys())}) VALUES({', '.join('?' * len(parsed))});", tuple(parsed.values()))
+
+        self._builds_init()
+
+    @staticmethod
+    def valid_format(data_files: list[str]) -> bool:
+        if not data_files:
+            return False
+
+        for data_file in data_files:
+            try:
+                with open(data_file, "r", encoding="utf-8") as fp:
+                    for parsed in csv.DictReader(fp):
+                        break
+            except Exception as e:
+                logger.debug(f'"{data_file}" is not a valid CSV file.', exc_info=e)
+                return False
+
+        return True
+
+
+bench_data = None
+if len(input_file) == 1:
+    if LlamaBenchDataSQLite3File.valid_format(input_file[0]):
+        bench_data = LlamaBenchDataSQLite3File(input_file[0])
+    elif LlamaBenchDataJSON.valid_format(input_file):
+        bench_data = LlamaBenchDataJSON(input_file)
+    elif LlamaBenchDataJSONL.valid_format(input_file[0]):
+        bench_data = LlamaBenchDataJSONL(input_file[0])
+    elif LlamaBenchDataCSV.valid_format(input_file):
+        bench_data = LlamaBenchDataCSV(input_file)
+else:
+    if LlamaBenchDataJSON.valid_format(input_file):
+        bench_data = LlamaBenchDataJSON(input_file)
+    elif LlamaBenchDataCSV.valid_format(input_file):
+        bench_data = LlamaBenchDataCSV(input_file)
+
+if not bench_data:
+    raise RuntimeError("No valid (or some invalid) input files found.")
+
+if not bench_data.builds:
+    raise RuntimeError(f"{input_file} does not contain any builds.")


 hexsha8_baseline = name_baseline = None

 # If the user specified a baseline, try to find a commit for it:
 if known_args.baseline is not None:
-    if known_args.baseline in builds:
+    if known_args.baseline in bench_data.builds:
        hexsha8_baseline = known_args.baseline
    if hexsha8_baseline is None:
-        hexsha8_baseline = get_commit_hexsha8(known_args.baseline)
+        hexsha8_baseline = bench_data.get_commit_hexsha8(known_args.baseline)
        name_baseline = known_args.baseline
    if hexsha8_baseline is None:
        logger.error(f"cannot find data for baseline={known_args.baseline}.")
        sys.exit(1)
 # Otherwise, search for the most recent parent of master for which there is data:
-elif repo is not None:
-    hexsha8_baseline = find_parent_in_data(repo.heads.master.commit)
+elif bench_data.repo is not None:
+    hexsha8_baseline = bench_data.find_parent_in_data(bench_data.repo.heads.master.commit)

    if hexsha8_baseline is None:
        logger.error("No baseline was provided and did not find data for any master branch commits.\n")
@@ -235,27 +464,25 @@ else:
    sys.exit(1)


-name_baseline = get_commit_name(hexsha8_baseline)
+name_baseline = bench_data.get_commit_name(hexsha8_baseline)

 hexsha8_compare = name_compare = None

 # If the user has specified a compare value, try to find a corresponding commit:
 if known_args.compare is not None:
-    if known_args.compare in builds:
+    if known_args.compare in bench_data.builds:
        hexsha8_compare = known_args.compare
    if hexsha8_compare is None:
-        hexsha8_compare = get_commit_hexsha8(known_args.compare)
+        hexsha8_compare = bench_data.get_commit_hexsha8(known_args.compare)
        name_compare = known_args.compare
    if hexsha8_compare is None:
        logger.error(f"cannot find data for compare={known_args.compare}.")
        sys.exit(1)
 # Otherwise, search for the commit for llama-bench was most recently run
 # and that is not a parent of master:
-elif repo is not None:
-    hexsha8s_master = get_all_parent_hexsha8s(repo.heads.master.commit)
-    builds_timestamp = cursor.execute(
-        "SELECT build_commit, test_time FROM test ORDER BY test_time;").fetchall()
-    for (hexsha8, _) in reversed(builds_timestamp):
+elif bench_data.repo is not None:
+    hexsha8s_master = bench_data.get_all_parent_hexsha8s(bench_data.repo.heads.master.commit)
+    for (hexsha8, _) in bench_data.builds_timestamp(reverse=True):
        if hexsha8 not in hexsha8s_master:
            hexsha8_compare = hexsha8
            break
@@ -270,26 +497,7 @@ else:
    parser.print_help()
    sys.exit(1)

-name_compare = get_commit_name(hexsha8_compare)
-
-
-def get_rows(properties):
-    """
-    Helper function that gets table rows for some list of properties.
-    Rows are created by combining those where all provided properties are equal.
-    The resulting rows are then grouped by the provided properties and the t/s values are averaged.
-    The returned rows are unique in terms of property combinations.
-    """
-    select_string = ", ".join(
-        [f"tb.{p}" for p in properties] + ["tb.n_prompt", "tb.n_gen", "tb.n_depth", "AVG(tb.avg_ts)", "AVG(tc.avg_ts)"])
-    equal_string = " AND ".join(
-        [f"tb.{p} = tc.{p}" for p in KEY_PROPERTIES] + [
-            f"tb.build_commit = '{hexsha8_baseline}'", f"tc.build_commit = '{hexsha8_compare}'"]
-    )
-    group_order_string = ", ".join([f"tb.{p}" for p in properties] + ["tb.n_gen", "tb.n_prompt", "tb.n_depth"])
-    query = (f"SELECT {select_string} FROM test tb JOIN test tc ON {equal_string} "
-             f"GROUP BY {group_order_string} ORDER BY {group_order_string};")
-    return cursor.execute(query).fetchall()
+name_compare = bench_data.get_commit_name(hexsha8_compare)


 # If the user provided columns to group the results by, use them:
@@ -303,10 +511,10 @@ if known_args.show is not None:
        logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}")
        parser.print_usage()
        sys.exit(1)
-    rows_show = get_rows(show)
+    rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
 # Otherwise, select those columns where the values are not all the same:
 else:
-    rows_full = get_rows(KEY_PROPERTIES)
+    rows_full = bench_data.get_rows(KEY_PROPERTIES, hexsha8_baseline, hexsha8_compare)
    properties_different = []
    for i, kp_i in enumerate(KEY_PROPERTIES):
        if kp_i in DEFAULT_SHOW or kp_i in ["n_prompt", "n_gen", "n_depth"]:
@@ -318,7 +526,7 @@ else:

    show = []
    # Show CPU and/or GPU by default even if the hardware for all results is the same:
-    if "n_gpu_layers" not in properties_different:
+    if rows_full and "n_gpu_layers" not in properties_different:
        ngl = int(rows_full[0][KEY_PROPERTIES.index("n_gpu_layers")])

        if ngl != 99 and "cpu_info" not in properties_different:
@@ -336,7 +544,11 @@ else:
            show.remove(prop)
        except ValueError:
            pass
-    rows_show = get_rows(show)
+    rows_show = bench_data.get_rows(show, hexsha8_baseline, hexsha8_compare)
+
+if not rows_show:
+    logger.error(f"No comparable data was found between {name_baseline} and {name_compare}.\n")
+    sys.exit(1)

 table = []
 for row in rows_show:
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-b59bddafe278877dfa22a80e53a637513862babb
+9b048bb72b811f50b0c30d9e5c84d6ff9f4bf005
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -23,6 +23,7 @@ add_library(llama
            llama-memory.cpp
            llama-mmap.cpp
            llama-model-loader.cpp
+            llama-model-saver.cpp
            llama-model.cpp
            llama-quant.cpp
            llama-sampling.cpp
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
    std::vector<ggml_backend_buffer_type_t> buft_extra;
    {
        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        if (!cpu_dev) {
+            throw std::runtime_error(format("%s: no CPU backend found", __func__));
+        }
        auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);

        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
@@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
                LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));

                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+                if (!cpu_dev) {
+                    throw std::runtime_error(format("%s: no CPU backend found", __func__));
+                }
                buft = ggml_backend_dev_buffer_type(cpu_dev);

                break;
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -1481,6 +1481,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,  "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,  "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
        },
    },
    {
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "mistral-v3",        LLM_CHAT_TEMPLATE_MISTRAL_V3        },
    { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
    { "mistral-v7",        LLM_CHAT_TEMPLATE_MISTRAL_V7        },
+    { "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
    { "phi3",              LLM_CHAT_TEMPLATE_PHI_3             },
    { "phi4",              LLM_CHAT_TEMPLATE_PHI_4             },
    { "falcon3",           LLM_CHAT_TEMPLATE_FALCON_3          },
@@ -202,19 +203,20 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|im_start|>assistant\n";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
        // Official mistral 'v7' template
        // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
+        //      https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
+        const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
        for (auto message : chat) {
            std::string role(message->role);
            std::string content(message->content);
            if (role == "system") {
-                ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
+                ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
            } else if (role == "user") {
-                ss << "[INST] " << content << "[/INST]";
-            }
-            else {
-                ss << " " << content << "</s>";
+                ss << "[INST]" << trailing_space << content << "[/INST]";
+            } else {
+                ss << trailing_space << content << "</s>";
            }
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@@ -14,6 +14,7 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_MISTRAL_V3,
    LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
    LLM_CHAT_TEMPLATE_MISTRAL_V7,
+    LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
    LLM_CHAT_TEMPLATE_PHI_3,
    LLM_CHAT_TEMPLATE_PHI_4,
    LLM_CHAT_TEMPLATE_FALCON_3,
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -93,6 +93,7 @@ llama_context::llama_context(
    }

    cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
+    cparams.op_offload = params.op_offload;

    const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;

@@ -243,7 +244,7 @@ llama_context::llama_context(
            }
        }

-        sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel));
+        sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.op_offload));

        if (pipeline_parallel) {
            LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get()));
@@ -358,7 +359,9 @@ llama_context::llama_context(
    }
 }

-llama_context::~llama_context() = default;
+llama_context::~llama_context() {
+    ggml_opt_free(opt_ctx);
+}

 void llama_context::synchronize() {
    ggml_backend_sched_synchronize(sched.get());
@@ -1787,10 +1790,13 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
        }
    }

-    LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);
-    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+    if (memory) {
+        LLAMA_LOG_DEBUG("%s: - reading KV self\n", __func__);

-    kv_self->state_read(io);
+        llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+
+        kv_self->state_read(io);
+    }

    return io.n_bytes();
 }
@@ -1798,9 +1804,11 @@ size_t llama_context::state_read_data(llama_io_read_i & io) {
 size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id) {
    GGML_UNUSED(seq_id);

-    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+    if (memory) {
+        llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());

-    kv_self->state_write(io, seq_id);
+        kv_self->state_write(io, seq_id);
+    }

    return io.n_bytes();
 }
@@ -1808,9 +1816,11 @@ size_t llama_context::state_seq_write_data(llama_io_write_i & io, llama_seq_id s
 size_t llama_context::state_seq_read_data(llama_io_read_i & io, llama_seq_id seq_id) {
    GGML_UNUSED(seq_id);

-    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+    if (memory) {
+        llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());

-    kv_self->state_read(io, seq_id);
+        kv_self->state_read(io, seq_id);
+    }

    return io.n_bytes();
 }
@@ -1838,6 +1848,215 @@ void llama_context::perf_reset() {
    t_p_eval_us = n_p_eval = 0;
 }

+//
+// training
+//
+
+static void llama_set_param(struct ggml_tensor * tensor, llama_opt_param_filter param_filter, void * userdata) {
+    if (!tensor || tensor->type != GGML_TYPE_F32) {
+        return;
+    }
+    if (!param_filter(tensor, userdata)) {
+        return;
+    }
+    if (strcmp(tensor->name, "token_embd.weight") == 0) {
+        return; // FIXME
+    }
+    if (strcmp(tensor->name, "rope_freqs.weight") == 0) {
+        return; // FIXME
+    }
+    ggml_set_param(tensor);
+}
+
+void llama_context::opt_init(struct llama_model * model, struct llama_opt_params lopt_params) {
+    GGML_ASSERT(!opt_ctx);
+    model->hparams.n_ctx_train = lopt_params.n_ctx_train > 0 ? lopt_params.n_ctx_train : n_ctx();
+    const uint32_t n_batch     = std::min(this->n_batch(),  model->hparams.n_ctx_train);
+    const uint32_t n_ubatch    = std::min(this->n_ubatch(), n_batch);
+    GGML_ASSERT(model->hparams.n_ctx_train % n_batch  == 0);
+    GGML_ASSERT(n_batch                    % n_ubatch == 0);
+
+    ggml_opt_params opt_params = ggml_opt_default_params(sched.get(), GGML_OPT_LOSS_TYPE_CROSS_ENTROPY);
+    opt_params.opt_period      = n_batch / n_ubatch;
+    opt_params.get_opt_pars    = lopt_params.get_opt_pars;
+    opt_params.get_opt_pars_ud = lopt_params.get_opt_pars_ud;
+
+    opt_ctx = ggml_opt_init(opt_params);
+
+    llama_opt_param_filter param_filter = lopt_params.param_filter;
+    void * param_filter_ud              = lopt_params.param_filter_ud;
+
+  //llama_set_param(model->tok_embd,        param_filter, param_filter_ud); // FIXME
+    llama_set_param(model->type_embd,       param_filter, param_filter_ud);
+    llama_set_param(model->pos_embd,        param_filter, param_filter_ud);
+    llama_set_param(model->tok_norm,        param_filter, param_filter_ud);
+    llama_set_param(model->tok_norm_b,      param_filter, param_filter_ud);
+    llama_set_param(model->output_norm,     param_filter, param_filter_ud);
+    llama_set_param(model->output_norm_b,   param_filter, param_filter_ud);
+    llama_set_param(model->output,          param_filter, param_filter_ud);
+    llama_set_param(model->output_b,        param_filter, param_filter_ud);
+    llama_set_param(model->output_norm_enc, param_filter, param_filter_ud);
+    llama_set_param(model->cls,             param_filter, param_filter_ud);
+    llama_set_param(model->cls_b,           param_filter, param_filter_ud);
+    llama_set_param(model->cls_out,         param_filter, param_filter_ud);
+    llama_set_param(model->cls_out_b,       param_filter, param_filter_ud);
+
+    for (struct llama_layer & layer : model->layers) {
+        for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
+            llama_set_param(reinterpret_cast<struct ggml_tensor **>(&layer)[i], param_filter, param_filter_ud);
+        }
+    }
+}
+
+void llama_context::opt_epoch_iter(
+        ggml_opt_dataset_t               dataset,
+        ggml_opt_result_t                result,
+        const std::vector<llama_token> & tokens,
+        const std::vector<llama_token> & labels_sparse,
+        llama_batch                    & batch,
+        ggml_opt_epoch_callback          callback,
+        bool                             train,
+        int64_t                          idata_in_loop,
+        int64_t                          ndata_in_loop,
+        int64_t                          t_loop_start) {
+    GGML_ASSERT(opt_ctx);
+    const uint32_t n_ctx    = llama_model_n_ctx_train(&model);
+    const uint32_t n_batch  = std::min(this->n_batch(),  n_ctx);
+    const uint32_t n_ubatch = std::min(this->n_ubatch(), n_batch);
+
+    llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
+
+    kv_self->clear();
+    llama_kv_cache_guard kv_guard(kv_self);
+
+    for (uint32_t pos_ctx = 0; pos_ctx < n_ctx; pos_ctx += n_batch) {
+        batch.n_tokens = n_batch;
+        for (uint32_t pos_batch = 0; pos_batch < n_batch; ++pos_batch) {
+            batch.token   [pos_batch]    = tokens[pos_ctx + pos_batch];
+            batch.pos     [pos_batch]    = pos_ctx + pos_batch;
+            batch.n_seq_id[pos_batch]    = 1;
+            batch.seq_id  [pos_batch][0] = 0;
+            batch.logits  [pos_batch]    = true;
+        }
+
+        const auto n_tokens_all = batch.n_tokens;
+
+        n_queued_tokens += n_tokens_all;
+
+        // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens
+        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
+
+        embd_seq.clear();
+
+        int64_t n_outputs_all = n_tokens_all;
+
+        llama_sbatch sbatch = kv_self->sbatch_init(batch, /*logits_all =*/ true);
+
+        // reserve output buffer
+        if (output_reserve(n_outputs_all) < n_outputs_all) {
+            LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all);
+            GGML_ABORT("TODO: handle this error");
+        };
+
+        for (uint32_t pos_batch = 0; pos_batch < n_batch; pos_batch += n_ubatch) {
+            llama_ubatch ubatch = kv_self->ubatch_next(sbatch, cparams.n_ubatch, embd_pooled);
+
+            n_outputs = ubatch.n_tokens;
+
+            // TODO: not sure if this is needed
+            if (!kv_self->find_slot(ubatch)) {
+                LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens);
+
+                GGML_ABORT("TODO: handle this error");
+            }
+
+            auto * gf = graph_init();
+            auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT);
+
+            struct ggml_context * ctx_compute_opt;
+            {
+                const size_t size_gf = ggml_graph_size(gf);
+                const size_t size_meta = 4*size_gf*ggml_tensor_overhead() + 2*ggml_graph_overhead_custom(size_gf, /*grads = */ true);
+                struct ggml_init_params params = {
+                    /*.mem_size   =*/ size_meta,
+                    /*.mem_buffer =*/ nullptr,
+                    /*.no_alloc   =*/ true,
+                };
+                ctx_compute_opt = ggml_init(params);
+            }
+            ggml_opt_prepare_alloc(opt_ctx, ctx_compute_opt, gf, res->get_tokens(), res->get_logits());
+            ggml_opt_alloc(opt_ctx, train);
+            res->set_inputs(&ubatch);
+            {
+                struct ggml_tensor * labels = ggml_opt_labels(opt_ctx);
+                GGML_ASSERT(labels->ne[1] == n_ubatch);
+                ggml_set_zero(labels);
+                const float onef = 1.0f;
+                for (uint32_t pos_ubatch = 0; pos_ubatch < n_ubatch; ++pos_ubatch) {
+                    const uint32_t ilabel = pos_ctx + pos_batch + pos_ubatch;
+                    GGML_ASSERT(labels_sparse[ilabel] < labels->ne[0]);
+                    ggml_backend_tensor_set(labels, &onef, (pos_ubatch*labels->ne[0] + labels_sparse[ilabel])*sizeof(float), sizeof(float));
+                }
+            }
+            ggml_opt_eval(opt_ctx, result);
+            if (callback) {
+                callback(train, opt_ctx, dataset, result, idata_in_loop + (pos_ctx + pos_batch)/n_ubatch + 1, ndata_in_loop, t_loop_start);
+            }
+            ggml_free(ctx_compute_opt);
+        }
+    }
+
+    kv_guard.commit();
+}
+
+void llama_context::opt_epoch(
+        ggml_opt_dataset_t        dataset,
+        ggml_opt_result_t         result_train,
+        ggml_opt_result_t         result_eval,
+        int64_t                   idata_split,
+        ggml_opt_epoch_callback   callback_train,
+        ggml_opt_epoch_callback   callback_eval) {
+    const uint32_t n_ctx    = this->n_ctx();
+    const uint32_t n_batch  = std::min(cparams.n_batch,  n_ctx);
+    const uint32_t n_ubatch = std::min(cparams.n_ubatch, n_batch);
+    const  int64_t ndata    = ggml_opt_dataset_ndata(dataset);
+
+    GGML_ASSERT(idata_split >= 0);
+    GGML_ASSERT(idata_split <= ndata);
+
+    const uint32_t ubatch_per_ctx = n_ctx / n_ubatch;
+
+    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+    std::vector<llama_token>        tokens(n_ctx);
+    std::vector<llama_token> labels_sparse(n_ctx);
+
+    int64_t idata = 0;
+
+    int64_t t_loop_start = ggml_time_us();
+    int64_t ndata_in_loop = idata_split*ubatch_per_ctx;
+    for (; idata < idata_split; ++idata) {
+        constexpr bool train = true;
+        const int64_t idata_in_loop = idata*ubatch_per_ctx;
+
+        ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata);
+        opt_epoch_iter(dataset, result_train, tokens, labels_sparse, batch,
+            callback_train, train, idata_in_loop, ndata_in_loop, t_loop_start);
+    }
+
+    t_loop_start = ggml_time_us();
+    ndata_in_loop = (ndata - idata_split)*ubatch_per_ctx;
+    for (; idata < ndata; ++idata) {
+        constexpr bool train = false;
+        const int64_t idata_in_loop = (idata - idata_split)*ubatch_per_ctx;
+
+        ggml_opt_dataset_get_batch_host(dataset, tokens.data(), n_ctx*sizeof(llama_token), labels_sparse.data(), idata);
+        opt_epoch_iter(dataset, result_eval, tokens, labels_sparse, batch,
+            callback_eval, train, idata_in_loop, ndata_in_loop, t_loop_start);
+    }
+
+    llama_batch_free(batch);
+}
+
 //
 // interface implementation
 //
@@ -1871,6 +2090,7 @@ llama_context_params llama_context_default_params() {
        /*.offload_kqv                 =*/ true,
        /*.flash_attn                  =*/ false,
        /*.no_perf                     =*/ true,
+        /*.op_offload                  =*/ true,
    };

    return result;
@@ -2455,3 +2675,34 @@ void llama_perf_context_print(const llama_context * ctx) {
 void llama_perf_context_reset(llama_context * ctx) {
    ctx->perf_reset();
 }
+
+//
+// training
+//
+
+bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata) {
+    GGML_UNUSED(tensor);
+    GGML_UNUSED(userdata);
+    return true;
+}
+
+void llama_opt_init(struct llama_context * ctx, struct llama_model * model, struct llama_opt_params lopt_params) {
+    ctx->opt_init(model, lopt_params);
+}
+
+void llama_opt_epoch(
+        struct llama_context    * ctx,
+        ggml_opt_dataset_t        dataset,
+        ggml_opt_result_t         result_train,
+        ggml_opt_result_t         result_eval,
+        int64_t                   idata_split,
+        ggml_opt_epoch_callback   callback_train,
+        ggml_opt_epoch_callback   callback_eval) {
+    ctx->opt_epoch(
+        dataset,
+        result_train,
+        result_eval,
+        idata_split,
+        callback_train,
+        callback_eval);
+}
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -7,6 +7,7 @@
 #include "llama-adapter.h"

 #include "ggml-cpp.h"
+#include "ggml-opt.h"

 #include <map>
 #include <vector>
@@ -133,6 +134,32 @@ struct llama_context {
    llama_perf_context_data perf_get_data() const;
    void perf_reset();

+    //
+    // training
+    //
+
+    void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
+
+    void opt_epoch(
+            ggml_opt_dataset_t      dataset,
+            ggml_opt_result_t       result_train,
+            ggml_opt_result_t       result_eval,
+            int64_t                 idata_split,
+            ggml_opt_epoch_callback callback_train,
+            ggml_opt_epoch_callback callback_eval);
+
+    void opt_epoch_iter(
+            ggml_opt_dataset_t               dataset,
+            ggml_opt_result_t                result,
+            const std::vector<llama_token> & tokens,
+            const std::vector<llama_token> & labels_sparse,
+            llama_batch                    & batch,
+            ggml_opt_epoch_callback          callback,
+            bool                             train,
+            int64_t                          idata_in_loop,
+            int64_t                          ndata_in_loop,
+            int64_t                          t_loop_start);
+
 private:
    //
    // output
@@ -212,6 +239,9 @@ private:

    ggml_context_ptr ctx_compute;

+    // training
+    ggml_opt_context_t opt_ctx = nullptr;
+
    ggml_threadpool_t threadpool       = nullptr;
    ggml_threadpool_t threadpool_batch = nullptr;

--- a/src/llama-cparams.h
+++ b/src/llama-cparams.h
@@ -30,6 +30,7 @@ struct llama_cparams {
    bool flash_attn;
    bool no_perf;
    bool warmup;
+    bool op_offload;

    enum llama_pooling_type pooling_type;

--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -971,6 +971,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
        inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
        //cb(inp->tokens, "inp_tokens", -1);
        ggml_set_input(inp->tokens);
+        res->t_tokens = inp->tokens;

        cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);

@@ -1227,8 +1228,19 @@ ggml_tensor * llm_graph_context::build_attn_mha(
        ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);

        if (v_mla) {
+#if 0
+            // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
+            // However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
            cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
            cur = ggml_mul_mat(ctx0, v_mla, cur);
+#else
+            // It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
+            // The permutations are noops and only change how the tensor data is interpreted.
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            cur = ggml_mul_mat(ctx0, v_mla, cur);
+            cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+            cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
+#endif
        }

        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -298,6 +298,7 @@ class llm_graph_result_i {
 public:
    virtual ~llm_graph_result_i() = default;

+    virtual ggml_tensor * get_tokens()      = 0;
    virtual ggml_tensor * get_logits()      = 0;
    virtual ggml_tensor * get_embd()        = 0;
    virtual ggml_tensor * get_embd_pooled() = 0;
@@ -312,6 +313,7 @@ class llm_graph_result : public llm_graph_result_i {
 public:
    virtual ~llm_graph_result() = default;

+    ggml_tensor * get_tokens()      override { return t_tokens; }
    ggml_tensor * get_logits()      override { return t_logits; }
    ggml_tensor * get_embd()        override { return t_embd; }
    ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
@@ -328,6 +330,7 @@ public:
    }

    // important graph nodes
+    ggml_tensor * t_tokens      = nullptr;
    ggml_tensor * t_logits      = nullptr;
    ggml_tensor * t_embd        = nullptr;
    ggml_tensor * t_embd_pooled = nullptr;
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -301,12 +301,12 @@ namespace GGUFMeta {
            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);

        switch (arr_info.gt) {
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
-            case GGUF_TYPE_INT32:   GGML_ASSERT(
-                                            (std::is_same<T,  int32_t>::value) ||
-                                            (std::is_same<T, uint32_t>::value));  break;
+            case GGUF_TYPE_UINT32:
+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
+                                                (std::is_same<T, uint32_t>::value)); break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
            default:
-                throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
        }

        result.resize(arr_info.length);
@@ -330,12 +330,12 @@ namespace GGUFMeta {
            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);

        switch (arr_info.gt) {
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
-            case GGUF_TYPE_INT32:   GGML_ASSERT(
-                                            (std::is_same<T,  int32_t>::value) ||
-                                            (std::is_same<T, uint32_t>::value));  break;
+            case GGUF_TYPE_UINT32:
+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
+                                                (std::is_same<T, uint32_t>::value)); break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
            default:
-                throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
        }

        if (arr_info.length > N_MAX) {
@@ -823,6 +823,10 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
        mmaps_used.reserve(files.size());
        for (const auto & file : files) {
            auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
+            if (!reg) {
+                throw std::runtime_error(format("%s: no CPU backend found", __func__));
+            }
+
            auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
            std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
            mmaps_used.emplace_back(mapping->size(), 0);
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -0,0 +1,281 @@
+#include "llama-model-saver.h"
+
+#include "gguf.h"
+
+#include "llama.h"
+#include "llama-hparams.h"
+#include "llama-model.h"
+#include "llama-vocab.h"
+
+#include <string>
+
+llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
+    gguf_ctx = gguf_init_empty();
+}
+
+llama_model_saver::~llama_model_saver() {
+    gguf_free(gguf_ctx);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
+    gguf_set_val_u32(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const int32_t value) {
+    gguf_set_val_i32(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const float value) {
+    gguf_set_val_f32(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const bool value) {
+    gguf_set_val_bool(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const char * value) {
+    gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), value);
+}
+
+[[noreturn]]
+void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
+    GGML_UNUSED(key);
+    GGML_UNUSED(value);
+    GGML_ABORT("fatal error"); // this should never be called, only needed to make the template below compile
+}
+
+template <typename Container>
+void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
+    const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
+    GGML_ASSERT(n_values <= value.size());
+
+    if (n_values == 0) {
+        return;
+    }
+
+    if (per_layer) {
+        bool all_values_the_same = true;
+        for (size_t i = 1; i < n_values; ++i) {
+            if (value[i] != value[0]) {
+                all_values_the_same = false;
+                break;
+            }
+        }
+        if (all_values_the_same) {
+            add_kv(key, value[0]);
+            return;
+        }
+    }
+
+    if (std::is_same<typename Container::value_type, uint8_t>::value) {
+        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT8, value.data(), n_values);
+    } else if (std::is_same<typename Container::value_type, int8_t>::value) {
+        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
+    } else if (std::is_same<typename Container::value_type, uint32_t>::value) {
+        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
+    } else if (std::is_same<typename Container::value_type, int32_t>::value) {
+        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
+    } else if (std::is_same<typename Container::value_type, float>::value) {
+        gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_FLOAT32, value.data(), n_values);
+    } else if (std::is_same<Container, std::string>::value) {
+        gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), reinterpret_cast<const char *>(value.data()));
+    } else {
+        GGML_ABORT("fatal error");
+    }
+}
+
+void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
+    std::vector<const char *> tmp(value.size());
+    for (size_t i = 0; i < value.size(); ++i) {
+        tmp[i] = value[i].c_str();
+    }
+    gguf_set_arr_str(gguf_ctx, llm_kv(key).c_str(), tmp.data(), tmp.size());
+}
+
+void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
+    if (!tensor) {
+        return;
+    }
+    if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
+        GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
+        return;
+    }
+    gguf_add_tensor(gguf_ctx, tensor);
+}
+
+void llama_model_saver::add_kv_from_model() {
+    const llama_hparams & hparams = model.hparams;
+    const llama_vocab   & vocab   = model.vocab;
+
+    const int32_t n_vocab = vocab.n_tokens();
+    std::vector<std::string> tokens(n_vocab);
+    std::vector<float>       scores(n_vocab);
+    std::vector<int32_t>     token_types(n_vocab);
+
+    for (int32_t id = 0; id < n_vocab; ++id) {
+        const llama_vocab::token_data & token_data = vocab.get_token_data(id);
+
+        tokens[id] = token_data.text;
+        scores[id] = token_data.score;
+
+        switch(token_data.attr) {
+            case LLAMA_TOKEN_ATTR_UNKNOWN:      token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN;      break;
+            case LLAMA_TOKEN_ATTR_UNUSED:       token_types[id] = LLAMA_TOKEN_TYPE_UNUSED;       break;
+            case LLAMA_TOKEN_ATTR_NORMAL:       token_types[id] = LLAMA_TOKEN_TYPE_NORMAL;       break;
+            case LLAMA_TOKEN_ATTR_CONTROL:      token_types[id] = LLAMA_TOKEN_TYPE_CONTROL;      break;
+            case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
+            case LLAMA_TOKEN_ATTR_BYTE:         token_types[id] = LLAMA_TOKEN_TYPE_BYTE;         break;
+            case LLAMA_TOKEN_ATTR_UNDEFINED:
+            default:                            token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED;    break;
+        }
+    }
+
+    // add_kv(LLM_KV_GENERAL_TYPE,                      ???);
+    add_kv(LLM_KV_GENERAL_ARCHITECTURE,              model.arch_name());
+    // add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION,      ???);
+    // add_kv(LLM_KV_GENERAL_ALIGNMENT,                 ???);
+    add_kv(LLM_KV_GENERAL_NAME,                      model.name);
+    // add_kv(LLM_KV_GENERAL_AUTHOR,                    ???);
+    // add_kv(LLM_KV_GENERAL_VERSION,                   ???);
+    // add_kv(LLM_KV_GENERAL_URL,                       ???);
+    // add_kv(LLM_KV_GENERAL_DESCRIPTION,               ???);
+    // add_kv(LLM_KV_GENERAL_LICENSE,                   ???);
+    // add_kv(LLM_KV_GENERAL_SOURCE_URL,                ???);
+    // add_kv(LLM_KV_GENERAL_SOURCE_HF_REPO,            ???);
+
+    add_kv(LLM_KV_VOCAB_SIZE,                        vocab.n_tokens());
+    add_kv(LLM_KV_CONTEXT_LENGTH,                    hparams.n_ctx_train);
+    add_kv(LLM_KV_EMBEDDING_LENGTH,                  hparams.n_embd);
+    add_kv(LLM_KV_BLOCK_COUNT,                       hparams.n_layer);
+    add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT,         hparams.n_layer_dense_lead);
+    add_kv(LLM_KV_FEED_FORWARD_LENGTH,               hparams.n_ff_arr, true);
+    add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
+    add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
+    add_kv(LLM_KV_USE_PARALLEL_RESIDUAL,             hparams.use_par_res);
+    // add_kv(LLM_KV_TENSOR_DATA_LAYOUT,                ???);
+    add_kv(LLM_KV_EXPERT_COUNT,                      hparams.n_expert);
+    add_kv(LLM_KV_EXPERT_USED_COUNT,                 hparams.n_expert_used);
+    add_kv(LLM_KV_EXPERT_SHARED_COUNT,               hparams.n_expert_shared);
+    add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE,              hparams.expert_weights_scale);
+    add_kv(LLM_KV_POOLING_TYPE,                      uint32_t(hparams.pooling_type));
+    add_kv(LLM_KV_LOGIT_SCALE,                       hparams.f_logit_scale);
+    add_kv(LLM_KV_DECODER_START_TOKEN_ID,            hparams.dec_start_token_id);
+    add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING,            hparams.f_attn_logit_softcapping);
+    add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING,           hparams.f_final_logit_softcapping);
+    add_kv(LLM_KV_SWIN_NORM,                         hparams.swin_norm);
+    add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS,            hparams.rescale_every_n_layers);
+    add_kv(LLM_KV_TIME_MIX_EXTRA_DIM,                hparams.time_mix_extra_dim);
+    add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM,              hparams.time_decay_extra_dim);
+    add_kv(LLM_KV_RESIDUAL_SCALE,                    hparams.f_residual_scale);
+    add_kv(LLM_KV_EMBEDDING_SCALE,                   hparams.f_embedding_scale);
+
+    add_kv(LLM_KV_ATTENTION_HEAD_COUNT,              hparams.n_head_arr, true);
+    add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV,           hparams.n_head_kv_arr, true);
+    add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS,          hparams.f_max_alibi_bias);
+    add_kv(LLM_KV_ATTENTION_CLAMP_KQV,               hparams.f_clamp_kqv);
+    add_kv(LLM_KV_ATTENTION_KEY_LENGTH,              hparams.n_embd_head_k);
+    add_kv(LLM_KV_ATTENTION_VALUE_LENGTH,            hparams.n_embd_head_v);
+    add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS,           hparams.f_norm_eps);
+    add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
+    add_kv(LLM_KV_ATTENTION_CAUSAL,                  hparams.causal_attn);
+    add_kv(LLM_KV_ATTENTION_Q_LORA_RANK,             hparams.n_lora_q);
+    add_kv(LLM_KV_ATTENTION_KV_LORA_RANK,            hparams.n_lora_kv);
+    add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,  hparams.n_rel_attn_bkts);
+    add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa);
+    add_kv(LLM_KV_ATTENTION_SCALE,                   hparams.f_attention_scale);
+
+    const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
+
+    add_kv(LLM_KV_ROPE_DIMENSION_COUNT,              hparams.n_rot);
+    add_kv(LLM_KV_ROPE_FREQ_BASE,                    hparams.rope_freq_base_train);
+    // add_kv(LLM_KV_ROPE_SCALE_LINEAR,                 rope_scaling_factor); // old name
+    add_kv(LLM_KV_ROPE_SCALING_TYPE,                 llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
+    add_kv(LLM_KV_ROPE_SCALING_FACTOR,               rope_scaling_factor);
+    add_kv(LLM_KV_ROPE_SCALING_ATTN_FACTOR,          hparams.rope_attn_factor);
+    add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,         hparams.n_ctx_orig_yarn);
+    add_kv(LLM_KV_ROPE_SCALING_FINETUNED,            hparams.rope_finetuned);
+    add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL,         hparams.rope_yarn_log_mul);
+
+    // TODO: implement split file support
+    // add_kv(LLM_KV_SPLIT_NO,                          ???);
+    // add_kv(LLM_KV_SPLIT_COUNT,                       ???);
+    // add_kv(LLM_KV_SPLIT_TENSORS_COUNT,               ???);
+
+    add_kv(LLM_KV_SSM_INNER_SIZE,                    hparams.ssm_d_inner);
+    add_kv(LLM_KV_SSM_CONV_KERNEL,                   hparams.ssm_d_conv);
+    add_kv(LLM_KV_SSM_STATE_SIZE,                    hparams.ssm_d_state);
+    add_kv(LLM_KV_SSM_TIME_STEP_RANK,                hparams.ssm_dt_rank);
+    add_kv(LLM_KV_SSM_DT_B_C_RMS,                    hparams.ssm_dt_b_c_rms);
+
+    add_kv(LLM_KV_WKV_HEAD_SIZE,                     hparams.wkv_head_size);
+
+    add_kv(LLM_KV_TOKENIZER_MODEL,                   vocab.get_tokenizer_model());
+    add_kv(LLM_KV_TOKENIZER_PRE,                     vocab.get_tokenizer_pre());
+    add_kv(LLM_KV_TOKENIZER_LIST,                    tokens);
+    add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE,              token_types);
+    add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,        vocab.n_token_types());
+    add_kv(LLM_KV_TOKENIZER_SCORES,                  scores);
+    add_kv(LLM_KV_TOKENIZER_MERGES,                  vocab.get_bpe_merges());
+    // FIXME llama_token is type i32 but when reading in a GGUF file u32 is expected, not an issue for writing though
+    add_kv(LLM_KV_TOKENIZER_BOS_ID,                  uint32_t(vocab.token_bos()));
+    add_kv(LLM_KV_TOKENIZER_EOS_ID,                  uint32_t(vocab.token_eos()));
+    add_kv(LLM_KV_TOKENIZER_EOT_ID,                  uint32_t(vocab.token_eot()));
+    add_kv(LLM_KV_TOKENIZER_EOM_ID,                  uint32_t(vocab.token_eom()));
+    add_kv(LLM_KV_TOKENIZER_UNK_ID,                  uint32_t(vocab.token_unk()));
+    add_kv(LLM_KV_TOKENIZER_SEP_ID,                  uint32_t(vocab.token_sep()));
+    add_kv(LLM_KV_TOKENIZER_PAD_ID,                  uint32_t(vocab.token_pad()));
+    // add_kv(LLM_KV_TOKENIZER_CLS_ID,                  uint32_t(vocab.token_bos())); // deprecated
+    // add_kv(LLM_KV_TOKENIZER_MASK_ID,                 ???);
+    add_kv(LLM_KV_TOKENIZER_ADD_BOS,                 vocab.get_add_bos());
+    add_kv(LLM_KV_TOKENIZER_ADD_EOS,                 vocab.get_add_eos());
+    add_kv(LLM_KV_TOKENIZER_ADD_PREFIX,              vocab.get_add_space_prefix());
+    add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,         vocab.get_remove_extra_whitespaces());
+    add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,    vocab.get_precompiled_charsmap());
+    // add_kv(LLM_KV_TOKENIZER_HF_JSON,                 ???);
+    // add_kv(LLM_KV_TOKENIZER_RWKV,                    ???);
+    add_kv(LLM_KV_TOKENIZER_FIM_PRE_ID,              uint32_t(vocab.token_fim_pre()));
+    add_kv(LLM_KV_TOKENIZER_FIM_SUF_ID,              uint32_t(vocab.token_fim_suf()));
+    add_kv(LLM_KV_TOKENIZER_FIM_MID_ID,              uint32_t(vocab.token_fim_mid()));
+    add_kv(LLM_KV_TOKENIZER_FIM_PAD_ID,              uint32_t(vocab.token_fim_pad()));
+    add_kv(LLM_KV_TOKENIZER_FIM_REP_ID,              uint32_t(vocab.token_fim_rep()));
+    add_kv(LLM_KV_TOKENIZER_FIM_SEP_ID,              uint32_t(vocab.token_fim_sep()));
+
+    // TODO: implement LoRA support
+    // add_kv(LLM_KV_ADAPTER_TYPE,                      ???);
+    // add_kv(LLM_KV_ADAPTER_LORA_ALPHA,                ???);
+
+    // deprecated
+    // add_kv(LLM_KV_TOKENIZER_PREFIX_ID,               ???);
+    // add_kv(LLM_KV_TOKENIZER_SUFFIX_ID,               ???);
+    // add_kv(LLM_KV_TOKENIZER_MIDDLE_ID,               ???);
+}
+
+void llama_model_saver::add_tensors_from_model() {
+    if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
+        add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
+    }
+    add_tensor(model.type_embd);
+    add_tensor(model.pos_embd);
+    add_tensor(model.tok_norm);
+    add_tensor(model.tok_norm_b);
+    add_tensor(model.output_norm);
+    add_tensor(model.output_norm_b);
+    add_tensor(model.output);
+    add_tensor(model.output_b);
+    add_tensor(model.output_norm_enc);
+    add_tensor(model.cls);
+    add_tensor(model.cls_b);
+    add_tensor(model.cls_out);
+    add_tensor(model.cls_out_b);
+
+    for (const struct llama_layer & layer : model.layers) {
+        for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
+            add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
+        }
+    }
+}
+
+void llama_model_saver::save(const std::string & path_model) {
+    gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
+}
+
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Xuan-Son Nguyen	b4726345ac	mtmd : remove libllava, remove clip-quantize-cli (⚠️ breaking change) (#13460 ) * mtmd : remove libllava, remove clip-quantize-cli * rm clip_model_quantize	2025-05-13 15:33:58 +02:00
Sigbjørn Skjæret	bf79371120	scripts : support arbitrary input file formats in compare-llama-bench.py (#13455 )	2025-05-13 15:31:12 +02:00
Gabe Goodhart	d590cd4c24	model : Granite MoE shared (#13269 ) * feat: Add GGUF conversion for granitemoeshared Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: hparam and arch plumbing for granitemoeshared Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Split MoE fused tensors for shared experts in conversion Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: First WIP cut at model arch in cpp The hparam and architecture plumbing should be correct, but the implementation of the shared experts seems to still be broken. Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Cleaner (maybe more correct?) splitting for gate/up Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Fix the input to the shared experts I had misread that the shared experts take the inputs _before_ the standard MoE layer and was feeding the output of the MoE to the shared experts. Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Avoid architecture-specific checks for Granite MoE Shared This is a cleaner way that will allow more flexibility in architecture strings going forward. Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Split granite architectures out of llm_build_llama This helps de-clutter the llama-family graph construction and allows granite to diverge further (in preparation for Granite 4). NOTE: I removed the granite scale factors from llm_build_deci because they appear to only be there as copy-paste from llm_build_llama. The HF config does not seem to set those values: https://huggingface.co/Deci/DeciLM-7B/blob/main/config.json Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Fix compiler warning about uninitialized inp_pos This should not have been reachable, but it warns on some compliers Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Consoladate GraniteMoEShared into GraniteMoE for conversion Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Consolidate GraniteMoEShared into GraniteMoE on the c++ side Branch: GraniteMoEShared Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>	2025-05-13 15:12:01 +02:00
Georgi Gerganov	1e2809bc4b	sync : ggml	2025-05-13 14:02:28 +03:00
Diego Devesa	cf0a43bb64	llama-bench : add defrag-thold, check for invalid ranges (#13487 )	2025-05-13 00:31:37 +02:00
lhez	f0d46ef157	opencl: remove unnecessary assert for `add` (#13257 )	2025-05-12 13:13:49 -07:00
Xuan-Son Nguyen	de4c07f937	clip : cap max image size 1024 for qwen vl model (#13478 )	2025-05-12 15:06:51 +02:00
Johannes Gäßler	10d2af0eaa	llama/ggml: add LLM training support (#10544 ) * llama/ggml: add LLM training support more compact progress bar llama_save_model_to_file llama_opt_param_filter ggml_graph_dup force_grads refactor ggml_opt, fix test-opt * remove logits_all * refactor CUDA implementation for ACC * reset graph at beginning of opt period	2025-05-12 14:44:49 +02:00
Georgi Gerganov	064cc596ac	context : fix state io for memory-less contexts (#13470 ) ggml-ci	2025-05-12 15:12:27 +03:00
Anudit Nagar	91159ee9df	server : allow content to be null in oaicompat_completion_params_parse (#13477 )	2025-05-12 13:56:42 +02:00
Diego Devesa	22cdab343b	llama-bench : accept ranges for integer parameters (#13410 )	2025-05-12 13:08:22 +02:00
Dan Johansson	a71a4075cd	ggml-cpu: Integrate fp32=bf16xbf16 SME KleidiAI kernel (#13053 ) * ggml-cpu: Integrate fp32=bf16xbf16 SME KleidiAI kernel Signed-off-by: Dan Johansson <dan.johansson@arm.com> * * code review fixes Signed-off-by: Dan Johansson <dan.johansson@arm.com> * * adds a comment that clarifies barrier usage Signed-off-by: Dan Johansson <dan.johansson@arm.com> --------- Signed-off-by: Dan Johansson <dan.johansson@arm.com> Co-authored-by: Charles Xu <charles.xu@arm.com>	2025-05-12 13:06:19 +02:00
Johannes Gäßler	95e18884fc	CUDA: fix misaligned synchronization in FA (#13469 )	2025-05-12 10:51:21 +02:00
Xuan-Son Nguyen	df8491922f	ggml : add mrope kernel for metal (#13457 )	2025-05-12 10:29:13 +02:00
Atharva Dubey	14492144c2	enable dpcpp nightly builds with libraries (#13406 )	2025-05-12 13:15:32 +08:00
City	c104023994	mtmd : Use RMS norm for InternVL 3 38B and 78B mmproj (#13459 )	2025-05-12 00:39:06 +02:00
Anthony Umfer	9a390c4829	tools : fix uninitialized llama_batch in server (#13436 ) * add constructor to initialize server_context::batch, preventing destructor's call to llama_batch_free from causing an invalid free() * Update tools/server/server.cpp Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> * use C++11 initializer syntax * switch from Copy-list-initialization to Direct-list-initialization --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>	2025-05-11 17:08:26 +02:00
Sigbjørn Skjæret	09232370fc	scripts : exit compare-llama-bench.py gracefully when there's nothing to compare (#13451 )	2025-05-11 16:20:39 +02:00
Johannes Gäßler	7474e00b34	CUDA: fix crash with partial offloading of MoE (#13439 )	2025-05-11 16:09:33 +02:00
David Huang	7f323a589f	Add `--no-op-offload` to improve `-ot` pp perf in MoE models like llama4 400B (#13386 )	2025-05-11 14:18:39 +02:00
City	3eac209319	mtmd : support InternVL 3 38B and 78B mmproj (#13443 ) * Support InternVL 3 38B and 78B mmproj * Swap norms in clip.cpp * Group variables together	2025-05-11 11:35:52 +02:00
Xuan-Son Nguyen	a634d75d1b	mtmd : move helpers to dedicated file (#13442 ) * mtmd : move helpers to dedicated file * fix windows build * rm redundant include	2025-05-11 11:34:23 +02:00
Thomas Germer	62d4250e52	docs : Fix typo in InternVL3 model name (#13440 )	2025-05-10 22:26:46 +02:00
Johannes Gäßler	0208355f42	CUDA: fix race conditions FlashAttention kernels (#13438 )	2025-05-10 22:22:48 +02:00
Sigbjørn Skjæret	d2a4ef05c6	vocab : add ByteDance-Seed/Seed-Coder (#13423 )	2025-05-10 22:08:07 +02:00
Xuan-Son Nguyen	15e6125a39	mtmd : add hard limit on image resolution for qwen2vl / qwen2.5vl (#13434 ) * mtmd : add hard limit on image resolution for qwen2vl / qwen2.5vl * fix typo	2025-05-10 19:57:54 +02:00
Xuan-Son Nguyen	3b24d26c22	server : update docs (#13432 )	2025-05-10 18:44:49 +02:00
Sigbjørn Skjæret	43dfd741a5	llguidance : set tokenizer slices to default (#13424 )	2025-05-10 17:19:52 +02:00
Thammachart Chinvarapon	b064a51a4e	ci: free_disk_space flag enabled for intel variant (#13426 ) before cleanup: 20G after cleanup: 44G after all built and pushed: 24G https://github.com/Thammachart/llama.cpp/actions/runs/14945093573/job/41987371245	2025-05-10 16:34:48 +02:00
Xuan-Son Nguyen	053367d149	mtmd : support InternVL 2.5 and 3 (#13422 ) * convert : internvl support * InternVL3-1B working * fix regression * rm mobilevlm from test * fix conversion * add test for internvl * add to list of pre-quant * restore boi/eoi check * add clarify comment for norm eps	2025-05-10 16:26:42 +02:00
Johannes Gäßler	d8919424f1	CUDA: fix FlashAttention on Turing (#13415 )	2025-05-10 09:16:52 +02:00
Xuan-Son Nguyen	7fef11766c	arg : add env var to control mmproj (#13416 ) * arg : add env var to control mmproj * small note about -hf --mmproj	2025-05-10 08:16:29 +02:00
Jeff Bolz	dc1d2adfc0	vulkan: scalar flash attention implementation (#13324 ) * vulkan: scalar flash attention implementation * vulkan: always use fp32 for scalar flash attention * vulkan: use vector loads in scalar flash attention shader * vulkan: remove PV matrix, helps with register usage * vulkan: reduce register usage in scalar FA, but perf may be slightly worse * vulkan: load each Q value once. optimize O reduction. more tuning * vulkan: support q4_0/q8_0 KV in scalar FA * CI: increase timeout to accommodate newly-supported tests * vulkan: for scalar FA, select between 1 and 8 rows * vulkan: avoid using Float16 capability in scalar FA	2025-05-10 08:07:07 +02:00
Helton Reis	7c28a74e07	chore(llguidance): use tagged version that does not break the build (#13413 )	2025-05-09 23:15:39 +03:00
Xuan-Son Nguyen	33eff40240	server : vision support via libmtmd (#12898 ) * server : (experimental) vision support via libmtmd * mtmd : add more api around mtmd_image_tokens * mtmd : add more api around mtmd_image_tokens * mtmd : ability to calc image hash * shared_ptr for mtmd_image_tokens * move hash to user-define ID (fixed) * abstract out the batch management * small fix * refactor logic adding tokens to batch * implement hashing image * use FNV hash, now hash bitmap instead of file data * allow decoding image embedding to be split into batches * rm whitespace * disable some features when mtmd is on * fix --no-mmproj-offload * mtmd_context_params no timings * refactor server_inp to server_tokens * fix the failing test case * init * wip * working version * add mtmd::bitmaps * add test target * rm redundant define * test: mtmd_input_chunks_free * rm outdated comment * fix merging issue * explicitly create mtmd::input_chunks * mtmd_input_chunk_copy * add clone() * improve server_input struct * clip : fix confused naming ffn_up and ffn_down * rm ffn_i/o/g naming * rename n_embd, n_ff * small fix * no check n_ff * fix detokenize * add const to various places * add warning about breaking changes * add c api * helper: use mtmd_image_tokens_get_n_pos * fix ctx_shift * fix name shadowing * more strict condition * support remote image_url * remote image_url log * add CI test * do not log base64 * add "has_multimodal" to /props * remove dangling image * speculative: use slot.cache_tokens.insert * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * rm can_be_detokenized * on prmpt processing done, assert cache_tokens.size * handle_completions_impl returns void * adapt the new web ui * update docs and hot topics * rm assert * small fix (2) --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2025-05-09 19:29:37 +02:00
Alberto Cabrera Pérez	17512a94d6	sycl : implementation of reordered Q4_0 MMVQ for Intel GPUs (#12858 ) * sycl : Implemented reorder Q4_0 mmvq Signed-off-by: Alberto Cabrera <alberto.cabrera@codeplay.com> * sycl : Fixed mmvq being called when reorder is disabled * sycl : Improved comments in the quants header Signed-off-by: Alberto Cabrera <alberto.cabrera@codeplay.com> * Use static_assert * safe_div -> ceil_div * Clarify qi comment * change the reorder tensor from init to execute OP * dbg * Undo changes to test-backend-ops * Refactor changes on top of q4_0 reorder fix * Missing Reverts * Refactored opt_for_reorder logic to simplify code path * Explicit inlining and unroll * Renamed mul_mat_algo enum for consistency --------- Signed-off-by: Alberto Cabrera <alberto.cabrera@codeplay.com> Co-authored-by: romain.biessy <romain.biessy@codeplay.com>	2025-05-09 16:34:08 +01:00
Georgi Gerganov	611aa914ef	metal : optimize MoE for large batches (#13388 ) ggml-ci	2025-05-09 15:14:56 +03:00
Johannes Gäßler	0cf6725e9f	CUDA: FA support for Deepseek (Ampere or newer) (#13306 ) * CUDA: FA support for Deepseek (Ampere or newer) * do loop unrolling via C++ template	2025-05-09 13:34:58 +02:00
Diego Devesa	27ebfcacba	llama : do not crash if there is no CPU backend (#13395 ) * llama : do not crash if there is no CPU backend * add checks to examples	2025-05-09 13:02:07 +02:00
Johannes Gäßler	5c86c9ed3e	CUDA: fix crash on large batch size for MoE models (#13384 )	2025-05-09 12:14:04 +02:00
Bartowski	efb8b47eda	imatrix : Add --parse-special for enabling parsing of special tokens in imatrix calculation (#13389 ) * Add --parse-special for enabling parsing of special tokens in imatrix calculation * whitespace	2025-05-09 11:53:58 +02:00
R0CKSTAR	0527771dd8	llama-run: add support for downloading models from ModelScope (#13370 ) Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>	2025-05-09 10:25:50 +01:00
Xuan-Son Nguyen	2189fd3b63	mtmd : fix batch_view for m-rope (#13397 ) * mtmd : fix batch_view for m-rope * nits : fix comment	2025-05-09 11:18:02 +02:00
Xuan-Son Nguyen	3f96aeff39	llama : one-off chat template fix for Mistral-Small-2503 (#13398 ) * llama : one-off chat template fix for Mistral-Small-2503 * update readme * add mistral-v7-tekken	2025-05-09 11:17:51 +02:00
Radoslav Gerganov	b486ba05bf	rpc : add rpc_msg_set_tensor_hash_req (#13353 ) * rpc : add rpc_msg_set_tensor_hash_req Use a dedicated struct for the request of RPC_CMD_SET_TENSOR_HASH which makes the code cleaner. * fix	2025-05-09 10:31:07 +03:00
Jeff Bolz	02115dcd9a	vulkan: Allow up to 4096 elements for mul_mat_id row_ids (#13326 ) This assert fired running Qwen_Qwen3-30B-A3B-Q2_K.gguf: GGML_ASSERT(nei0 * nei1 <= 3072); The tensor is 8 x 512. Increase this array size to accommodate.	2025-05-09 09:23:41 +02:00
Xuan-Son Nguyen	d9c4accaff	server : (webui) rename has_multimodal --> modalities (#13393 ) * server : (webui) rename has_multimodal --> modalities * allow converting SVG to PNG * less complicated code	2025-05-09 09:06:37 +02:00
Diego Devesa	15e03282bb	ci : limit write permission to only the release step + fixes (#13392 ) * ci : limit write permission to only the release step * fix win cuda file name * fix license file copy on multi-config generators	2025-05-08 23:45:22 +02:00
Matt Clayton	f05a6d71a0	mtmd : Expose helper_decode_image_chunk (#13366 ) * mtmd: Expose helper_decode_image, output_embd_copy, image_tokens_copy/free * Slim down * Cleanups	2025-05-08 20:25:39 +02:00
Xuan-Son Nguyen	ee01d71e58	server : (webui) fix a very small misalignment (#13387 ) * server : (webui) fix a very small misalignment * restore font-bold	2025-05-08 18:51:45 +02:00