Merge branch 'master' into compilade/convert-prequant

2026-04-23 16:37:33 +03:00 · 2025-10-23 14:23:12 -04:00 · 2025-09-09 14:23:06 -04:00 · 2025-09-01 10:13:29 -04:00 · 2025-08-19 17:27:59 -04:00 · 2025-08-14 17:05:21 -04:00
306 changed files with 16937 additions and 27652 deletions
--- a/.devops/s390x.Dockerfile
+++ b/.devops/s390x.Dockerfile
@@ -24,9 +24,8 @@ RUN --mount=type=cache,target=/root/.ccache \
        -DCMAKE_C_COMPILER_LAUNCHER=ccache \
        -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
        -DLLAMA_BUILD_TESTS=OFF \
+        -DGGML_BACKEND_DL=OFF \
        -DGGML_NATIVE=OFF \
-        -DGGML_BACKEND_DL=ON \
-        -DGGML_CPU_ALL_VARIANTS=ON \
        -DGGML_BLAS=ON \
        -DGGML_BLAS_VENDOR=OpenBLAS && \
    cmake --build build --config Release -j $(nproc) && \
@@ -104,7 +103,6 @@ FROM base AS light
 WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
-COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
 COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin

 ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ]
@@ -118,7 +116,6 @@ ENV LLAMA_ARG_HOST=0.0.0.0
 WORKDIR /llama.cpp/bin

 # Copy llama.cpp binaries and libraries
-COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin
 COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin

 EXPOSE 8080
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -76,10 +76,6 @@ ggml:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/**
-model:
-    - changed-files:
-        - any-glob-to-any-file:
-            - src/models/**
 nix:
    - changed-files:
        - any-glob-to-any-file:
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -4,49 +4,49 @@ on:
  workflow_call:

 jobs:
-  # ubuntu-24-riscv64-cpu-cross:
-  #   runs-on: ubuntu-24.04
+  ubuntu-24-riscv64-cpu-cross:
+    runs-on: ubuntu-24.04

-  #   steps:
-  #     - uses: actions/checkout@v4
-  #     - name: Setup Riscv
-  #       run: |
-  #         sudo dpkg --add-architecture riscv64
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Riscv
+        run: |
+          sudo dpkg --add-architecture riscv64

-  #         # Add arch-specific repositories for non-amd64 architectures
-  #         cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
-  #         deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
-  #         EOF
+          # Add arch-specific repositories for non-amd64 architectures
+          cat << EOF | sudo tee /etc/apt/sources.list.d/riscv64-ports.list
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble main universe
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-updates main universe
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-security main universe
+          deb [arch=riscv64] http://ports.ubuntu.com/ubuntu-ports/ noble-backports main universe
+          EOF

-  #         sudo apt-get update || true    ;# Prevent failure due to missing URLs.
+          sudo apt-get update || true    ;# Prevent failure due to missing URLs.

-  #         sudo apt-get install -y --no-install-recommends \
-  #                 build-essential \
-  #                 gcc-14-riscv64-linux-gnu \
-  #                 g++-14-riscv64-linux-gnu
+          sudo apt-get install -y --no-install-recommends \
+                  build-essential \
+                  gcc-14-riscv64-linux-gnu \
+                  g++-14-riscv64-linux-gnu

-  #     - name: Build
-  #       run: |
-  #         cmake -B build -DLLAMA_CURL=OFF \
-  #                        -DCMAKE_BUILD_TYPE=Release \
-  #                        -DGGML_OPENMP=OFF \
-  #                        -DLLAMA_BUILD_EXAMPLES=ON \
-  #                        -DLLAMA_BUILD_TOOLS=ON \
-  #                        -DLLAMA_BUILD_TESTS=OFF \
-  #                        -DCMAKE_SYSTEM_NAME=Linux \
-  #                        -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-  #                        -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-  #                        -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-  #                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-  #                        -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-  #                        -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+      - name: Build
+        run: |
+          cmake -B build -DLLAMA_CURL=OFF \
+                         -DCMAKE_BUILD_TYPE=Release \
+                         -DGGML_OPENMP=OFF \
+                         -DLLAMA_BUILD_EXAMPLES=ON \
+                         -DLLAMA_BUILD_TOOLS=ON \
+                         -DLLAMA_BUILD_TESTS=OFF \
+                         -DCMAKE_SYSTEM_NAME=Linux \
+                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
+                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
+                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
+                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
+                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH

-  #         cmake --build build --config Release -j $(nproc)
+          cmake --build build --config Release -j $(nproc)

  # ubuntu-24-riscv64-vulkan-cross:
  #   runs-on: ubuntu-24.04
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -40,7 +40,7 @@ jobs:
          # https://github.com/ggml-org/llama.cpp/issues/11888
          #- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
          - { tag: "cpu",    dockerfile: ".devops/cpu.Dockerfile",    platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
-          - { tag: "cuda",   dockerfile: ".devops/cuda.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
+          - { tag: "cuda",   dockerfile: ".devops/cuda.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
          - { tag: "musa",   dockerfile: ".devops/musa.Dockerfile",   platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
          - { tag: "intel",  dockerfile: ".devops/intel.Dockerfile",  platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true,  runs_on: "ubuntu-22.04" }
          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false, runs_on: "ubuntu-22.04" }
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -134,8 +134,8 @@ jobs:
        include:
          - build: 'x64'
            os: ubuntu-22.04
-          - build: 's390x'
-            os: ubuntu-24.04-s390x
+          - build: 's390x-z15' # z15 because our CI runners are on z15
+            os: ubuntu-22.04-s390x
          # GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
          # - build: 'arm64'
          #   os: ubuntu-22.04-arm
--- a/3
+++ b/3
@@ -65,7 +65,7 @@
 /ggml/src/ggml-impl.h                   @ggerganov @slaren
 /ggml/src/ggml-metal/                   @ggerganov
 /ggml/src/ggml-opencl/                  @lhez @max-krasnyansky
-/ggml/src/ggml-hexagon/                 @max-krasnyansky @lhez
+/ggml/src/ggml-hexagon/                 @max-krasnyansky
 /ggml/src/ggml-opt.cpp                  @JohannesGaessler
 /ggml/src/ggml-quants.*                 @ggerganov
 /ggml/src/ggml-rpc/                     @rgerganov
@@ -89,7 +89,6 @@
 /src/llama-model-loader.*               @slaren
 /src/llama-model.*                      @CISC
 /src/llama-vocab.*                      @CISC
-/src/models/                            @CISC
 /tests/                                 @ggerganov
 /tests/test-backend-ops.cpp             @slaren
 /tests/test-thread-safety.cpp           @slaren
--- a/README.md
+++ b/README.md
@@ -17,13 +17,14 @@ LLM inference in C/C++

 ## Hot topics

- **[guide : using the new WebUI of llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/16938)**
- [guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)
- [[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)
+- **[guide : running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)**
+- **[[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗](https://github.com/ggml-org/llama.cpp/discussions/15313)**
 - Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
+- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
 - Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
+- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
 - Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
 - Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)

@@ -83,7 +84,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1)
 - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
 - [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct)
- [x] [Jamba](https://huggingface.co/ai21labs)
 - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
 - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2030,7 +2030,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                params.system_prompt.pop_back();
            }
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_DIFFUSION}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
    add_opt(common_arg(
        {"--in-file"}, "FNAME",
        "an input file (repeat to specify multiple files)",
@@ -2768,20 +2768,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.image.emplace_back(value);
        }
    ).set_examples({LLAMA_EXAMPLE_MTMD}));
-    add_opt(common_arg(
-        {"--image-min-tokens"}, "N",
-        "minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
-        [](common_params & params, int value) {
-            params.image_min_tokens = value;
-        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MIN_TOKENS"));
-    add_opt(common_arg(
-        {"--image-max-tokens"}, "N",
-        "maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
-        [](common_params & params, int value) {
-            params.image_max_tokens = value;
-        }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
    if (llama_supports_rpc()) {
        add_opt(common_arg(
            {"--rpc"}, "SERVERS",
@@ -3217,7 +3203,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
    add_opt(common_arg(
        {"--parse-special"},
-        string_format("parse special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
+        string_format("prase special tokens (chat, tool, etc) (default: %s)", params.parse_special ? "true" : "false"),
        [](common_params & params) {
            params.parse_special = true;
        }
@@ -3262,7 +3248,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
    add_opt(common_arg(
        {"--embd-output-format"}, "FORMAT",
-        "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix, \"raw\" = plain whitespace-delimited output (one embedding per line)",
+        "empty = default, \"array\" = [[],[]...], \"json\" = openai style, \"json+\" = same \"json\" + cosine similarity matrix",
        [](common_params & params, const std::string & value) {
            params.embd_out = value;
        }
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -9,11 +9,8 @@
 #include <minja/chat-template.hpp>
 #include <minja/minja.hpp>

-#include <algorithm>
 #include <cstdio>
-#include <cctype>
 #include <exception>
-#include <functional>
 #include <iostream>
 #include <optional>
 #include <stdexcept>
@@ -313,6 +310,7 @@ json common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msg
        }
        if (!msg.reasoning_content.empty()) {
            jmsg["reasoning_content"] = msg.reasoning_content;
+            jmsg["thinking"] = msg.reasoning_content; // gpt-oss
        }
        if (!msg.tool_name.empty()) {
            jmsg["name"] = msg.tool_name;
@@ -642,7 +640,6 @@ const char * common_chat_format_name(common_chat_format format) {
        case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS";
        case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2";
        case COMMON_CHAT_FORMAT_APERTUS: return "Apertus";
-        case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS: return "LFM2 with JSON tools";
        default:
            throw std::runtime_error("Unknown chat format");
    }
@@ -989,126 +986,6 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
    return data;
 }

-
-// Case-insensitive find
-static size_t ifind_string(const std::string & haystack, const std::string & needle, size_t pos = 0) {
-    auto it = std::search(
-        haystack.begin() + pos, haystack.end(),
-        needle.begin(), needle.end(),
-        [](char a, char b) { return std::tolower(a) == std::tolower(b); }
-    );
-    return (it == haystack.end()) ? std::string::npos : std::distance(haystack.begin(), it);
-}
-
-static common_chat_params common_chat_params_init_lfm2(const common_chat_template & tmpl, const struct templates_params & inputs) {
-    common_chat_params data;
-    const auto is_json_schema_provided = !inputs.json_schema.is_null();
-    const auto is_grammar_provided = !inputs.grammar.empty();
-    const auto are_tools_provided = inputs.tools.is_array() && !inputs.tools.empty();
-
-    // the logic requires potentially modifying the messages
-    auto tweaked_messages = inputs.messages;
-
-    auto replace_json_schema_marker = [](json & messages) -> bool {
-        static std::string marker1 = "force json schema.\n";
-        static std::string marker2 = "force json schema.";
-
-        if (messages.empty() || messages.at(0).at("role") != "system") {
-            return false;
-        }
-
-        std::string content = messages.at(0).at("content");
-
-        for (const auto & marker : {marker1, marker2}) {
-            const auto pos = ifind_string(content, marker);
-            if (pos != std::string::npos) {
-                content.replace(pos, marker.length(), "");
-                // inject modified content back into the messages
-                messages.at(0).at("content") = content;
-                return true;
-            }
-        }
-
-        return false;
-    };
-
-    // Lfm2 model does not natively work with json, but can generally understand the tools structure
-    //
-    // Example of the pytorch dialog structure:
-    //     <|startoftext|><|im_start|>system
-    //     List of tools: <|tool_list_start|>[{"name": "get_candidate_status", "description": "Retrieves the current status of a candidate in the recruitment process", "parameters": {"type": "object", "properties": {"candidate_id": {"type": "string", "description": "Unique identifier for the candidate"}}, "required": ["candidate_id"]}}]<|tool_list_end|><|im_end|>
-    //     <|im_start|>user
-    //     What is the current status of candidate ID 12345?<|im_end|>
-    //     <|im_start|>assistant
-    //     <|tool_call_start|>[get_candidate_status(candidate_id="12345")]<|tool_call_end|>Checking the current status of candidate ID 12345.<|im_end|>
-    //     <|im_start|>tool
-    //     <|tool_response_start|>{"candidate_id": "12345", "status": "Interview Scheduled", "position": "Clinical Research Associate", "date": "2023-11-20"}<|tool_response_end|><|im_end|>
-    //     <|im_start|>assistant
-    //     The candidate with ID 12345 is currently in the "Interview Scheduled" stage for the position of Clinical Research Associate, with an interview date set for 2023-11-20.<|im_end|>
-    //
-    // For the llama server compatibility with json tools semantic,
-    // the client can add "Follow json schema." line into the system message prompt to force the json output.
-    //
-    if (are_tools_provided && (is_json_schema_provided || is_grammar_provided)) {
-        // server/utils.hpp prohibits that branch for the custom grammar anyways
-        throw std::runtime_error("Tools call must not use \"json_schema\" or \"grammar\", use non-tool invocation if you want to use custom grammar");
-    } else if (are_tools_provided && replace_json_schema_marker(tweaked_messages)) {
-        LOG_INF("%s: Using tools to build a grammar\n", __func__);
-
-        data.grammar = build_grammar([&](const common_grammar_builder & builder) {
-            auto schemas = json::array();
-            foreach_function(inputs.tools, [&](const json & tool) {
-                const auto & function = tool.at("function");
-                schemas.push_back({
-                    {"type", "object"},
-                    {"properties", {
-                        {"name", {
-                            {"type", "string"},
-                            {"const", function.at("name")},
-                        }},
-                        {"arguments", function.at("parameters")},
-                    }},
-                    {"required", json::array({"name", "arguments", "id"})},
-                });
-            });
-            auto schema = json {
-                {"type", "array"},
-                {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
-                {"minItems", 1},
-            };
-            if (!inputs.parallel_tool_calls) {
-                schema["maxItems"] = 1;
-            }
-
-            builder.add_rule("root", "\"<|tool_call_start|>\"" + builder.add_schema("tool_calls", schema) + "\"<|tool_call_end|>\"");
-        });
-        // model has no concept of tool selection mode choice,
-        // if the system prompt rendered correctly it will produce a tool call
-        // the grammar goes inside the tool call body
-        data.grammar_lazy = true;
-        data.grammar_triggers = {{COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL, "\\s*<\\|tool_call_start\\|>\\s*\\["}};
-        data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
-        data.format = COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS;
-    } else if (are_tools_provided && (!is_json_schema_provided && !is_grammar_provided)) {
-        LOG_INF("%s: Using tools without json schema or grammar\n", __func__);
-        // output those tokens
-        data.preserved_tokens = {"<|tool_call_start|>", "<|tool_call_end|>"};
-    } else if (is_json_schema_provided) {
-        LOG_INF("%s: Using provided json schema to build a grammar\n", __func__);
-        data.grammar = json_schema_to_grammar(inputs.json_schema);
-    } else if (is_grammar_provided) {
-        LOG_INF("%s: Using provided grammar\n", __func__);
-        data.grammar = inputs.grammar;
-    } else {
-        LOG_INF("%s: Using content relying on the template\n", __func__);
-    }
-
-    data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
-    LOG_DBG("%s: Prompt: %s\n", __func__, data.prompt.c_str());
-
-    return data;
-}
-
 static common_chat_params common_chat_params_init_magistral(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
    data.prompt = apply(tmpl, inputs);
@@ -1809,23 +1686,7 @@ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {

 static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
    common_chat_params data;
-
-    // Copy reasoning to the "thinking" field as expected by the gpt-oss template
-    auto adjusted_messages = json::array();
-    for (const auto & msg : inputs.messages) {
-        auto has_reasoning_content = msg.contains("reasoning_content") && msg.at("reasoning_content").is_string();
-        auto has_tool_calls = msg.contains("tool_calls") && msg.at("tool_calls").is_array();
-
-        if (has_reasoning_content && has_tool_calls) {
-            auto adjusted_message = msg;
-            adjusted_message["thinking"] = msg.at("reasoning_content");
-            adjusted_messages.push_back(adjusted_message);
-        } else {
-            adjusted_messages.push_back(msg);
-        }
-    }
-
-    auto prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
+    auto prompt = apply(tmpl, inputs);

    // Check if we need to replace the return token with end token during
    // inference and without generation prompt. For more details see:
@@ -2638,71 +2499,6 @@ static void common_chat_parse_apertus(common_chat_msg_parser & builder) {
    builder.add_content(builder.consume_rest());
 }

-
-static void common_chat_parse_lfm2(common_chat_msg_parser & builder) {
-    if (!builder.syntax().parse_tool_calls) {
-        builder.add_content(builder.consume_rest());
-        return;
-    }
-
-    // LFM2 format: <|tool_call_start|>[{"name": "get_current_time", "arguments": {"location": "Paris"}}]<|tool_call_end|>
-    static const common_regex tool_call_start_regex(regex_escape("<|tool_call_start|>"));
-    static const common_regex tool_call_end_regex(regex_escape("<|tool_call_end|>"));
-
-    // Loop through all tool calls
-    while (auto res = builder.try_find_regex(tool_call_start_regex, std::string::npos, /* add_prelude_to_content= */ true)) {
-        builder.move_to(res->groups[0].end);
-
-        // Parse JSON array format: [{"name": "...", "arguments": {...}}]
-        auto tool_calls_data = builder.consume_json();
-
-        // Consume end marker
-        builder.consume_spaces();
-        if (!builder.try_consume_regex(tool_call_end_regex)) {
-            throw common_chat_msg_partial_exception("Expected <|tool_call_end|>");
-        }
-
-        // Process each tool call in the array
-        if (tool_calls_data.json.is_array()) {
-            for (const auto & tool_call : tool_calls_data.json) {
-                if (!tool_call.is_object()) {
-                    throw common_chat_msg_partial_exception("Tool call must be an object");
-                }
-
-                if (!tool_call.contains("name")) {
-                    throw common_chat_msg_partial_exception("Tool call missing 'name' field");
-                }
-
-                std::string function_name = tool_call.at("name");
-                std::string arguments = "{}";
-
-                if (tool_call.contains("arguments")) {
-                    if (tool_call.at("arguments").is_object()) {
-                        arguments = tool_call.at("arguments").dump();
-                    } else if (tool_call.at("arguments").is_string()) {
-                        arguments = tool_call.at("arguments");
-                    }
-                }
-
-                if (!builder.add_tool_call(function_name, "", arguments)) {
-                    throw common_chat_msg_partial_exception("Incomplete tool call");
-                }
-            }
-        } else {
-            throw common_chat_msg_partial_exception("Expected JSON array for tool calls");
-        }
-
-        // Consume any trailing whitespace after this tool call
-        builder.consume_spaces();
-    }
-
-    // Consume any remaining content after all tool calls
-    auto remaining = builder.consume_rest();
-    if (!string_strip(remaining).empty()) {
-        builder.add_content(remaining);
-    }
-}
-
 static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) {
    // Parse thinking tags first - this handles the main reasoning content
    builder.try_parse_reasoning("<seed:think>", "</seed:think>");
@@ -2952,12 +2748,6 @@ static common_chat_params common_chat_templates_apply_jinja(
        return common_chat_params_init_apertus(tmpl, params);
    }

-    // LFM2 (w/ tools)
-    if (src.find("List of tools: <|tool_list_start|>[") != std::string::npos &&
-        src.find("]<|tool_list_end|>") != std::string::npos) {
-        return common_chat_params_init_lfm2(tmpl, params);
-    }
-
    // Use generic handler when mixing tools + JSON schema.
    // TODO: support that mix in handlers below.
    if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -3136,9 +2926,6 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
        case COMMON_CHAT_FORMAT_APERTUS:
            common_chat_parse_apertus(builder);
            break;
-        case COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS:
-            common_chat_parse_lfm2(builder);
-            break;
        default:
            throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
    }
--- a/common/chat.h
+++ b/common/chat.h
@@ -116,7 +116,6 @@ enum common_chat_format {
    COMMON_CHAT_FORMAT_SEED_OSS,
    COMMON_CHAT_FORMAT_NEMOTRON_V2,
    COMMON_CHAT_FORMAT_APERTUS,
-    COMMON_CHAT_FORMAT_LFM2_WITH_JSON_TOOLS,

    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
 };
--- a/common/common.h
+++ b/common/common.h
@@ -406,8 +406,6 @@ struct common_params {
    bool mmproj_use_gpu = true;     // use GPU for multimodal model
    bool no_mmproj = false;         // explicitly disable multimodal model
    std::vector<std::string> image; // path to image file(s)
-    int image_min_tokens = -1;
-    int image_max_tokens = -1;

    // finetune
    struct lr_opt lr;
@@ -507,10 +505,6 @@ struct common_params {
    // return false from callback to abort model loading or true to continue
    llama_progress_callback load_progress_callback = NULL;
    void *                  load_progress_callback_user_data = NULL;
-
-    bool has_speculative() const {
-        return !speculative.model.path.empty() || !speculative.model.hf_repo.empty();
-    }
 };

 // call once at the start of a program if it uses libcommon
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -601,10 +601,7 @@ private:
    }

    std::string _resolve_ref(const std::string & ref) {
-        auto it = ref.find('#');
-        std::string ref_fragment = it != std::string::npos ? ref.substr(it + 1) : ref;
-        static const std::regex nonalphanumeric_regex(R"([^a-zA-Z0-9-]+)");
-        std::string ref_name = "ref" + std::regex_replace(ref_fragment, nonalphanumeric_regex, "-");
+        std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
        if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
            _refs_being_resolved.insert(ref);
            json resolved = _refs[ref];
@@ -777,24 +774,11 @@ public:
                        std::vector<std::string> tokens = string_split(pointer, "/");
                        for (size_t i = 1; i < tokens.size(); ++i) {
                            std::string sel = tokens[i];
-                            if (target.is_object() && target.contains(sel)) {
-                                target = target[sel];
-                            } else if (target.is_array()) {
-                                size_t sel_index;
-                                try {
-                                    sel_index = std::stoul(sel);
-                                } catch (const std::invalid_argument & e) {
-                                    sel_index = target.size();
-                                }
-                                if (sel_index >= target.size()) {
-                                    _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
-                                    return;
-                                }
-                                target = target[sel_index];
-                            } else {
+                            if (target.is_null() || !target.contains(sel)) {
                                _errors.push_back("Error resolving ref " + ref + ": " + sel + " not in " + target.dump());
                                return;
                            }
+                            target = target[sel];
                        }
                        _refs[ref] = target;
                    }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -742,12 +742,6 @@ class TextModel(ModelBase):
        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
            self.gguf_writer.add_expert_used_count(n_experts_used)
            logger.info(f"gguf: experts used count = {n_experts_used}")
-        if (n_expert_groups := self.hparams.get("n_group")) is not None:
-            self.gguf_writer.add_expert_group_count(n_expert_groups)
-            logger.info(f"gguf: expert groups count = {n_expert_groups}")
-        if (n_group_used := self.hparams.get("topk_group")) is not None:
-            self.gguf_writer.add_expert_group_used_count(n_group_used)
-            logger.info(f"gguf: expert groups used count = {n_group_used}")

        if (head_dim := self.hparams.get("head_dim")) is not None:
            self.gguf_writer.add_key_length(head_dim)
@@ -1054,9 +1048,6 @@ class TextModel(ModelBase):
        if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
            # ref: https://huggingface.co/ibm-granite/granite-docling-258M
            res = "granite-docling"
-        if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
-            # ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
-            res = "minimax-m2"

        if res is None:
            logger.warning("\n")
@@ -1506,17 +1497,6 @@ class MmprojModel(ModelBase):
    def set_type(self):
        self.gguf_writer.add_type(gguf.GGUFType.MMPROJ)

-    def prepare_metadata(self, vocab_only: bool):
-        super().prepare_metadata(vocab_only=vocab_only)
-
-        output_type: str = self.ftype.name.partition("_")[2]
-
-        if self.fname_out.is_dir():
-            fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=output_type, model_type=None)
-            self.fname_out = self.fname_out / f"mmproj-{fname_default}.gguf"
-        else:
-            self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
-
    def set_gguf_parameters(self):
        self.gguf_writer.add_file_type(self.ftype)

@@ -1531,7 +1511,7 @@ class MmprojModel(ModelBase):
            self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
            self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
            self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys))
-            self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads"]))
+            self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"]))

            # preprocessor config
            image_mean = _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"]
@@ -2463,21 +2443,18 @@ class ArceeModel(LlamaModel):
 )
 class LlavaVisionModel(MmprojModel):
    img_break_tok_id = -1
-    use_break_tok = True

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if self.hparams.get("model_type") == "pixtral":
            # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py
            self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
-            if self.use_break_tok:
-                self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
+            self.img_break_tok_id = self.get_token_id("[IMG_BREAK]")
        elif self.is_mistral_format:
            # hparams is already vision config here so norm_eps is only defined in global_config.
            self.hparams["norm_eps"] = self.global_config.get("norm_eps", None)
            assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json"
-            if self.use_break_tok:
-                self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
+            self.img_break_tok_id = self.find_vparam(["image_break_token_id"])
        else:
            raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
        logger.info(f"Image break token id: {self.img_break_tok_id}")
@@ -3855,43 +3832,7 @@ class Qwen2MoeModel(TextModel):
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # process the experts separately
        name = name.replace("language_model.", "") # InternVL
-
-        # handle aggregated expert tensors
-        # GGUF stores dimensions reversed from PyTorch, so:
-        # PyTorch (A,B,C) -> GGUF writes [C,B,A] -> GGML reads ne={C,B,A}
-        # Input shapes from HF: (n_expert, n_ff_exp, n_embd) or (n_expert, n_embd, n_ff_exp)
-        # Expected GGML ne: {n_embd, n_ff_exp, n_expert} for gate/up, {n_ff_exp, n_embd, n_expert} for down
-        if name.endswith("mlp.experts.down_proj") or name.endswith("mlp.experts.down_proj.weight"):
-            mapped = f"{name}.weight" if not name.endswith(".weight") else name
-            # Input: (n_expert=128, n_ff_exp=768, n_embd=2048)
-            # Want GGML ne: {n_ff_exp, n_embd, n_expert} = {768, 2048, 128}
-            # Need PyTorch: (128, 2048, 768) [reversed of GGML]
-            # So: permute(0, 2, 1): (128, 768, 2048) -> (128, 2048, 768)
-            permuted = data_torch.permute(0, 2, 1).contiguous()
-            return [(self.map_tensor_name(mapped), permuted)]
-
-        if name.endswith("mlp.experts.gate_up_proj") or name.endswith("mlp.experts.gate_up_proj.weight"):
-            if data_torch.ndim < 3 or data_torch.shape[-1] % 2 != 0:
-                raise ValueError(f"Unexpected gate_up_proj shape for {name}: {tuple(data_torch.shape)}")
-            split_dim = data_torch.shape[-1] // 2
-            gate = data_torch[..., :split_dim].contiguous()
-            up = data_torch[..., split_dim:].contiguous()
-            # Input gate/up: (n_expert=128, n_embd=2048, n_ff_exp=768)
-            # Want GGML ne: {n_embd, n_ff_exp, n_expert} = {2048, 768, 128}
-            # Need PyTorch: (128, 768, 2048) [reversed of GGML]
-            # So: permute(0, 2, 1): (128, 2048, 768) -> (128, 768, 2048)
-            base_name = name.removesuffix(".weight")
-            base = base_name.rsplit('.', 1)[0]
-            mapped_gate = f"{base}.gate_proj.weight"
-            mapped_up = f"{base}.up_proj.weight"
-            perm_gate = gate.permute(0, 2, 1).contiguous()
-            perm_up = up.permute(0, 2, 1).contiguous()
-            return [
-                (self.map_tensor_name(mapped_gate), perm_gate),
-                (self.map_tensor_name(mapped_up), perm_up),
-            ]
-
-        if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") or name.startswith("model.visual"):
+        if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
            # skip visual tensors
            return []
        if name.find("experts") != -1:
@@ -4004,10 +3945,6 @@ class Qwen3Model(Qwen2Model):
        return torch.stack([true_row, false_row], dim=0)

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if "model.vision_" in name:
-            # skip multimodal tensors
-            return []
-
        if self.is_rerank:
            is_tied_head = self.is_tied_embeddings and "embed_tokens" in name
            is_real_head = not self.is_tied_embeddings and "lm_head" in name
@@ -4043,187 +3980,6 @@ class Qwen3MoeModel(Qwen2MoeModel):
        super().set_vocab()


-@ModelBase.register("Qwen3VLForConditionalGeneration", "Qwen3VLMoeForConditionalGeneration")
-class Qwen3VLVisionModel(MmprojModel):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        assert self.hparams_vision is not None
-        # Compute image_size if not present
-        if "image_size" not in self.hparams_vision:
-            # For Qwen3VL/Qwen3VLMoe, compute from num_position_embeddings
-            num_pos = self.hparams_vision.get("num_position_embeddings", 2304)
-            patch_size = self.hparams_vision.get("patch_size", 16)
-            # num_position_embeddings = (image_size / patch_size) ** 2
-            # So image_size = sqrt(num_position_embeddings) * patch_size
-            image_size = int(num_pos**0.5 * patch_size)
-            self.hparams_vision["image_size"] = image_size
-
-        # Rename config values for compatibility
-        self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
-        self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
-
-        self.is_deepstack_layers = [False] * int(self.hparams_vision["num_hidden_layers"] or 0)
-        for idx in self.hparams_vision.get("deepstack_visual_indexes", []):
-            self.is_deepstack_layers[idx] = True
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN3VL)
-        self.gguf_writer.add_vision_use_gelu(True)
-
-        if self.hparams_vision is not None:
-            merge_size = self.hparams_vision.get("spatial_merge_size")
-            if merge_size is not None:
-                self.gguf_writer.add_vision_spatial_merge_size(int(merge_size))
-
-        # Use text config's rms_norm_eps for vision attention layernorm eps
-        rms_norm_eps = self.global_config.get("text_config", {}).get("rms_norm_eps", 1e-6)
-        self.gguf_writer.add_vision_attention_layernorm_eps(rms_norm_eps)
-
-        if self.is_deepstack_layers:
-            self.gguf_writer.add_vision_is_deepstack_layers(self.is_deepstack_layers)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        assert self.hparams_vision is not None
-        # Skip text model tensors - they go in the text model file
-        if name.startswith("model.language_model.") or name.startswith("lm_head."):
-            return []
-
-        if name.startswith("model.visual."):
-            name = name.replace("model.visual.", "visual.", 1)
-
-        if name.startswith("visual.deepstack_merger_list."):
-            prefix, rest = name.split(".", maxsplit=3)[2:]
-            # prefix is the layer index, convert to absolute clip layer index!
-            idx = self.hparams_vision.get("deepstack_visual_indexes", [])[int(prefix)]
-            target = rest
-
-            tensor_type: gguf.MODEL_TENSOR
-            if target.startswith("norm."):
-                tensor_type = gguf.MODEL_TENSOR.V_DS_NORM
-                suffix = target.split(".", 1)[1]
-            elif target.startswith("linear_fc1."):
-                tensor_type = gguf.MODEL_TENSOR.V_DS_FC1
-                suffix = target.split(".", 1)[1]
-            elif target.startswith("linear_fc2."):
-                tensor_type = gguf.MODEL_TENSOR.V_DS_FC2
-                suffix = target.split(".", 1)[1]
-            else:
-                raise ValueError(f"Unexpected deepstack tensor: {name}")
-
-            new_name = self.format_tensor_name(tensor_type, idx, suffix=f".{suffix}")
-            return [(new_name, data_torch)]
-
-        if name.startswith("visual.merger."):
-            suffix = name.split(".", 2)[2]
-            if suffix.startswith("linear_fc"):
-                fc_idx_str, tail = suffix.split(".", 1)
-                fc_num = int(fc_idx_str.replace("linear_fc", ""))
-                # Qwen3VL has linear_fc1 and linear_fc2
-                # Map to indices 0 and 2 (matching Qwen2VL which uses indices 0 and 2)
-                if fc_num == 1:
-                    fc_idx = 0
-                elif fc_num == 2:
-                    fc_idx = 2
-                else:
-                    raise ValueError(f"unexpected fc index {fc_num} in {name}")
-                new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, fc_idx, suffix=f".{tail}")
-            elif suffix.startswith("norm."):
-                new_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_POST_NORM, suffix=f".{suffix.split('.', 1)[1]}")
-            else:
-                raise ValueError(f"Unexpected merger tensor: {name}")
-            return [(new_name, data_torch)]
-
-        if name == "visual.patch_embed.proj.weight":
-            # split Conv3D into Conv2Ds along temporal dimension
-            c1, c2, kt, _, _ = data_torch.shape
-            del c1, c2
-            if kt != 2:
-                raise ValueError("Current implementation only supports temporal_patch_size of 2")
-            return [
-                (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight", data_torch[:, :, 0, ...]),
-                (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...]),
-            ]
-
-        if name == "visual.patch_embed.proj.bias":
-            # Include the bias - it's used by the C++ code
-            return [(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".bias", data_torch)]
-
-        if name.startswith("visual."):
-            return [(self.map_tensor_name(name), data_torch)]
-
-        # Fall back to parent class for other tensors
-        return super().modify_tensors(data_torch, name, bid)
-
-
-@ModelBase.register("Qwen3VLForConditionalGeneration")
-class Qwen3VLTextModel(Qwen3Model):
-    model_arch = gguf.MODEL_ARCH.QWEN3VL
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
-        text_config = self.hparams.get("text_config", {})
-        # rope_scaling is deprecated in V5, use rope_parameters instead
-        rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
-
-        if rope_scaling.get("mrope_section"):
-            # mrope_section contains [time, height, width] dimensions
-            mrope_section = rope_scaling["mrope_section"]
-            # Pad to 4 dimensions [time, height, width, extra]
-            while len(mrope_section) < 4:
-                mrope_section.append(0)
-            self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
-
-            logger.info(f"MRoPE sections: {mrope_section[:4]}")
-
-        vision_config = self.hparams.get("vision_config", {})
-        deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
-        self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # Skip vision tensors - they go in the mmproj file
-        if name.startswith("model.visual."):
-            return []
-
-        return super().modify_tensors(data_torch, name, bid)
-
-
-@ModelBase.register("Qwen3VLMoeForConditionalGeneration")
-class Qwen3VLMoeTextModel(Qwen3MoeModel):
-    model_arch = gguf.MODEL_ARCH.QWEN3VLMOE
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-
-        # Handle MRoPE (Multi-axis Rotary Position Embedding) for Qwen3-VL
-        text_config = self.hparams.get("text_config", {})
-        # rope_scaling is deprecated in V5, use rope_parameters instead
-        rope_scaling = text_config.get("rope_scaling") or text_config.get("rope_parameters") or {}
-
-        if rope_scaling.get("mrope_section"):
-            # mrope_section contains [time, height, width] dimensions
-            mrope_section = rope_scaling["mrope_section"]
-            # Pad to 4 dimensions [time, height, width, extra]
-            while len(mrope_section) < 4:
-                mrope_section.append(0)
-            self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
-
-            logger.info(f"MRoPE sections: {mrope_section[:4]}")
-
-        vision_config = self.hparams.get("vision_config", {})
-        deepstack_layer_num = len(vision_config.get("deepstack_visual_indexes", []))
-        self.gguf_writer.add_num_deepstack_layers(deepstack_layer_num)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # Skip vision tensors - they go in the mmproj file
-        if name.startswith("model.visual."):
-            return []
-
-        return super().modify_tensors(data_torch, name, bid)
-
-
@ModelBase.register("GPT2LMHeadModel")
 class GPT2Model(TextModel):
    model_arch = gguf.MODEL_ARCH.GPT2
@@ -7129,100 +6885,6 @@ class DeepseekV2Model(TextModel):
                raise ValueError(f"Unprocessed experts: {experts}")


-@ModelBase.register("MiniMaxM2ForCausalLM")
-class MiniMaxM2Model(TextModel):
-    model_arch = gguf.MODEL_ARCH.MINIMAXM2
-    _experts_cache: dict[int, dict[str, Tensor]] = {}
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.hparams["num_experts"] = self.hparams["num_local_experts"]
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        if self.hparams["scoring_func"] == "sigmoid":
-            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
-        elif self.hparams["scoring_func"] == "softmax":
-            self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
-        else:
-            raise ValueError(f"Unsupported scoring_func value: {self.hparams['scoring_func']}")
-
-        self.gguf_writer.add_expert_feed_forward_length(self.find_hparam(["intermediate_size"]))
-        self.gguf_writer.add_rope_dimension_count(self.find_hparam(["rotary_dim"]))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
-        if name.endswith("e_score_correction_bias"):
-            name = name.replace("e_score_correction_bias", "e_score_correction.bias")
-
-        # merge expert weights
-        if 'experts' in name:
-            n_experts = self.hparams["num_experts"]
-            assert bid is not None
-
-            expert_cache = self._experts_cache.setdefault(bid, {})
-            expert_cache[name] = data_torch
-            expert_weights = ["w1", "w2", "w3"]
-
-            # not enough expert weights to merge
-            if len(expert_cache) < n_experts * len(expert_weights):
-                return []
-
-            tensors: list[tuple[str, Tensor]] = []
-            for w_name in expert_weights:
-                datas: list[Tensor] = []
-
-                for xid in range(n_experts):
-                    ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight"
-                    datas.append(expert_cache[ename])
-                    del expert_cache[ename]
-
-                data_torch = torch.stack(datas, dim=0)
-                merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight"
-                new_name = self.map_tensor_name(merged_name)
-                tensors.append((new_name, data_torch))
-
-            del self._experts_cache[bid]
-            return tensors
-
-        return super().modify_tensors(data_torch, name, bid)
-
-
-@ModelBase.register("PanguEmbeddedForCausalLM")
-class PanguEmbeddedModel(TextModel):
-    model_arch = gguf.MODEL_ARCH.PANGU_EMBED
-
-    def set_vocab(self):
-        self._set_vocab_sentencepiece()
-
-        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
-        if tokenizer_config_file.is_file():
-            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
-                tokenizer_config_json = json.load(f)
-                if "add_prefix_space" in tokenizer_config_json:
-                    self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-
-        # PanguEmbedded's hparam loaded from config.json without head_dim
-        if (rope_dim := hparams.get("head_dim")) is None:
-            rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"]
-        self.gguf_writer.add_rope_dimension_count(rope_dim)
-
-        if hparams.get("head_dim") is None:
-            self.gguf_writer.add_key_length(rope_dim)
-            self.gguf_writer.add_value_length(rope_dim)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        if name == "lm_head.weight":
-            if self.hparams.get("tie_word_embeddings", False):
-                logger.info("Skipping tied output layer 'lm_head.weight'")
-                return []
-        return [(self.map_tensor_name(name), data_torch)]
-
-
@ModelBase.register("Dots1ForCausalLM")
 class Dots1Model(Qwen2MoeModel):
    model_arch = gguf.MODEL_ARCH.DOTS1
@@ -8560,6 +8222,8 @@ class BailingMoeV2Model(TextModel):
        self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
        self.gguf_writer.add_expert_count(hparams["num_experts"])
        self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
+        self.gguf_writer.add_expert_group_count(hparams["n_group"])
+        self.gguf_writer.add_expert_group_used_count(hparams["topk_group"])
        self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])

        if hparams["score_function"] == "sigmoid":
@@ -9279,13 +8943,6 @@ class SmolLM3Model(LlamaModel):
 class GptOssModel(TextModel):
    model_arch = gguf.MODEL_ARCH.GPT_OSS

-    # TODO: remove once MXFP4 is supported more generally
-    def dequant_model(self):
-        quant_config = self.hparams.get("quantization_config")
-        if quant_config is not None and quant_config.get("quant_method") == "mxfp4":
-            return
-        return super().dequant_model()
-
    def transform_nibble_layout(self, tensor):
        assert tensor.dtype == torch.uint8
        assert tensor.shape[-1] == 16
@@ -9756,21 +9413,6 @@ class PixtralModel(LlavaVisionModel):
        return super().map_tensor_name(name, try_suffixes)


-@ModelBase.register("LightOnOCRForConditionalGeneration")
-class LightOnOCRVisionModel(LlavaVisionModel):
-    is_mistral_format = False
-    use_break_tok = False
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LIGHTONOCR)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
-        name = name.replace("model.vision_encoder.", "vision_tower.")
-        name = name.replace("model.vision_projection.", "multi_modal_projector.")
-        return super().modify_tensors(data_torch, name, bid)
-
-
@ModelBase.register("KimiVLForConditionalGeneration")
 class KimiVLModel(MmprojModel):
    def __init__(self, *args, **kwargs):
@@ -9807,144 +9449,6 @@ class KimiVLModel(MmprojModel):

        return [] # skip other tensors

-
-@ModelBase.register("CogVLMForCausalLM")
-class CogVLMVisionModel(MmprojModel):
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-
-        if not name.startswith("model.vision."):
-            return []
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-
-@ModelBase.register("CogVLMForCausalLM")
-class CogVLMModel(LlamaModel):
-    model_arch = gguf.MODEL_ARCH.COGVLM
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-
-        # block vision tensors
-        if name.startswith("model.vision."):
-            return []
-
-        return [(self.map_tensor_name(name), data_torch)]
-
-
-@ModelBase.register("JanusForConditionalGeneration")
-class JanusProModel(LlamaModel):
-    model_arch = gguf.MODEL_ARCH.LLAMA  # reuse Llama arch
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # Skip vision, aligner, and generation tensors
-        skip_prefixes = (
-            'model.vision_model.',
-            'model.aligner.',
-            'model.vqmodel.',
-            'model.generation_embeddings.',
-            'model.generation_aligner.',
-            'model.generation_head.',
-        )
-        if name.startswith(skip_prefixes):
-            return []
-
-        if name.startswith('model.language_model.'):
-            name = name.replace('model.language_model.', 'model.')
-        elif name.startswith('language_model.'):
-            name = name.replace('language_model.', '')
-
-        return super().modify_tensors(data_torch, name, bid)
-
-
-@ModelBase.register("JanusForConditionalGeneration")
-class JanusProVisionModel(MmprojModel):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        assert self.hparams_vision is not None
-        if "intermediate_size" not in self.hparams_vision:
-            mlp_ratio = self.hparams_vision.get("mlp_ratio")
-            hidden_size = self.hparams_vision.get("hidden_size")
-            if mlp_ratio is not None and hidden_size is not None:
-                self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        assert self.hparams_vision is not None
-
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)
-
-        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
-
-        hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
-        if hidden_act == "gelu":
-            self.gguf_writer.add_vision_use_gelu(True)
-        elif hidden_act == "silu":
-            self.gguf_writer.add_vision_use_silu(True)
-
-    def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
-        """Map aligner tensors to projector format"""
-        suffix = ".bias" if name.endswith(".bias") else ".weight"
-
-        if name.startswith("model.aligner."):
-            local_name = name[len("model.aligner."):]
-        elif name.startswith("aligner."):
-            local_name = name[len("aligner."):]
-        else:
-            raise ValueError(f"Unsupported Janus aligner prefix: {name}")
-
-        if local_name.startswith("fc1."):
-            mm_index = 0
-        elif local_name.startswith("hidden_layers."):
-            parts = local_name.split(".", 2)
-            if len(parts) < 3:
-                raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
-            mm_index = int(parts[1]) + 1
-        else:
-            raise ValueError(f"Unsupported Janus aligner tensor: {name}")
-
-        tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
-        return [(tensor_name, data_torch)]
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-
-        # Skip language model tensors as they will be handled by `JanusProModel`
-        if name.startswith(('model.language_model.', 'language_model.')):
-            return []
-
-        # Skip generation-related components
-        skip_generation_prefixes = (
-            'model.vqmodel.',
-            'vqmodel.',
-            'model.generation_embeddings.',
-            'generation_embeddings.',
-            'model.generation_aligner.',
-            'generation_aligner.',
-            'model.generation_head.',
-            'generation_head.',
-        )
-        if name.startswith(skip_generation_prefixes):
-            return []
-
-        # Handle aligner tensors
-        if name.startswith(('model.aligner.', 'aligner.')):
-            return list(self._map_aligner_tensor(data_torch, name))
-
-        # Handle vision tensors
-        if name.startswith(('model.vision_model.', 'vision_model.')):
-            return [(self.map_tensor_name(name), data_torch)]
-
-        return []
-
-
 ###### CONVERSION LOGIC ######


@@ -10218,6 +9722,10 @@ def main() -> None:

    logger.info(f"Loading model: {dir_model.name}")

+    if args.mmproj:
+        if "mmproj" not in fname_out.name:
+            fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
+
    is_mistral_format = args.mistral_format
    if is_mistral_format and not _mistral_common_installed:
        raise ImportError(_mistral_import_error_msg)
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -141,7 +141,6 @@ models = [
    {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
    {"name": "bailingmoe2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
    {"name": "granite-docling",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
-    {"name": "minimax-m2",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
 ]

 # some models are known to be broken upstream, so we will skip them as exceptions
@@ -436,7 +435,7 @@ for model in models:
            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
        else:
            tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
-    except (OSError, TypeError) as e:
+    except OSError as e:
        logger.error(f"Failed to load tokenizer for model {name}. Error: {e}")
        continue  # Skip this model and continue with the next one in the loop

--- a/docs/backend/OPENCL.md
+++ b/docs/backend/OPENCL.md
@@ -39,23 +39,18 @@ The llama.cpp OpenCL backend is designed to enable llama.cpp on **Qualcomm Adren
 | Adreno 830 (Snapdragon 8 Elite)      | Support |
 | Adreno X85 (Snapdragon X Elite)      | Support |

-> A6x GPUs with a recent driver and compiler are supported; they are usually found in IoT platforms.
-However, A6x GPUs in phones are likely not supported due to the outdated driver and compiler.
-
 ## DataType Supports

 | DataType               | Status                     |
 |:----------------------:|:--------------------------:|
 | Q4_0                   | Support                    |
 | Q6_K                   | Support, but not optimized |
-| Q8_0                   | Support                    |
-| MXFP4                  | Support                    |

 ## Model Preparation

-You can refer to the general [llama-quantize tool](/tools/quantize/README.md) for steps to convert a model in Hugging Face safetensor format to GGUF with quantization.
+You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration.

-Currently we support `Q4_0` quantization and have optimized for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize` (i.e., make all weights in `Q4_0`). For example,
+Currently we support `Q4_0` quantization and have optimize for it. To achieve best performance on Adreno GPU, add `--pure` to `llama-quantize`. For example,

 ```sh
 ./llama-quantize --pure ggml-model-qwen2.5-3b-f16.gguf ggml-model-qwen-3b-Q4_0.gguf Q4_0
@@ -63,17 +58,6 @@ Currently we support `Q4_0` quantization and have optimized for it. To achieve b

 Since `Q6_K` is also supported, `Q4_0` quantization without `--pure` will also work. However, the performance will be worse compared to pure `Q4_0` quantization.

-### `MXFP4` MoE Models
-
-OpenAI gpt-oss models are MoE models in `MXFP4`. The quantized model will be in `MXFP4_MOE`, a mixture of `MXFP4` and `Q8_0`.
-For this quantization, there is no need to specify `--pure`.
-For gpt-oss-20b model, you can directly [download](https://huggingface.co/ggml-org/gpt-oss-20b-GGUF) the quantized GGUF file in `MXFP4_MOE` from Hugging Face.
-
-Although it is possible to quantize gpt-oss-20b model in pure `Q4_0` (all weights in `Q4_0`), it is not recommended since `MXFP4` has been optimized for MoE while `Q4_0` is not. In addition, accuracy should degrade with such pure `Q4_0` quantization.
-Hence, using the default `MXFP4_MOE` quantization (see the link above) is recommended for this model.
-
-> Note that the `Q4_0` model found [here](https://huggingface.co/unsloth/gpt-oss-20b-GGUF/blob/main/gpt-oss-20b-Q4_0.gguf) is a mixture of `Q4_0`, `Q8_0` and `MXFP4` and gives better performance than `MXFP4_MOE` quantization.
-
 ## CMake Options

 The OpenCL backend has the following CMake options that control the behavior of the backend.
@@ -162,13 +146,10 @@ A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the followi
 * Ninja
 * Visual Studio 2022
 * Powershell 7
-* Python

 Visual Studio provides necessary headers and libraries although it is not directly used for building.
 Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio.

-> Note that building using Visual Studio's cl compiler is not supported. Clang must be used. Clang depends on libraries provided by Visual Studio to work. Therefore, Visual Studio must be installed. Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio.
-
 Powershell 7 is used for the following commands.
 If an older version of Powershell is used, these commands may not work as they are.

@@ -220,12 +201,9 @@ ninja

 ## Known Issues

- Flash attention does not always improve performance.
- Currently OpenCL backend works on A6xx GPUs with recent drivers and compilers (usually found in IoT platforms).
-  However, it does not work on A6xx GPUs found in phones with old drivers and compilers.
+- Currently OpenCL backend does not work on Adreno 6xx GPUs.

 ## TODO

 - Optimization for Q6_K
 - Support and optimization for Q4_K
- Improve flash attention
--- a/docs/build.md
+++ b/docs/build.md
@@ -178,48 +178,6 @@ GeForce RTX 3070      8.6
 cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="86;89"
 ```

-### Overriding the CUDA Version
-
-If you have multiple CUDA installations on your system and want to compile llama.cpp for a specific one, e.g. for CUDA 11.7 installed under `/opt/cuda-11.7`:
-
-```bash
-cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=/opt/cuda-11.7/bin/nvcc -DCMAKE_INSTALL_RPATH="/opt/cuda-11.7/lib64;\$ORIGIN" -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON
-```
-
-#### Fixing Compatibility Issues with Old CUDA and New glibc
-
-If you try to use an old CUDA version (e.g. v11.7) with a new glibc version you can get errors like this:
-
-```
-/usr/include/bits/mathcalls.h(83): error: exception specification is
-  incompatible with that of previous function "cospi"
-
-
-  /opt/cuda-11.7/bin/../targets/x86_64-linux/include/crt/math_functions.h(5545):
-  here
-```
-
-It seems the least bad solution is to patch the CUDA installation to declare the correct signatures.
-Replace the following lines in `/path/to/your/cuda/installation/targets/x86_64-linux/include/crt/math_functions.h`:
-
-```C++
-// original lines
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cospi(double x);
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cospif(float x);
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 sinpi(double x);
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinpif(float x);
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rsqrt(double x);
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rsqrtf(float x);
-
-// edited lines
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 cospi(double x) noexcept (true);
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  cospif(float x) noexcept (true);
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 sinpi(double x) noexcept (true);
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  sinpif(float x) noexcept (true);
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double                 rsqrt(double x) noexcept (true);
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float                  rsqrtf(float x) noexcept (true);
-```
-
 ### Runtime CUDA environmental variables

 You may set the [cuda environmental variables](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) at runtime.
@@ -303,12 +261,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
  ```bash
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-      cmake -S . -B build -DGGML_HIP=ON -DGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
      && cmake --build build --config Release -- -j 16
  ```

-  Note: `GPU_TARGETS` is optional, omitting it will build the code for all GPUs in the current system.
-
  To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.

  The rocWMMA library is included by default when installing the ROCm SDK using the `rocm` meta package provided by AMD. Alternatively, if you are not using the meta package, you can install the library using the `rocwmma-dev` or `rocwmma-devel` package, depending on your system's package manager.
@@ -326,17 +282,17 @@ You can download it from your Linux distro's package manager or from here: [ROCm
  ```bash
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
  HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
-      cmake -S . -B build -DGGML_HIP=ON -DGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
      && cmake --build build -- -j 16
  ```

 - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
  ```bash
  set PATH=%HIP_PATH%\bin;%PATH%
-  cmake -S . -B build -G Ninja -DGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
  cmake --build build
  ```
-  If necessary, adapt `GPU_TARGETS` to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
+  Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
  Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.


--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,9 +7,9 @@
 ## Images
 We have three Docker images available for this project:

-1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
-2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
-3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`, `linux/s390x`)
+1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
+2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
+3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)

 Additionally, there the following images, similar to the above:

--- a/docs/ops.md
+++ b/docs/ops.md
@@ -22,11 +22,11 @@ Legend:
 |                           ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
 |                           ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                          ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
-|                             CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                             CEIL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                            CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ |
-|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ❌ |
+|                           CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ |
 |                             CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ |
-|                          CONV_2D | ❌ | ❌ | ✅ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ |
+|                          CONV_2D | ❌ | ❌ | ✅ | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ |
 |                       CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                          CONV_3D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
 |                CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
@@ -42,7 +42,7 @@ Legend:
 |                              ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                              EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ |
 |                   FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ |
-|                            FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                            FLOOR | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                            GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
 |                        GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ |
@@ -79,12 +79,12 @@ Legend:
 |                           REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ❌ |
 |                      REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                         RMS_NORM | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ |
-|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ |
+|                    RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                 RMS_NORM_MUL_ADD | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                             ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ |
 |                             ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                        ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ |
-|                            ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                            ROUND | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                        RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                        RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ |
 |                            SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
@@ -111,6 +111,6 @@ Legend:
 |                             TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ |
 |               TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
 |                         TOPK_MOE | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
-|                            TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ |
+|                            TRUNC | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
 |                          UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ |
 |                            XIELU | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ |
--- a/docs/ops/CUDA.csv
+++ b/docs/ops/CUDA.csv
--- a/docs/ops/SYCL.csv
+++ b/docs/ops/SYCL.csv
@@ -5637,25 +5637,25 @@
 "SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000000,inplace=0","support","1","yes","SYCL"
 "SYCL0","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000000","support","1","yes","SYCL"
 "SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000000,inplace=0","support","1","yes","SYCL"
-"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000000","support","1","yes","SYCL"
+"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000000","support","0","no","SYCL"
 "SYCL0","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","SYCL"
 "SYCL0","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001","support","1","yes","SYCL"
 "SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=0","support","1","yes","SYCL"
 "SYCL0","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000001","support","1","yes","SYCL"
 "SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000001,inplace=0","support","1","yes","SYCL"
-"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000001","support","1","yes","SYCL"
+"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000001","support","0","no","SYCL"
 "SYCL0","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","SYCL"
 "SYCL0","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000100","support","1","yes","SYCL"
 "SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000100,inplace=0","support","1","yes","SYCL"
 "SYCL0","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000100","support","1","yes","SYCL"
 "SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.000100,inplace=0","support","1","yes","SYCL"
-"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000100","support","1","yes","SYCL"
+"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.000100","support","0","no","SYCL"
 "SYCL0","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","SYCL"
 "SYCL0","NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.100000","support","1","yes","SYCL"
 "SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.100000,inplace=0","support","1","yes","SYCL"
 "SYCL0","NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.100000","support","1","yes","SYCL"
 "SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=0.100000,inplace=0","support","1","yes","SYCL"
-"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.100000","support","1","yes","SYCL"
+"SYCL0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=0.100000","support","0","no","SYCL"
 "SYCL0","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","SYCL"
 "SYCL0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","1","yes","SYCL"
 "SYCL0","RMS_NORM_MUL_ADD","type=f32,ne=[64,5,4,3],eps=0.000000,broadcast=0,multi_add=0","support","1","yes","SYCL"
@@ -9307,37 +9307,37 @@
 "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=1","support","1","yes","SYCL"
 "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=1","support","1","yes","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","0","no","SYCL"
 "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL"
-"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","yes","SYCL"
+"SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","0","no","SYCL"
 "SYCL0","ARGSORT","type=f32,ne=[8,1,1,1],order=0","support","1","yes","SYCL"
 "SYCL0","ARGSORT","type=f32,ne=[16,10,10,10],order=0","support","1","yes","SYCL"
 "SYCL0","ARGSORT","type=f32,ne=[60,10,10,10],order=0","support","1","yes","SYCL"
--- a/examples/embedding/README.md
+++ b/examples/embedding/README.md
@@ -38,7 +38,6 @@ The above command will output space-separated float values.
 |            | multiple embeddings          | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
 | 'json'     | openai style                 |
 | 'json+'    | add cosine similarity matrix |
-| 'raw'      | plain text output            |

 ### --embd-separator $"string"$
 | $"string"$   | |
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -70,29 +70,6 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    }
 }

-// plain, pipe-friendly output: one embedding per line
-static void print_raw_embeddings(const float * emb,
-                                 int n_embd_count,
-                                 int n_embd,
-                                 const llama_model * model,
-                                 enum llama_pooling_type pooling_type,
-                                 int embd_normalize) {
-    const uint32_t n_cls_out = llama_model_n_cls_out(model);
-    const bool is_rank = (pooling_type == LLAMA_POOLING_TYPE_RANK);
-    const int cols = is_rank ? std::min<int>(n_embd, (int) n_cls_out) : n_embd;
-
-    for (int j = 0; j < n_embd_count; ++j) {
-        for (int i = 0; i < cols; ++i) {
-            if (embd_normalize == 0) {
-                LOG("%1.0f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
-            } else {
-                LOG("%1.7f%s", emb[j * n_embd + i], (i + 1 < cols ? " " : ""));
-            }
-        }
-        LOG("\n");
-    }
-}
-
 int main(int argc, char ** argv) {
    common_params params;

@@ -395,8 +372,6 @@ int main(int argc, char ** argv) {
        }

        if (notArray) LOG("\n}\n");
-    } else if (params.embd_out == "raw") {
-        print_raw_embeddings(emb, n_embd_count, n_embd, model, pooling_type, params.embd_normalize);
    }

    LOG("\n");
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -184,13 +184,8 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
            const char * name   = gguf_get_tensor_name  (ctx, i);
            const size_t size   = gguf_get_tensor_size  (ctx, i);
            const size_t offset = gguf_get_tensor_offset(ctx, i);
-            const auto   type   = gguf_get_tensor_type  (ctx, i);

-            const char * type_name  = ggml_type_name(type);
-            const size_t type_size  = ggml_type_size(type);
-            const size_t n_elements = size / type_size;
-
-            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu, type = %s, n_elts = %zu\n", __func__, i, name, size, offset, type_name, n_elements);
+            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
        }
    }

--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -371,17 +371,8 @@ class SchemaConverter:
                        raise ValueError(f'Unsupported ref {ref}')

                    for sel in ref.split('#')[-1].split('/')[1:]:
-                        assert target is not None, f'Error resolving ref {ref}: {sel} not in {target}'
-                        if isinstance(target, list):
-                            try:
-                                sel_index = int(sel)
-                            except ValueError:
-                                raise ValueError(f'Error resolving ref {ref}: {sel} not in {target}')
-                            assert 0 <= sel_index < len(target), f'Error resolving ref {ref}: {sel} not in {target}'
-                            target = target[sel_index]
-                        else:
-                            assert sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
-                            target = target[sel]
+                        assert target is not None and sel in target, f'Error resolving ref {ref}: {sel} not in {target}'
+                        target = target[sel]

                    self._refs[ref] = target
                else:
@@ -556,8 +547,7 @@ class SchemaConverter:


    def _resolve_ref(self, ref):
-        ref_fragment = ref.split('#')[-1]
-        ref_name = 'ref' + re.sub(r'[^a-zA-Z0-9-]+', '-', ref_fragment)
+        ref_name = ref.split('/')[-1]
        if ref_name not in self._rules and ref not in self._refs_being_resolved:
            self._refs_being_resolved.add(ref)
            resolved = self._refs[ref]
--- a/examples/model-conversion/scripts/causal/run-org-model.py
+++ b/examples/model-conversion/scripts/causal/run-org-model.py
@@ -138,10 +138,7 @@ if model_path is None:
        "Model path must be specified either via --model-path argument or MODEL_PATH environment variable"
    )

-
-print("Loading model and tokenizer using AutoTokenizer:", model_path)
-tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+config = AutoConfig.from_pretrained(model_path)

 print("Model type:       ", config.model_type)
 print("Vocab size:       ", config.vocab_size)
@@ -150,6 +147,10 @@ print("Number of layers: ", config.num_hidden_layers)
 print("BOS token id:     ", config.bos_token_id)
 print("EOS token id:     ", config.eos_token_id)

+print("Loading model and tokenizer using AutoTokenizer:", model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+config = AutoConfig.from_pretrained(model_path)
+
 if unreleased_model_name:
    model_name_lower = unreleased_model_name.lower()
    unreleased_module_path = (
@@ -170,7 +171,7 @@ if unreleased_model_name:
        exit(1)
 else:
    model = AutoModelForCausalLM.from_pretrained(
-        model_path, device_map="auto", offload_folder="offload", trust_remote_code=True, config=config
+        model_path, device_map="auto", offload_folder="offload"
    )

 for name, module in model.named_modules():
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -242,7 +242,6 @@
 #define GGML_ROPE_TYPE_NEOX   2
 #define GGML_ROPE_TYPE_MROPE  8
 #define GGML_ROPE_TYPE_VISION 24
-#define GGML_ROPE_TYPE_IMROPE 40 // binary: 101000

 #define GGML_MROPE_SECTIONS   4

@@ -2108,7 +2107,6 @@ extern "C" {
    enum ggml_scale_mode {
        GGML_SCALE_MODE_NEAREST  = 0,
        GGML_SCALE_MODE_BILINEAR = 1,
-        GGML_SCALE_MODE_BICUBIC  = 2,

        GGML_SCALE_MODE_COUNT
    };
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -308,10 +308,6 @@ function(ggml_add_cpu_backend_variant tag_name)
            set(GGML_INTERNAL_${feat} ON)
        endforeach()
    elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
-        foreach (feat VXE2 NNPA)
-            set(GGML_INTERNAL_${feat} OFF)
-        endforeach()
-
        foreach (feat ${ARGN})
            set(GGML_INTERNAL_${feat} ON)
        endforeach()
@@ -381,8 +377,9 @@ if (GGML_CPU_ALL_VARIANTS)
        endif()
    elseif (GGML_SYSTEM_ARCH STREQUAL "s390x")
        if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-            ggml_add_cpu_backend_variant(z15    Z15 VXE2)
-            ggml_add_cpu_backend_variant(z16    Z16 VXE2 NNPA)
+            ggml_add_cpu_backend_variant(s390x_z15  Z15 VXE)
+            # ggml_add_cpu_backend_variant(s390x_z16  Z16 VXE)
+            # ggml_add_cpu_backend_variant(s390x_z17  Z17 VXE)
        else()
            message(FATAL_ERROR "Unsupported s390x target OS: ${CMAKE_SYSTEM_NAME}")
        endif()
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -226,23 +226,16 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
    }

    if (best_fit_block == -1) {
-        // no suitable block found, try the last block (this may grow a chunks size)
-        int64_t best_reuse = INT64_MIN;
+        // no suitable block found, try the last block (this will grow a chunks size)
        for (int c = 0; c < alloc->n_chunks; ++c) {
            struct tallocr_chunk * chunk = alloc->chunks[c];
            if (chunk->n_free_blocks > 0) {
                struct free_block * block = &chunk->free_blocks[chunk->n_free_blocks - 1];
                max_avail = MAX(max_avail, block->size);
-                int64_t reuse_factor = chunk->max_size - block->offset - size;
-                // reuse_factor < 0 : amount of extra memory that needs to be allocated
-                // reuse_factor = 0 : allocated free space exactly matches tensor size
-                // reuse_factor > 0 : superfluous memory that will remain unused
-                bool better_reuse = best_reuse < 0 && reuse_factor > best_reuse;
-                bool better_fit = reuse_factor >= 0 && reuse_factor < best_reuse;
-                if (block->size >= size && (better_reuse || better_fit)) {
+                if (block->size >= size) {
                    best_fit_chunk = c;
                    best_fit_block = chunk->n_free_blocks - 1;
-                    best_reuse = reuse_factor;
+                    break;
                }
            }
        }
@@ -275,7 +268,7 @@ static struct buffer_address ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * al
 #ifdef GGML_ALLOCATOR_DEBUG
    add_allocated_tensor(alloc, addr, tensor);
    size_t cur_max = addr.offset + size;
-    if (cur_max > chunk->max_size) {
+    if (cur_max > alloc->max_size[addr.chunk]) {
        // sort allocated_tensors by chunk/offset
        for (int i = 0; i < 1024; i++) {
            for (int j = i + 1; j < 1024; j++) {
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2234,7 +2234,7 @@ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
                              ACL_MEM_MALLOC_HUGE_FIRST));

        acl_theta_scale_tensor = ggml_cann_create_tensor(ctx.rope_cache.theta_scale_cache, ACL_FLOAT, sizeof(float),
-                                                         theta_scale_ne, theta_scale_nb, 1);
+                                                         theta_scale_ne, theta_scale_nb, GGML_MAX_DIMS);

        float start      = 0;
        float step       = 1;
@@ -2251,7 +2251,7 @@ static void aclnn_cache_init(ggml_backend_cann_context & ctx,
            yarn_ramp_allocator.alloc(theta_scale_length * sizeof(float));
            void * yarn_ramp_buffer = yarn_ramp_allocator.get();
            acl_yarn_ramp_tensor   = ggml_cann_create_tensor(yarn_ramp_buffer, ACL_FLOAT, sizeof(float), theta_scale_ne,
-                                                             theta_scale_nb, 1);
+                                                             theta_scale_nb, GGML_MAX_DIMS);
            float       zero_value = 0, one_value = 1;
            float       denom_safe_value = MAX(0.001f, corr_dims[1] - corr_dims[0]);
            aclScalar * low              = aclCreateScalar(&corr_dims[0], aclDataType::ACL_FLOAT);
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -67,30 +67,19 @@
    GGML_ABORT("CANN error");
 }

-// Thread-local variable to record the current device of this thread.
-thread_local int g_current_cann_device = -1;
-
 /**
- * @brief Set the CANN device to be used.
+ * @brief Sets the device to be used by CANN.
 *
- * @param device The target device ID to set.
+ * @param device The device ID to set.
 */
 void ggml_cann_set_device(const int32_t device) {
-    // int current_device = -1;
-    // Note: In some CANN versions, if no device has been set yet,
-    //       aclrtGetDevice(&current_device) may return 0 by default.
-    // aclrtGetDevice(&current_device);
+    int current_device = -1;
+    aclrtGetDevice(&current_device);

-    // If the current device is already the target one, no need to switch.
-    if (device == g_current_cann_device) {
+    if (device == current_device) {
        return;
    }
-
-    // Switch to the new device.
    ACL_CHECK(aclrtSetDevice(device));
-
-    // Update the global device record.
-    g_current_cann_device = device;
 }

 /**
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -504,18 +504,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
            endforeach()
        endif()

-        if (GGML_VXE OR GGML_INTERNAL_VXE2)
-            message(STATUS "VXE2 enabled")
+        if (GGML_VXE OR GGML_INTERNAL_VXE)
+            message(STATUS "VX/VXE/VXE2 enabled")
            list(APPEND ARCH_FLAGS -mvx -mzvector)
-            list(APPEND ARCH_DEFINITIONS GGML_USE_VXE2)
+            list(APPEND ARCH_DEFINITIONS GGML_VXE)
        endif()
-
-        if (GGML_INTERNAL_NNPA)
-            message(STATUS "NNPA enabled")
-            list(APPEND ARCH_DEFINITIONS GGML_USE_NNPA)
-        endif()
-
-        ggml_add_cpu_backend_features(${GGML_CPU_NAME} s390 ${ARCH_DEFINITIONS})
    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "wasm")
        message(STATUS "Wasm detected")
        list (APPEND GGML_CPU_SOURCES ggml-cpu/arch/wasm/quants.c)
--- a/ggml/src/ggml-cpu/arch/loongarch/quants.c
+++ b/ggml/src/ggml-cpu/arch/loongarch/quants.c
@@ -700,8 +700,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
    for (; ib + 1 < nb; ib += 2) {

        // Compute combined scale for the block 0 and 1
-        const float ft0 = GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d);
-        const __m128 d_0_1 = (__m128)(v4f32){ft0, ft0, ft0, ft0};
+        const __m128 d_0_1 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d) );

        const __m128i tmp_0_1 = __lsx_vld((const __m128i *)x[ib].qs, 0);

@@ -715,9 +714,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
        bx_1 = __lsx_vsub_b(bx_1, off);
        const __m128i i32_1 = mul_sum_i8_pairs(bx_1, by_1);

+        //_mm_prefetch(&x[ib] + 2 * sizeof(block_q4_0), _MM_HINT_T0);
+        //_mm_prefetch(&y[ib] + 2 * sizeof(block_q8_0), _MM_HINT_T0);
+
        // Compute combined scale for the block 2 and 3
-        const float ft1 = GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d);
-        const __m128 d_2_3 = (__m128)(v4f32){ft1, ft1, ft1, ft1};
+        const __m128 d_2_3 = (__m128)__lsx_vreplgr2vr_w( GGML_CPU_FP16_TO_FP32(x[ib + 1].d) * GGML_CPU_FP16_TO_FP32(y[ib + 1].d) );

        const __m128i tmp_2_3 = __lsx_vld((const __m128i *)x[ib + 1].qs, 0);

--- a/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp
+++ b/ggml/src/ggml-cpu/arch/s390/cpu-feats.cpp
@@ -1,50 +0,0 @@
-#include "ggml-backend-impl.h"
-
-#if defined(__s390x__)
-#include <sys/auxv.h>
-
-// find hwcap bits in asm/elf.h
-#ifndef HWCAP_VXRS_EXT2
-#define HWCAP_VXRS_EXT2 (1 << 15)
-#endif
-
-#ifndef HWCAP_NNPA
-#define HWCAP_NNPA (1 << 20)
-#endif
-
-struct s390x_features {
-    bool has_vxe2 = false;
-    bool has_nnpa = false;
-
-    s390x_features() {
-        uint32_t hwcap = getauxval(AT_HWCAP);
-        // NOTE: use hwcap2 with DFLT for z17 and later
-        // uint32_t hwcap2 = getauxval(AT_HWCAP2);
-
-        has_vxe2 = !!(hwcap & HWCAP_VXRS_EXT2);
-        has_nnpa = !!(hwcap & HWCAP_NNPA);
-    }
-};
-
-static int ggml_backend_cpu_s390x_score() {
-    int score = 1;
-    s390x_features sf;
-
-// IBM z15 / LinuxONE 3
-#ifdef GGML_USE_VXE2
-    if (!sf.has_vxe2) { return 0; }
-    score += 1 << 1;
-#endif
-
-// IBM z16 / LinuxONE 4 and z17 / LinuxONE 5
-#ifdef GGML_USE_NNPA
-    if (!sf.has_nnpa) { return 0; }
-    score += 1 << 2;
-#endif
-
-    return score;
-}
-
-GGML_BACKEND_DL_SCORE_IMPL(ggml_backend_cpu_s390x_score)
-
-#endif  // __s390x__
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -500,15 +500,13 @@ inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) {

 #endif

-#if defined(__loongarch_sx)
+#if defined(__loongarch_asx)
 /* float type data load instructions */
 static __m128 __lsx_vreplfr2vr_s(const float val) {
    v4f32 res = {val, val, val, val};
    return (__m128)res;
 }
-#endif

-#if defined(__loongarch_asx)
 static __m256 __lasx_xvreplfr2vr_s(const float val) {
    v8f32 res = {val, val, val, val, val, val, val, val};
    return (__m256)res;
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1613,8 +1613,13 @@ static void ggml_compute_forward_mul_mat_id(
            chunk_size = 64;
        }

+#if defined(__aarch64__)
+        // disable for ARM
+        const bool disable_chunking = true;
+#else
        // disable for NUMA
        const bool disable_chunking = ggml_is_numa();
+#endif // defined(__aarch64__)

        int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
        int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5474,7 +5474,7 @@ static void ggml_rope_cache_init(
 }

 static void ggml_mrope_cache_init(
-     float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool is_imrope, bool indep_sects,
+     float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
     float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
     float * cache, float sin_sign, float theta_scale) {
    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
@@ -5509,26 +5509,14 @@ static void ggml_mrope_cache_init(
        }

        float theta = theta_t;
-        if (is_imrope) { // qwen3vl apply interleaved mrope
-            if (sector % 3 == 1 && sector < 3 * sections[1]) {
-                theta = theta_h;
-            } else if (sector % 3 == 2 && sector < 3 * sections[2]) {
-                theta = theta_w;
-            } else if (sector % 3 == 0 && sector < 3 * sections[0]) {
-                theta = theta_t;
-            } else {
-                theta = theta_e;
-            }
-        } else {
-            if (sector >= sections[0] && sector < sec_w) {
-                theta = theta_h;
-            }
-            else if (sector >= sec_w && sector < sec_w + sections[2]) {
-                theta = theta_w;
-            }
-            else if (sector >= sec_w + sections[2]) {
-                theta = theta_e;
-            }
+        if (sector >= sections[0] && sector < sec_w) {
+            theta = theta_h;
+        }
+        else if (sector >= sec_w && sector < sec_w + sections[2]) {
+            theta = theta_w;
+        }
+        else if (sector >= sec_w + sections[2]) {
+            theta = theta_e;
        }

        rope_yarn(
@@ -5601,7 +5589,6 @@ static void ggml_compute_forward_rope_f32(

    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;  // ggml_rope_multi, multimodal rotary position embedding
-    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE; // qwen3vl apply interleaved mrope
    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;

    if (is_mrope) {
@@ -5640,7 +5627,7 @@ static void ggml_compute_forward_rope_f32(
                const int64_t p_w = pos[i2 + ne2 * 2];
                const int64_t p_e = pos[i2 + ne2 * 3];
                ggml_mrope_cache_init(
-                    p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
+                    p_t, p_h, p_w, p_e, sections, is_vision,
                    freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
            }

@@ -5788,7 +5775,6 @@ static void ggml_compute_forward_rope_f16(

    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
-    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;

    if (is_mrope) {
@@ -5827,7 +5813,7 @@ static void ggml_compute_forward_rope_f16(
                const int64_t p_w = pos[i2 + ne2 * 2];
                const int64_t p_e = pos[i2 + ne2 * 3];
                ggml_mrope_cache_init(
-                    p_t, p_h, p_w, p_e, sections, is_imrope, is_vision,
+                    p_t, p_h, p_w, p_e, sections, is_vision,
                    freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
            }

@@ -7084,11 +7070,7 @@ static void ggml_compute_forward_conv_2d_dw_cwhn(
    const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);

 #ifdef GGML_SIMD
-    #if defined(__ARM_FEATURE_SVE)
-        const int64_t pkg_size = svcntw();
-    #else
-        const int64_t pkg_size = GGML_F32_EPR;
-    #endif
+    const int64_t pkg_size = GGML_F32_EPR;
    const int64_t pkg_count = c / pkg_size;
    const int64_t c_pkg_end = pkg_count * pkg_size;
 #else
@@ -7511,17 +7493,10 @@ static void ggml_compute_forward_upscale_f32(
    float sf1 = (float)ne1/src0->ne[1];
    float sf2 = (float)ne2/src0->ne[2];
    float sf3 = (float)ne3/src0->ne[3];
-    float pixel_offset = 0.5f;

    const int32_t mode_flags = ggml_get_op_params_i32(dst, 0);
    const ggml_scale_mode mode = (ggml_scale_mode) (mode_flags & 0xFF);

-    if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
-        pixel_offset = 0.0f;
-        sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
-        sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
-    }
-
    if (mode == GGML_SCALE_MODE_NEAREST) {
        for (int64_t i3 = 0; i3 < ne3; i3++) {
            const int64_t i03 = i3 / sf3;
@@ -7541,6 +7516,13 @@ static void ggml_compute_forward_upscale_f32(
            }
        }
    } else if (mode == GGML_SCALE_MODE_BILINEAR) {
+        float pixel_offset = 0.5f;
+        if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
+            pixel_offset = 0.0f;
+            sf0 = (float)(ne0 - 1) / (src0->ne[0] - 1);
+            sf1 = (float)(ne1 - 1) / (src0->ne[1] - 1);
+        }
+
        for (int64_t i3 = 0; i3 < ne3; i3++) {
            const int64_t i03 = i3 / sf3;
            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
@@ -7575,51 +7557,6 @@ static void ggml_compute_forward_upscale_f32(

                        const float val = a*(1 - dx)*(1 - dy) + b*dx*(1 - dy) + c*(1 - dx)*dy + d*dx*dy;

-                        float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
-                        *y_dst = val;
-                    }
-                }
-            }
-        }
-    } else if (mode == GGML_SCALE_MODE_BICUBIC) {
-        // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
-        const float a = -0.75f; // use alpha = -0.75 (same as PyTorch)
-        auto weight1 = [a](float x) { return ((a + 2) * x - (a + 3)) * x * x + 1; };
-        auto weight2 = [a](float x) { return ((a * x - 5 * a) * x + 8 * a) * x - 4 * a; };
-        auto bicubic = [=](float p0, float p1, float p2, float p3, float x) {
-            const float w0 = weight2(x + 1);
-            const float w1 = weight1(x + 0);
-            const float w2 = weight1(1 - x);
-            const float w3 = weight2(2 - x);
-            return p0*w0 + p1*w1 + p2*w2 + p3*w3;
-        };
-
-        for (int64_t i3 = 0; i3 < ne3; i3++) {
-            const int64_t i03 = i3 / sf3;
-            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-                const int64_t i02 = i2 / sf2;
-                for (int64_t i1 = 0; i1 < ne1; i1++) {
-                    const float y = ((float)i1 + pixel_offset) / sf1 - pixel_offset;
-                    const int64_t y0 = (int64_t)floorf(y);
-                    const float dy = y - (float)y0;
-
-                    for (int64_t i0 = 0; i0 < ne0; i0++) {
-                        const float x = ((float)i0 + pixel_offset) / sf0 - pixel_offset;
-                        const int64_t x0 = (int64_t)floorf(x);
-                        const float dx = x - (float)x0;
-
-                        auto p = [=](int64_t x_off, int64_t y_off) -> float {
-                            int64_t i00 = std::max(int64_t(0), std::min(x0 + x_off, ne00 - 1));
-                            int64_t i01 = std::max(int64_t(0), std::min(y0 + y_off, ne01 - 1));
-                            return *(const float *)((const char *)src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                        };
-
-                        const float val = bicubic(
-                            bicubic(p(-1,-1), p(0,-1), p(1,-1), p(2,-1), dx),
-                            bicubic(p(-1, 0), p(0, 0), p(1, 0), p(2, 0), dx),
-                            bicubic(p(-1, 1), p(0, 1), p(1, 1), p(2, 1), dx),
-                            bicubic(p(-1, 2), p(0, 2), p(1, 2), p(2, 2), dx), dy);
-
                        float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
                        *y_dst = val;
                    }
@@ -7972,10 +7909,10 @@ void ggml_compute_forward_argsort(

 // ggml_compute_forward_flash_attn_ext

-static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
+static void ggml_compute_forward_flash_attn_ext_f16(
        const ggml_compute_params * params,
-        ggml_tensor * dst,
-        int ir0, int ir1) {
+        ggml_tensor * dst) {
+
    const ggml_tensor * q     = dst->src[0];
    const ggml_tensor * k     = dst->src[1];
    const ggml_tensor * v     = dst->src[2];
@@ -7991,6 +7928,9 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)

+    const int ith = params->ith;
+    const int nth = params->nth;
+
    const int64_t DK = nek0;
    const int64_t DV = nev0;
    const int64_t N  = neq1;
@@ -8024,6 +7964,16 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(

    // parallelize by q rows using ggml_vec_dot_f32

+    // total rows in q
+    const int nr = neq1*neq2*neq3;
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
    float scale         = 1.0f;
    float max_bias      = 0.0f;
    float logit_softcap = 0.0f;
@@ -8050,8 +8000,6 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
    GGML_ASSERT((                            q_to_vec_dot) && "fattn: unsupported K-type");
    GGML_ASSERT((v->type == GGML_TYPE_F32 || v_to_float  ) && "fattn: unsupported V-type");

-    int ith = params->ith;
-
    // loop over n_batch and n_head
    for (int ir = ir0; ir < ir1; ++ir) {
        // q indices
@@ -8199,91 +8147,6 @@ static void ggml_compute_forward_flash_attn_ext_f16_one_chunk(
    }
 }

-static void ggml_compute_forward_flash_attn_ext_f16(
-        const ggml_compute_params * params,
-        ggml_tensor * dst) {
-
-    const ggml_tensor * q     = dst->src[0];
-    const ggml_tensor * k     = dst->src[1];
-    const ggml_tensor * v     = dst->src[2];
-
-    GGML_TENSOR_LOCALS(int64_t, neq, q,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbq, q,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nek, k,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbk, k,   nb)
-    GGML_TENSOR_LOCALS(int64_t, nev, v,   ne)
-    GGML_TENSOR_LOCALS(size_t,  nbv, v,   nb)
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst, ne)
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst, nb)
-
-    const int64_t DK = nek0;
-    const int64_t DV = nev0;
-    const int64_t N  = neq1;
-
-    GGML_ASSERT(ne0 == DV);
-    GGML_ASSERT(ne2 == N);
-
-    // input tensor rows must be contiguous
-    GGML_ASSERT(nbq0 == ggml_type_size(q->type));
-    GGML_ASSERT(nbk0 == ggml_type_size(k->type));
-    GGML_ASSERT(nbv0 == ggml_type_size(v->type));
-
-    GGML_ASSERT(neq0 == DK);
-    GGML_ASSERT(nek0 == DK);
-    GGML_ASSERT(nev0 == DV);
-
-    GGML_ASSERT(neq1 == N);
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-    // parallelize by q rows using ggml_vec_dot_f32
-
-    // total rows in q
-    const int64_t nr = neq1*neq2*neq3;
-
-    // rows per thread
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    // disable for NUMA
-    const bool disable_chunking = ggml_is_numa();
-
-    // 4x chunks per thread
-    int nth_scaled = nth * 4;
-    int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
-    int64_t nchunk     = (nr + chunk_size - 1) / chunk_size;
-
-    if (nth == 1 || nchunk < nth || disable_chunking) {
-        nchunk = nth;
-    }
-
-    if (ith == 0) {
-        // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-        ggml_threadpool_chunk_set(params->threadpool, nth);
-    }
-
-    ggml_barrier(params->threadpool);
-
-    // The number of elements in each chunk
-    const int64_t dr = (nr + nchunk - 1) / nchunk;
-
-    // The first chunk comes from our thread_id, the rest will get auto-assigned.
-    int current_chunk = ith;
-
-    while (current_chunk < nchunk) {
-        const int64_t ir0 = dr * current_chunk;
-        const int64_t ir1 = MIN(ir0 + dr, nr);
-
-        ggml_compute_forward_flash_attn_ext_f16_one_chunk(params, dst, ir0, ir1);
-
-        current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
-    }
-}
-
 void ggml_compute_forward_flash_attn_ext(
        const ggml_compute_params * params,
        ggml_tensor * dst) {
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -1600,32 +1600,6 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
        return false;
    }

-    void forward_mul_mat_one_chunk(ggml_compute_params * params, ggml_tensor * op, int64_t src0_start, int64_t src0_end) {
-        const ggml_tensor * src0 = op->src[0];
-        const ggml_tensor * src1 = op->src[1];
-        ggml_tensor *       dst  = op;
-
-        GGML_TENSOR_BINARY_OP_LOCALS
-
-        const void * src1_wdata      = params->wdata;
-        const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
-
-        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
-        if (ne11 > 3) {
-            gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
-                    (float *) ((char *) dst->data) + src0_start, ne01,
-                    (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
-        }
-        for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
-            gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
-                    (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
-                    (const char *) src0->data + src0_start * nb01,
-                    (const char *) src1_wdata + (src1_col_stride * iter), 1,
-                    src0_end - src0_start);
-        }
-    }
-
    void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
        const ggml_tensor * src0 = op->src[0];
        const ggml_tensor * src1 = op->src[1];
@@ -1669,62 +1643,31 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
            from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10);
        }

-        // disable for NUMA
-        const bool disable_chunking = ggml_is_numa();
-
-        // 4x chunks per thread
-        int64_t nr = ggml_nrows(op->src[0]);
-        int nth_scaled = nth * 4;
-        int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled;
-        int64_t nchunk     = (nr + chunk_size - 1) / chunk_size;
-
-        // Ensure minimum chunk size to avoid alignment issues with high thread counts
-        // Minimum chunk size should be at least NB_COLS to prevent overlapping chunks after alignment
-        const int64_t min_chunk_size = NB_COLS;
-        if (nchunk > 0 && (nr / nchunk) < min_chunk_size && nr >= min_chunk_size) {
-            nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
-        }
-
-        if (nth == 1 || nchunk < nth || disable_chunking) {
-            nchunk = nth;
-        }
-
-        // Ensure nchunk doesn't exceed the number of rows divided by minimum chunk size
-        // This prevents creating too many tiny chunks that could overlap after alignment
-        const int64_t max_nchunk = (nr + min_chunk_size - 1) / min_chunk_size;
-        if (nchunk > max_nchunk) {
-            nchunk = max_nchunk;
-        }
-
-        if (ith == 0) {
-            // Every thread starts at ith, so the first unprocessed chunk is nth.  This save a bit of coordination right at the start.
-            ggml_threadpool_chunk_set(params->threadpool, nth);
-        }
-
        ggml_barrier(params->threadpool);

-        // The first chunk comes from our thread_id, the rest will get auto-assigned.
-        int current_chunk = ith;
+        const void * src1_wdata      = params->wdata;
+        const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
+        int64_t      src0_start      = (ith * ne01) / nth;
+        int64_t      src0_end        = ((ith + 1) * ne01) / nth;
+        src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
+        src0_end   = (src0_end   % NB_COLS) ? src0_end   + NB_COLS - (src0_end   % NB_COLS) : src0_end;
+        if (src0_start >= src0_end) {
+            return;
+        }

-        while (current_chunk < nchunk) {
-            int64_t src0_start = (current_chunk * ne01) / nchunk;
-            int64_t src0_end   = ((current_chunk + 1) * ne01) / nchunk;
-
-            // Align boundaries to NB_COLS - round up to ensure all data is included
-            // The chunk size limiting above ensures chunks are large enough to prevent overlaps
-            src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
-            src0_end   = (src0_end   % NB_COLS) ? src0_end   + NB_COLS - (src0_end   % NB_COLS) : src0_end;
-            if (src0_end > ne01) {
-                src0_end = ne01;
-            }
-
-            if (src0_start >= src0_end) {
-                break;
-            }
-
-            forward_mul_mat_one_chunk(params, dst, src0_start, src0_end);
-
-            current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
+        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
+        if (ne11 > 3) {
+            gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
+                    (float *) ((char *) dst->data) + src0_start, ne01,
+                    (const char *) src0->data + src0_start * nb01,
+                    (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
+        }
+        for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) {
+            gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00,
+                    (float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01,
+                    (const char *) src0->data + src0_start * nb01,
+                    (const char *) src1_wdata + (src1_col_stride * iter), 1,
+                    src0_end - src0_start);
        }
    }

@@ -1829,12 +1772,8 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
            int64_t src0_cur_start = (ith * ne01) / nth;
            int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;

-            // Align boundaries to NB_COLS - round up to ensure all data is included
            src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
            src0_cur_end   = (src0_cur_end   % NB_COLS) ? src0_cur_end   + NB_COLS - (src0_cur_end   % NB_COLS) : src0_cur_end;
-            if (src0_cur_end > ne01) {
-                src0_cur_end = ne01;
-            }

            if (src0_cur_start >= src0_cur_end) {
                return;
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -956,7 +956,7 @@ do {                                                              \

 #define GGML_F32Cx8          __m256
 #define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
-#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
+#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))

 static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
    __m256i a;
@@ -999,34 +999,34 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {

 #define GGML_F32x4         __m128
 #define GGML_F32x4_ZERO    (__m128)__lsx_vldi(0)
-#define GGML_F32x4_SET1(x) (__m128)__lsx_vreplfr2vr_s((x))
+#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
 #define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
 #define GGML_F32x4_STORE(x, y)   __lsx_vst(y, x, 0)
 #define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
 #define GGML_F32x4_ADD     __lsx_vfadd_s
 #define GGML_F32x4_MUL     __lsx_vfmul_s
-
-#define GGML_F32x4_REDUCE(res, x)                               \
-{                                                               \
-    int offset = GGML_F32_ARR >> 1;                             \
-    for (int i = 0; i < offset; ++i) {                          \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
-    }                                                           \
-    offset >>= 1;                                               \
-    for (int i = 0; i < offset; ++i) {                          \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
-    }                                                           \
-    offset >>= 1;                                               \
-    for (int i = 0; i < offset; ++i) {                          \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                \
-    }                                                           \
-    __m128i t0 = __lsx_vpickev_w((__m128i)x[0], (__m128i)x[0]); \
-    __m128i t1 = __lsx_vpickod_w((__m128i)x[0], (__m128i)x[0]); \
-    __m128 t2 = __lsx_vfadd_s((__m128)t0, (__m128)t1);          \
-    __m128i t3 = __lsx_vpickev_w((__m128i)t2, (__m128i)t2);     \
-    __m128i t4 = __lsx_vpickod_w((__m128i)t2, (__m128i)t2);     \
-    __m128 t5 = __lsx_vfadd_s((__m128)t3, (__m128)t4);          \
-    res = (ggml_float) ((v4f32)t5)[0];                          \
+#define GGML_F32x4_REDUCE(res, x)                                                     \
+{                                                                                     \
+    int offset = GGML_F32_ARR >> 1;                                                   \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    __m128i tmp     = __lsx_vsrli_d((__m128i) x[0], 32);                              \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]);                    \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88);                                     \
+    tmp             = __lsx_vsrli_d((__m128i) t0, 32);                                \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, t0);                      \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    res             = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
 }

 #define GGML_F32_VEC        GGML_F32x4
@@ -1068,7 +1068,7 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {

 #define GGML_F32Cx4             __m128
 #define GGML_F32Cx4_ZERO        (__m128)__lsx_vldi(0)
-#define GGML_F32Cx4_SET1(x)     (__m128)__lsx_vreplfr2vr_s((x))
+#define GGML_F32Cx4_SET1(x)     (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
 #define GGML_F32Cx4_LOAD(x)     (__m128)__lsx_f16x4_load(x)
 #define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
 #define GGML_F32Cx4_FMA         GGML_F32x4_FMA
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@@ -1,81 +1,5 @@
 #include "argsort.cuh"

-#ifdef GGML_CUDA_USE_CUB
-#    include <cub/cub.cuh>
-using namespace cub;
-#endif  // GGML_CUDA_USE_CUB
-
-static __global__ void init_indices(int * indices, const int ncols, const int nrows) {
-    const int col = blockIdx.x * blockDim.x + threadIdx.x;
-    const int row = blockIdx.y;
-
-    if (col < ncols && row < nrows) {
-        indices[row * ncols + col] = col;
-    }
-}
-
-static __global__ void init_offsets(int * offsets, const int ncols, const int nrows) {
-    const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx <= nrows) {
-        offsets[idx] = idx * ncols;
-    }
-}
-
-#ifdef GGML_CUDA_USE_CUB
-static void argsort_f32_i32_cuda_cub(ggml_cuda_pool & pool,
-                                     const float *    x,
-                                     int *            dst,
-                                     const int        ncols,
-                                     const int        nrows,
-                                     ggml_sort_order  order,
-                                     cudaStream_t     stream) {
-    ggml_cuda_pool_alloc<int>   temp_indices_alloc(pool, ncols * nrows);
-    ggml_cuda_pool_alloc<float> temp_keys_alloc(pool, ncols * nrows);
-    ggml_cuda_pool_alloc<int>   offsets_alloc(pool, nrows + 1);
-
-    int *   temp_indices = temp_indices_alloc.get();
-    float * temp_keys    = temp_keys_alloc.get();
-    int *   d_offsets    = offsets_alloc.get();
-
-    static const int block_size = 256;
-    const dim3 grid_size((ncols + block_size - 1) / block_size, nrows);
-    init_indices<<<grid_size, block_size, 0, stream>>>(temp_indices, ncols, nrows);
-
-    const dim3 offset_grid((nrows + block_size - 1) / block_size);
-    init_offsets<<<offset_grid, block_size, 0, stream>>>(d_offsets, ncols, nrows);
-
-    cudaMemcpyAsync(temp_keys, x, ncols * nrows * sizeof(float), cudaMemcpyDeviceToDevice, stream);
-
-    size_t temp_storage_bytes = 0;
-
-    if (order == GGML_SORT_ORDER_ASC) {
-        DeviceSegmentedRadixSort::SortPairs(nullptr, temp_storage_bytes, temp_keys, temp_keys,  // keys (in-place)
-                                            temp_indices, dst,                                  // values (indices)
-                                            ncols * nrows, nrows,                            // num items, num segments
-                                            d_offsets, d_offsets + 1, 0, sizeof(float) * 8,  // all bits
-                                            stream);
-    } else {
-        DeviceSegmentedRadixSort::SortPairsDescending(nullptr, temp_storage_bytes, temp_keys, temp_keys, temp_indices,
-                                                      dst, ncols * nrows, nrows, d_offsets, d_offsets + 1, 0,
-                                                      sizeof(float) * 8, stream);
-    }
-
-    ggml_cuda_pool_alloc<uint8_t> temp_storage_alloc(pool, temp_storage_bytes);
-    void *                        d_temp_storage = temp_storage_alloc.get();
-
-    if (order == GGML_SORT_ORDER_ASC) {
-        DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys, temp_indices, dst,
-                                            ncols * nrows, nrows, d_offsets, d_offsets + 1, 0, sizeof(float) * 8,
-                                            stream);
-    } else {
-        DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, temp_keys, temp_keys,
-                                                      temp_indices, dst, ncols * nrows, nrows, d_offsets, d_offsets + 1,
-                                                      0, sizeof(float) * 8, stream);
-    }
-}
-#endif  // GGML_CUDA_USE_CUB
-
-// Bitonic sort implementation
 template<typename T>
 static inline __device__ void ggml_cuda_swap(T & a, T & b) {
    T tmp = a;
@@ -87,7 +11,7 @@ template<ggml_sort_order order>
 static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) {
    // bitonic sort
    int col = threadIdx.x;
-    int row = blockIdx.x;
+    int row = blockIdx.y;

    if (col >= ncols_pad) {
        return;
@@ -141,28 +65,21 @@ static int next_power_of_2(int x) {
    return n;
 }

-static void argsort_f32_i32_cuda_bitonic(const float *   x,
-                                         int *           dst,
-                                         const int       ncols,
-                                         const int       nrows,
-                                         ggml_sort_order order,
-                                         cudaStream_t    stream) {
+static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) {
    // bitonic sort requires ncols to be power of 2
    const int ncols_pad = next_power_of_2(ncols);

    const dim3 block_dims(ncols_pad, 1, 1);
-    const dim3 block_nums(nrows, 1, 1);
+    const dim3 block_nums(1, nrows, 1);
    const size_t shared_mem = ncols_pad * sizeof(int);

    // FIXME: this limit could be raised by ~2-4x on Ampere or newer
    GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb);

    if (order == GGML_SORT_ORDER_ASC) {
-        k_argsort_f32_i32<GGML_SORT_ORDER_ASC>
-            <<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+        k_argsort_f32_i32<GGML_SORT_ORDER_ASC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
    } else if (order == GGML_SORT_ORDER_DESC) {
-        k_argsort_f32_i32<GGML_SORT_ORDER_DESC>
-            <<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
+        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
    } else {
        GGML_ABORT("fatal error");
    }
@@ -183,18 +100,5 @@ void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {

    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];

-#ifdef GGML_CUDA_USE_CUB
-    const int    ncols_pad      = next_power_of_2(ncols);
-    const size_t shared_mem     = ncols_pad * sizeof(int);
-    const size_t max_shared_mem = ggml_cuda_info().devices[ggml_cuda_get_device()].smpb;
-
-    if (shared_mem > max_shared_mem || ncols > 1024) {
-        ggml_cuda_pool & pool = ctx.pool();
-        argsort_f32_i32_cuda_cub(pool, src0_d, (int *) dst_d, ncols, nrows, order, stream);
-    } else {
-        argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
-    }
-#else
-    argsort_f32_i32_cuda_bitonic(src0_d, (int *) dst_d, ncols, nrows, order, stream);
-#endif
+    argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream);
 }
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@@ -272,7 +272,7 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
        const uint3 ne12 = init_fastdiv_values((uint32_t) cne1[2]);
        const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);

-        if (block_nums.z > 65535 || block_nums.y > 65535) {
+        if (block_nums.z > 65535) {
            int         block_num  = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
            const uint3 prod_012    = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
            const uint3 prod_01     = init_fastdiv_values((uint32_t) (ne0 * ne1));
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -224,11 +224,6 @@ static const char * cu_get_error_str(CUresult err) {
 #define AMD_MFMA_AVAILABLE
 #endif // defined(GGML_USE_HIP) && defined(CDNA) && !defined(GGML_HIP_NO_MMQ_MFMA)

-// The Volta instructions are in principle available on Turing or newer but they are effectively unusable:
-#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
-#define VOLTA_MMA_AVAILABLE
-#endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
-
 #if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
 #define TURING_MMA_AVAILABLE
 #endif // !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
@@ -283,10 +278,7 @@ static bool amd_mfma_available(const int cc) {
 #endif //!defined(GGML_HIP_NO_MMQ_MFMA)
 }

-static bool volta_mma_available(const int cc) {
-    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) == GGML_CUDA_CC_VOLTA;
-}
-
+// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
 static bool turing_mma_available(const int cc) {
    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
 }
@@ -633,11 +625,8 @@ static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {
 // and a shift:
 //
 // n/d = (mulhi(n, mp) + n) >> L;
-static const uint3 init_fastdiv_values(uint64_t d_64) {
-    GGML_ASSERT(d_64 != 0);
-    GGML_ASSERT(d_64 <= std::numeric_limits<uint32_t>::max());
-
-    uint32_t d = (uint32_t)d_64;
+static const uint3 init_fastdiv_values(uint32_t d) {
+    GGML_ASSERT(d != 0);

    // compute L = ceil(log2(d));
    uint32_t L = 0;
@@ -1016,16 +1005,3 @@ struct ggml_backend_cuda_context {
        return pool(device);
    }
 };
-
-struct ggml_cuda_mm_fusion_args_host {
-    const ggml_tensor * x_bias = nullptr;
-    const ggml_tensor * gate = nullptr;
-    const ggml_tensor * gate_bias = nullptr;
-    ggml_glu_op glu_op;
-};
-struct ggml_cuda_mm_fusion_args_device {
-    const void * x_bias = nullptr;
-    const void * gate = nullptr;
-    const void * gate_bias = nullptr;
-    ggml_glu_op glu_op;
-};
--- a/ggml/src/ggml-cuda/convert.cuh
+++ b/ggml/src/ggml-cuda/convert.cuh
@@ -1,4 +1,3 @@
-#pragma once
 #include "common.cuh"

 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -7,10 +7,6 @@

 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);

-const int CUDA_CPY_TILE_DIM_2D = 32; // 2D tile dimension for transposed blocks
-const int CUDA_CPY_BLOCK_NM = 8;     // block size of 3rd dimension if available
-const int CUDA_CPY_BLOCK_ROWS = 8;   // block dimension for marching through rows
-
 template <cpy_kernel_t cpy_1>
 static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
                               const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -39,55 +35,6 @@ static __global__ void cpy_flt(const char * cx, char * cdst, const int ne,
    cpy_1(cx + x_offset, cdst + dst_offset);
 }

-template <typename T>
-static __global__ void cpy_flt_transpose(const char * cx, char * cdst, const int ne,
-                               const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-                               const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                               const int nb12, const int nb13) {
-
-    const T* src = reinterpret_cast<const T*>(cx);
-    T* dst = reinterpret_cast<T*>(cdst);
-
-    const int64_t nmat = ne / (ne00 * ne01);
-    const int64_t n = ne00 * ne01;
-
-    const int x = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x;
-    const int y = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
-    const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x;  // transpose block offset
-    const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y;
-
-    __shared__ float tile[CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1];
-
-#pragma unroll
-    for (int i = 0; i < CUDA_CPY_BLOCK_NM; ++i) {
-
-        const unsigned int imat = blockIdx.z * CUDA_CPY_BLOCK_NM + i;
-        if (imat >= nmat)
-            break;
-
-#pragma unroll
-        for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
-            if(x < ne01 && y + j < ne00){
-                const int row = threadIdx.y+j;
-                const int col = threadIdx.x * sizeof(float)/sizeof(T);
-                T *tile2 = reinterpret_cast<T*>(tile[row]);
-                tile2[col] = src[imat*n + (y+j)*ne01 + x];
-            }
-        }
-
-        __syncthreads();
-
-#pragma unroll
-        for (int j = 0; j < CUDA_CPY_TILE_DIM_2D; j += CUDA_CPY_BLOCK_ROWS) {
-            if (ty + j < ne01 && tx < ne00) {
-                const int col = (threadIdx.y+j)*sizeof(float)/sizeof(T);
-                const T *tile2 = reinterpret_cast<const T*>(tile[threadIdx.x]);
-                dst[imat*n + (ty+j)*ne00 + tx] = tile2[col];
-            }
-        }
-    }
-}
-
 static __device__ void cpy_blck_q8_0_f32(const char * cxi, char * cdsti) {
    float * cdstf = (float *)(cdsti);

@@ -166,61 +113,14 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
 }

 template<typename src_t, typename dst_t>
-static __global__ void cpy_flt_contiguous(const char * cx, char * cdst, const int64_t ne) {
-    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;
-
-    if (i >= ne) {
-        return;
-    }
-
-    const src_t * x = (const src_t *) cx;
-    dst_t *     dst = (dst_t *) cdst;
-
-    dst[i] = ggml_cuda_cast<dst_t>(x[i]);
-}
-
-template<typename src_t, typename dst_t>
-static void ggml_cpy_flt_contiguous_cuda(
-    const char * cx, char * cdst, const int64_t ne,
-cudaStream_t stream) {
-
-    const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    cpy_flt_contiguous<src_t, dst_t><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne);
-}
-
-template<typename src_t, typename dst_t, bool transposed = false>
 static void ggml_cpy_flt_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {

-    if (transposed) {
-        GGML_ASSERT(ne == ne00*ne01*ne02);  // ne[3] is 1 assumed
-        int ne00n, ne01n, ne02n;
-        if (nb00 < nb02) {
-            ne00n = ne00;
-            ne01n = ne01;
-            ne02n = ne02;
-        } else if (nb00 > nb02) {
-            ne00n = ne00;
-            ne01n = ne01*ne02;
-            ne02n = 1;
-        } else {
-            GGML_ASSERT(false);
-        }
-
-        dim3 dimGrid( (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
-                      (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D,
-                      (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM);
-        dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1);
-        cpy_flt_transpose<dst_t><<<dimGrid, dimBlock, 0, stream>>>
-            (cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-    } else {
-        const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-        cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-            (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
-    }
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    cpy_flt<cpy_1_flt<src_t, dst_t>><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
 }

 static void ggml_cpy_f32_q8_0_cuda(
@@ -385,10 +285,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
    char * src0_ddc = (char *) src0->data;
    char * src1_ddc = (char *) src1->data;

-    const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
-    const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && src0->ne[3] == 1;
-
-    if (src0->type == src1->type && contiguous_srcs) {
+    if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
        GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
 #if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
        if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
@@ -399,23 +296,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
            CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
        }
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        if (can_be_transposed) {
-            ggml_cpy_flt_cuda<float, float, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        } else {
-            ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        }
+        ggml_cpy_flt_cuda<float, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
-        if (contiguous_srcs) {
-            ggml_cpy_flt_contiguous_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, main_stream);
-        } else {
-            ggml_cpy_flt_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        }
+        ggml_cpy_flt_cuda<float, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-        if (contiguous_srcs) {
-            ggml_cpy_flt_contiguous_cuda<float, half>        (src0_ddc, src1_ddc, ne, main_stream);
-        } else {
-            ggml_cpy_flt_cuda<float, half>        (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        }
+        ggml_cpy_flt_cuda<float, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
@@ -442,53 +327,21 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
    } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        if (can_be_transposed) {
-            ggml_cpy_flt_cuda<half, half, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        } else {
-            ggml_cpy_flt_cuda<half, half>       (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        }
+        ggml_cpy_flt_cuda<half, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) {
-        if (contiguous_srcs) {
-            ggml_cpy_flt_contiguous_cuda<half, nv_bfloat16>  (src0_ddc, src1_ddc, ne, main_stream);
-        } else {
-            ggml_cpy_flt_cuda<half, nv_bfloat16>    (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        }
+        ggml_cpy_flt_cuda<half, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
-        if (contiguous_srcs) {
-            ggml_cpy_flt_contiguous_cuda<half, float>        (src0_ddc, src1_ddc, ne, main_stream);
-        } else {
-            ggml_cpy_flt_cuda<half, float>          (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        }
+        ggml_cpy_flt_cuda<half, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) {
-        if (can_be_transposed) {
-            ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16, true> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        } else {
-            ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        }
+        ggml_cpy_flt_cuda<nv_bfloat16, nv_bfloat16> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) {
-        if (contiguous_srcs) {
-            ggml_cpy_flt_contiguous_cuda<nv_bfloat16, half>  (src0_ddc, src1_ddc, ne, main_stream);
-        } else {
-            ggml_cpy_flt_cuda<nv_bfloat16, half>    (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        }
+        ggml_cpy_flt_cuda<nv_bfloat16, half> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) {
-        if (contiguous_srcs) {
-            ggml_cpy_flt_contiguous_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, main_stream);
-        } else {
-            ggml_cpy_flt_cuda<nv_bfloat16, float>   (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        }
+        ggml_cpy_flt_cuda<nv_bfloat16, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32) {
-        if (contiguous_srcs) {
-            ggml_cpy_flt_contiguous_cuda<float, int32_t>     (src0_ddc, src1_ddc, ne, main_stream);
-        } else {
-            ggml_cpy_flt_cuda<float, int32_t>       (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        }
+        ggml_cpy_flt_cuda<float, int32_t> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_F32) {
-        if (contiguous_srcs) {
-            ggml_cpy_flt_contiguous_cuda<int32_t, float>     (src0_ddc, src1_ddc, ne, main_stream);
-        } else {
-            ggml_cpy_flt_cuda<int32_t, float>       (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
-        }
+        ggml_cpy_flt_cuda<int32_t, float> (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
    } else {
        GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
--- a/ggml/src/ggml-cuda/fattn-tile.cu
+++ b/ggml/src/ggml-cuda/fattn-tile.cu
@@ -14,10 +14,6 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor
            GGML_ASSERT(V->ne[0] == K->ne[0]);
            ggml_cuda_flash_attn_ext_tile_case< 64,  64>(ctx, dst);
        } break;
-        case  72: {
-            GGML_ASSERT(V->ne[0] == K->ne[0]);
-            ggml_cuda_flash_attn_ext_tile_case< 72,  72>(ctx, dst);
-        } break;
        case  80: {
            GGML_ASSERT(V->ne[0] == K->ne[0]);
            ggml_cuda_flash_attn_ext_tile_case< 80,  80>(ctx, dst);
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -6,7 +6,7 @@
 // nbatch_K == number of K columns to load in parallel for KQ calculation

 // TODO optimize kernel parameters for FP16 NVIDIA (P100)
-// TODO optimize kernel parameters for head sizes 40, 72, 80, 96, 112
+// TODO optimize kernel parameters for head sizes 40, 80, 96, 112

 // The ROCm compiler cannot handle templating in __launch_bounds__.
 // As a workaround, define a macro to package the kernel parameters as uint32_t:
@@ -32,12 +32,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64,  64, 16, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64,  64, 32, 256, 2,  64,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72,  2,  64, 2,  64,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72,  4, 128, 2,  64,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72,  8, 256, 2,  64,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72, 16, 256, 2,  64,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72, 32, 256, 2,  64,  72)
-
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80,  80,  2,  64, 2,  64,  40)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80,  80,  4, 128, 2,  64,  40)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80,  80,  8, 256, 2,  64,  40)
@@ -86,12 +80,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_nv
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64,  64, 16, 128, 3,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64,  64, 32, 256, 2,  64,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72,  2,  64, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72,  4, 128, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72,  8, 256, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72, 16, 256, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72, 32, 256, 2,  32,  72)
-
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80,  80,  2,  64, 2,  32,  40)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80,  80,  4, 128, 2,  32,  40)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80,  80,  8, 256, 2,  32,  40)
@@ -142,13 +130,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64,  64, 32, 256, 2,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64,  64, 64, 256, 2,  64,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72,  2,  64, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72,  4, 128, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72,  8, 256, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72, 16, 256, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72, 32, 256, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72, 64, 256, 2,  32,  72)
-
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80,  80,  2,  64, 2,  32,  40)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80,  80,  4, 128, 2,  32,  40)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80,  80,  8, 256, 2,  32,  40)
@@ -204,13 +185,6 @@ static constexpr __host__ __device__ uint32_t ggml_cuda_fattn_tile_get_config_am
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64,  64, 32, 128, 4,  64,  64)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 64,  64, 64, 128, 5,  64,  64)

-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72,  2,  64, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72,  4, 128, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72,  8, 256, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72, 16, 256, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72, 32, 256, 2,  32,  72)
-    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 72,  72, 64, 256, 2,  32,  72)
-
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80,  80,  2,  64, 2,  32,  40)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80,  80,  4, 128, 2,  32,  40)
    GGML_CUDA_FATTN_TILE_CONFIG_CASE( 80,  80,  8, 256, 2,  32,  40)
@@ -749,7 +723,7 @@ static __global__ void flash_attn_tile(

    if (
 #ifdef GGML_USE_WMMA_FATTN
-            (ncols2 != 1 && DV != 40 && DV != 72 && DV != 512) ||
+            (ncols2 != 1 && DV != 40 && DV != 512) ||
 #endif // GGML_USE_WMMA_FATTN
            (use_logit_softcap && !(DV == 128 || DV == 256))
    ) {
@@ -1224,7 +1198,6 @@ void ggml_cuda_flash_attn_ext_tile(ggml_backend_cuda_context & ctx, ggml_tensor

 extern DECL_FATTN_TILE_CASE( 40,  40);
 extern DECL_FATTN_TILE_CASE( 64,  64);
-extern DECL_FATTN_TILE_CASE( 72,  72);
 extern DECL_FATTN_TILE_CASE( 80,  80);
 extern DECL_FATTN_TILE_CASE( 96,  96);
 extern DECL_FATTN_TILE_CASE(112, 112);
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -223,7 +223,6 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
    switch (K->ne[0]) {
        case  40:
        case  64:
-        case  72:
        case  80:
        case  96:
        case 128:
@@ -276,7 +275,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
    const bool can_use_vector_kernel = Q->ne[0] <= 256 && Q->ne[0] % 64 == 0 && K->ne[1] % FATTN_KQ_STRIDE == 0;

    // If Turing tensor cores available, use them:
-    if (turing_mma_available(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72) {
+    if (turing_mma_available(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40) {
        if (can_use_vector_kernel) {
            if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
                if (cc >= GGML_CUDA_CC_ADA_LOVELACE && Q->ne[1] == 1 && Q->ne[3] == 1 && !(gqa_ratio > 4 && K->ne[1] >= 8192)) {
@@ -302,7 +301,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
    }

    // Use the WMMA kernel if possible:
-    if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 72 && Q->ne[0] != 576) {
+    if (ggml_cuda_should_use_wmma_fattn(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40 && Q->ne[0] != 576) {
        if (can_use_vector_kernel && Q->ne[1] <= 2) {
            return BEST_FATTN_KERNEL_VEC;
        }
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -27,7 +27,6 @@
 #include "ggml-cuda/mmq.cuh"
 #include "ggml-cuda/mmvf.cuh"
 #include "ggml-cuda/mmvq.cuh"
-#include "ggml-cuda/moe-expert-reduce.cuh"
 #include "ggml-cuda/norm.cuh"
 #include "ggml-cuda/opt-step-adamw.cuh"
 #include "ggml-cuda/opt-step-sgd.cuh"
@@ -51,7 +50,6 @@
 #include "ggml-cuda/upscale.cuh"
 #include "ggml-cuda/wkv.cuh"
 #include "ggml-cuda/gla.cuh"
-#include "ggml-cuda/set.cuh"
 #include "ggml-cuda/set-rows.cuh"
 #include "ggml-cuda/pad_reflect_1d.cuh"
 #include "ggml.h"
@@ -1959,15 +1957,8 @@ static void ggml_cuda_mul_mat_batched_cublas_impl(ggml_backend_cuda_context & ct

        size_t src1_stride_size = sizeof(cuda_t);

-        const int threads_x = 16;
-        const int threads_y = 16;
-        dim3 block_dims(threads_x, threads_y);
-
-        dim3 grid_dims(
-            (ne13 + threads_x - 1) / threads_x,
-            (ne12 + threads_y - 1) / threads_y
-        );
-        k_compute_batched_ptrs<<<grid_dims, block_dims, 0, main_stream>>>(
+        dim3 block_dims(ne13, ne12);
+        k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
                src0_ptr, src1_ptr, dst_t,
                ptrs_src.get(), ptrs_dst.get(),
                ne12, ne13,
@@ -2016,164 +2007,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
    }
 }

-static bool ggml_cuda_should_fuse_mul_mat(const ggml_tensor * ffn_up,
-                                          const ggml_tensor * ffn_gate,
-                                          const ggml_tensor * glu,
-                                          const ggml_tensor * ffn_up_bias = nullptr,
-                                          const ggml_tensor * ffn_gate_bias = nullptr) {
-    const bool has_bias = ffn_up_bias != nullptr || ffn_gate_bias != nullptr;
-
-    if (has_bias && (!ffn_up_bias || !ffn_gate_bias)) {
-        return false;
-    }
-
-    const bool is_mul_mat     = ffn_up->op == GGML_OP_MUL_MAT     && ffn_gate->op == GGML_OP_MUL_MAT     && glu->op == GGML_OP_GLU;
-    const bool is_mul_mat_id  = ffn_up->op == GGML_OP_MUL_MAT_ID  && ffn_gate->op == GGML_OP_MUL_MAT_ID  && glu->op == GGML_OP_GLU;
-
-    GGML_ASSERT(ffn_up && ffn_gate && glu);
-
-    if (!is_mul_mat && !is_mul_mat_id) {
-        return false;
-    }
-
-    const ggml_op expected_bias_op = is_mul_mat ? GGML_OP_ADD : GGML_OP_ADD_ID;
-
-    if (has_bias) {
-        if (ffn_up_bias->op != expected_bias_op || ffn_gate_bias->op != expected_bias_op) {
-            return false;
-        }
-
-        if (glu->src[0] != ffn_gate_bias || glu->src[1] != ffn_up_bias) {
-            return false;
-        }
-
-        if (expected_bias_op == GGML_OP_ADD) {
-            const bool up_has_mul   = ffn_up_bias->src[0] == ffn_up || ffn_up_bias->src[1] == ffn_up;
-            const bool gate_has_mul = ffn_gate_bias->src[0] == ffn_gate || ffn_gate_bias->src[1] == ffn_gate;
-            if (!up_has_mul || !gate_has_mul) {
-                return false;
-            }
-        } else { // GGML_OP_ADD_ID
-            if (ffn_up_bias->src[0] != ffn_up || ffn_gate_bias->src[0] != ffn_gate) {
-                return false;
-            }
-            if (ffn_up_bias->src[2] != ffn_up->src[2] || ffn_gate_bias->src[2] != ffn_gate->src[2]) {
-                return false;
-            }
-        }
-    } else {
-        if (glu->src[0] != ffn_gate && glu->src[1] != ffn_up) {
-            return false;
-        }
-    }
-
-    if (ffn_up->src[0]->type != ffn_gate->src[0]->type || !ggml_are_same_shape(ffn_up->src[0], ffn_gate->src[0]) ||
-        !ggml_are_same_stride(ffn_up->src[0], ffn_gate->src[0])) {
-        return false;
-    }
-
-    if (ffn_up->src[1] != ffn_gate->src[1]) {
-        return false;
-    }
-
-    if (ffn_up->src[2] && (ffn_up->src[2] != ffn_gate->src[2])) {
-        return false;
-    }
-
-    static constexpr std::array<ggml_glu_op, 3> valid_glu_ops = { GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU, GGML_GLU_OP_SWIGLU_OAI };
-
-    if (std::find(valid_glu_ops.begin(), valid_glu_ops.end(), ggml_get_glu_op(glu)) == valid_glu_ops.end()) {
-        return false;
-    }
-
-    if (const bool swapped = ggml_get_op_params_i32(glu, 1); swapped) {
-        return false;
-    }
-
-    const bool split = ggml_backend_buft_is_cuda_split(ffn_up->src[0]->buffer->buft) ||
-                       ggml_backend_buft_is_cuda_split(ffn_gate->src[0]->buffer->buft);
-
-    //TODO: add support for fusion for split buffers
-    if (split) {
-        return false;
-    }
-
-    return true;
-}
-
-static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
-    ggml_tensor *       src0 = tensor->src[0];
-    ggml_tensor *       src1 = tensor->src[1];
-    const ggml_tensor * dst  = tensor;
-
-    const bool is_mul_mat_id = tensor->op == GGML_OP_MUL_MAT_ID;
-
-    bool use_mul_mat_vec_f =
-        (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) &&
-        src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
-
-    const int cc      = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-    use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, is_mul_mat_id ? src1->ne[2] : src1->ne[1]);
-
-    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
-                       ggml_backend_buft_is_cuda_split(src1->buffer->buft);
-
-    //TODO: add support for fusion for split buffers
-    if (split) {
-        return false;
-    }
-
-    //we only support fusion for ncols_dst = 1
-    if (tensor->op == GGML_OP_MUL_MAT && dst->ne[1] != 1) {
-        return false;
-    }
-
-    if (tensor->op == GGML_OP_MUL_MAT_ID && dst->ne[2] != 1) {
-        return false;
-    }
-
-
-    return use_mul_mat_vec_f;
-}
-
-static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
-    ggml_tensor *       src0 = tensor->src[0];
-    ggml_tensor *       src1 = tensor->src[1];
-    const ggml_tensor * dst  = tensor;
-
-    const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE &&
-                                   ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) &&
-                                   src0->view_src;
-
-    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 &&
-                             dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
-
-    // fusion is not universally faster on Pascal
-    const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
-    if (cc <= GGML_CUDA_CC_PASCAL) {
-        return false;
-    }
-    //we only support fusion for ncols_dst = 1
-    if (tensor->op == GGML_OP_MUL_MAT && dst->ne[1] != 1) {
-        return false;
-    }
-
-    if (tensor->op == GGML_OP_MUL_MAT_ID && dst->ne[2] != 1) {
-        return false;
-    }
-
-
-    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
-                       ggml_backend_buft_is_cuda_split(src1->buffer->buft);
-
-    //TODO: add support for fusion for split buffers
-    if (split) {
-        return false;
-    }
-
-    return use_mul_mat_vec_q;
-}
-
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);

@@ -2435,9 +2268,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_SET_ROWS:
            ggml_cuda_op_set_rows(ctx, dst);
            break;
-        case GGML_OP_SET:
-            ggml_cuda_op_set(ctx, dst);
-            break;
        case GGML_OP_DUP:
            ggml_cuda_dup(ctx, dst);
            break;
@@ -2516,18 +2346,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
                case GGML_UNARY_OP_XIELU:
                    ggml_cuda_op_xielu(ctx, dst);
                    break;
-                case GGML_UNARY_OP_FLOOR:
-                    ggml_cuda_op_floor(ctx, dst);
-                    break;
-                case GGML_UNARY_OP_CEIL:
-                    ggml_cuda_op_ceil(ctx, dst);
-                    break;
-                case GGML_UNARY_OP_ROUND:
-                    ggml_cuda_op_round(ctx, dst);
-                    break;
-                case GGML_UNARY_OP_TRUNC:
-                    ggml_cuda_op_trunc(ctx, dst);
-                    break;
                default:
                    return false;
            }
@@ -2927,7 +2745,7 @@ static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_gra
        }
    }

-    if ((node->op == GGML_OP_SCALE || node->op == GGML_OP_GLU) &&
+    if (node->op == GGML_OP_SCALE &&
        memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
        return false;
    }
@@ -3008,9 +2826,9 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
        ggml_cuda_topk_moe_ops(/*with_norm=*/false, /*delayed_softmax=*/true);

    if (ops.size() == topk_moe_ops_with_norm.size() &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 9 })) {
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 8 })) {
        ggml_tensor * softmax = cgraph->nodes[node_idx];
-        ggml_tensor * weights = cgraph->nodes[node_idx + 9];
+        ggml_tensor * weights = cgraph->nodes[node_idx+8];

        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
            return true;
@@ -3020,14 +2838,14 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
    if (ops.size() == topk_moe_ops.size() &&
        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3, node_idx + 4 })) {
        ggml_tensor * softmax = cgraph->nodes[node_idx];
-        ggml_tensor * weights = cgraph->nodes[node_idx + 4];
+        ggml_tensor * weights = cgraph->nodes[node_idx+4];
        if (ggml_cuda_should_use_topk_moe(softmax, weights)) {
            return true;
        }
    }

    if (ops.size() == topk_moe_ops_delayed_softmax.size() &&
-        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 1, node_idx + 5 })) {
+        ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 2, node_idx + 5 })) {
        ggml_tensor * softmax = cgraph->nodes[node_idx + 4];
        ggml_tensor * weights = cgraph->nodes[node_idx + 5];

@@ -3036,38 +2854,6 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
        }
    }

-    std::initializer_list<enum ggml_op> mul_mat_bias_glu_ops    = { GGML_OP_MUL_MAT,    GGML_OP_ADD,    GGML_OP_MUL_MAT,    GGML_OP_ADD,    GGML_OP_GLU };
-    std::initializer_list<enum ggml_op> mul_mat_id_bias_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_MUL_MAT_ID, GGML_OP_ADD_ID, GGML_OP_GLU };
-
-    std::initializer_list<enum ggml_op> mul_mat_id_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_MUL_MAT_ID, GGML_OP_GLU };
-    std::initializer_list<enum ggml_op> mul_mat_glu_ops    = { GGML_OP_MUL_MAT,    GGML_OP_MUL_MAT,    GGML_OP_GLU };
-
-    if (ops.size() == 5 && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 4}) ||
-                            ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 4}))) {
-
-        const ggml_tensor * ffn_gate      = cgraph->nodes[node_idx];
-        const ggml_tensor * ffn_gate_bias = cgraph->nodes[node_idx + 1];
-        const ggml_tensor * ffn_up        = cgraph->nodes[node_idx + 2];
-        const ggml_tensor * ffn_up_bias   = cgraph->nodes[node_idx + 3];
-        const ggml_tensor * glu           = cgraph->nodes[node_idx + 4];
-
-        if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu, ffn_up_bias, ffn_gate_bias)) {
-            return true;
-        }
-    }
-
-    if (ops.size() == 3 && (ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 2}) ||
-                            ggml_can_fuse_subgraph(cgraph, node_idx, ops, {node_idx + 2}))) {
-
-        const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
-        const ggml_tensor * ffn_up   = cgraph->nodes[node_idx + 1];
-        const ggml_tensor * glu      = cgraph->nodes[node_idx + 2];
-
-        if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu)) {
-            return true;
-        }
-    }
-
    if (!ggml_can_fuse(cgraph, node_idx, ops)) {
        return false;
    }
@@ -3148,20 +2934,9 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
        // With the use of CUDA graphs, the execution will be performed by the graph launch.
        if (!use_cuda_graph || cuda_graph_update_required) {

-            [[maybe_unused]] int prev_i = 0;
-
            for (int i = 0; i < cgraph->n_nodes; i++) {
                ggml_tensor * node = cgraph->nodes[i];

-
-#ifdef GGML_CUDA_DEBUG
-                const int nodes_fused = i - prev_i - 1;
-                prev_i = i;
-                if (nodes_fused > 0) {
-                    GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused);
-                }
-#endif
-
                if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
                    continue;
                }
@@ -3170,18 +2945,17 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                if (!disable_fusion) {

                    if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ true), {})) {
-                        ggml_tensor * weights          = cgraph->nodes[i + 9];
-                        ggml_tensor * selected_experts = cgraph->nodes[i + 3];
-                        ggml_tensor * clamp            = cgraph->nodes[i + 7];
+                        ggml_tensor * weights = cgraph->nodes[i+8];
+                        ggml_tensor * selected_experts = cgraph->nodes[i+3];
                        ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ true,
-                                              /*delayed softmax*/ false, clamp);
-                        i += 9;
+                                              /*delayed softmax*/ false);
+                        i += 8;
                        continue;
                    }

                    if (ggml_cuda_can_fuse(cgraph, i, ggml_cuda_topk_moe_ops(/*with norm*/ false), {})) {
-                        ggml_tensor * weights          = cgraph->nodes[i + 4];
-                        ggml_tensor * selected_experts = cgraph->nodes[i + 3];
+                        ggml_tensor * weights = cgraph->nodes[i+4];
+                        ggml_tensor * selected_experts = cgraph->nodes[i+3];
                        ggml_cuda_op_topk_moe(*cuda_ctx, node->src[0], weights, selected_experts, /*with norm*/ false,
                                              /*delayed softmax*/ false);
                        i += 4;
@@ -3199,31 +2973,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                        continue;
                    }

-                    if (node->op == GGML_OP_MUL) {
-                        int current_node = i + 1;
-                        int num_views    = 0;
-                        int num_adds     = 0;
-                        while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_VIEW) {
-                            num_views++;
-                            current_node++;
-                        }
-
-                        while (current_node < cgraph->n_nodes && cgraph->nodes[current_node]->op == GGML_OP_ADD &&
-                                num_adds < num_views - 1) {
-                            num_adds++;
-                            current_node++;
-                        }
-
-                        if (num_adds == num_views - 1 && num_views > 0) {
-                            ggml_tensor * dst_node = cgraph->nodes[current_node - 1];
-                            if (ggml_cuda_should_use_moe_expert_reduce(cgraph, i, current_node)) {
-                                ggml_cuda_op_moe_expert_reduce(*cuda_ctx, node->src[0], node->src[1], dst_node);
-                                i += num_views + num_adds;
-                                continue;
-                            }
-                        }
-                    }
-
                    if (node->op == GGML_OP_ADD) {
                        int n_fuse = 0;
                        ggml_op ops[8];
@@ -3255,184 +3004,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                        }
                    }

-                    bool fused_mul_mat_vec = false;
-                    int fused_node_count = 0;
-
-                    for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
-                        const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
-
-                        if (ggml_cuda_can_fuse(cgraph, i, { op, bias_op, op, bias_op, GGML_OP_GLU }, {})) {
-                            ggml_tensor * glu         = cgraph->nodes[i + 4];
-                            ggml_tensor * gate_bias_n = glu->src[0];
-                            ggml_tensor * up_bias_n   = glu->src[1];
-
-                            //we don't assume the order for {gate, up}. Instead infer it from the bias tensor
-                            ggml_tensor * gate_n      = nullptr;
-                            ggml_tensor * up_n        = nullptr;
-
-                            if (gate_bias_n->src[0] == cgraph->nodes[i] || gate_bias_n->src[1] == cgraph->nodes[i]) {
-                                gate_n = cgraph->nodes[i];
-                                up_n   = cgraph->nodes[i + 2];
-                            } else if (gate_bias_n->src[0] == cgraph->nodes[i + 2] || gate_bias_n->src[1] == cgraph->nodes[i + 2]) {
-                                gate_n = cgraph->nodes[i + 2];
-                                up_n   = cgraph->nodes[i];
-                            } else {
-                                continue;
-                            }
-
-                            auto get_bias_tensor = [](const ggml_tensor * bias_node, const ggml_tensor * mul_node, ggml_op op_bias) {
-                                if (op_bias == GGML_OP_ADD) {
-                                    if (bias_node->src[0] == mul_node) {
-                                        return bias_node->src[1];
-                                    }
-                                    if (bias_node->src[1] == mul_node) {
-                                        return bias_node->src[0];
-                                    }
-                                    return (ggml_tensor *) nullptr;
-                                }
-                                GGML_ASSERT(op_bias == GGML_OP_ADD_ID);
-                                GGML_ASSERT(bias_node->src[0] == mul_node);
-                                return bias_node->src[1];
-                            };
-
-                            ggml_tensor * up_bias_tensor   = get_bias_tensor(up_bias_n, up_n, bias_op);
-                            ggml_tensor * gate_bias_tensor = get_bias_tensor(gate_bias_n, gate_n, bias_op);
-
-                            if (!up_bias_tensor || !gate_bias_tensor) {
-                                continue;
-                            }
-
-                            const ggml_tensor * src0 = up_n->src[0];
-                            const ggml_tensor * src1 = up_n->src[1];
-                            const ggml_tensor * ids  = up_n->src[2];
-
-                            if (ggml_cuda_should_fuse_mul_mat_vec_f(up_n)) {
-                                ggml_cuda_mm_fusion_args_host fusion_data{};
-                                fusion_data.gate      = gate_n->src[0];
-                                fusion_data.x_bias    = up_bias_tensor;
-                                fusion_data.gate_bias = gate_bias_tensor;
-                                fusion_data.glu_op    = ggml_get_glu_op(glu);
-
-                                ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
-                                fused_mul_mat_vec = true;
-                                fused_node_count = 5;
-                                break;
-                            }
-
-                            if (ggml_cuda_should_fuse_mul_mat_vec_q(up_n)) {
-                                ggml_cuda_mm_fusion_args_host fusion_data{};
-                                fusion_data.gate      = gate_n->src[0];
-                                fusion_data.x_bias    = up_bias_tensor;
-                                fusion_data.gate_bias = gate_bias_tensor;
-                                fusion_data.glu_op    = ggml_get_glu_op(glu);
-
-                                ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
-                                fused_mul_mat_vec = true;
-                                fused_node_count = 5;
-                                break;
-                            }
-                        } else if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) {
-                            ggml_tensor * glu  = cgraph->nodes[i + 2];
-                            ggml_tensor * gate = glu->src[0];
-                            ggml_tensor * up   = glu->src[1];
-
-                            bool ok = (gate == cgraph->nodes[i] && up == cgraph->nodes[i + 1])
-                                || (gate == cgraph->nodes[i + 1] && up == cgraph->nodes[i]);
-
-                            if (!ok) continue;
-
-                            const ggml_tensor * src0 = up->src[0];
-                            const ggml_tensor * src1 = up->src[1];
-                            const ggml_tensor * ids  = up->src[2];
-
-                            if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
-                                ggml_cuda_mm_fusion_args_host fusion_data{};
-                                fusion_data.gate   = gate->src[0];
-                                fusion_data.glu_op = ggml_get_glu_op(glu);
-
-                                ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
-                                fused_mul_mat_vec = true;
-                                fused_node_count = 3;
-                                break;
-                            }
-
-                            if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
-                                ggml_cuda_mm_fusion_args_host fusion_data{};
-                                fusion_data.gate   = gate->src[0];
-                                fusion_data.glu_op = ggml_get_glu_op(glu);
-
-                                ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, &fusion_data);
-                                fused_mul_mat_vec = true;
-                                fused_node_count = 3;
-                                break;
-                            }
-                        }
-                    }
-
-                    if (fused_mul_mat_vec) {
-                        i += fused_node_count - 1;
-                        continue;
-                    }
-
-                    fused_mul_mat_vec = false;
-                    fused_node_count = 0;
-
-                    for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
-                        const ggml_op bias_op = op == GGML_OP_MUL_MAT ? GGML_OP_ADD : GGML_OP_ADD_ID;
-
-                        if (!ggml_can_fuse(cgraph, i, { op, bias_op })) {
-                            continue;
-                        }
-
-                        ggml_tensor * mm_node   = cgraph->nodes[i];
-                        ggml_tensor * bias_node = cgraph->nodes[i + 1];
-
-                        ggml_tensor * bias_tensor = nullptr;
-                        if (bias_op == GGML_OP_ADD) {
-                            if (bias_node->src[0] == mm_node) {
-                                bias_tensor = bias_node->src[1];
-                            } else if (bias_node->src[1] == mm_node) {
-                                bias_tensor = bias_node->src[0];
-                            } else {
-                                continue;
-                            }
-                        } else {
-                            if (bias_node->src[0] != mm_node) {
-                                continue;
-                            }
-                            bias_tensor = bias_node->src[1];
-                        }
-
-                        const ggml_tensor * src0 = mm_node->src[0];
-                        const ggml_tensor * src1 = mm_node->src[1];
-                        const ggml_tensor * ids  = mm_node->src[2];
-
-                        if (bias_op == GGML_OP_ADD_ID && bias_node->src[2] != ids) {
-                            continue;
-                        }
-
-                        ggml_cuda_mm_fusion_args_host fusion_data{};
-                        fusion_data.x_bias = bias_tensor;
-
-                        if (ggml_cuda_should_fuse_mul_mat_vec_f(mm_node)) {
-                            ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
-                            fused_mul_mat_vec = true;
-                            fused_node_count = 2;
-                            break;
-                        }
-
-                        if (ggml_cuda_should_fuse_mul_mat_vec_q(mm_node)) {
-                            ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, bias_node, &fusion_data);
-                            fused_mul_mat_vec = true;
-                            fused_node_count = 2;
-                            break;
-                        }
-                    }
-
-                    if (fused_mul_mat_vec) {
-                        i += fused_node_count - 1;
-                        continue;
-                    }

                    if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) {
                        ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
@@ -3798,10 +3369,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_EXP:
                case GGML_UNARY_OP_ELU:
-                case GGML_UNARY_OP_FLOOR:
-                case GGML_UNARY_OP_CEIL:
-                case GGML_UNARY_OP_ROUND:
-                case GGML_UNARY_OP_TRUNC:
                    return ggml_is_contiguous(op->src[0]);
                default:
                    return false;
@@ -3916,13 +3483,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
                       op->src[0]->type == GGML_TYPE_F32 &&
                       (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32);
            } break;
-        case GGML_OP_SET:
-            {
-                const ggml_type t = op->type;
-                return (t == GGML_TYPE_F32 || t == GGML_TYPE_I32) &&
-                    t == op->src[0]->type &&
-                    t == op->src[1]->type;
-            } break;
        case GGML_OP_CPY:
            {
                ggml_type src0_type = op->src[0]->type;
@@ -4082,11 +3642,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_SUM:
            return ggml_is_contiguous_rows(op->src[0]);
        case GGML_OP_ARGSORT:
-#ifndef GGML_CUDA_USE_CUB
+            // TODO: Support arbitrary column width
            return op->src[0]->ne[0] <= 1024;
-#else
-            return true;
-#endif
        case GGML_OP_SUM_ROWS:
        case GGML_OP_MEAN:
        case GGML_OP_GROUP_NORM:
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -18,10 +18,6 @@

 #include "common.cuh"

-// On Volta each warp is doing 4 8x8 mma operations in parallel.
-// The basic memory layout for a 32x8 output tile is to stack 4 input tiles in I direction and to mirror the B tile.
-// However, the i indices in this file are by default permuted to simplify the index calculations.
-// #define GGML_CUDA_MMA_NO_VOLTA_PERM

 #if CUDART_VERSION >= 11080

@@ -77,15 +73,6 @@ namespace ggml_cuda_mma {
        static constexpr int ne = I * J / 64;
        T x[ne] = {0};

-        static constexpr __device__ bool supported() {
-            if (I == 64 && J ==  2) return true;
-            if (I == 16 && J ==  8) return true;
-            if (I == 32 && J ==  4) return true;
-            if (I == 16 && J == 16) return true;
-            if (I == 32 && J == 32) return true;
-            return false;
-        }
-
        static __device__ __forceinline__ int get_i(const int l) {
            if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
                return threadIdx.x % 16;
@@ -98,8 +85,7 @@ namespace ggml_cuda_mma {
            } else if constexpr (I == 32 && J == 32) {
                return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
            } else {
-                NO_DEVICE_CODE;
-                return -1;
+                static_assert(I == -1 && J == -1, "template specialization not implemented");
            }
        }

@@ -115,67 +101,22 @@ namespace ggml_cuda_mma {
            } else if constexpr (I == 32 && J == 32) {
                return threadIdx.x % 32;
            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
-        }
-#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
-        static constexpr int ne = I * J / 32;
-        T x[ne] = {0};
-
-        static constexpr __device__ bool supported() {
-            if (I == 32 && J ==  8) return true;
-            return false;
-        }
-
-        static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 32 && J == 8) {
-#ifdef GGML_CUDA_MMA_NO_VOLTA_PERM
-                return (((threadIdx.x % 16) / 4) * 8) | ((threadIdx.x / 16) * 4) | (l & 2) | (threadIdx.x % 2);
-#else
-                return (l & 2) | (threadIdx.x & ~2);
-#endif // GGML_CUDA_MMA_NO_VOLTA_PERM
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
-        }
-
-        static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr (I == 32 && J == 8) {
-                return (threadIdx.x & 2) | (l & (4 + 1));
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
+                static_assert(I == -1 && J == -1, "template specialization not implemented");
            }
        }
 #else
        static constexpr int ne = I * J / 32;
        T x[ne] = {0};

-        static constexpr __device__ bool supported() {
-            if (I ==  8 && J ==  4) return true;
-            if (I ==  8 && J ==  8) return true;
-            if (I == 16 && J ==  8) return true;
-            if (I == 16 && J == 16) return true;
-            if (I == 32 && J ==  8) return true;
-            return false;
-        }
-
        static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 8 && J == 4) {
-                return threadIdx.x / 4;
-            } else if constexpr (I == 8 && J == 8) {
+            if constexpr (I == 8 && (J == 4 || J == 8)) {
                return threadIdx.x / 4;
            } else if constexpr (I == 16 && J == 8) {
-                return ((l / 2) * 8) | (threadIdx.x / 4);
+                return (l / 2) * 8 + threadIdx.x / 4;
            } else if constexpr (I == 16 && J == 16) {
-                return (((l / 2) % 2) * 8) | (threadIdx.x / 4);
-            } else if constexpr (I == 32 && J == 8) {
-                return tile<16, 8, T>::get_i(l); // Memory layout simply repeated with same pattern in i direction.
+                return ((l / 2) % 2) * 8 + threadIdx.x / 4;
            } else {
-                NO_DEVICE_CODE;
-                return -1;
+                static_assert(I == -1 && J == -1, "template specialization not implemented");
            }
        }

@@ -183,16 +124,13 @@ namespace ggml_cuda_mma {
            if constexpr (I == 8 && J == 4) {
                return threadIdx.x % 4;
            } else if constexpr (I == 8 && J == 8) {
-                return (l * 4) | (threadIdx.x % 4);
+                return 4 * l + threadIdx.x % 4;
            } else if constexpr (I == 16 && J == 8) {
-                return ((threadIdx.x % 4) * 2) | (l % 2);
+                return 2 * (threadIdx.x % 4) + l % 2;
            } else if constexpr (I == 16 && J == 16) {
-                return ((l / 4) * 8) | ((threadIdx.x % 4) * 2) | (l % 2);
-            } else if constexpr (I == 32 && J == 8) {
-                return tile<16, 8, T>::get_j(l); // Memory layout simply repeated with same pattern in i direction.
+                return 8 * (l / 4) + 2 * (threadIdx.x % 4) + l % 2;
            } else {
-                NO_DEVICE_CODE;
-                return -1;
+                static_assert(I == -1 && J == -1, "template specialization not implemented");
            }
        }
 #endif // defined(GGML_USE_HIP)
@@ -202,83 +140,32 @@ namespace ggml_cuda_mma {
    struct tile<I_, J_, half2> {
        static constexpr int I  = I_;
        static constexpr int J  = J_;
-
-#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
-        static constexpr int ne = I == 8 && J == 8 ? I * J / (WARP_SIZE/4) : I * J / WARP_SIZE;
-        half2 x[ne] = {{0.0f, 0.0f}};
-
-        static constexpr __device__ bool supported() {
-            if (I ==  8 && J ==  8) return true;
-            if (I == 32 && J ==  8) return true;
-            return false;
-        }
-
-        static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (I == 8 && J == 8) {
-                return ((threadIdx.x / 16) * 4) | (threadIdx.x % 4);
-            } else if constexpr (I == 32 && J == 8) {
-#ifdef GGML_CUDA_MMA_NO_VOLTA_PERM
-                return (((threadIdx.x % 16) / 4) * 8) | ((threadIdx.x / 16) * 4) | (threadIdx.x % 4);
-#else
-                return threadIdx.x;
-#endif // GGML_CUDA_MMA_NO_VOLTA_PERM
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
-        }
-
-        static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr ((I == 8 || I == 32) && J == 8) {
-                return l;
-            } else {
-                NO_DEVICE_CODE;
-                return -1;
-            }
-        }
-#else
        static constexpr int ne = I * J / WARP_SIZE;
        half2 x[ne] = {{0.0f, 0.0f}};

-        static constexpr __device__ bool supported() {
-            if (I ==  8 && J ==  4) return true;
-            if (I ==  8 && J ==  8) return true;
-            if (I == 16 && J ==  8) return true;
-            if (I == 16 && J == 16) return true;
-            if (I == 32 && J ==  8) return true;
-            return false;
-        }
-
        static __device__ __forceinline__ int get_i(const int l) {
            if constexpr (I == 8 && J == 8) {
                return threadIdx.x / 4;
            } else if constexpr (I == 16 && J == 4) {
-                return (l * 8) | (threadIdx.x / 4);
+                return l * 8 + threadIdx.x / 4;
            } else if constexpr (I == 16 && J == 8) {
-                return ((l % 2) * 8) | (threadIdx.x / 4);
-            } else if constexpr (I == 32 && J == 8) {
-                return ((l / 4) * 16) | ((l % 2) * 8) | (threadIdx.x / 4);
+                return (l % 2) * 8 + threadIdx.x / 4;
            } else {
-                NO_DEVICE_CODE;
-                return -1;
+                static_assert(I == -1 && J == -1, "template specialization not implemented");
            }
        }

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 8 && J == 8) {
-                return (l * 4) | (threadIdx.x % 4);
+                return l * 4 + threadIdx.x % 4;
            } else if constexpr (I == 16 && J == 4) {
                return threadIdx.x % 4;
            } else if constexpr (I == 16 && J == 8) {
-                return ((l / 2) * 4) | (threadIdx.x % 4);
-            } else if constexpr (I == 32 && J == 8) {
-                return ((l & 2) * 2) | (threadIdx.x % 4);
+                return (l / 2) * 4 + threadIdx.x % 4;
            } else {
-                NO_DEVICE_CODE;
-                return -1;
+                static_assert(I == -1 && J == -1, "template specialization not implemented");
            }
        }
-#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
    };

    template <int I_, int J_>
@@ -288,36 +175,27 @@ namespace ggml_cuda_mma {
        static constexpr int ne = I * J / WARP_SIZE;
        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};

-        static constexpr __device__ bool supported() {
-            if (I ==  8 && J ==  8) return true;
-            if (I == 16 && J ==  4) return true;
-            if (I == 16 && J ==  8) return true;
-            return false;
-        }
-
        static __device__ __forceinline__ int get_i(const int l) {
            if constexpr (I == 8 && J == 8) {
                return threadIdx.x / 4;
            } else if constexpr (I == 16 && J == 4) {
-                return (l * 8) | (threadIdx.x / 4);
+                return l * 8 + threadIdx.x / 4;
            } else if constexpr (I == 16 && J == 8) {
-                return ((l % 2) * 8) | (threadIdx.x / 4);
+                return (l % 2) * 8 + threadIdx.x / 4;
            } else {
-                NO_DEVICE_CODE;
-                return -1;
+                static_assert(I == -1 && J == -1, "template specialization not implemented");
            }
        }

        static __device__ __forceinline__ int get_j(const int l) {
            if constexpr (I == 8 && J == 8) {
-                return (l * 4) | (threadIdx.x % 4);
+                return l * 4 + threadIdx.x % 4;
            } else if constexpr (I == 16 && J == 4) {
                return threadIdx.x % 4;
            } else if constexpr (I == 16 && J == 8) {
-                return ((l / 2) * 4) | (threadIdx.x % 4);
+                return (l / 2) * 4 + threadIdx.x % 4;
            } else {
-                NO_DEVICE_CODE;
-                return -1;
+                static_assert(I == -1 && J == -1, "template specialization not implemented");
            }
        }
    };
@@ -385,12 +263,8 @@ namespace ggml_cuda_mma {
            : "=r"(xi[0]), "=r"(xi[1])
            : "l"(xs));
 #else
-#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
-        GGML_UNUSED_VARS(t, xs0, stride);
-        NO_DEVICE_CODE;
-#else
-        load_generic(t, xs0, stride);
-#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+        load_generic(xs0, stride);
+        GGML_UNUSED(t);
 #endif // TURING_MMA_AVAILABLE
    }

@@ -403,35 +277,11 @@ namespace ggml_cuda_mma {
        asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
            : "=r"(xi[0]), "=r"(xi[1]), "=r"(xi[2]), "=r"(xi[3])
            : "l"(xs));
-#else
-#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
-        GGML_UNUSED_VARS(t, xs0, stride);
-        NO_DEVICE_CODE;
 #else
        load_generic(t, xs0, stride);
-#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
 #endif // TURING_MMA_AVAILABLE
    }

-    template <typename T>
-    static __device__ __forceinline__ void load_ldmatrix(
-            tile<32, 8, T> & t, const T * __restrict__ xs0, const int stride) {
-#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
-#if 1
-        // TODO: more generic handling
-        static_assert(sizeof(T) == 4, "bad type size");
-        ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 0, xs0 + t.get_i(0)*stride + 0);
-        ggml_cuda_memcpy_1<4*sizeof(T)>(t.x + 4, xs0 + t.get_i(4)*stride + 4);
-#else
-        load_generic(t, xs0, stride);
-#endif // 1
-#else
-        tile<16, 8, T> * t16 = (tile<16, 8, T> *) &t;
-        load_ldmatrix(t16[0], xs0 +  0*stride, stride);
-        load_ldmatrix(t16[1], xs0 + 16*stride, stride);
-#endif // __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
-    }
-
    template <typename T>
    static __device__ __forceinline__ void load_ldmatrix_trans(
            tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
@@ -696,43 +546,4 @@ namespace ggml_cuda_mma {
        NO_DEVICE_CODE;
 #endif // AMD_MFMA_AVAILABLE
    }
-
-    template <typename T1, typename T2, int J, int K>
-    static __device__ __forceinline__ void mma(
-            tile<32, J, T1> & D, const tile<32, K, T2> & A, const tile<J, K, T2> & B) {
-        tile<16, J, T1> * D16 = (tile<16, J, T1> *) &D;
-        tile<16, K, T2> * A16 = (tile<16, K, T2> *) &A;
-        mma(D16[0], A16[0], B);
-        mma(D16[1], A16[1], B);
-    }
-
-    static __device__ __forceinline__ void mma(
-            tile<32, 8, float> & D, const tile<32, 8, half2> & A, const tile<8, 8, half2> & B) {
-#if __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
-        const int * Axi = (const int *) A.x;
-        const int * Bxi = (const int *) B.x;
-        int       * Dxi = (int       *) D.x;
-        asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
-            "{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
-            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
-            : "r"(Axi[0]), "r"(Axi[1]), "r"(Bxi[0]), "r"(Bxi[1]));
-        asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
-            "{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
-            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
-            : "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[2]), "r"(Bxi[3]));
-        asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
-            "{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
-            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
-            : "r"(Axi[4]), "r"(Axi[5]), "r"(Bxi[4]), "r"(Bxi[5]));
-        asm("mma.sync.aligned.m8n8k4.row.col.f32.f16.f16.f32 "
-            "{%0, %1, %2, %3, %4, %5, %6, %7}, {%8, %9}, {%10, %11}, {%0, %1, %2, %3, %4, %5, %6, %7};"
-            : "+r"(Dxi[0]), "+r"(Dxi[1]), "+r"(Dxi[2]), "+r"(Dxi[3]), "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
-            : "r"(Axi[6]), "r"(Axi[7]), "r"(Bxi[6]), "r"(Bxi[7]));
-#else
-        tile<16, 8, float> * D16 = (tile<16, 8, float> *) &D;
-        tile<16, 8, half2> * A16 = (tile<16, 8, half2> *) &A;
-        mma(D16[0], A16[0], B);
-        mma(D16[1], A16[1], B);
-#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
-    }
 }
--- a/ggml/src/ggml-cuda/mmf.cu
+++ b/ggml/src/ggml-cuda/mmf.cu
@@ -148,7 +148,7 @@ bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const
        case GGML_TYPE_F32:
            return ampere_mma_available(cc);
        case GGML_TYPE_F16:
-            return volta_mma_available(cc) || turing_mma_available(cc);
+            return turing_mma_available(cc);
        case GGML_TYPE_BF16:
            return ampere_mma_available(cc);
        default:
--- a/ggml/src/ggml-cuda/mmf.cuh
+++ b/ggml/src/ggml-cuda/mmf.cuh
@@ -28,19 +28,9 @@ static __global__ void mul_mat_f(
        const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
        const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
 #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
-    constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
-
-    if (!I_16_supported && !I_32_supported) {
-        NO_DEVICE_CODE;
-        return;
-    }
-
-    constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
-
-    typedef tile<I_preferred, 8, T>     tile_A;
-    typedef tile<8,           8, T>     tile_B;
-    typedef tile<I_preferred, 8, float> tile_C;
+    typedef tile<16, 8, T>     tile_A;
+    typedef tile< 8, 8, T>     tile_B;
+    typedef tile<16, 8, float> tile_C;

    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    constexpr int tile_k_padded = warp_size + 4;
@@ -242,6 +232,7 @@ static __global__ void mul_mat_f(
 #endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
 }

+
 //This kernel is for larger batch sizes of mul_mat_id
 template <typename T, int rows_per_block, int cols_per_block, int nwarps>
 __launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)
@@ -254,19 +245,9 @@ static __global__ void mul_mat_f_ids(
        const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
        const uint3 sis1_fd, const uint3 nch_fd) {
 #if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
-    constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
-    constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
-
-    if (!I_16_supported && !I_32_supported) {
-        NO_DEVICE_CODE;
-        return;
-    }
-
-    constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work butr 16 is ~1% faster.
-
-    typedef tile<I_preferred, 8, T>     tile_A;
-    typedef tile<8,           8, T>     tile_B;
-    typedef tile<I_preferred, 8, float> tile_C;
+    typedef tile<16, 8, T>     tile_A;
+    typedef tile< 8, 8, T>     tile_B;
+    typedef tile<16, 8, float> tile_C;

    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
    constexpr int tile_k_padded = warp_size + 4;
@@ -552,8 +533,7 @@ void mul_mat_f_cuda(
        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
        cudaStream_t stream, const mmf_ids_data * ids_data) {
-    typedef tile<16, 8, T>     tile_A_16;
-    typedef tile<32, 8, T>     tile_A_32;
+    typedef tile<16, 8, T>     tile_A;
    typedef tile< 8, 8, T>     tile_B;

    GGML_ASSERT(ncols_x      % 2 == 0);
@@ -564,8 +544,7 @@ void mul_mat_f_cuda(
    const int64_t channel_ratio = nchannels_dst / nchannels_x;
    const int64_t sample_ratio  = nsamples_dst  / nsamples_x;

-    const int device    = ggml_cuda_get_device();
-    const int cc        = ggml_cuda_info().devices[device].cc;
+    const int device = ggml_cuda_get_device();
    const int warp_size = ggml_cuda_info().devices[device].warp_size;

    int64_t nwarps_best     = 1;
@@ -580,7 +559,7 @@ void mul_mat_f_cuda(
    }

    constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
-    const int nbytes_shared_iter = nwarps_best * (volta_mma_available(cc) ? tile_A_32::I : tile_A_16::I) * (warp_size + 4) * 4;
+    const int nbytes_shared_iter = nwarps_best * tile_A::I * (warp_size + 4) * 4;
    const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4;
    const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
    const int nbytes_slotmap = ids ? GGML_PAD(cols_per_block, 16) * sizeof(int) : 0;
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -1,12 +1,11 @@
 #include "ggml.h"
 #include "common.cuh"
-#include "unary.cuh"
-#include "mmvf.cuh"
 #include "convert.cuh"
+#include "mmvf.cuh"

-template <typename T, typename type_acc, int ncols_dst, int block_size, bool has_fusion = false>
+template <typename T, typename type_acc, int ncols_dst, int block_size>
 static __global__ void mul_mat_vec_f(
-        const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
+        const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
        const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
        const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
        const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
@@ -25,164 +24,58 @@ static __global__ void mul_mat_vec_f(
    y   += int64_t(sample_y)  *stride_sample_y   + channel_y  *stride_channel_y;
    dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst;

-    bool use_gate = false;
-    bool use_bias = false;
-    bool use_gate_bias = false;
-    ggml_glu_op glu_op = ggml_glu_op::GGML_GLU_OP_SWIGLU;
-    const T * gate_x = nullptr;
-    const float * x_bias = nullptr;
-    const float * gate_bias = nullptr;
-
-    if constexpr (has_fusion) {
-        use_gate = fusion.gate != nullptr;
-        use_bias = fusion.x_bias != nullptr;
-        use_gate_bias = fusion.gate_bias != nullptr;
-        glu_op = fusion.glu_op;
-
-        if (use_gate) {
-            gate_x = static_cast<const T *>(fusion.gate);
-        }
-        if (use_bias) {
-            x_bias = static_cast<const float *>(fusion.x_bias);
-        }
-        if (use_gate_bias) {
-            gate_bias = static_cast<const float *>(fusion.gate_bias);
-            use_gate_bias = use_gate;
-        } else {
-            use_gate_bias = false;
-        }
-    }
-
-    if (use_gate) {
-        gate_x += int64_t(sample_x)  *stride_sample_x   + channel_x  *stride_channel_x   + row*stride_row;
-    }
-    if constexpr (has_fusion) {
-        const int channel_bias = ids ? channel_x : channel_dst;
-        if (use_bias) {
-            x_bias += int64_t(sample_dst)*stride_sample_dst + channel_bias*stride_channel_dst;
-        }
-        if (use_gate_bias) {
-            gate_bias += int64_t(sample_dst)*stride_sample_dst + channel_bias*stride_channel_dst;
-        }
-    }
-
    const float2 * y2 = (const float2 *) y;

    extern __shared__ char data_mmv[];
    float * buf_iw = (float *) data_mmv;
-    float * buf_iw_gate = nullptr;
-    if constexpr (has_fusion) {
-        buf_iw_gate = (float *) (data_mmv + warp_size*sizeof(float));
-    }

    if (block_size > warp_size) {
        if (tid < warp_size) {
            buf_iw[tid] = 0.0f;
-            if constexpr (has_fusion) {
-                if (use_gate) {
-                    buf_iw_gate[tid] = 0.0f;
-                }
-            }
        }
        __syncthreads();
    }

    float sumf[ncols_dst] = {0.0f};
-    float sumf_gate[ncols_dst];
-    if constexpr (has_fusion) {
-#pragma unroll
-        for (int j = 0; j < ncols_dst; ++j) {
-            sumf_gate[j] = 0.0f;
-        }
-    }

    if constexpr (std::is_same_v<T, float>) {
        const float2 * x2 = (const float2 *) x;
-        const float2 * gate_x2 = nullptr;
-        if constexpr (has_fusion) {
-            if (use_gate) {
-                gate_x2 = (const float2 *) gate_x;
-            }
-        }

        for (int col2 = tid; col2 < ncols2; col2 += block_size) {
            const float2 tmpx = x2[col2];
-            float2 tmpx_gate = make_float2(0.0f, 0.0f);
-            if constexpr (has_fusion) {
-                if (use_gate) {
-                    tmpx_gate = gate_x2[col2];
-                }
-            }

 #pragma unroll
            for (int j = 0; j < ncols_dst; ++j) {
                const float2 tmpy = y2[j*stride_col_y2 + col2];
                ggml_cuda_mad(sumf[j], tmpx.x, tmpy.x);
                ggml_cuda_mad(sumf[j], tmpx.y, tmpy.y);
-
-                if constexpr (has_fusion) {
-                    if (use_gate) {
-                        ggml_cuda_mad(sumf_gate[j], tmpx_gate.x, tmpy.x);
-                        ggml_cuda_mad(sumf_gate[j], tmpx_gate.y, tmpy.y);
-                    }
-                }
            }
        }
    } else if constexpr (std::is_same_v<T, half>) {
        const half2 * x2 = (const half2 *) x;
-        const half2 * gate_x2 = nullptr;
-        if constexpr (has_fusion) {
-            if (use_gate) {
-                gate_x2 = (const half2 *) gate_x;
-            }
-        }

        if (std::is_same_v<type_acc, float>) {
            for (int col2 = tid; col2 < ncols2; col2 += block_size) {
                const float2 tmpx = __half22float2(x2[col2]);
-                float2 tmpx_gate = make_float2(0.0f, 0.0f);
-                if constexpr (has_fusion) {
-                    if (use_gate) {
-                        tmpx_gate = __half22float2(gate_x2[col2]);
-                    }
-                }
+
 #pragma unroll
                for (int j = 0; j < ncols_dst; ++j) {
                    const float2 tmpy = y2[j*stride_col_y2 + col2];
                    ggml_cuda_mad(sumf[j], tmpx.x, tmpy.x);
                    ggml_cuda_mad(sumf[j], tmpx.y, tmpy.y);
-
-                    if constexpr (has_fusion) {
-                        if (use_gate) {
-                            ggml_cuda_mad(sumf_gate[j], tmpx_gate.x, tmpy.x);
-                            ggml_cuda_mad(sumf_gate[j], tmpx_gate.y, tmpy.y);
-                        }
-                    }
                }
            }
        } else {
 #ifdef FP16_AVAILABLE
            half2 sumh2[ncols_dst] = {{0.0f, 0.0f}};
-            half2 sumh2_gate[ncols_dst] = {{0.0f, 0.0f}};

            for (int col2 = tid; col2 < ncols2; col2 += block_size) {
                const half2 tmpx = x2[col2];
-                half2 tmpx_gate = make_half2(0.0f, 0.0f);
-                if constexpr (has_fusion) {
-                    if (use_gate) {
-                        tmpx_gate = gate_x2[col2];
-                    }
-                }
+
 #pragma unroll
                for (int j = 0; j < ncols_dst; ++j) {
                    const float2 tmpy = y2[j*stride_col_y2 + col2];
                    sumh2[j] += tmpx * make_half2(tmpy.x, tmpy.y);
-
-                    if constexpr (has_fusion) {
-                        if (use_gate) {
-                            sumh2_gate[j] += tmpx_gate * make_half2(tmpy.x, tmpy.y);
-                        }
-                    }
                }
            }

@@ -190,15 +83,6 @@ static __global__ void mul_mat_vec_f(
            for (int j = 0; j < ncols_dst; ++j) {
                sumf[j] = __low2float(sumh2[j]) + __high2float(sumh2[j]);
            }
-
-            if constexpr (has_fusion) {
-                if (use_gate) {
-#pragma unroll
-                    for (int j = 0; j < ncols_dst; ++j) {
-                        sumf_gate[j] = __low2float(sumh2_gate[j]) + __high2float(sumh2_gate[j]);
-                    }
-                }
-            }
 #else
            NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
@@ -207,20 +91,8 @@ static __global__ void mul_mat_vec_f(
 //TODO: add support for ggml_cuda_mad for hip_bfloat162
 #if defined(GGML_USE_HIP)
        const int * x2 = (const int *) x;
-        const int * gate_x2 = nullptr;
-        if constexpr (has_fusion) {
-            if (use_gate) {
-                gate_x2 = (const int *) gate_x;
-            }
-        }
        for (int col2 = tid; col2 < ncols2; col2 += block_size) {
            const int tmpx = x2[col2];
-            int tmpx_gate = 0;
-            if constexpr (has_fusion) {
-                if (use_gate) {
-                    tmpx_gate = gate_x2[col2];
-                }
-            }
 #pragma unroll
            for (int j = 0; j < ncols_dst; ++j) {
                const float2 tmpy = y2[j*stride_col_y2 + col2];
@@ -228,45 +100,17 @@ static __global__ void mul_mat_vec_f(
                const float tmpx1 = ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]);
                ggml_cuda_mad(sumf[j], tmpx0, tmpy.x);
                ggml_cuda_mad(sumf[j], tmpx1, tmpy.y);
-
-                if constexpr (has_fusion) {
-                    if (use_gate) {
-                        const float tmpx0_gate = ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx_gate)[0]);
-                        const float tmpx1_gate = ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx_gate)[1]);
-                        ggml_cuda_mad(sumf_gate[j], tmpx0_gate, tmpy.x);
-                        ggml_cuda_mad(sumf_gate[j], tmpx1_gate, tmpy.y);
-                    }
-                }
            }
        }
 #else
        const nv_bfloat162 * x2 = (const nv_bfloat162 *) x;
-        const nv_bfloat162 * gate_x2 = nullptr;
-        if constexpr (has_fusion) {
-            if (use_gate) {
-                gate_x2 = (const nv_bfloat162 *) gate_x;
-            }
-        }
        for (int col2 = tid; col2 < ncols2; col2 += block_size) {
            const nv_bfloat162 tmpx = x2[col2];
-            nv_bfloat162 tmpx_gate;
-            if constexpr (has_fusion) {
-                if (use_gate) {
-                    tmpx_gate = gate_x2[col2];
-                }
-            }
 #pragma unroll
            for (int j = 0; j < ncols_dst; ++j) {
                const float2 tmpy = y2[j*stride_col_y2 + col2];
                ggml_cuda_mad(sumf[j], tmpx.x, tmpy.x);
                ggml_cuda_mad(sumf[j], tmpx.y, tmpy.y);
-
-                if constexpr (has_fusion) {
-                    if (use_gate) {
-                        ggml_cuda_mad(sumf_gate[j], tmpx_gate.x, tmpy.x);
-                        ggml_cuda_mad(sumf_gate[j], tmpx_gate.y, tmpy.y);
-                    }
-                }
            }
        }
 #endif
@@ -278,31 +122,13 @@ static __global__ void mul_mat_vec_f(
    for (int j = 0; j < ncols_dst; ++j) {
        sumf[j] = warp_reduce_sum<warp_size>(sumf[j]);

-        if constexpr (has_fusion) {
-            if (use_gate) {
-                sumf_gate[j] = warp_reduce_sum<warp_size>(sumf_gate[j]);
-            }
-        }
-
        if (block_size > warp_size) {
            buf_iw[tid/warp_size] = sumf[j];
-            if constexpr (has_fusion) {
-                if (use_gate) {
-                    buf_iw_gate[tid/warp_size] = sumf_gate[j];
-                }
-            }
            __syncthreads();
            if (tid < warp_size) {
                sumf[j] = buf_iw[tid];
                sumf[j] = warp_reduce_sum<warp_size>(sumf[j]);
-                if constexpr (has_fusion) {
-                    if (use_gate) {
-                        sumf_gate[j] = buf_iw_gate[tid];
-                        sumf_gate[j] = warp_reduce_sum<warp_size>(sumf_gate[j]);
-                    }
-                }
            }
-
            if (j < ncols_dst) {
                __syncthreads();
            }
@@ -313,74 +139,12 @@ static __global__ void mul_mat_vec_f(
        return;
    }

-    float value = sumf[tid];
-
-    if constexpr (has_fusion) {
-        if (use_bias) {
-            value += x_bias[tid*stride_col_dst + row];
-        }
-
-        if (use_gate) {
-            float gate_value = sumf_gate[tid];
-            if (use_gate_bias) {
-                gate_value += gate_bias[tid*stride_col_dst + row];
-            }
-            switch (glu_op) {
-                case GGML_GLU_OP_SWIGLU:
-                    value *= ggml_cuda_op_silu_single(gate_value);
-                    break;
-                case GGML_GLU_OP_GEGLU:
-                    value *= ggml_cuda_op_gelu_single(gate_value);
-                    break;
-                case GGML_GLU_OP_SWIGLU_OAI: {
-                    value = ggml_cuda_op_swiglu_oai_single(gate_value, value);
-                    break;
-                }
-                default:
-                    break;
-            }
-        }
-    }
-
-    dst[tid*stride_col_dst + row] = value;
-
-    if constexpr (!has_fusion) {
-        GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, glu_op, gate_x, x_bias, gate_bias, sumf_gate);
-    }
-}
-
-template<typename T, typename type_acc, int ncols_dst, int block_size>
-static void mul_mat_vec_f_switch_fusion(
-        const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
-        const int64_t ncols, const int64_t nrows,
-        const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
-        const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
-        const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
-        const dim3 & block_dims, const dim3 & block_nums, const int nbytes_shared, const cudaStream_t stream) {
-
-    const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
-    if constexpr (ncols_dst == 1) {
-        if (has_fusion) {
-            mul_mat_vec_f<T, type_acc, ncols_dst, block_size, true><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
-                channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-            return;
-       }
-    }
-
-    GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
-
-    mul_mat_vec_f<T, type_acc, ncols_dst, block_size><<<block_nums, block_dims, nbytes_shared, stream>>>
-        (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
-        channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-        sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-
+    dst[tid*stride_col_dst + row] = sumf[tid];
 }

 template <typename T, typename type_acc, int ncols_dst>
-void launch_mul_mat_vec_f_cuda(
-        const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
+static void launch_mul_mat_vec_f_cuda(
+        const T * x, const float * y, const int32_t * ids, float * dst,
        const int64_t ncols, const int64_t nrows,
        const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
        const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
@@ -412,59 +176,57 @@ void launch_mul_mat_vec_f_cuda(
        }
    }

-    const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
-
-    const int nbytes_shared = warp_size*sizeof(float) + (has_fusion ? warp_size*sizeof(float) : 0);
+    const int nbytes_shared = warp_size*sizeof(float);
    const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
    const dim3 block_dims(block_size_best, 1, 1);
    switch (block_size_best) {
        case   32: {
-            mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 32>
-                (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst,  32><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case   64: {
-            mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 64>
-                (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst,  64><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case   96: {
-            mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 96>
-                (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst,  96><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case  128: {
-            mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 128>
-                (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst, 128><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case  160: {
-            mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 160>
-                (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst, 160><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case  192: {
-            mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 192>
-                (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst, 192><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case  224: {
-            mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 224>
-                (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst, 224><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case  256: {
-            mul_mat_vec_f_switch_fusion<T, type_acc, ncols_dst, 256>
-                (x, y, ids, fusion, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst, 256><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, block_dims, block_nums, nbytes_shared, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        default: {
            GGML_ABORT("fatal error");
@@ -474,7 +236,7 @@ void launch_mul_mat_vec_f_cuda(

 template <typename T, typename type_acc>
 static void mul_mat_vec_f_cuda_switch_ncols_dst(
-        const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
+        const T * x, const float * y, const int32_t * ids, float * dst,
        const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
        const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
        const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
@@ -484,49 +246,49 @@ static void mul_mat_vec_f_cuda_switch_ncols_dst(
    switch (ncols_dst) {
        case 1:
            launch_mul_mat_vec_f_cuda<T, type_acc, 1>
-                (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 2:
            launch_mul_mat_vec_f_cuda<T, type_acc, 2>
-                (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 3:
            launch_mul_mat_vec_f_cuda<T, type_acc, 3>
-                (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 4:
            launch_mul_mat_vec_f_cuda<T, type_acc, 4>
-                (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 5:
            launch_mul_mat_vec_f_cuda<T, type_acc, 5>
-                (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 6:
            launch_mul_mat_vec_f_cuda<T, type_acc, 6>
-                (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 7:
            launch_mul_mat_vec_f_cuda<T, type_acc, 7>
-                (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
        case 8:
            launch_mul_mat_vec_f_cuda<T, type_acc, 8>
-                (x, y, ids, fusion, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            break;
@@ -538,31 +300,29 @@ static void mul_mat_vec_f_cuda_switch_ncols_dst(

 template<typename T>
 static void mul_mat_vec_f_cuda(
-        const T * x, const float * y, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
+        const T * x, const float * y, const int32_t * ids, float * dst,
        const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
        const int64_t stride_row, const int64_t stride_col_y, const int stride_col_dst,
        const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
        const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
        enum ggml_prec prec, cudaStream_t stream) {
-
    if constexpr(std::is_same_v<T, half>) {
        if (prec == GGML_PREC_DEFAULT) {
            mul_mat_vec_f_cuda_switch_ncols_dst<T, half>
-                (x, y, ids, fusion, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
-                nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
-                stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
            return;
        }
    }
    mul_mat_vec_f_cuda_switch_ncols_dst<T, float>
-        (x, y, ids, fusion, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
-        nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
-        stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+        (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
+         nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+         stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
 }

-void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
-    const ggml_cuda_mm_fusion_args_host * fusion) {
+void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
    GGML_ASSERT(        src1->type == GGML_TYPE_F32);
    GGML_ASSERT(!ids ||  ids->type == GGML_TYPE_I32);
    GGML_ASSERT(         dst->type == GGML_TYPE_F32);
@@ -588,30 +348,6 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor
    const int32_t *  ids_d = ids ? (const int32_t *)  ids->data : nullptr;
    float         *  dst_d =       (float         *)  dst->data;

-    ggml_cuda_mm_fusion_args_device fusion_local{};
-
-    if (fusion) {
-        GGML_ASSERT( !ids || dst->ne[2] == 1);
-        GGML_ASSERT(  ids || dst->ne[1] == 1);
-        if (fusion->x_bias) {
-            GGML_ASSERT(fusion->x_bias->type == GGML_TYPE_F32);
-            GGML_ASSERT(fusion->x_bias->ne[0] == dst->ne[0]);
-            GGML_ASSERT(!ids || fusion->x_bias->ne[1] == src0->ne[2]);
-            fusion_local.x_bias = fusion->x_bias->data;
-        }
-        if (fusion->gate) {
-            GGML_ASSERT(fusion->gate->type == src0->type && ggml_are_same_stride(fusion->gate, src0));
-            fusion_local.gate = fusion->gate->data;
-        }
-        if (fusion->gate_bias) {
-            GGML_ASSERT(fusion->gate_bias->type == GGML_TYPE_F32);
-            GGML_ASSERT(fusion->gate_bias->ne[0] == dst->ne[0]);
-            GGML_ASSERT(!ids || fusion->gate_bias->ne[1] == src0->ne[2]);
-            fusion_local.gate_bias = fusion->gate_bias->data;
-        }
-        fusion_local.glu_op = fusion->glu_op;
-    }
-
    const int64_t s01 = src0->nb[1] / ts_src0;
    const int64_t s11 = src1->nb[1] / ts_src1;
    const int64_t s1  =  dst->nb[1] / ts_dst;
@@ -634,19 +370,19 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor
    switch (src0->type) {
        case GGML_TYPE_F32: {
            const float * src0_d = (const float *) src0->data;
-            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
+            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
                ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
                ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
        } break;
        case GGML_TYPE_F16: {
            const half * src0_d = (const half *) src0->data;
-            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
+            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
                ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
                ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
        } break;
        case GGML_TYPE_BF16: {
            const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
-            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
+            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
                ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
                ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
        } break;
@@ -673,6 +409,7 @@ void ggml_cuda_op_mul_mat_vec_f(
    const int cc = ggml_cuda_info().devices[id].cc;
    const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;

+
    // ggml_cuda_op provides single, contiguous matrices
    const int64_t stride_row         = ne00;
    const int64_t stride_col_y       = ne10;
@@ -689,23 +426,22 @@ void ggml_cuda_op_mul_mat_vec_f(
    const int64_t stride_sample_y    = 0;
    const int64_t stride_sample_dst  = 0;

-    ggml_cuda_mm_fusion_args_device empty{};
    switch (src0->type) {
        case GGML_TYPE_F32: {
            const float * src0_d = (const float *) src0_dd_i;
-            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
+            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
                nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
        } break;
        case GGML_TYPE_F16: {
            const half * src0_d = (const half *) src0_dd_i;
-            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
+            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
                nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
        } break;
        case GGML_TYPE_BF16: {
            const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
-            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, empty, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
+            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
                nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
        } break;
--- a/ggml/src/ggml-cuda/mmvf.cuh
+++ b/ggml/src/ggml-cuda/mmvf.cuh
@@ -1,7 +1,6 @@
 #include "common.cuh"

-void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
-    const ggml_cuda_mm_fusion_args_host * fusion = nullptr);
+void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);

 void ggml_cuda_op_mul_mat_vec_f(
    ggml_backend_cuda_context & ctx,
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -1,6 +1,5 @@
 #include "mmvq.cuh"
 #include "quantize.cuh"
-#include "unary.cuh"
 #include "vecdotq.cuh"

 #include <cstdint>
@@ -83,7 +82,7 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
    return MMVQ_PARAMETERS_GENERIC;
 }

-static constexpr __host__ __device__ int calc_nwarps(int ncols_dst, mmvq_parameter_table_id table_id) {
+static constexpr __host__ __device__ int calc_nwarps(int ncols_dst,  mmvq_parameter_table_id table_id) {
    if (table_id == MMVQ_PARAMETERS_GENERIC) {
        switch (ncols_dst) {
            case 1:
@@ -137,11 +136,11 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
    return 1;
 }

+template <ggml_type type, int ncols_dst>
 // tell the compiler to use as many registers as it wants, see nwarps definition below
-template <ggml_type type, int ncols_dst, bool has_fusion>
 __launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
 static __global__ void mul_mat_vec_q(
-        const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const ggml_cuda_mm_fusion_args_device fusion, float * __restrict__ dst,
+        const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, float * __restrict__ dst,
        const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
        const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
        const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
@@ -170,56 +169,8 @@ static __global__ void mul_mat_vec_q(
    const uint32_t sample_x    = fastdiv(sample_dst, sample_ratio);
    const uint32_t sample_y    = sample_dst;

-    bool use_gate = false;
-    bool use_bias = false;
-    bool use_gate_bias = false;
-    const void * vgate = nullptr;
-    const float * x_bias = nullptr;
-    const float * gate_bias = nullptr;
-    ggml_glu_op active_glu;
-
-    if constexpr (has_fusion) {
-        use_gate      = fusion.gate      != nullptr;
-        use_bias      = fusion.x_bias    != nullptr;
-        use_gate_bias = fusion.gate_bias != nullptr && use_gate;
-        vgate         = fusion.gate;
-        x_bias        = (const float *) fusion.x_bias;
-        gate_bias     = (const float *) fusion.gate_bias;
-        active_glu    = fusion.glu_op;
-    }
-
-    const uint32_t channel_bias = ids ? channel_x : channel_dst;
-
-    float x_biases[ncols_dst]    = { 0.0f };
-    float gate_biases[ncols_dst] = { 0.0f };
-    if constexpr (has_fusion) {
-        if (use_bias) {
-            x_bias = x_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
-            // 1. Hide latency by prefetching bias and gate here
-            // 2. load only on threads that won't die after partial sum calculation
-            if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&
-                (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
-#pragma unroll
-                for (int j = 0; j < ncols_dst; ++j) {
-                    x_biases[j] = x_bias[j * stride_col_dst + threadIdx.x];
-                }
-            }
-        }
-        if (use_gate_bias) {
-            gate_bias = gate_bias + sample_dst*stride_sample_dst + channel_bias*stride_channel_dst + row0;
-            if (threadIdx.x < rows_per_cuda_block && threadIdx.y == 0 &&
-                (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
-#pragma unroll
-                for (int j = 0; j < ncols_dst; ++j) {
-                    gate_biases[j] = gate_bias[j * stride_col_dst + threadIdx.x];
-                }
-            }
-        }
-    }
-
    // partial sum for each thread
    float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}};
-    float tmp_gate[ncols_dst][rows_per_cuda_block] = {{0.0f}};

    const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y;
    const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x;
@@ -236,35 +187,17 @@ static __global__ void mul_mat_vec_q(
            for (int i = 0; i < rows_per_cuda_block; ++i) {
                tmp[j][i] += vec_dot_q_cuda(
                    vx, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs);
-                if constexpr (has_fusion) {
-                    if (use_gate) {
-                        tmp_gate[j][i] += vec_dot_q_cuda(
-                            vgate, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs);
-                    }
-                }
            }
        }
    }

    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size];
-    __shared__ float tmp_shared_gate[(has_fusion && (nwarps-1 > 0)) ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size];
-    if constexpr (!has_fusion) {
-        (void) tmp_shared_gate;
-    } else if (!use_gate) {
-        (void) tmp_shared_gate;
-    }
-
    if (threadIdx.y > 0) {
 #pragma unroll
        for (int j = 0; j < ncols_dst; ++j) {
 #pragma unroll
            for (int i = 0; i < rows_per_cuda_block; ++i) {
                tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
-                if constexpr (has_fusion) {
-                    if (use_gate) {
-                        tmp_shared_gate[threadIdx.y-1][j][i][threadIdx.x] = tmp_gate[j][i];
-                    }
-                }
            }
        }
    }
@@ -283,55 +216,14 @@ static __global__ void mul_mat_vec_q(
 #pragma unroll
            for (int l = 0; l < nwarps-1; ++l) {
                tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
-                if constexpr (has_fusion) {
-                    if (use_gate) {
-                        tmp_gate[j][i] += tmp_shared_gate[l][j][i][threadIdx.x];
-                    }
-                }
            }
            tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
-            if constexpr (has_fusion) {
-                if (use_gate) {
-                    tmp_gate[j][i] = warp_reduce_sum<warp_size>(tmp_gate[j][i]);
-                }
-            }
        }

        if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
-            float result = tmp[j][threadIdx.x];
-            if constexpr (has_fusion) {
-                if (use_bias) {
-                    result += x_biases[j];
-                }
-                if (use_gate) {
-                    float gate_value = tmp_gate[j][threadIdx.x];
-                    if (use_gate_bias) {
-                        gate_value += gate_biases[j];
-                    }
-                    switch (active_glu) {
-                        case GGML_GLU_OP_SWIGLU:
-                            result *= ggml_cuda_op_silu_single(gate_value);
-                            break;
-                        case GGML_GLU_OP_GEGLU:
-                            result *= ggml_cuda_op_gelu_single(gate_value);
-                            break;
-                        case GGML_GLU_OP_SWIGLU_OAI: {
-                            result = ggml_cuda_op_swiglu_oai_single(gate_value, result);
-                            break;
-                        }
-                        default:
-                            result = result * gate_value;
-                            break;
-                    }
-                }
-            }
-            dst[j*stride_col_dst + threadIdx.x] = result;
+            dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x];
        }
    }
-
-    if constexpr (!has_fusion) {
-        GGML_UNUSED_VARS(use_gate, use_bias, use_gate_bias, active_glu, gate_bias, x_bias, tmp_gate);
-    }
 }

 static std::pair<dim3, dim3> calc_launch_params(
@@ -343,37 +235,9 @@ static std::pair<dim3, dim3> calc_launch_params(
    return {block_nums, block_dims};
 }

-template<ggml_type type, int c_ncols_dst>
-static void mul_mat_vec_q_switch_fusion(
-        const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
-        const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
-        const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
-        const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
-        const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst,
-        const dim3 & block_nums, const dim3 & block_dims, const int nbytes_shared, cudaStream_t stream) {
-
-    const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
-    if constexpr (c_ncols_dst == 1) {
-        if (has_fusion) {
-            mul_mat_vec_q<type, c_ncols_dst, true><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
-                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-            return;
-        }
-    }
-
-    GGML_ASSERT(!has_fusion && "fusion only supported for ncols_dst=1");
-
-    mul_mat_vec_q<type, c_ncols_dst, false><<<block_nums, block_dims, nbytes_shared, stream>>>
-        (vx, vy, ids, fusion, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
-        channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-        sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
-}
-
 template <ggml_type type>
 static void mul_mat_vec_q_switch_ncols_dst(
-        const void * vx, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
+        const void * vx, const void * vy, const int32_t * ids, float * dst,
        const int ncols_x, const int nrows_x, const int ncols_dst,
        const int stride_row_x, const int stride_col_y, const int stride_col_dst,
        const int nchannels_x, const int nchannels_y, const int nchannels_dst,
@@ -392,83 +256,80 @@ static void mul_mat_vec_q_switch_ncols_dst(
    const int warp_size = ggml_cuda_info().devices[device].warp_size;
    const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);

-    const bool has_fusion = fusion.gate != nullptr || fusion.x_bias != nullptr || fusion.gate_bias != nullptr;
-
    GGML_ASSERT(!ids || ncols_dst == 1);
    switch (ncols_dst) {
        case 1: {
            constexpr int c_ncols_dst = 1;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 dims.first, dims.second, 0, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case 2: {
            constexpr int c_ncols_dst = 2;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 dims.first, dims.second, 0, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case 3: {
            constexpr int c_ncols_dst = 3;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 dims.first, dims.second, 0, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case 4: {
            constexpr int c_ncols_dst = 4;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 dims.first, dims.second, 0, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case 5: {
            constexpr int c_ncols_dst = 5;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 dims.first, dims.second, 0, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case 6: {
            constexpr int c_ncols_dst = 6;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 dims.first, dims.second, 0, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case 7: {
            constexpr int c_ncols_dst = 7;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 dims.first, dims.second, 0, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        case 8: {
            constexpr int c_ncols_dst = 8;
            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q_switch_fusion<type, c_ncols_dst>(vx, vy, ids, fusion, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                 channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 dims.first, dims.second, 0, stream);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
        } break;
        default:
            GGML_ABORT("fatal error");
            break;
    }
-
-    GGML_UNUSED(has_fusion);
 }
+
 static void mul_mat_vec_q_switch_type(
-        const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, const ggml_cuda_mm_fusion_args_device fusion, float * dst,
+        const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, float * dst,
        const int ncols_x, const int nrows_x, const int ncols_dst,
        const int stride_row_x, const int stride_col_y, const int stride_col_dst,
        const int nchannels_x, const int nchannels_y, const int nchannels_dst,
@@ -478,123 +339,143 @@ static void mul_mat_vec_q_switch_type(
    switch (type_x) {
        case GGML_TYPE_Q4_0:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_0>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_Q4_1:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_1>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_Q5_0:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_0>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_Q5_1:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_1>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_Q8_0:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q8_0>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_MXFP4:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_MXFP4>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_Q2_K:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_Q3_K:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q3_K>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_Q4_K:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_K>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_Q5_K:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_K>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_Q6_K:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q6_K>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_IQ2_XXS:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XXS>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_IQ2_XS:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XS>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_IQ2_S:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_S>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_IQ3_XXS:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_XXS>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_IQ1_S:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_S>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_IQ1_M:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_M>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_IQ4_NL:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_NL>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_IQ4_XS:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_XS>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        case GGML_TYPE_IQ3_S:
            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_S>
-                (vx, vy, ids, fusion, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
            break;
        default:
            GGML_ABORT("fatal error");
@@ -603,8 +484,7 @@ static void mul_mat_vec_q_switch_type(
 }

 void ggml_cuda_mul_mat_vec_q(
-        ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
-        const ggml_cuda_mm_fusion_args_host * fusion) {
+        ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
    GGML_ASSERT(        src1->type == GGML_TYPE_F32);
    GGML_ASSERT(        dst->type  == GGML_TYPE_F32);
    GGML_ASSERT(!ids || ids->type  == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
@@ -628,31 +508,6 @@ void ggml_cuda_mul_mat_vec_q(
    const int32_t *  ids_d = ids ? (const int32_t *)  ids->data : nullptr;
    float         *  dst_d =       (float         *)  dst->data;

-    ggml_cuda_mm_fusion_args_device fusion_local{};
-
-    if (fusion) {
-        GGML_ASSERT( !ids || dst->ne[2] == 1);
-        GGML_ASSERT(  ids || dst->ne[1] == 1);
-
-        if (fusion->x_bias) {
-            GGML_ASSERT(fusion->x_bias->type == GGML_TYPE_F32);
-            GGML_ASSERT(fusion->x_bias->ne[0] == dst->ne[0]);
-            GGML_ASSERT(!ids || fusion->x_bias->ne[1] == src0->ne[2]);
-            fusion_local.x_bias = fusion->x_bias->data;
-        }
-        if (fusion->gate) {
-            GGML_ASSERT(fusion->gate->type == src0->type && ggml_are_same_stride(fusion->gate, src0));
-            fusion_local.gate = fusion->gate->data;
-        }
-        if (fusion->gate_bias) {
-            GGML_ASSERT(fusion->gate_bias->type == GGML_TYPE_F32);
-            GGML_ASSERT(fusion->gate_bias->ne[0] == dst->ne[0]);
-            GGML_ASSERT(!ids || fusion->gate_bias->ne[1] == src0->ne[2]);
-            fusion_local.gate_bias = fusion->gate_bias->data;
-        }
-        fusion_local.glu_op = fusion->glu_op;
-    }
-
    // If src0 is a temporary compute buffer, clear any potential padding.
    if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
        const size_t size_data  = ggml_nbytes(src0);
@@ -694,10 +549,10 @@ void ggml_cuda_mul_mat_vec_q(
    const int64_t stride_channel_y   = ids ? s11  : s12;

    mul_mat_vec_q_switch_type(
-        src0->data, src0->type, src1_q8_1.get(), ids_d, fusion_local, dst_d, ne00,
+        src0->data, src0->type, src1_q8_1.get(), ids_d, dst_d, ne00,
        ne01,              ncols_dst,     s01, stride_col_y,     stride_col_dst,
        ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
-        ne03,              ne3,           s03, s13,              s3,               stream);
+        ne03,              ne3,           s03, s13,              s3,                 stream);
 }

 void ggml_cuda_op_mul_mat_vec_q(
@@ -723,9 +578,8 @@ void ggml_cuda_op_mul_mat_vec_q(
    const int stride_row_x = ne00 / ggml_blck_size(src0->type);
    const int stride_col_y = src1_padded_row_size / QK8_1;

-    ggml_cuda_mm_fusion_args_device fusion_local{};
    mul_mat_vec_q_switch_type(
-        src0_dd_i, src0->type, src1_ddq_i, nullptr, fusion_local, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst,
+        src0_dd_i, src0->type, src1_ddq_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream);

    GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_ncols, src1_padded_row_size);
--- a/ggml/src/ggml-cuda/mmvq.cuh
+++ b/ggml/src/ggml-cuda/mmvq.cuh
@@ -3,7 +3,7 @@
 #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.

 void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, const ggml_cuda_mm_fusion_args_host * fusion = nullptr);
+    const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);

 void ggml_cuda_op_mul_mat_vec_q(
    ggml_backend_cuda_context & ctx,
--- a/ggml/src/ggml-cuda/moe-expert-reduce.cu
+++ b/ggml/src/ggml-cuda/moe-expert-reduce.cu
@@ -1,168 +0,0 @@
-#include "moe-expert-reduce.cuh"
-
-// This kernel is a fusion of the expert weight reduce, common in MoE models
-
-template <int n_expert_used_template>
-__global__ void moe_expert_reduce_cuda(const float * __restrict__ experts,
-                                       const float * __restrict__ weights,
-                                       float * __restrict__ dst,
-                                       const int n_expert_used,
-                                       const int n_cols) {
-    const int row = blockIdx.x;
-    const int col = blockIdx.y * blockDim.x + threadIdx.x;
-    if (col >= n_cols) {
-        return;
-    }
-
-    experts += row * n_cols * n_expert_used;
-    weights += row * n_expert_used;
-    dst += row * n_cols;
-
-    float acc = 0.f;
-    if constexpr (n_expert_used_template == 0) {
-        for (int expert = 0; expert < n_expert_used; ++expert) {
-            ggml_cuda_mad(acc, experts[col], weights[expert]);
-            experts += n_cols;
-        }
-        dst[col] = acc;
-    } else {
-#pragma unroll
-        for (int i = 0; i < n_expert_used_template; ++i) {
-            ggml_cuda_mad(acc, experts[col], weights[i]);
-            experts += n_cols;
-        }
-        dst[col] = acc;
-    }
-}
-
-static void launch_moe_expert_reduce(ggml_backend_cuda_context & ctx,
-                                     const float *               experts,
-                                     const float *               weights,
-                                     float *                     dst,
-                                     const int                   n_expert_used,
-                                     const int                   n_cols,
-                                     const int                   n_rows) {
-    const int block_size = 32;
-
-    const int n_blocks_x = n_rows;
-    const int n_blocks_y = (n_cols + block_size - 1) / block_size;
-
-    dim3 block_dims(block_size);
-    dim3 grid_dims(n_blocks_x, n_blocks_y);
-
-    cudaStream_t stream = ctx.stream();
-    switch (n_expert_used) {
-        case 1:
-            moe_expert_reduce_cuda<1>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 2:
-            moe_expert_reduce_cuda<2>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 4:
-            moe_expert_reduce_cuda<4>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 6:
-            moe_expert_reduce_cuda<6>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 8:
-            moe_expert_reduce_cuda<8>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 16:
-            moe_expert_reduce_cuda<16>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 32:
-            moe_expert_reduce_cuda<32>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 64:
-            moe_expert_reduce_cuda<64>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        case 128:
-            moe_expert_reduce_cuda<128>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-        default:
-            moe_expert_reduce_cuda<0>
-                <<<grid_dims, block_dims, 0, stream>>>(experts, weights, dst, n_expert_used, n_cols);
-            break;
-    }
-}
-
-bool ggml_cuda_should_use_moe_expert_reduce(const ggml_cgraph * cgraph, int start_index, int end_index) {
-    const ggml_tensor * mul = cgraph->nodes[start_index];
-
-    if (mul->op != GGML_OP_MUL || !ggml_is_contiguous(mul->src[0]) || !ggml_is_contiguous(mul->src[1])) {
-        return false;
-    }
-
-    int    current_node   = start_index + 1;
-    size_t current_offset = 0;
-
-    std::vector<const ggml_tensor *> view_nodes;
-    //check if all are views of the expert in increasing order
-    while (current_node < end_index && cgraph->nodes[current_node]->op == GGML_OP_VIEW) {
-        const ggml_tensor * node = cgraph->nodes[current_node];
-        if (node->view_src != mul) {
-            return false;
-        }
-        if (node->view_offs < current_offset) {
-            return false;
-        }
-        current_offset = node->view_offs;
-        current_node++;
-        view_nodes.push_back(node);
-    }
-
-    //check if all the adds are in increasing order
-    const ggml_tensor * prev_add_src = view_nodes.empty() ? nullptr : view_nodes[0];
-    int                 num_adds     = 0;
-    int                 num_views    = view_nodes.size();
-    while (current_node < end_index && cgraph->nodes[current_node]->op == GGML_OP_ADD) {
-        const ggml_tensor * add_node = cgraph->nodes[current_node];
-
-        bool is_first_op_ok  = num_views > num_adds ? add_node->src[0] == prev_add_src : false;
-        bool is_second_op_ok = num_views > num_adds ? add_node->src[1] == view_nodes[num_adds + 1] : false;
-
-        if (!is_first_op_ok || !is_second_op_ok) {
-            return false;
-        }
-        prev_add_src = add_node;
-
-        num_adds++;
-        current_node++;
-    }
-
-    if (num_views != num_adds + 1) {
-        return false;
-    }
-
-    return true;
-}
-
-void ggml_cuda_op_moe_expert_reduce(ggml_backend_cuda_context & ctx,
-                                    const ggml_tensor *         experts,
-                                    const ggml_tensor *         weights,
-                                    ggml_tensor *               dst) {
-    const int n_rows        = experts->ne[2];
-    const int n_expert_used = experts->ne[1];
-    const int n_cols        = experts->ne[0];
-
-    GGML_ASSERT(experts->type == GGML_TYPE_F32);
-    GGML_ASSERT(weights->type == GGML_TYPE_F32);
-    GGML_ASSERT(ggml_is_contiguous(experts));
-    GGML_ASSERT(ggml_is_contiguous(weights));
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    const float * experts_d = (const float *) experts->data;
-    const float * weights_d = (const float *) weights->data;
-    float *       dst_d     = (float *) dst->data;
-
-    launch_moe_expert_reduce(ctx, experts_d, weights_d, dst_d, n_expert_used, n_cols, n_rows);
-}
--- a/ggml/src/ggml-cuda/moe-expert-reduce.cuh
+++ b/ggml/src/ggml-cuda/moe-expert-reduce.cuh
@@ -1,11 +0,0 @@
-#include "common.cuh"
-#include "ggml.h"
-
-#include <initializer_list>
-
-void ggml_cuda_op_moe_expert_reduce(ggml_backend_cuda_context & ctx,
-                                    const ggml_tensor *         experts,
-                                    const ggml_tensor *         weights,
-                                    ggml_tensor *               dst);
-
-bool ggml_cuda_should_use_moe_expert_reduce(const ggml_cgraph * cgraph, int start_index, int end_index);
--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
@@ -125,7 +125,7 @@ template<bool forward, bool has_ff, typename T>
 static __global__ void rope_multi(
        const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2,
        const int n_dims, const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
-        const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections, const bool is_imrope) {
+        const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors, const mrope_sections sections) {
    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);

    if (i0 >= ne0) {
@@ -152,29 +152,17 @@ static __global__ void rope_multi(
    const int sector = (i0 / 2) % sect_dims;

    float theta_base = 0.0;
-    if (is_imrope) {
-        if (sector % 3 == 1 && sector < 3 * sections.v[1]) { // h
-            theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
-        } else if (sector % 3 == 2 && sector < 3 * sections.v[2]) { // w
-            theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
-        } else if (sector % 3 == 0 && sector < 3 * sections.v[0]) { // t
-            theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
-        } else {
-            theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
-        }
-    } else {
-        if (sector < sections.v[0]) {
-            theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
-        }
-        else if (sector >= sections.v[0] && sector < sec_w) {
-            theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
-        }
-        else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
-            theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
-        }
-        else if (sector >= sec_w + sections.v[2]) {
-            theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
-        }
+    if (sector < sections.v[0]) {
+        theta_base = pos[channel_x]*powf(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sections.v[0] && sector < sec_w) {
+        theta_base = pos[channel_x + ne2 * 1]*powf(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
+        theta_base = pos[channel_x + ne2 * 2]*powf(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sec_w + sections.v[2]) {
+        theta_base = pos[channel_x + ne2 * 3]*powf(theta_scale, i0/2.0f);
    }

    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
@@ -288,7 +276,7 @@ template<bool forward, typename T>
 static void rope_multi_cuda(
        const T * x, T * dst, const int ne0, const int ne1, const int ne2, const int s1, const int s2, const int n_dims, const int nr,
        const int32_t * pos, const float freq_scale, const float freq_base, const float ext_factor, const float attn_factor,
-        const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, const bool is_imrope, cudaStream_t stream) {
+        const rope_corr_dims corr_dims, const float * freq_factors, const mrope_sections sections, cudaStream_t stream) {
    GGML_ASSERT(ne0 % 2 == 0);
    const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
@@ -299,11 +287,11 @@ static void rope_multi_cuda(
    if (freq_factors == nullptr) {
        rope_multi<forward, false, T><<<block_nums, block_dims, 0, stream>>>(
            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
-            attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
    } else {
        rope_multi<forward, true, T><<<block_nums, block_dims, 0, stream>>>(
            x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor,
-            attn_factor, corr_dims, theta_scale, freq_factors, sections, is_imrope);
+            attn_factor, corr_dims, theta_scale, freq_factors, sections);
    }
 }

@@ -381,7 +369,6 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst)

    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
-    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;

    if (is_mrope) {
@@ -419,11 +406,11 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
        if (src0->type == GGML_TYPE_F32) {
            rope_multi_cuda<forward>(
                (const float *) src0_d, (float *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
-                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream);
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
        } else if (src0->type == GGML_TYPE_F16) {
            rope_multi_cuda<forward>(
                (const half *) src0_d, (half *) dst_d, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
-                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, is_imrope, stream);
+                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, stream);
        } else {
            GGML_ABORT("fatal error");
        }
--- a/ggml/src/ggml-cuda/set-rows.cu
+++ b/ggml/src/ggml-cuda/set-rows.cu
@@ -4,53 +4,30 @@
 typedef void (*set_rows_kernel_t)(const char * src, char * dst);

 // Generic quantized set_rows kernel template
-template <typename idx_t, typename block_type, int qk, void (*quantize_func)(const float *, block_type *)>
-static __global__ void k_set_rows_quant(const float * __restrict__ src0,
-                                        const idx_t * __restrict__ src1,
-                                        block_type * __restrict__ dst,
-                                        const int64_t ne_total,
-                                        const int64_t ne10,
-                                        const int64_t ne11,
-                                        const int64_t ne12,
-                                        const int64_t ne13,
-                                        const int64_t s01,
-                                        const int64_t s02,
-                                        const int64_t s03,
-                                        const int64_t s10,
-                                        const int64_t s11,
-                                        const int64_t s12,
-                                        const int64_t s1,
-                                        const int64_t s2,
-                                        const int64_t s3,
-                                        const uint3   ne00,
-                                        const uint3   ne01,
-                                        const uint3   ne02,
-                                        const uint3   ne11_fd,
-                                        const uint3   ne12_fd) {
+template<typename idx_t, typename block_type, int qk, void (*quantize_func)(const float*, block_type*)>
+static __global__ void k_set_rows_quant(
+        const float * __restrict__ src0, const idx_t * __restrict__ src1, block_type * __restrict__ dst,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const int64_t s01, const int64_t s02, const int64_t s03,
+        const int64_t s10, const int64_t s11, const int64_t s12,
+        const int64_t s1, const int64_t s2, const int64_t s3) {
+
    const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;
+    const int64_t ne_total = (ne00 * ne01 * ne02 * ne03) / qk;

    if (i >= ne_total) {
        return;
    }

    const int64_t i_base = i * qk;
-    uint32_t      tmp    = (uint32_t) i_base;
-    uint2         div_mod;
+    const int64_t i03 = i_base / (ne00 * ne01 * ne02);
+    const int64_t i02 = (i_base - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
+    const int64_t i01 = (i_base - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
+    const int64_t i00 = i_base - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;

-    div_mod           = fast_div_modulo(tmp, ne00);
-    const int64_t i00 = div_mod.y;
-    tmp               = div_mod.x;
-
-    div_mod           = fast_div_modulo(tmp, ne01);
-    const int64_t i01 = div_mod.y;
-    tmp               = div_mod.x;
-
-    div_mod           = fast_div_modulo(tmp, ne02);
-    const int64_t i02 = div_mod.y;
-    const int64_t i03 = div_mod.x;
-
-    const int64_t i12 = fastmodulo((uint32_t) i03, ne12_fd);
-    const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd);
+    const int64_t i12 = i03 % ne12;
+    const int64_t i11 = i02 % ne11;
    const int64_t i10 = i01;

    const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
@@ -64,8 +41,6 @@ static __global__ void k_set_rows_quant(const float * __restrict__ src0,
    quantize_func(src_block, dst_block);

    GGML_UNUSED(ne10);
-    GGML_UNUSED(ne11);
-    GGML_UNUSED(ne12);
    GGML_UNUSED(ne13);
 }

@@ -96,65 +71,40 @@ static void set_rows_cuda_quant(
    const int64_t s2  = nb2;
    const int64_t s3  = nb3;

-    if (ne_total > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0) {
-        const uint3 ne00_fd = init_fastdiv_values((uint32_t) ne00);
-        const uint3 ne01_fd = init_fastdiv_values((uint32_t) ne01);
-        const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02);
-        const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11);
-        const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12);
-
+    if (ne_total > 0) {
        k_set_rows_quant<idx_t, block_type, qk, quantize_func><<<grid_size, block_size, 0, stream>>>(
-            src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01, s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd,
-            ne01_fd, ne02_fd, ne11_fd, ne12_fd);
+            src0_d, src1_d, dst_d,
+            ne00, ne01, ne02, ne03,
+            ne10, ne11, ne12, ne13,
+            s01, s02, s03,
+            s10, s11, s12,
+            s1, s2, s3);
    }
 }

-template <typename src_t, typename idx_t, typename dst_t>
-static __global__ void k_set_rows(const src_t * __restrict__ src0,
-                                  const idx_t * __restrict__ src1,
-                                  dst_t * __restrict__ dst,
-                                  const int64_t ne_total,
-                                  const int64_t ne10,
-                                  const int64_t ne11,
-                                  const int64_t ne12,
-                                  const int64_t ne13,
-                                  const int64_t s01,
-                                  const int64_t s02,
-                                  const int64_t s03,
-                                  const int64_t s10,
-                                  const int64_t s11,
-                                  const int64_t s12,
-                                  const int64_t s1,
-                                  const int64_t s2,
-                                  const int64_t s3,
-                                  const uint3   ne00,
-                                  const uint3   ne01,
-                                  const uint3   ne02,
-                                  const uint3   ne11_fd,
-                                  const uint3   ne12_fd) {
+template<typename src_t, typename idx_t, typename dst_t>
+static __global__ void k_set_rows(
+        const src_t * __restrict__ src0, const idx_t * __restrict__ src1, dst_t * __restrict__ dst,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t ne13,
+        const int64_t s01, const int64_t s02, const int64_t s03,
+        const int64_t s10, const int64_t s11, const int64_t s12,
+        const int64_t s1, const int64_t s2, const int64_t s3) {
+
    const int64_t i = int64_t(blockDim.x) * blockIdx.x + threadIdx.x;
+    const int64_t ne_total = ne00 * ne01 * ne02 * ne03;

    if (i >= ne_total) {
        return;
    }

-    uint32_t tmp = (uint32_t) i;
-    uint2    div_mod;
+    const int64_t i03 = i / (ne00 * ne01 * ne02);
+    const int64_t i02 = (i - i03 * ne00 * ne01 * ne02) / (ne00 * ne01);
+    const int64_t i01 = (i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01) / ne00;
+    const int64_t i00 = i - i03 * ne00 * ne01 * ne02 - i02 * ne00 * ne01 - i01 * ne00;

-    div_mod           = fast_div_modulo(tmp, ne00);
-    const int64_t i00 = div_mod.y;
-    tmp               = div_mod.x;
-
-    div_mod           = fast_div_modulo(tmp, ne01);
-    const int64_t i01 = div_mod.y;
-    tmp               = div_mod.x;
-
-    div_mod           = fast_div_modulo(tmp, ne02);
-    const int64_t i02 = div_mod.y;
-    const int64_t i03 = div_mod.x;
-
-    const int64_t i12 = fastmodulo((uint32_t) i03, ne12_fd);
-    const int64_t i11 = fastmodulo((uint32_t) i02, ne11_fd);
+    const int64_t i12 = i03 % ne12;
+    const int64_t i11 = i02 % ne11;
    const int64_t i10 = i01;

    const int64_t dst_row = *(src1 + i10*s10 + i11*s11 + i12*s12);
@@ -165,8 +115,6 @@ static __global__ void k_set_rows(const src_t * __restrict__ src0,
    dst_row_ptr[i00] = ggml_cuda_cast<dst_t>(src0_row[i00]);

    GGML_UNUSED(ne10);
-    GGML_UNUSED(ne11);
-    GGML_UNUSED(ne12);
    GGML_UNUSED(ne13);
 }

@@ -196,16 +144,14 @@ static void set_rows_cuda(
    const int64_t s2  = nb2/sizeof(dst_t);
    const int64_t s3  = nb3/sizeof(dst_t);

-    if (ne_total > 0 && ne00 > 0 && ne01 > 0 && ne02 > 0 && ne11 > 0 && ne12 > 0) {
-        const uint3 ne00_fd = init_fastdiv_values((uint32_t) ne00);
-        const uint3 ne01_fd = init_fastdiv_values((uint32_t) ne01);
-        const uint3 ne02_fd = init_fastdiv_values((uint32_t) ne02);
-        const uint3 ne11_fd = init_fastdiv_values((uint32_t) ne11);
-        const uint3 ne12_fd = init_fastdiv_values((uint32_t) ne12);
-
-        k_set_rows<<<grid_size, block_size, 0, stream>>>(src0_d, src1_d, dst_d, ne_total, ne10, ne11, ne12, ne13, s01,
-                                                         s02, s03, s10, s11, s12, s1, s2, s3, ne00_fd, ne01_fd, ne02_fd,
-                                                         ne11_fd, ne12_fd);
+    if (ne_total > 0) {
+        k_set_rows<<<grid_size, block_size, 0, stream>>>(
+            src0_d, src1_d, dst_d,
+            ne00, ne01, ne02, ne03,
+            ne10, ne11, ne12, ne13,
+            s01, s02, s03,
+            s10, s11, s12,
+            s1, s2, s3);
    }
 }

--- a/ggml/src/ggml-cuda/set.cu
+++ b/ggml/src/ggml-cuda/set.cu
@@ -1,39 +0,0 @@
-#include "set.cuh"
-#include "cpy.cuh"
-
-void ggml_cuda_op_set(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_I32));
-    GGML_ASSERT(src1->type == src0->type);
-    GGML_ASSERT(dst ->type == src0->type);
-
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    GGML_ASSERT(ggml_is_contiguous(src1));
-
-    const size_t nb1    = ((int32_t *) dst->op_params)[0];
-    const size_t nb2    = ((int32_t *) dst->op_params)[1];
-    const size_t nb3    = ((int32_t *) dst->op_params)[2];
-    const size_t offset = ((int32_t *) dst->op_params)[3];
-    const bool   inplace= (bool)     ((int32_t *) dst->op_params)[4];
-
-    if (!inplace) {
-        ggml_cuda_cpy(ctx, src0, dst);
-    }
-
-    ggml_tensor dst_view = *dst;
-    dst_view.data  = (void *)((char *)dst->data + offset);
-    dst_view.ne[0] = src1->ne[0];
-    dst_view.ne[1] = src1->ne[1];
-    dst_view.ne[2] = src1->ne[2];
-    dst_view.ne[3] = src1->ne[3];
-
-    dst_view.nb[0] = ggml_element_size(dst);
-    dst_view.nb[1] = nb1;
-    dst_view.nb[2] = nb2;
-    dst_view.nb[3] = nb3;
-
-    ggml_cuda_cpy(ctx, src1, &dst_view);
-}
--- a/ggml/src/ggml-cuda/set.cuh
+++ b/ggml/src/ggml-cuda/set.cuh
@@ -1,7 +0,0 @@
-#pragma once
-
-#include "common.cuh"
-
-#define CUDA_SET_BLOCK_SIZE 256
-
-void ggml_cuda_op_set(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu
+++ b/ggml/src/ggml-cuda/template-instances/fattn-tile-instance-dkq72-dv72.cu
@@ -1,5 +0,0 @@
-// This file has been autogenerated by generate_cu_files.py, do not edit manually.
-
-#include "../fattn-tile.cuh"
-
-DECL_FATTN_TILE_CASE(72, 72);
--- a/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ggml/src/ggml-cuda/template-instances/generate_cu_files.py
@@ -3,7 +3,7 @@
 from glob import glob
 import os

-HEAD_SIZES_KQ = [40, 64, 72, 80, 96, 112, 128, 256, 576]
+HEAD_SIZES_KQ = [40, 64, 80, 96, 112, 128, 256, 576]

 TYPES_KV = ["GGML_TYPE_F16", "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0"]

@@ -81,8 +81,6 @@ for ncols in [8, 16, 32, 64]:
            for head_size_kq in HEAD_SIZES_KQ:
                if head_size_kq == 40:
                    continue
-                if head_size_kq == 72:
-                    continue
                if head_size_kq != 576 and ncols2 == 16:
                    continue
                if head_size_kq == 576 and ncols2 != 16:
--- a/ggml/src/ggml-cuda/topk-moe.cu
+++ b/ggml/src/ggml-cuda/topk-moe.cu
@@ -2,7 +2,6 @@
 #include "ggml.h"
 #include "topk-moe.cuh"

-#include <cmath>
 #include <initializer_list>

 // Warp-local softmax used for both the pre-top-k logits and the post-top-k delayed path.
@@ -64,8 +63,7 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
                                                                  float *       weights,
                                                                  int32_t *     ids,
                                                                  const int     n_rows,
-                                                                  const int     n_expert_used,
-                                                                  const float   clamp_val) {
+                                                                  const int     n_expert_used) {
    const int row = blockIdx.x * blockDim.y + threadIdx.y;
    if (row >= n_rows) {
        return;
@@ -141,7 +139,6 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *

    if constexpr (with_norm) {
        wt_sum              = warp_reduce_sum(wt_sum);
-        wt_sum              = max(wt_sum, clamp_val);
        const float inv_sum = 1.0f / wt_sum;

        for (int i = 0; i < experts_per_thread; i++) {
@@ -160,10 +157,6 @@ __launch_bounds__(4 * WARP_SIZE, 1) __global__ void topk_moe_cuda(const float *
            weights[idx] = output_weights[i];
        }
    }
-
-    if (!with_norm) {
-        GGML_UNUSED(clamp_val);
-    }
 }

 template <bool with_norm, bool delayed_softmax = false>
@@ -173,9 +166,9 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
                                 int32_t *                   ids,
                                 const int                   n_rows,
                                 const int                   n_expert,
-                                 const int                   n_expert_used,
-                                 const float                 clamp_val) {
+                                 const int                   n_expert_used) {
    static_assert(!(with_norm && delayed_softmax), "delayed softmax is not supported with weight normalization");
+
    const int    rows_per_block = 4;
    dim3         grid_dims((n_rows + rows_per_block - 1) / rows_per_block, 1, 1);
    dim3         block_dims(WARP_SIZE, rows_per_block, 1);
@@ -184,43 +177,43 @@ static void launch_topk_moe_cuda(ggml_backend_cuda_context & ctx,
    switch (n_expert) {
        case 1:
            topk_moe_cuda<1, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
            break;
        case 2:
            topk_moe_cuda<2, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
            break;
        case 4:
            topk_moe_cuda<4, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
            break;
        case 8:
            topk_moe_cuda<8, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
            break;
        case 16:
            topk_moe_cuda<16, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
            break;
        case 32:
            topk_moe_cuda<32, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
            break;
        case 64:
            topk_moe_cuda<64, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
            break;
        case 128:
            topk_moe_cuda<128, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
            break;
        case 256:
            topk_moe_cuda<256, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
            break;
        case 512:
            topk_moe_cuda<512, with_norm, delayed_softmax>
-                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used, clamp_val);
+                <<<grid_dims, block_dims, 0, stream>>>(logits, weights, ids, n_rows, n_expert_used);
            break;
        default:
            GGML_ASSERT(false && "fatal error");
@@ -233,8 +226,7 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
                           ggml_tensor *               weights,
                           ggml_tensor *               ids,
                           const bool                  with_norm,
-                           const bool                  delayed_softmax,
-                           ggml_tensor *               clamp) {
+                           const bool                  delayed_softmax) {
    GGML_ASSERT(logits->type == GGML_TYPE_F32);
    GGML_ASSERT(weights->type == GGML_TYPE_F32);
    GGML_ASSERT(ids->type == GGML_TYPE_I32);
@@ -250,25 +242,18 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,

    const int n_expert_used = weights->ne[1];

-    float clamp_val = -INFINITY;
    if (with_norm) {
-        if (clamp) {
-            clamp_val = ggml_get_op_params_f32(clamp, 0);
-        }
-        launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used, clamp_val);
+        launch_topk_moe_cuda<true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
    } else {
-        GGML_ASSERT(clamp == nullptr);
        if (delayed_softmax) {
-            launch_topk_moe_cuda<false, true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used,
-                                              clamp_val);
+            launch_topk_moe_cuda<false, true>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
        } else {
-            launch_topk_moe_cuda<false, false>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used,
-                                               clamp_val);
+            launch_topk_moe_cuda<false, false>(ctx, logits_d, weights_d, ids_d, n_rows, n_experts, n_expert_used);
        }
    }
 }

-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp) {
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights) {
    float scale    = 1.0f;
    float max_bias = 0.0f;

@@ -294,26 +279,13 @@ bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tenso
        return false;
    }

-    if (clamp) {
-        if (clamp->op != GGML_OP_CLAMP) {
-            return false;
-        }
-        float max_val = ggml_get_op_params_f32(clamp, 1);
-
-        if (max_val != INFINITY) {
-            return false;
-        }
-    }
-
-
    return true;
 }

 std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool norm, bool delayed_softmax) {
    static std::initializer_list<enum ggml_op> norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE,  GGML_OP_ARGSORT,
                                                            GGML_OP_VIEW,     GGML_OP_GET_ROWS, GGML_OP_RESHAPE,
-                                                            GGML_OP_SUM_ROWS, GGML_OP_CLAMP,    GGML_OP_DIV,
-                                                            GGML_OP_RESHAPE };
+                                                            GGML_OP_SUM_ROWS, GGML_OP_DIV,      GGML_OP_RESHAPE };

    static std::initializer_list<enum ggml_op> no_norm_ops = { GGML_OP_SOFT_MAX, GGML_OP_RESHAPE, GGML_OP_ARGSORT,
                                                               GGML_OP_VIEW, GGML_OP_GET_ROWS };
--- a/ggml/src/ggml-cuda/topk-moe.cuh
+++ b/ggml/src/ggml-cuda/topk-moe.cuh
@@ -8,9 +8,8 @@ void ggml_cuda_op_topk_moe(ggml_backend_cuda_context & ctx,
                           ggml_tensor *               weights,
                           ggml_tensor *               ids,
                           const bool                  with_norm,
-                           const bool                  delayed_softmax = false,
-                           ggml_tensor *               weight_clamp    = nullptr);
+                           const bool                  delayed_softmax = false);

-bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights, const ggml_tensor * clamp = nullptr);
+bool ggml_cuda_should_use_topk_moe(const ggml_tensor * softmax, const ggml_tensor * weights);

 std::initializer_list<enum ggml_op> ggml_cuda_topk_moe_ops(bool with_norm, bool delayed_softmax = false);
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -18,7 +18,10 @@ static __device__ __forceinline__ float op_step(float x) {
 }

 static __device__ __forceinline__ float op_gelu(float x) {
-    return ggml_cuda_op_gelu_single(x);
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+
+    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
 }

 static __device__ __forceinline__ float op_gelu_erf(float x) {
@@ -34,7 +37,7 @@ static __device__ __forceinline__ float op_gelu_quick(float x) {
 }

 static __device__ __forceinline__ float op_silu(float x) {
-    return ggml_cuda_op_silu_single(x);
+    return x / (1.0f + expf(-x));
 }

 static __device__ __forceinline__ float op_tanh(float x) {
@@ -85,22 +88,6 @@ static __device__ __forceinline__ float op_elu(float x) {
    return (x > 0.f) ? x : expm1f(x);
 }

-static __device__ __forceinline__ float op_floor(float x) {
-    return floorf(x);
-}
-
-static __device__ __forceinline__ float op_ceil(float x) {
-    return ceilf(x);
-}
-
-static __device__ __forceinline__ float op_round(float x) {
-    return round(x);
-}
-
-static __device__ __forceinline__ float op_trunc(float x) {
-    return trunc(x);
-}
-
 template <float (*op)(float), typename T>
 static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
@@ -217,22 +204,6 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_unary<op_elu>(ctx, dst);
 }
-
-void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_cuda_op_unary<op_floor>(ctx, dst);
-}
-
-void ggml_cuda_op_ceil(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_cuda_op_unary<op_ceil>(ctx, dst);
-}
-
-void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_cuda_op_unary<op_round>(ctx, dst);
-}
-
-void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
-    ggml_cuda_op_unary<op_trunc>(ctx, dst);
-}
 /* gated ops */

 template <float (*op)(float), typename T>
@@ -346,8 +317,13 @@ static __global__ void swiglu_oai_kernel(const T * x, const T * g, T * dst, cons

    float xi = x[j0];
    float gi = g[j1];
+    xi = fminf(xi, limit);
+    gi = fmaxf(fminf(gi, limit), -limit);

-    dst[i] = ggml_cuda_op_swiglu_oai_single(xi, gi, alpha, limit);
+    float out_glu = xi / (1.0f + expf(-xi * alpha));
+    out_glu = out_glu * (1.0f + gi);
+
+    dst[i] = out_glu;
 }

 template <typename T>
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -1,4 +1,3 @@
-#pragma once
 #include "common.cuh"

 #define CUDA_NEG_BLOCK_SIZE 256
@@ -63,14 +62,6 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

-void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_op_ceil(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
 void ggml_cuda_op_reglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_geglu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -84,23 +75,3 @@ void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
-
-__device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
-    return x / (1.0f + expf(-x));
-}
-
-__device__ __forceinline__ float ggml_cuda_op_gelu_single(float x) {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-
-    return 0.5f * x * (1.0f + tanhf(SQRT_2_OVER_PI * x * (1.0f + GELU_COEF_A * x * x)));
-}
-
-__device__ __forceinline__ float ggml_cuda_op_swiglu_oai_single(float x, float g, float alpha = 1.702f, float limit = 7.0f) {
-    x = fminf(x, limit);
-    g = fmaxf(fminf(g, limit), -limit);
-
-    float out_glu = x / (1.0f + expf(-x * alpha));
-    out_glu = out_glu * (1.0f + g);
-    return out_glu;
-}
--- a/ggml/src/ggml-cuda/upscale.cu
+++ b/ggml/src/ggml-cuda/upscale.cu
@@ -126,8 +126,8 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    } else if (mode == GGML_SCALE_MODE_BILINEAR) {
        float pixel_offset = 0.5f;
        if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
-            sf0          = dst->ne[0] > 1 && src0->ne[0] > 1 ? (float)(dst->ne[0] - 1) / (src0->ne[0] - 1) : sf0;
-            sf1          = dst->ne[1] > 1 && src0->ne[1] > 1 ? (float)(dst->ne[1] - 1) / (src0->ne[1] - 1) : sf1;
+            sf0          = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1);
+            sf1          = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1);
            pixel_offset = 0.0f;
        }
        upscale_f32_bilinear_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -211,15 +211,12 @@ static inline void hex_format_op_names(char * str, const struct ggml_tensor * t)
 // ** backend sessions

 struct ggml_hexagon_session {
-    ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false);
+    ggml_hexagon_session(int dev_id) noexcept(false);
    ~ggml_hexagon_session() noexcept(true);

    void allocate(int dev_id) noexcept(false);
    void release() noexcept(true);

-    void enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync = false);
-    void flush();
-
    ggml_backend_buffer_type buffer_type;
    ggml_backend_buffer_type repack_buffer_type;

@@ -240,37 +237,15 @@ struct ggml_hexagon_session {
    uint32_t         prof_pkts;
 };

-void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
-    // Bump pending flag (cleared in the session::flush once we get the responce)
-    this->op_pending++;  // atomic inc
-
-    int err = dspqueue_write(this->queue,
-                             0,                       // flags - the framework will autoset this
-                             n_bufs,                  // number of buffers
-                             bufs,                    // buffer references
-                             sizeof(req),
-                             (const uint8_t *) &req,  // Message
-                             1000000                  // Timeout
-    );
-
-    if (err != 0) {
-        GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", this->name.c_str(), (unsigned) err);
-    }
-
-    if (sync) {
-        flush();
-    }
-}
-
-// Flush HTP response queue i.e wait for all outstanding requests to complete
-void ggml_hexagon_session::flush() {
-    dspqueue_t q = this->queue;
+// Packet callback
+static void htp_packet_callback(dspqueue_t queue, AEEResult error, void * context) {
+    auto sess = static_cast<ggml_hexagon_session *>(context);

    // Repeatedly read packets from the queue until it's empty. We don't
    // necessarily get a separate callback for each packet, and new packets
    // may arrive while we're processing the previous one.

-    while (this->op_pending) {
+    while (1) {
        struct htp_general_rsp rsp;
        uint32_t               rsp_size;
        uint32_t               flags;
@@ -278,23 +253,22 @@ void ggml_hexagon_session::flush() {
        struct dspqueue_buffer bufs[HTP_MAX_PACKET_BUFFERS];
        uint32_t               n_bufs;

-        // Read response packet from queue
-        int err = dspqueue_read(q, &flags,
-                                   HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
-                                   &n_bufs,                 // Number of buffer references
-                                   bufs,                    // Buffer references
-                                   sizeof(rsp),             // Max message length
-                                   &rsp_size,               // Message length
-                                   (uint8_t *) &rsp,
-                                   1000000);                // Timeout
+        // Read packet from queue
+        int err = dspqueue_read_noblock(queue, &flags,
+                                        HTP_MAX_PACKET_BUFFERS,  // Maximum number of buffer references
+                                        &n_bufs,                 // Number of buffer references
+                                        bufs,                    // Buffer references
+                                        sizeof(rsp),             // Max message length
+                                        &rsp_size,               // Message length
+                                        (uint8_t *) &rsp);

-        if (err == AEE_EEXPIRED) {
-            // TODO: might need to bail out if the HTP is stuck on something
-            continue;
+        if (err == AEE_EWOULDBLOCK) {
+            // Consumed all packets available for now
+            return;
        }

        if (err != 0) {
-            GGML_ABORT("ggml-hex: dspqueue_read failed: 0x%08x\n", (unsigned) err);
+            GGML_ABORT("ggml-hex: dspqueue_read_noblock failed: 0x%08x\n", (unsigned) err);
        }

        // Basic sanity checks
@@ -307,15 +281,21 @@ void ggml_hexagon_session::flush() {
            // TODO: handle errors
        }

-        // TODO: update profiling implementation, currently only works for opt_opsync mode
-        this->prof_usecs  = rsp.prof_usecs;
-        this->prof_cycles = rsp.prof_cycles;
-        this->prof_pkts   = rsp.prof_pkts;
+        // FIXME: update profiling implementation
+        sess->prof_usecs  = rsp.prof_usecs;
+        sess->prof_cycles = rsp.prof_cycles;
+        sess->prof_pkts   = rsp.prof_pkts;

-        this->op_pending--;  // atomic dec
+        sess->op_pending--;  // atomic dec
    }
 }

+// Error callback - simply terminates with an error. Used where we don't
+// expect errors.
+[[noreturn]] static void htp_error_callback(dspqueue_t queue, AEEResult error, void * context) {
+    GGML_ABORT("ggml-hex: dspcall general error 0x%x: for queue %p\n", error, (void *) queue);
+}
+
 // ** backend buffers

 struct ggml_backend_hexagon_buffer_type_context {
@@ -367,13 +347,7 @@ struct ggml_backend_hexagon_buffer_context {
    ggml_backend_hexagon_buffer_context(ggml_hexagon_session * sess, size_t size, bool repack) {
        size += 4 * 1024;  // extra page for padding

-        if (rpcmem_alloc2) {
-            this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
-        } else {
-            GGML_LOG_INFO("ggml-hex: %s rpcmem_alloc2 not found, falling back to rpcmem_alloc\n", sess->name.c_str());
-            this->base = (uint8_t *) rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
-        }
-
+        this->base = (uint8_t *) rpcmem_alloc2(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_HEAP_NOREG, size);
        if (!this->base) {
            GGML_LOG_ERROR("ggml-hex: %s failed to allocate buffer : size %zu\n", sess->name.c_str(), size);
            throw std::runtime_error("ggml-hex: rpcmem_alloc failed (see log for details)");
@@ -682,15 +656,6 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));  // extra elements for the pad
    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)

-    // Ensure we don't try to read more data than is available in the source buffer 'data'
-    // or write more than the tensor can hold.
-    const size_t total_tensor_size = (size_t)nrows * row_size;
-    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
-
-    // Calculate how many full rows and how many remaining bytes we need to process.
-    const int64_t n_full_rows = n_bytes_to_copy / row_size;
-    const size_t  n_rem_bytes = n_bytes_to_copy % row_size;
-
    void * buf_pd = ggml_aligned_malloc(row_size_pd);
    GGML_ASSERT(buf_pd != NULL);

@@ -702,8 +667,7 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)

    init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]);  // init padded buffer to make sure the tail is all zeros

-    // 1. Process all the full rows
-    for (int64_t i = 0; i < n_full_rows; i++) {
+    for (int64_t i = 0; i < nrows; i++) {
        const uint8_t * src = (const uint8_t *) data + (i * row_size);
        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);

@@ -712,25 +676,6 @@ static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size)
        memcpy(dst, buf_rp, row_size);
    }

-    // 2. Process the final, potentially partial, row
-    if (n_rem_bytes > 0) {
-        const int64_t i = n_full_rows;
-        const uint8_t * src = (const uint8_t *) data + (i * row_size);
-        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
-
-        // re-init the row because we are potentially copying a partial row
-        init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]);
-
-        // Copy only the remaining bytes from the source.
-        memcpy(buf_pd, src, n_rem_bytes);
-
-        // Repack the entire buffer
-        repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]);
-
-        // Write only the corresponding remaining bytes to the destination tensor.
-        memcpy(dst, buf_rp, n_rem_bytes);
-    }
-
    ggml_aligned_free(buf_pd, row_size_pd);
    ggml_aligned_free(buf_rp, row_size_rp);
 }
@@ -743,14 +688,6 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2));  // extra elements for the pad
    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)

-    // Ensure we don't try to copy more data than the tensor actually contains.
-    const size_t total_tensor_size = (size_t)nrows * row_size;
-    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
-
-    // Calculate how many full rows and how many remaining bytes we need to process.
-    const int64_t n_full_rows = n_bytes_to_copy / row_size;
-    const size_t  n_rem_bytes = n_bytes_to_copy % row_size;
-
    void * buf_pd = ggml_aligned_malloc(row_size_pd);
    GGML_ASSERT(buf_pd != NULL);

@@ -762,8 +699,7 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)

    memset(buf_pd, 0, row_size_pd);  // clear-out padded buffer to make sure the tail is all zeros

-    // 1. Process all the full rows
-    for (int64_t i = 0; i < n_full_rows; i++) {
+    for (int64_t i = 0; i < nrows; i++) {
        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
        uint8_t *       dst = (uint8_t *) data + (i * row_size);

@@ -772,20 +708,6 @@ static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size)
        memcpy(dst, buf_rp, row_size);
    }

-    // 2. Process the final, potentially partial, row
-    if (n_rem_bytes > 0) {
-        const int64_t i = n_full_rows;
-        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
-        uint8_t *       dst = (uint8_t *) data + (i * row_size);
-
-        // We still need to read and unpack the entire source row because quantization is block-based.
-        memcpy(buf_pd, src, row_size);
-        unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
-
-        // But we only copy the remaining number of bytes to the destination.
-        memcpy(dst, buf_rp, n_rem_bytes);
-    }
-
    ggml_aligned_free(buf_pd, row_size_pd);
    ggml_aligned_free(buf_rp, row_size_rp);
 }
@@ -1008,15 +930,6 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2));  // extra elements for the pad
    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)

-    // Ensure we don't try to read more data than is available in the source buffer 'data'
-    // or write more than the tensor can hold.
-    const size_t total_tensor_size = (size_t)nrows * row_size;
-    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
-
-    // Calculate how many full rows and how many remaining bytes we need to process.
-    const int64_t n_full_rows = n_bytes_to_copy / row_size;
-    const size_t  n_rem_bytes = n_bytes_to_copy % row_size;
-
    void * buf_pd = ggml_aligned_malloc(row_size_pd);
    GGML_ASSERT(buf_pd != NULL);

@@ -1028,8 +941,7 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)

    init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]);  // init padded buffer to make sure the tail is all zeros

-    // 1. Process all the full rows
-    for (int64_t i = 0; i < n_full_rows; i++) {
+    for (int64_t i = 0; i < nrows; i++) {
        const uint8_t * src = (const uint8_t *) data + (i * row_size);
        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);

@@ -1038,25 +950,6 @@ static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size)
        memcpy(dst, buf_rp, row_size);
    }

-    // 2. Process the final, potentially partial, row
-    if (n_rem_bytes > 0) {
-        const int64_t i = n_full_rows;
-        const uint8_t * src = (const uint8_t *) data + (i * row_size);
-        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
-
-        // re-init the row because we are potentially copying a partial row
-        init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]);
-
-        // Copy only the remaining bytes from the source.
-        memcpy(buf_pd, src, n_rem_bytes);
-
-        // Repack the entire buffer
-        repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]);
-
-        // Write only the corresponding remaining bytes to the destination tensor.
-        memcpy(dst, buf_rp, n_rem_bytes);
-    }
-
    ggml_aligned_free(buf_pd, row_size_pd);
    ggml_aligned_free(buf_rp, row_size_rp);
 }
@@ -1069,14 +962,6 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2));  // extra elements for the pad
    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)

-    // Ensure we don't try to copy more data than the tensor actually contains.
-    const size_t total_tensor_size = (size_t)nrows * row_size;
-    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
-
-    // Calculate how many full rows and how many remaining bytes we need to process.
-    const int64_t n_full_rows = n_bytes_to_copy / row_size;
-    const size_t  n_rem_bytes = n_bytes_to_copy % row_size;
-
    void * buf_pd = ggml_aligned_malloc(row_size_pd);
    GGML_ASSERT(buf_pd != NULL);

@@ -1088,8 +973,7 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)

    memset(buf_pd, 0, row_size_pd);  // clear-out padded buffer to make sure the tail is all zeros

-    // 1. Process all the full rows
-    for (int64_t i = 0; i < n_full_rows; i++) {
+    for (int64_t i = 0; i < nrows; i++) {
        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
        uint8_t *       dst = (uint8_t *) data + (i * row_size);

@@ -1098,20 +982,6 @@ static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size)
        memcpy(dst, buf_rp, row_size);
    }

-    // 2. Process the final, potentially partial, row
-    if (n_rem_bytes > 0) {
-        const int64_t i = n_full_rows;
-        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
-        uint8_t *       dst = (uint8_t *) data + (i * row_size);
-
-        // We still need to read and unpack the entire source row because quantization is block-based.
-        memcpy(buf_pd, src, row_size);
-        unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
-
-        // But we only copy the remaining number of bytes to the destination.
-        memcpy(dst, buf_rp, n_rem_bytes);
-    }
-
    ggml_aligned_free(buf_pd, row_size_pd);
    ggml_aligned_free(buf_rp, row_size_rp);
 }
@@ -1359,15 +1229,6 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2));  // extra elements for the pad
    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)

-    // Ensure we don't try to read more data than is available in the source buffer 'data'
-    // or write more than the tensor can hold.
-    const size_t total_tensor_size = (size_t)nrows * row_size;
-    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
-
-    // Calculate how many full rows and how many remaining bytes we need to process.
-    const int64_t n_full_rows = n_bytes_to_copy / row_size;
-    const size_t  n_rem_bytes = n_bytes_to_copy % row_size;
-
    void * buf_pd = ggml_aligned_malloc(row_size_pd);
    GGML_ASSERT(buf_pd != NULL);

@@ -1379,8 +1240,7 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si

    init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]);  // init padded buffer to make sure the tail is all zeros

-    // 1. Process all the full rows
-    for (int64_t i = 0; i < n_full_rows; i++) {
+    for (int64_t i = 0; i < nrows; i++) {
        const uint8_t * src = (const uint8_t *) data + (i * row_size);
        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);

@@ -1389,25 +1249,6 @@ static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t si
        memcpy(dst, buf_rp, row_size);
    }

-    // 2. Process the final, potentially partial, row
-    if (n_rem_bytes > 0) {
-        const int64_t i = n_full_rows;
-        const uint8_t * src = (const uint8_t *) data + (i * row_size);
-        uint8_t *       dst = (uint8_t *) t->data + (i * row_size);
-
-        // re-init the row because we are potentially copying a partial row
-        init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]);
-
-        // Copy only the remaining bytes from the source.
-        memcpy(buf_pd, src, n_rem_bytes);
-
-        // Repack the entire buffer (partial data + zero padding).
-        repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]);
-
-        // Write only the corresponding remaining bytes to the destination tensor.
-        memcpy(dst, buf_rp, n_rem_bytes);
-    }
-
    ggml_aligned_free(buf_pd, row_size_pd);
    ggml_aligned_free(buf_rp, row_size_rp);
 }
@@ -1420,14 +1261,6 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
    size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2));  // extra elements for the pad
    size_t row_size_rp = row_size * 2;  // extra space for tmp pad (if any)

-    // Ensure we don't try to copy more data than the tensor actually contains.
-    const size_t total_tensor_size = (size_t)nrows * row_size;
-    const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size;
-
-    // Calculate how many full rows and how many remaining bytes we need to process.
-    const int64_t n_full_rows = n_bytes_to_copy / row_size;
-    const size_t  n_rem_bytes = n_bytes_to_copy % row_size;
-
    void * buf_pd = ggml_aligned_malloc(row_size_pd);
    GGML_ASSERT(buf_pd != NULL);

@@ -1439,8 +1272,7 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si

    memset(buf_pd, 0, row_size_pd);  // clear-out padded buffer to make sure the tail is all zeros

-    // 1. Process all the full rows
-    for (int64_t i = 0; i < n_full_rows; i++) {
+    for (int64_t i = 0; i < nrows; i++) {
        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
        uint8_t *       dst = (uint8_t *) data + (i * row_size);

@@ -1449,20 +1281,6 @@ static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t si
        memcpy(dst, buf_rp, row_size);
    }

-    // 2. Process the final, potentially partial, row
-    if (n_rem_bytes > 0) {
-        const int64_t i = n_full_rows;
-        const uint8_t * src = (const uint8_t *) t->data + (i * row_size);
-        uint8_t *       dst = (uint8_t *) data + (i * row_size);
-
-        // We still need to read and unpack the entire source row because the format is block-based.
-        memcpy(buf_pd, src, row_size);
-        unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]);
-
-        // But we only copy the remaining number of bytes to the destination to respect the size limit.
-        memcpy(dst, buf_rp, n_rem_bytes);
-    }
-
    ggml_aligned_free(buf_pd, row_size_pd);
    ggml_aligned_free(buf_rp, row_size_rp);
 }
@@ -1481,19 +1299,19 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
    switch (tensor->type) {
        case GGML_TYPE_Q4_0:
            GGML_ASSERT(offset == 0);
-            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            GGML_ASSERT(size == ggml_nbytes(tensor));
            repack_q4_0_q4x4x2(tensor, data, size);
            break;

        case GGML_TYPE_Q8_0:
            GGML_ASSERT(offset == 0);
-            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            GGML_ASSERT(size == ggml_nbytes(tensor));
            repack_q8_0_q8x4x2(tensor, data, size);
            break;

        case GGML_TYPE_MXFP4:
            GGML_ASSERT(offset == 0);
-            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            GGML_ASSERT(size == ggml_nbytes(tensor));
            repack_mxfp4_mxfp4x4x2(tensor, data, size);
            break;

@@ -1517,19 +1335,19 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
    switch (tensor->type) {
        case GGML_TYPE_Q4_0:
            GGML_ASSERT(offset == 0);
-            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            GGML_ASSERT(size == ggml_nbytes(tensor));
            repack_q4x4x2_q4_0(data, tensor, size);
            break;

        case GGML_TYPE_Q8_0:
            GGML_ASSERT(offset == 0);
-            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            GGML_ASSERT(size == ggml_nbytes(tensor));
            repack_q8x4x2_q8_0(data, tensor, size);
            break;

        case GGML_TYPE_MXFP4:
            GGML_ASSERT(offset == 0);
-            GGML_ASSERT(offset + size <= ggml_nbytes(tensor));
+            GGML_ASSERT(size == ggml_nbytes(tensor));
            repack_mxfp4x4x2_mxfp4(data, tensor, size);
            break;

@@ -1685,13 +1503,12 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
    }

    // Get session URI
+    char htp_uri[256];
+    sprintf(htp_uri, "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);

    char session_uri[256];
    {
-        char htp_uri[256];
-        snprintf(htp_uri, sizeof(htp_uri), "file:///libggml-htp-v%u.so?htp_iface_skel_handle_invoke&_modver=1.0", opt_arch);
-
-        struct remote_rpc_get_uri u = {};
+        struct remote_rpc_get_uri u;
        u.session_id      = this->session_id;
        u.domain_name     = const_cast<char *>(CDSP_DOMAIN_NAME);
        u.domain_name_len = strlen(CDSP_DOMAIN_NAME);
@@ -1702,12 +1519,8 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {

        int err = remote_session_control(FASTRPC_GET_URI, (void *) &u, sizeof(u));
        if (err != AEE_SUCCESS) {
-            // fallback to single session uris
-            int htp_URI_domain_len = strlen(htp_uri) + MAX_DOMAIN_NAMELEN;
-
-            snprintf(session_uri, htp_URI_domain_len, "%s%s", htp_uri, my_domain->uri);
-
-            GGML_LOG_WARN("ggml-hex: failed to get URI for session %d : error 0x%x. Falling back to single session URI: %s\n", dev_id, err, session_uri);
+            GGML_LOG_ERROR("ggml-hex: failed to get URI for session %d : error 0x%x\n", dev_id, err);
+            throw std::runtime_error("ggml-hex: remote_session_control(get-uri) failed (see log for details)");
        }
    }

@@ -1751,8 +1564,7 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) {
                          0,              // Flags
                          128 * 1024,     // Request  queue size (in bytes)
                          64 * 1024,      // Response queue size (in bytes)
-                          nullptr,        // Read packet callback (we handle reads explicitly)
-                          nullptr,        // Error callback (we handle errors during reads)
+                          htp_packet_callback, htp_error_callback,
                          (void *) this,  // Callback context
                          &queue);
    if (err != 0) {
@@ -1819,13 +1631,10 @@ void ggml_hexagon_session::release() noexcept(true) {
    }
 }

-ggml_hexagon_session::ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false) {
+ggml_hexagon_session::ggml_hexagon_session(int dev_id) noexcept(false) {
    buffer_type.context        = nullptr;
    repack_buffer_type.context = nullptr;

-    buffer_type.device         = dev;
-    repack_buffer_type.device  = dev;
-
    try {
        allocate(dev_id);

@@ -2393,7 +2202,7 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
    bufs[0].ptr    = src0->data;
    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
    bufs[0].size   = ggml_nbytes(src0);
-    bufs[0].flags  = 0;
+    bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_REF;

    // Second buffer Input Activations. This is a buffer that the CPU
    // writes and the DSP reads, so we'll need to flush CPU caches and
@@ -2403,7 +2212,8 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
    bufs[1].ptr    = src1->data;
    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
    bufs[1].size   = ggml_nbytes(src1);
-    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
+    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP

    // Third buffer Output Activations. We'll handle DSP
@@ -2414,7 +2224,7 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
    bufs[2].ptr    = dst->data;
    bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
    bufs[2].size   = ggml_nbytes(dst);
-    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);

    // Primary DSP session from the src0 (normally weight) tensor
    auto sess = src0_buf->sess;
@@ -2442,7 +2252,27 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
    }

    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, 3, opt_opsync);
+        // Bump pending flag (cleared in the callback once we get the responce)
+        sess->op_pending++;  // atomic inc
+
+        int err = dspqueue_write(sess->queue,
+                                 0,                       // flags - the framework will autoset this
+                                 3,                       // number of buffers
+                                 bufs,                    // buffer references
+                                 sizeof(req),
+                                 (const uint8_t *) &req,  // Message
+                                 1000000                  // Timeout
+        );
+
+        if (err != 0) {
+            GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
+        }
+    }
+
+    if (opt_opsync) {
+        while (sess->op_pending) {
+            ;
+        }
    }

    t2 = ggml_time_us();
@@ -2498,7 +2328,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
    bufs[0].ptr    = src0->data;
    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
    bufs[0].size   = ggml_nbytes(src0);
-    bufs[0].flags  = 0;
+    bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_REF;

    // Second buffer Input Activations. This is a buffer that the CPU
    // writes and the DSP reads, so we'll need to flush CPU caches and
@@ -2508,7 +2338,8 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
    bufs[1].ptr    = src1->data;
    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
    bufs[1].size   = ggml_nbytes(src1);
-    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
+    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP

    // Third buffer expert IDs. This is a buffer that the CPU
@@ -2519,7 +2350,8 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
    bufs[2].ptr    = src2->data;
    bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
    bufs[2].size   = ggml_nbytes(src2);
-    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
+    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP

    // Forth buffer Output Activations. We'll handle DSP
@@ -2530,7 +2362,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
    bufs[3].ptr    = dst->data;
    bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
    bufs[3].size   = ggml_nbytes(dst);
-    bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+    bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);

    // Primary DSP session from the src0 (normally weight) tensor
    auto sess = src0_buf->sess;
@@ -2559,7 +2391,27 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
    }

    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, 4, opt_opsync);
+        // Bump pending flag (cleared in the callback once we get the responce)
+        sess->op_pending++;  // atomic inc
+
+        int err = dspqueue_write(sess->queue,
+                                 0,                       // flags - the framework will autoset this
+                                 4,                       // number of buffers
+                                 bufs,                    // buffer references
+                                 sizeof(req),
+                                 (const uint8_t *) &req,  // Message
+                                 1000000                  // Timeout
+        );
+
+        if (err != 0) {
+            GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
+        }
+    }
+
+    if (opt_opsync) {
+        while (sess->op_pending) {
+            ;
+        }
    }

    t2 = ggml_time_us();
@@ -2632,7 +2484,8 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
    bufs[0].ptr    = src0->data;
    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
    bufs[0].size   = ggml_nbytes(src0);
-    bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
+    bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;

    // Second buffer = Second Operand of Binary op
@@ -2644,7 +2497,8 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
    bufs[1].ptr    = src1->data;
    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
    bufs[1].size   = ggml_nbytes(src1);
-    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
+    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP

    // Third buffer = Output Activations. We'll handle DSP
@@ -2655,7 +2509,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
    bufs[2].ptr    = dst->data;
    bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
    bufs[2].size   = ggml_nbytes(dst);
-    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);

    // Primary DSP session from the src0 tensor
    ggml_hexagon_session * sess = src0_buf->sess;
@@ -2683,7 +2537,26 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
    }

    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, 3, opt_opsync);
+        // Bump pending flag (cleared in the callback once we get the responce)
+        sess->op_pending++;  // atomic inc
+
+        int err = dspqueue_write(sess->queue,
+                                 0,                       // flags - the framework will autoset this
+                                 3,                       // number of buffers
+                                 bufs,                    // buffer references
+                                 sizeof(req),
+                                 (const uint8_t *) &req,  // Message
+                                 1000000);                // Timeout
+
+        if (0 != err) {
+            GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
+        }
+    }
+
+    if (opt_opsync) {
+        while (sess->op_pending) {
+            ;
+        }
    }

    t2 = ggml_time_us();
@@ -2748,7 +2621,8 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
    bufs[0].ptr    = src0->data;
    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
    bufs[0].size   = ggml_nbytes(src0);
-    bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
+    bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;

    // Second buffer = experts bias
@@ -2756,7 +2630,8 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
    bufs[1].ptr    = src1->data;
    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
    bufs[1].size   = ggml_nbytes(src1);
-    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
+    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP

    // Third buffer = activated experts
@@ -2764,7 +2639,8 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
    bufs[2].ptr    = src2->data;
    bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
    bufs[2].size   = ggml_nbytes(src2);
-    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
+    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                     DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP

    // Forth buffer = output activations
@@ -2772,7 +2648,7 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
    bufs[3].ptr    = dst->data;
    bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
    bufs[3].size   = ggml_nbytes(dst);
-    bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+    bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);

    // Primary DSP session from the src0 tensor
    ggml_hexagon_session * sess = src0_buf->sess;
@@ -2802,7 +2678,26 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
    }

    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, 4, opt_opsync);
+        // Bump pending flag (cleared in the callback once we get the responce)
+        sess->op_pending++;  // atomic inc
+
+        int err = dspqueue_write(sess->queue,
+                                 0,                       // flags - the framework will autoset this
+                                 4,                       // number of buffers
+                                 bufs,                    // buffer references
+                                 sizeof(req),
+                                 (const uint8_t *) &req,  // Message
+                                 1000000);                // Timeout
+
+        if (0 != err) {
+            GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
+        }
+    }
+
+    if (opt_opsync) {
+        while (sess->op_pending) {
+            ;
+        }
    }

    t2 = ggml_time_us();
@@ -2900,7 +2795,8 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
    bufs[n_bufs].ptr    = src0->data;
    bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
    bufs[n_bufs].size   = ggml_nbytes(src0);
-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
+    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                          DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
    ++n_bufs;

@@ -2915,7 +2811,8 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
        bufs[n_bufs].ptr    = src1->data;
        bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
        bufs[n_bufs].size   = ggml_nbytes(src1);
-        bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
+        bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                              DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
                              DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
        ++n_bufs;
    }
@@ -2930,7 +2827,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
    bufs[n_bufs].ptr    = dst->data;
    bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
    bufs[n_bufs].size   = ggml_nbytes(dst);
-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
    ++n_bufs;

    // Primary DSP session from the src0 tensor
@@ -2963,7 +2860,26 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
    }

    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, n_bufs, opt_opsync);
+        // Bump pending flag (cleared in the callback once we get the responce)
+        sess->op_pending++;  // atomic inc
+
+        int err = dspqueue_write(sess->queue,
+                                 0,                       // flags - the framework will autoset this
+                                 n_bufs,                  // number of buffers
+                                 bufs,                    // buffer references
+                                 sizeof(req),
+                                 (const uint8_t *) &req,  // Message
+                                 1000000);                // Timeout
+
+        if (0 != err) {
+            GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
+        }
+    }
+
+    if (opt_opsync) {
+        while (sess->op_pending) {
+            ;
+        }
    }

    t2 = ggml_time_us();
@@ -3037,7 +2953,8 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
    bufs[n_bufs].ptr    = src0->data;
    bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
    bufs[n_bufs].size   = ggml_nbytes(src0);
-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
+    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                          DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
    ++n_bufs;

@@ -3051,7 +2968,8 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
    bufs[n_bufs].ptr    = src1->data;
    bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
    bufs[n_bufs].size   = ggml_nbytes(src1);
-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
+    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                          DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
    ++n_bufs;

@@ -3066,7 +2984,8 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
        bufs[n_bufs].ptr    = src2->data;
        bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base;
        bufs[n_bufs].size   = ggml_nbytes(src2);
-        bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
+        bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF |                   // Take a reference
+                              DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
                              DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
        ++n_bufs;
    }
@@ -3081,7 +3000,7 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
    bufs[n_bufs].ptr    = dst->data;
    bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
    bufs[n_bufs].size   = ggml_nbytes(dst);
-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_REF | DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
    ++n_bufs;

    // Primary DSP session from the src0 tensor
@@ -3114,7 +3033,26 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
    }

    if ((opt_opmask & HTP_OPMASK_QUEUE)) {
-        sess->enqueue(req, bufs, n_bufs, opt_opsync);
+        // Bump pending flag (cleared in the callback once we get the responce)
+        sess->op_pending++;  // atomic inc
+
+        int err = dspqueue_write(sess->queue,
+                                 0,                       // flags - the framework will autoset this
+                                 n_bufs,                  // number of buffers
+                                 bufs,                    // buffer references
+                                 sizeof(req),
+                                 (const uint8_t *) &req,  // Message
+                                 1000000);                // Timeout
+
+        if (0 != err) {
+            GGML_ABORT("ggml-hex: %s dspqueue_write failed: 0x%08x\n", sess->name.c_str(), (unsigned) err);
+        }
+    }
+
+    if (opt_opsync) {
+        while (sess->op_pending) {
+            ;
+        }
    }

    t2 = ggml_time_us();
@@ -3259,7 +3197,9 @@ static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, gg
    }

    // Wait until all pending ops complete
-    sess->flush();
+    while (sess->op_pending) {
+        ;
+    }

    return GGML_STATUS_SUCCESS;
 }
@@ -3270,7 +3210,9 @@ static void ggml_backend_hexagon_synchronize(ggml_backend_t backend) {
    HEX_VERBOSE("ggml-hex: %s synchronize\n", sess->name.c_str());

    // Wait until all pending ops complete
-    sess->flush();
+    while (sess->op_pending) {
+        ;
+    }
 }

 struct node_info {
@@ -3679,11 +3621,6 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
        }
    }

-    if(opt_arch < 75) {
-        opt_ndev = 1;
-        GGML_LOG_WARN("ggml-hex: forcing ndev to 1 for SoCs archs lower than v75.\n");
-    }
-
    GGML_LOG_INFO("ggml-hex: Hexagon Arch version v%d\n", opt_arch);

    // Create devices / sessions
@@ -3691,7 +3628,7 @@ ggml_hexagon_registry::ggml_hexagon_registry(ggml_backend_reg_t reg) {
        devices[i].iface   = ggml_backend_hexagon_device_i;
        devices[i].reg     = reg;
        try {
-            devices[i].context = new ggml_hexagon_session(i, &devices[i]);
+            devices[i].context = new ggml_hexagon_session(i);
        } catch (std::exception const &exc) {
            GGML_LOG_ERROR("ggml-hex: failed to create device/session %zu\n", i);
            devices[i].context = nullptr;
--- a/ggml/src/ggml-hexagon/htp-utils.h
+++ b/ggml/src/ggml-hexagon/htp-utils.h
@@ -64,7 +64,6 @@ extern "C" {
 #    pragma weak remote_handle64_control
 #    pragma weak fastrpc_mmap
 #    pragma weak fastrpc_munmap
-#    pragma weak rpcmem_alloc2
 #endif

 #if !defined(_WINDOWS)
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -395,14 +395,28 @@ static void proc_matmul_req(struct htp_context *     ctx,
                            struct htp_general_req * req,
                            struct dspqueue_buffer * bufs,
                            size_t                   n_bufs) {
-    struct dspqueue_buffer rsp_bufs[1];
+    // Prep response buffer structs (needed for error responses, etc)
+    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[1].fd     = bufs[1].fd;
+    rsp_bufs[1].ptr    = bufs[1].ptr;
+    rsp_bufs[1].size   = bufs[1].size;
+    rsp_bufs[1].offset = bufs[1].offset;
+    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference

    // We had written to the output buffer, we'd also need to flush it
-    rsp_bufs[0].fd     = bufs[2].fd;
-    rsp_bufs[0].ptr    = bufs[2].ptr;
-    rsp_bufs[0].size   = bufs[2].size;
-    rsp_bufs[0].offset = bufs[2].offset;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+    rsp_bufs[2].fd     = bufs[2].fd;
+    rsp_bufs[2].ptr    = bufs[2].ptr;
+    rsp_bufs[2].size   = bufs[2].size;
+    rsp_bufs[2].offset = bufs[2].offset;
+    rsp_bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU

    // Setup Op context
@@ -430,21 +444,41 @@ static void proc_matmul_req(struct htp_context *     ctx,
    }

    profile_stop(&prof);
-    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 3, &prof);
 }

 static void proc_matmul_id_req(struct htp_context *     ctx,
                               struct htp_general_req * req,
                               struct dspqueue_buffer * bufs,
                               size_t                   n_bufs) {
-    struct dspqueue_buffer rsp_bufs[1];
+    // Prep response buffer structs (needed for error responses, etc)
+    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[1].fd     = bufs[1].fd;
+    rsp_bufs[1].ptr    = bufs[1].ptr;
+    rsp_bufs[1].size   = bufs[1].size;
+    rsp_bufs[1].offset = bufs[1].offset;
+    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[2].fd     = bufs[2].fd;
+    rsp_bufs[2].ptr    = bufs[2].ptr;
+    rsp_bufs[2].size   = bufs[2].size;
+    rsp_bufs[2].offset = bufs[2].offset;
+    rsp_bufs[2].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference

    // We had written to the output buffer, we'd also need to flush it
-    rsp_bufs[0].fd     = bufs[3].fd;
-    rsp_bufs[0].ptr    = bufs[3].ptr;
-    rsp_bufs[0].size   = bufs[3].size;
-    rsp_bufs[0].offset = bufs[3].offset;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+    rsp_bufs[3].fd     = bufs[3].fd;
+    rsp_bufs[3].ptr    = bufs[3].ptr;
+    rsp_bufs[3].size   = bufs[3].size;
+    rsp_bufs[3].offset = bufs[3].offset;
+    rsp_bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU

    // Setup Op context
@@ -474,18 +508,32 @@ static void proc_matmul_id_req(struct htp_context *     ctx,
    }

    profile_stop(&prof);
-    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 4, &prof);
 }

 static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
-    struct dspqueue_buffer rsp_bufs[1];
+    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));
+
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[1].fd     = bufs[1].fd;
+    rsp_bufs[1].ptr    = bufs[1].ptr;
+    rsp_bufs[1].offset = bufs[1].offset;
+    rsp_bufs[1].size   = bufs[1].size;
+    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference

    // We had written to the output buffer, we'd also need to flush it
-    rsp_bufs[0].fd     = bufs[2].fd;
-    rsp_bufs[0].ptr    = bufs[2].ptr;
-    rsp_bufs[0].offset = bufs[2].offset;
-    rsp_bufs[0].size   = bufs[2].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+    rsp_bufs[2].fd     = bufs[2].fd;
+    rsp_bufs[2].ptr    = bufs[2].ptr;
+    rsp_bufs[2].offset = bufs[2].offset;
+    rsp_bufs[2].size   = bufs[2].size;
+    rsp_bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU

    // Setup Op context
@@ -513,18 +561,38 @@ static void proc_binary_req(struct htp_context * ctx, struct htp_general_req * r
    }

    profile_stop(&prof);
-    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 3, &prof);
 }

 static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
-    struct dspqueue_buffer rsp_bufs[1];
+    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));
+
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[1].fd     = bufs[1].fd;
+    rsp_bufs[1].ptr    = bufs[1].ptr;
+    rsp_bufs[1].offset = bufs[1].offset;
+    rsp_bufs[1].size   = bufs[1].size;
+    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[2].fd     = bufs[2].fd;
+    rsp_bufs[2].ptr    = bufs[2].ptr;
+    rsp_bufs[2].offset = bufs[2].offset;
+    rsp_bufs[2].size   = bufs[2].size;
+    rsp_bufs[2].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference

    // We had written to the output buffer, we'd also need to flush it
-    rsp_bufs[0].fd     = bufs[3].fd;
-    rsp_bufs[0].ptr    = bufs[3].ptr;
-    rsp_bufs[0].offset = bufs[3].offset;
-    rsp_bufs[0].size   = bufs[3].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+    rsp_bufs[3].fd     = bufs[3].fd;
+    rsp_bufs[3].ptr    = bufs[3].ptr;
+    rsp_bufs[3].offset = bufs[3].offset;
+    rsp_bufs[3].size   = bufs[3].size;
+    rsp_bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU

    // Setup Op context
@@ -554,18 +622,26 @@ static void proc_add_id_req(struct htp_context * ctx, struct htp_general_req * r
    }

    profile_stop(&prof);
-    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 4, &prof);
 }

 static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * req, struct dspqueue_buffer * bufs) {
    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));
+
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference

    // We had written to the output buffer, we'd also need to flush it
-    rsp_bufs[0].fd     = bufs[1].fd;
-    rsp_bufs[0].ptr    = bufs[1].ptr;
-    rsp_bufs[0].offset = bufs[1].offset;
-    rsp_bufs[0].size   = bufs[1].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
+    rsp_bufs[1].fd     = bufs[1].fd;
+    rsp_bufs[1].ptr    = bufs[1].ptr;
+    rsp_bufs[1].offset = bufs[1].offset;
+    rsp_bufs[1].size   = bufs[1].size;
+    rsp_bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                         DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU

    // Setup Op context
@@ -593,7 +669,7 @@ static void proc_unary_req(struct htp_context * ctx, struct htp_general_req * re
    }

    profile_stop(&prof);
-    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 2, &prof);
 }

 static void proc_activations_req(struct htp_context *     ctx,
@@ -601,16 +677,33 @@ static void proc_activations_req(struct htp_context *     ctx,
                                 struct dspqueue_buffer * bufs,
                                 uint32_t                 n_bufs) {
    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));

-    int write_idx = (n_bufs == 3) ? 2 : 1;
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    int write_idx = 1;
+    if (3 == n_bufs) {
+        rsp_bufs[1].fd     = bufs[1].fd;
+        rsp_bufs[1].ptr    = bufs[1].ptr;
+        rsp_bufs[1].offset = bufs[1].offset;
+        rsp_bufs[1].size   = bufs[1].size;
+        rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+        write_idx = 2;
+    }

    // We had written to the output buffer, we'd also need to flush it
-    rsp_bufs[0].fd     = bufs[write_idx].fd;
-    rsp_bufs[0].ptr    = bufs[write_idx].ptr;
-    rsp_bufs[0].offset = bufs[write_idx].offset;
-    rsp_bufs[0].size   = bufs[write_idx].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
-                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
+    rsp_bufs[write_idx].fd     = bufs[write_idx].fd;
+    rsp_bufs[write_idx].ptr    = bufs[write_idx].ptr;
+    rsp_bufs[write_idx].offset = bufs[write_idx].offset;
+    rsp_bufs[write_idx].size   = bufs[write_idx].size;
+    rsp_bufs[write_idx].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                                 DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
+                                 DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU

    // Setup Op context
    struct htp_ops_context octx = { 0 };
@@ -649,7 +742,7 @@ static void proc_activations_req(struct htp_context *     ctx,
    }

    profile_stop(&prof);
-    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, n_bufs, &prof);
 }

 static void proc_rope_req(struct htp_context *     ctx,
@@ -657,16 +750,39 @@ static void proc_rope_req(struct htp_context *     ctx,
                          struct dspqueue_buffer * bufs,
                          uint32_t                 n_bufs) {
    struct dspqueue_buffer rsp_bufs[HTP_MAX_PACKET_BUFFERS];
+    memset(rsp_bufs, 0, sizeof(rsp_bufs));

-    int write_idx = (n_bufs == 4) ? 3 : 2;
+    rsp_bufs[0].fd     = bufs[0].fd;
+    rsp_bufs[0].ptr    = bufs[0].ptr;
+    rsp_bufs[0].offset = bufs[0].offset;
+    rsp_bufs[0].size   = bufs[0].size;
+    rsp_bufs[0].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    rsp_bufs[1].fd     = bufs[1].fd;
+    rsp_bufs[1].ptr    = bufs[1].ptr;
+    rsp_bufs[1].offset = bufs[1].offset;
+    rsp_bufs[1].size   = bufs[1].size;
+    rsp_bufs[1].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+    int write_idx = 2;
+    if (4 == n_bufs) {
+        rsp_bufs[write_idx].fd     = bufs[write_idx].fd;
+        rsp_bufs[write_idx].ptr    = bufs[write_idx].ptr;
+        rsp_bufs[write_idx].offset = bufs[write_idx].offset;
+        rsp_bufs[write_idx].size   = bufs[write_idx].size;
+        rsp_bufs[write_idx].flags  = DSPQUEUE_BUFFER_FLAG_DEREF;  // Release reference
+
+        write_idx++;
+    }

    // We had written to the output buffer, we'd also need to flush it
-    rsp_bufs[0].fd     = bufs[write_idx].fd;
-    rsp_bufs[0].ptr    = bufs[write_idx].ptr;
-    rsp_bufs[0].offset = bufs[write_idx].offset;
-    rsp_bufs[0].size   = bufs[write_idx].size;
-    rsp_bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush HTP
-                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT); // Invalidate CPU
+    rsp_bufs[write_idx].fd     = bufs[write_idx].fd;
+    rsp_bufs[write_idx].ptr    = bufs[write_idx].ptr;
+    rsp_bufs[write_idx].offset = bufs[write_idx].offset;
+    rsp_bufs[write_idx].size   = bufs[write_idx].size;
+    rsp_bufs[write_idx].flags  = (DSPQUEUE_BUFFER_FLAG_DEREF |                 // Release reference
+                                 DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush NSP
+                                 DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate CPU

    // Setup Op context
    struct htp_ops_context octx = { 0 };
@@ -703,7 +819,7 @@ static void proc_rope_req(struct htp_context *     ctx,
    }

    profile_stop(&prof);
-    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, 1, &prof);
+    send_htp_rsp(ctx, req->op, rsp_status, rsp_bufs, n_bufs, &prof);
 }

 static void htp_packet_callback(dspqueue_t queue, int error, void * context) {
--- a/ggml/src/ggml-hexagon/htp/ops-utils.h
+++ b/ggml/src/ggml-hexagon/htp/ops-utils.h
@@ -43,46 +43,46 @@ static inline int32_t htp_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_s
 }

 static inline void htp_dump_int8_line(char * pref, const int8_t * x, int n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
-    for (int i = 0; i < n && p < p_end; i++) {
-        p += snprintf(p, p_end - p, "%d, ", x[i]);
+    char str[1024], *p = str;
+    p += sprintf(p, "%s: ", pref);
+    for (int i = 0; i < 16; i++) {
+        p += sprintf(p, "%d, ", x[i]);
    }
    FARF(HIGH, "%s\n", str);
 }

 static inline void htp_dump_uint8_line(char * pref, const uint8_t * x, uint32_t n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
-    for (int i = 0; i < n && p < p_end; i++) {
-        p += snprintf(p, p_end - p, "%d, ", x[i]);
+    char str[1024], *p = str;
+    p += sprintf(p, "%s: ", pref);
+    for (int i = 0; i < n; i++) {
+        p += sprintf(p, "%d, ", x[i]);
    }
    FARF(HIGH, "%s\n", str);
 }

 static inline void htp_dump_int32_line(char * pref, const int32_t * x, uint32_t n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
+    char str[1024], *p = str;
+    p += sprintf(p, "%s: ", pref);
    for (int i = 0; i < n; i++) {
-        p += snprintf(p, p_end - p, "%d, ", (int) x[i]);
+        p += sprintf(p, "%d, ", (int) x[i]);
    }
    FARF(HIGH, "%s\n", str);
 }

 static inline void htp_dump_fp16_line(char * pref, const __fp16 * x, uint32_t n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
+    char str[1024], *p = str;
+    p += sprintf(p, "%s: ", pref);
    for (int i = 0; i < n; i++) {
-        p += snprintf(p, p_end - p, "%.6f, ", (float) x[i]);
+        p += sprintf(p, "%.6f, ", (float) x[i]);
    }
    FARF(HIGH, "%s\n", str);
 }

 static inline void htp_dump_fp32_line(char * pref, const float * x, uint32_t n) {
-    char str[1024], *p = str, *p_end = str + sizeof(str);
-    p += snprintf(p, p_end - p, "%s: ", pref);
+    char str[1024], *p = str;
+    p += sprintf(p, "%s: ", pref);
    for (int i = 0; i < n; i++) {
-        p += snprintf(p, p_end - p, "%.6f, ", x[i]);
+        p += sprintf(p, "%.6f, ", x[i]);
    }
    FARF(HIGH, "%s\n", str);
 }
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -29,11 +29,10 @@ if (CXX_IS_HIPCC)
    endif()
 else()
    # Forward (AMD)GPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
-    if(AMDGPU_TARGETS AND NOT GPU_TARGETS)
-        set(GPU_TARGETS ${AMDGPU_TARGETS})
-    endif()
    if(GPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
        set(CMAKE_HIP_ARCHITECTURES ${GPU_TARGETS})
+    elseif(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+        set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
    endif()
    cmake_minimum_required(VERSION 3.21)
    enable_language(HIP)
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -682,7 +682,6 @@ static inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph * cgraph,
 #endif

 #ifdef __cplusplus
-#include <array>
 #include <initializer_list>
 #include <vector>

@@ -698,21 +697,6 @@ inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph *          cgraph,
    return ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size());
 }

-// Return true if the edges in the graph match expectations.
-inline bool ggml_check_edges(const struct ggml_cgraph *                cgraph,
-                             int                                       start_idx,
-                             std::initializer_list<std::array<int, 3>> edges) {
-    for (const auto & edge : edges) {
-        int dst_node = edge[0];
-        int src_idx  = edge[1];
-        int src_node = edge[2];
-        if (cgraph->nodes[start_idx + dst_node]->src[src_idx] != cgraph->nodes[start_idx + src_node]) {
-            return false;
-        }
-    }
-    return true;
-}
-
 // expose GGUF internals for test code
 GGML_API size_t gguf_type_size(enum gguf_type type);
 GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -677,7 +677,7 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_mul_mm_id_map0(ggml_metal_
    char name[256];

    snprintf(base, 256, "kernel_mul_mm_id_map0_ne20_%d", ne20);
-    snprintf(name, 256, "%s_ne02=%d", base, ne02);
+    snprintf(name, 256, "%s", base);

    ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
    if (res) {
@@ -1332,12 +1332,11 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope(ggml_metal_library_t

    const bool is_neox   = mode & GGML_ROPE_TYPE_NEOX;
    const bool is_mrope  = mode & GGML_ROPE_TYPE_MROPE;
-    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;

    if (is_neox) {
        snprintf(base, 256, "kernel_rope_neox_%s", ggml_type_name(op->src[0]->type));
-    } else if ((is_mrope || is_imrope) && !is_vision) {
+    } else if (is_mrope && !is_vision) {
        GGML_ASSERT(op->src[1]->ne[0]*4 >= op->src[0]->ne[2]); // need at least 4 pos per token
        snprintf(base, 256, "kernel_rope_multi_%s", ggml_type_name(op->src[0]->type));
    } else if (is_vision) {
@@ -1347,20 +1346,14 @@ ggml_metal_pipeline_t ggml_metal_library_get_pipeline_rope(ggml_metal_library_t
        snprintf(base, 256, "kernel_rope_norm_%s", ggml_type_name(op->src[0]->type));
    }

-    snprintf(name, 256, "%s_imrope=%d", base, is_imrope ? 1 : 0);
+    snprintf(name, 256, "%s", base);

    ggml_metal_pipeline_t res = ggml_metal_library_get_pipeline(lib, name);
    if (res) {
        return res;
    }

-    ggml_metal_cv_t cv = ggml_metal_cv_init();
-
-    ggml_metal_cv_set_bool(cv, is_imrope, FC_ROPE + 0);
-
-    res = ggml_metal_library_compile_pipeline(lib, base, name, cv);
-
-    ggml_metal_cv_free(cv);
+    res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);

    return res;
 }
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -707,7 +707,6 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
            if (op->src[0]->ne[0] != 32 &&
                op->src[0]->ne[0] != 40 &&
                op->src[0]->ne[0] != 64 &&
-                op->src[0]->ne[0] != 72 &&
                op->src[0]->ne[0] != 80 &&
                op->src[0]->ne[0] != 96 &&
                op->src[0]->ne[0] != 112 &&
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -76,7 +76,6 @@
 #define FC_FLASH_ATTN_EXT_VEC_REDUCE   500
 #define FC_MUL_MV                      600
 #define FC_MUL_MM                      700
-#define FC_ROPE                        800

 // op-specific constants
 #define OP_FLASH_ATTN_EXT_NQPTG 8
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -3709,8 +3709,6 @@ template [[host_name("kernel_mul_mv_bf16_f32_short")]]  kernel mul_mv_t_t_short_
 template [[host_name("kernel_mul_mv_bf16_bf16_short")]] kernel mul_mv_t_t_short_t kernel_mul_mv_t_t_short<bfloat, bfloat>;
 #endif

-constant bool FC_rope_is_imrope [[function_constant(FC_ROPE + 0)]];
-
 static float rope_yarn_ramp(const float low, const float high, const int i0) {
    const float y = (i0 / 2 - low) / max(0.001f, high - low);
    return 1.0f - min(1.0f, max(0.0f, y));
@@ -3891,26 +3889,14 @@ kernel void kernel_rope_multi(
            const int sector    = ic % sect_dims;

            float theta_base;
-            if (FC_rope_is_imrope) {
-                if (sector % 3 == 1 && sector < 3 * args.sect_1) { // h
-                    theta_base = (float) pos[i2 + args.ne02 * 1];
-                } else if (sector % 3 == 2 && sector < 3 * args.sect_2) { // w
-                    theta_base = (float) pos[i2 + args.ne02 * 2];
-                } else if (sector % 3 == 0 && sector < 3 * args.sect_0) { // t
-                    theta_base = (float) pos[i2 + args.ne02 * 0];
-                } else { // e
-                    theta_base = (float) pos[i2 + args.ne02 * 3];
-                }
+            if (sector < args.sect_0) {
+                theta_base = (float) pos[i2];
+            } else if (sector < sec_w01) {
+                theta_base = (float) pos[i2 + args.ne02];
+            } else if (sector < sec_w012) {
+                theta_base = (float) pos[i2 + args.ne02 * 2];
            } else {
-                if (sector < args.sect_0) {
-                    theta_base = (float) pos[i2];
-                } else if (sector < sec_w01) {
-                    theta_base = (float) pos[i2 + args.ne02 * 1];
-                } else if (sector < sec_w012) {
-                    theta_base = (float) pos[i2 + args.ne02 * 2];
-                } else {
-                    theta_base = (float) pos[i2 + args.ne02 * 3];
-                }
+                theta_base = (float) pos[i2 + args.ne02 * 3];
            }
            // end of mrope

@@ -5362,7 +5348,6 @@ typedef decltype(kernel_flash_attn_ext<FA_TYPES, half4x4, 1, dequantize_f16, hal
 template [[host_name("kernel_flash_attn_ext_f32_dk32_dv32"  )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4,   1, dequantize_f32,  float4x4,   1, dequantize_f32,  32,  32>;
 template [[host_name("kernel_flash_attn_ext_f32_dk40_dv40"  )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4,   1, dequantize_f32,  float4x4,   1, dequantize_f32,  40,  40>;
 template [[host_name("kernel_flash_attn_ext_f32_dk64_dv64"  )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4,   1, dequantize_f32,  float4x4,   1, dequantize_f32,  64,  64>;
-template [[host_name("kernel_flash_attn_ext_f32_dk72_dv72"  )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4,   1, dequantize_f32,  float4x4,   1, dequantize_f32,  72,  72>;
 template [[host_name("kernel_flash_attn_ext_f32_dk80_dv80"  )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4,   1, dequantize_f32,  float4x4,   1, dequantize_f32,  80,  80>;
 template [[host_name("kernel_flash_attn_ext_f32_dk96_dv96"  )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4,   1, dequantize_f32,  float4x4,   1, dequantize_f32,  96,  96>;
 template [[host_name("kernel_flash_attn_ext_f32_dk112_dv112")]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_F32, float4x4,   1, dequantize_f32,  float4x4,   1, dequantize_f32,  112, 112>;
@@ -5375,7 +5360,6 @@ template [[host_name("kernel_flash_attn_ext_f32_dk576_dv512")]]  kernel flash_at
 template [[host_name("kernel_flash_attn_ext_f16_dk32_dv32"  )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  32,  32>;
 template [[host_name("kernel_flash_attn_ext_f16_dk40_dv40"  )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  40,  40>;
 template [[host_name("kernel_flash_attn_ext_f16_dk64_dv64"  )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  64,  64>;
-template [[host_name("kernel_flash_attn_ext_f16_dk72_dv72"  )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  72,  72>;
 template [[host_name("kernel_flash_attn_ext_f16_dk80_dv80"  )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  80,  80>;
 template [[host_name("kernel_flash_attn_ext_f16_dk96_dv96"  )]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  96,  96>;
 template [[host_name("kernel_flash_attn_ext_f16_dk112_dv112")]]  kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    half4x4,    1, dequantize_f16,  half4x4,    1, dequantize_f16,  112, 112>;
@@ -5389,7 +5373,6 @@ template [[host_name("kernel_flash_attn_ext_f16_dk576_dv512")]]  kernel flash_at
 template [[host_name("kernel_flash_attn_ext_bf16_dk32_dv32"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 32,  32>;
 template [[host_name("kernel_flash_attn_ext_bf16_dk40_dv40"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 40,  40>;
 template [[host_name("kernel_flash_attn_ext_bf16_dk64_dv64"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 64,  64>;
-template [[host_name("kernel_flash_attn_ext_bf16_dk72_dv72"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 72,  72>;
 template [[host_name("kernel_flash_attn_ext_bf16_dk80_dv80"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 80,  80>;
 template [[host_name("kernel_flash_attn_ext_bf16_dk96_dv96"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 96,  96>;
 template [[host_name("kernel_flash_attn_ext_bf16_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES_BF, bfloat4x4,  1, dequantize_bf16, bfloat4x4,  1, dequantize_bf16, 112, 112>;
@@ -5403,7 +5386,6 @@ template [[host_name("kernel_flash_attn_ext_bf16_dk576_dv512")]] kernel flash_at
 template [[host_name("kernel_flash_attn_ext_q4_0_dk32_dv32"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 32,  32>;
 template [[host_name("kernel_flash_attn_ext_q4_0_dk40_dv40"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 40,  40>;
 template [[host_name("kernel_flash_attn_ext_q4_0_dk64_dv64"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 64,  64>;
-template [[host_name("kernel_flash_attn_ext_q4_0_dk72_dv72"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 72,  72>;
 template [[host_name("kernel_flash_attn_ext_q4_0_dk80_dv80"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 80,  80>;
 template [[host_name("kernel_flash_attn_ext_q4_0_dk96_dv96"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 96,  96>;
 template [[host_name("kernel_flash_attn_ext_q4_0_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_0, 2, dequantize_q4_0, block_q4_0, 2, dequantize_q4_0, 112, 112>;
@@ -5416,7 +5398,6 @@ template [[host_name("kernel_flash_attn_ext_q4_0_dk576_dv512")]] kernel flash_at
 template [[host_name("kernel_flash_attn_ext_q4_1_dk32_dv32"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 32,  32>;
 template [[host_name("kernel_flash_attn_ext_q4_1_dk40_dv40"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 40,  40>;
 template [[host_name("kernel_flash_attn_ext_q4_1_dk64_dv64"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 64,  64>;
-template [[host_name("kernel_flash_attn_ext_q4_1_dk72_dv72"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 72,  72>;
 template [[host_name("kernel_flash_attn_ext_q4_1_dk80_dv80"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 80,  80>;
 template [[host_name("kernel_flash_attn_ext_q4_1_dk96_dv96"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 96,  96>;
 template [[host_name("kernel_flash_attn_ext_q4_1_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q4_1, 2, dequantize_q4_1, block_q4_1, 2, dequantize_q4_1, 112, 112>;
@@ -5429,7 +5410,6 @@ template [[host_name("kernel_flash_attn_ext_q4_1_dk576_dv512")]] kernel flash_at
 template [[host_name("kernel_flash_attn_ext_q5_0_dk32_dv32"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 32,  32>;
 template [[host_name("kernel_flash_attn_ext_q5_0_dk40_dv40"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 40,  40>;
 template [[host_name("kernel_flash_attn_ext_q5_0_dk64_dv64"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 64,  64>;
-template [[host_name("kernel_flash_attn_ext_q5_0_dk72_dv72"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 72,  72>;
 template [[host_name("kernel_flash_attn_ext_q5_0_dk80_dv80"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 80,  80>;
 template [[host_name("kernel_flash_attn_ext_q5_0_dk96_dv96"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 96,  96>;
 template [[host_name("kernel_flash_attn_ext_q5_0_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_0, 2, dequantize_q5_0, block_q5_0, 2, dequantize_q5_0, 112, 112>;
@@ -5442,7 +5422,6 @@ template [[host_name("kernel_flash_attn_ext_q5_0_dk576_dv512")]] kernel flash_at
 template [[host_name("kernel_flash_attn_ext_q5_1_dk32_dv32"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 32,  32>;
 template [[host_name("kernel_flash_attn_ext_q5_1_dk40_dv40"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 40,  40>;
 template [[host_name("kernel_flash_attn_ext_q5_1_dk64_dv64"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 64,  64>;
-template [[host_name("kernel_flash_attn_ext_q5_1_dk72_dv72"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 72,  72>;
 template [[host_name("kernel_flash_attn_ext_q5_1_dk80_dv80"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 80,  80>;
 template [[host_name("kernel_flash_attn_ext_q5_1_dk96_dv96"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 96,  96>;
 template [[host_name("kernel_flash_attn_ext_q5_1_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q5_1, 2, dequantize_q5_1, block_q5_1, 2, dequantize_q5_1, 112, 112>;
@@ -5455,7 +5434,6 @@ template [[host_name("kernel_flash_attn_ext_q5_1_dk576_dv512")]] kernel flash_at
 template [[host_name("kernel_flash_attn_ext_q8_0_dk32_dv32"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 32,  32>;
 template [[host_name("kernel_flash_attn_ext_q8_0_dk40_dv40"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 40,  40>;
 template [[host_name("kernel_flash_attn_ext_q8_0_dk64_dv64"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 64,  64>;
-template [[host_name("kernel_flash_attn_ext_q8_0_dk72_dv72"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 72,  72>;
 template [[host_name("kernel_flash_attn_ext_q8_0_dk80_dv80"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 80,  80>;
 template [[host_name("kernel_flash_attn_ext_q8_0_dk96_dv96"  )]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 96,  96>;
 template [[host_name("kernel_flash_attn_ext_q8_0_dk112_dv112")]] kernel flash_attn_ext_t kernel_flash_attn_ext<FA_TYPES,    block_q8_0, 2, dequantize_q8_0, block_q8_0, 2, dequantize_q8_0, 112, 112>;
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -6156,8 +6156,8 @@ static void ggml_cl_upscale(ggml_backend_t backend, const ggml_tensor * src0, gg
        CL_CHECK(clSetKernelArg(kernel, 15, sizeof(float),    &sf3));
    } else if (mode == GGML_SCALE_MODE_BILINEAR) {
        if (mode_flags & GGML_SCALE_FLAG_ALIGN_CORNERS) {
-            sf0 = ne0 > 1 && ne00 > 1 ? (float)(ne0 - 1) / (ne00 - 1) : sf0;
-            sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
+            sf0 = (float)(ne0 - 1) / (ne00 - 1);
+            sf1 = (float)(ne1 - 1) / (ne01 - 1);
            pixel_offset = 0.0f;
        }

@@ -8399,7 +8399,6 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
    const bool is_neox = mode & 2;
    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;
-    const int  is_imrope = mode == GGML_ROPE_TYPE_IMROPE;

    if (is_mrope) {
        GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
@@ -8490,14 +8489,9 @@ static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const
    CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float),    &attn_factor));
    CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float),    &beta_fast));
    CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float),    &beta_slow));
-    // both mrope and vision kernels have sections
    if (is_mrope || is_vision) {
        CL_CHECK(clSetKernelArg(kernel, 33, sizeof(int32_t)*4, &sections));
    }
-    // only mrope has is_imrope
-    if (is_mrope && !is_vision) {
-        CL_CHECK(clSetKernelArg(kernel, 34, sizeof(int), &is_imrope));
-    }

    size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03};
    size_t local_work_size[] = {(size_t)nth, 1, 1};
--- a/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mm_f16_f32_l4_lm.cl
@@ -79,8 +79,8 @@ kernel void kernel_mul_mm_f16_f32_l4_lm(

    for (int block = 0; block < ne00; block += BK) {
        for (int l = 0; l < BM; l += loadstride_a) {
-            if (ir*BM + loadc_a + l < ne01) {
-                const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
+            if (loadc_a + l < ne01) {
+            const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
                buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
                buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
                buf_a[(loadr_a * LOAD_VEC_A + 2) * BM + loadc_a + l] = src0[idx].s2;
@@ -94,7 +94,7 @@ kernel void kernel_mul_mm_f16_f32_l4_lm(
        }

        for (int l = 0; l < BN; l += loadstride_b) {
-            if (ic*BN + loadc_b + l < ne11) {
+            if (loadc_b + l < ne11) {
                const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
--- a/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mm_f32_f32_l4_lm.cl
@@ -79,7 +79,7 @@ kernel void kernel_mul_mm_f32_f32_l4_lm(

    for (int block = 0; block < ne00; block += BK) {
        for (int l = 0; l < BM; l += loadstride_a) {
-            if (ir*BM + loadc_a + l < ne01) {
+            if (loadc_a + l < ne01) {
                const int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
                buf_a[(loadr_a * LOAD_VEC_A + 0) * BM + loadc_a + l] = src0[idx].s0;
                buf_a[(loadr_a * LOAD_VEC_A + 1) * BM + loadc_a + l] = src0[idx].s1;
@@ -94,7 +94,7 @@ kernel void kernel_mul_mm_f32_f32_l4_lm(
        }

        for (int l = 0; l < BN; l += loadstride_b) {
-            if (ic*BN + loadc_b + l < ne11) {
+            if (loadc_b + l < ne11) {
                const int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
--- a/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl
+++ b/ggml/src/ggml-opencl/kernels/mul_mm_q8_0_f32_l4_lm.cl
@@ -78,7 +78,7 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm(

    for (int block = 0; block < ne00; block += BK) {
        for (int l = 0; l < BM; l += loadstride_a) {
-            if (ir*BM + loadc_a + l < ne01) {
+            if (loadc_a + l < ne01) {
                int idx = pos_a + (loadc_a + l) * stride_a / LOAD_VEC_A + loadr_a;
                int ib  = idx / 8;
                int iqs = idx % 8;
@@ -101,7 +101,7 @@ kernel void kernel_mul_mm_q8_0_f32_l4_lm(
        }

        for (int l = 0; l < BN; l += loadstride_b) {
-            if (ic*BN + loadc_b + l < ne11) {
+            if (loadc_b + l < ne11) {
                int idx = pos_b + (loadc_b + l) * stride_b / LOAD_VEC_B + loadr_b;
                buf_b[(loadr_b * LOAD_VEC_B + 0) * BN + loadc_b + l] = src1[idx].s0;
                buf_b[(loadr_b * LOAD_VEC_B + 1) * BN + loadc_b + l] = src1[idx].s1;
--- a/ggml/src/ggml-opencl/kernels/rope.cl
+++ b/ggml/src/ggml-opencl/kernels/rope.cl
@@ -392,8 +392,7 @@ kernel void kernel_rope_multi_f32(
        float attn_factor,
        float beta_fast,
        float beta_slow,
-        int4 sections,
-        int  is_imrope
+        int4 sections
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global int*)((global char*)src1 + offset1);
@@ -420,29 +419,17 @@ kernel void kernel_rope_multi_f32(
            const int sector = (i0 / 2) % sect_dims;
            float theta_base = 0.0f;

-            if (is_imrope) {
-                if (sector % 3 == 1 && sector < 3 * sections.s1) { // h
-                    theta_base = (float) pos[i2 + ne02 * 1];
-                } else if (sector % 3 == 2 && sector < 3 * sections.s2) { // w
-                    theta_base = (float) pos[i2 + ne02 * 2];
-                } else if (sector % 3 == 0 && sector < 3 * sections.s0) { // t
-                    theta_base = (float) pos[i2 + ne02 * 0];
-                } else { // e
-                    theta_base = (float) pos[i2 + ne02 * 3];
-                }
-            } else {
-                if (sector < sections.s0) {
-                    theta_base = pos[i2];
-                }
-                else if (sector >= sections.s0 && sector < sec_w) {
-                    theta_base = pos[i2 + ne2 * 1];
-                }
-                else if (sector >= sec_w && sector < sec_w + sections.s2) {
-                    theta_base = pos[i2 + ne2 * 2];
-                }
-                else if (sector >= sec_w + sections.s2) {
-                    theta_base = pos[i2 + ne2 * 3];
-                }
+            if (sector < sections.s0) {
+                theta_base = pos[i2];
+            }
+            else if (sector >= sections.s0 && sector < sec_w) {
+                theta_base = pos[i2 + ne2 * 1];
+            }
+            else if (sector >= sec_w && sector < sec_w + sections.s2) {
+                theta_base = pos[i2 + ne2 * 2];
+            }
+            else if (sector >= sec_w + sections.s2) {
+                theta_base = pos[i2 + ne2 * 3];
            }

            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
@@ -503,8 +490,7 @@ kernel void kernel_rope_multi_f16(
        float attn_factor,
        float beta_fast,
        float beta_slow,
-        int4 sections,
-        int  is_imrope
+        int4 sections
 ) {
    src0 = (global void*)((global char*)src0 + offset0);
    src1 = (global int*)((global char*)src1 + offset1);
@@ -531,29 +517,17 @@ kernel void kernel_rope_multi_f16(
            const int sector = (i0 / 2) % sect_dims;
            float theta_base = 0.0f;

-            if (is_imrope) {
-                if (sector % 3 == 1 && sector < 3 * sections.s1) { // h
-                    theta_base = (float) pos[i2 + ne02 * 1];
-                } else if (sector % 3 == 2 && sector < 3 * sections.s2) { // w
-                    theta_base = (float) pos[i2 + ne02 * 2];
-                } else if (sector % 3 == 0 && sector < 3 * sections.s0) { // t
-                    theta_base = (float) pos[i2 + ne02 * 0];
-                } else { // e
-                    theta_base = (float) pos[i2 + ne02 * 3];
-                }
-            } else {
-                if (sector < sections.s0) {
-                    theta_base = pos[i2];
-                }
-                else if (sector >= sections.s0 && sector < sec_w) {
-                    theta_base = pos[i2 + ne2 * 1];
-                }
-                else if (sector >= sec_w && sector < sec_w + sections.s2) {
-                    theta_base = pos[i2 + ne2 * 2];
-                }
-                else if (sector >= sec_w + sections.s2) {
-                    theta_base = pos[i2 + ne2 * 3];
-                }
+            if (sector < sections.s0) {
+                theta_base = pos[i2];
+            }
+            else if (sector >= sections.s0 && sector < sec_w) {
+                theta_base = pos[i2 + ne2 * 1];
+            }
+            else if (sector >= sec_w && sector < sec_w + sections.s2) {
+                theta_base = pos[i2 + ne2 * 2];
+            }
+            else if (sector >= sec_w + sections.s2) {
+                theta_base = pos[i2 + ne2 * 3];
            }

            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
--- a/ggml/src/ggml-sycl/backend.hpp
+++ b/ggml/src/ggml-sycl/backend.hpp
@@ -32,10 +32,8 @@
 #include "pad.hpp"
 #include "quantize.hpp"
 #include "quants.hpp"
-#include "roll.hpp"
 #include "rope.hpp"
 #include "set_rows.hpp"
-#include "ssm_conv.hpp"
 #include "softmax.hpp"
 #include "tsembd.hpp"
 #include "wkv.hpp"
--- a/ggml/src/ggml-sycl/concat.cpp
+++ b/ggml/src/ggml-sycl/concat.cpp
@@ -11,13 +11,9 @@
 //

 #include "concat.hpp"
+#include "common.hpp"

-static inline size_t elem_size(ggml_type t) {
-    return ggml_type_size(t) / ggml_blck_size(t);
-}
-
-template <typename T>
-static void concat_T_dim0(const T *x, const T *y, T *dst,
+static void concat_f32_dim0(const float *x, const float *y, float *dst,
                            const int ne0, const int ne00,
                            const sycl::nd_item<3> &item_ct1) {
  int nidx = item_ct1.get_local_id(2) +
@@ -40,8 +36,7 @@ static void concat_T_dim0(const T *x, const T *y, T *dst,
  }
 }

-template <typename T>
-static void concat_T_dim1(const T *x, const T *y, T *dst,
+static void concat_f32_dim1(const float *x, const float *y, float *dst,
                            const int ne0, const int ne01,
                            const sycl::nd_item<3> &item_ct1) {
  int nidx = item_ct1.get_local_id(2) +
@@ -64,8 +59,7 @@ static void concat_T_dim1(const T *x, const T *y, T *dst,
  }
 }

-template <typename T>
-static void concat_T_dim2(const T *x, const T *y, T *dst,
+static void concat_f32_dim2(const float *x, const float *y, float *dst,
                            const int ne0, const int ne02,
                            const sycl::nd_item<3> &item_ct1) {
  int nidx = item_ct1.get_local_id(2) +
@@ -88,35 +82,45 @@ static void concat_T_dim2(const T *x, const T *y, T *dst,
  }
 }

-template <typename T>
-static void concat_T_sycl(const T *x, const T *y, T *dst,
+static void concat_f32_sycl(const float *x, const float *y, float *dst,
                            int ne00, int ne01, int ne02, int ne0, int ne1,
                            int ne2, int dim, queue_ptr stream) {
  int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
  sycl::range<3> gridDim(ne2, ne1, num_blocks);
  switch (dim) {
  case 0:
-      stream->parallel_for(sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-                        [=](sycl::nd_item<3> item_ct1) { concat_T_dim0<T>(x, y, dst, ne0, ne00, item_ct1); });
-      break;
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+          concat_f32_dim0(x, y, dst, ne0, ne00, item_ct1);
+        });
+    break;
  case 1:
-      stream->parallel_for(sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-                        [=](sycl::nd_item<3> item_ct1) { concat_T_dim1<T>(x, y, dst, ne0, ne01, item_ct1); });
-      break;
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+          concat_f32_dim1(x, y, dst, ne0, ne01, item_ct1);
+        });
+    break;
  // dim >=2 will be dispatched to the default path
  default:
-      stream->parallel_for(sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
-                                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
-                        [=](sycl::nd_item<3> item_ct1) { concat_T_dim2<T>(x, y, dst, ne0, ne02, item_ct1); });
-      break;
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+          concat_f32_dim2(x, y, dst, ne0, ne02, item_ct1);
+        });
+    break;
  }
 }

 // non-contiguous kernel (slow)
-template<typename T>
-static void concat_T_sycl_non_cont(
+static void concat_f32_sycl_non_cont(
    queue_ptr stream, const char *src0, const char *src1, char *dst,
    int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne03, uint64_t nb00,
    uint64_t nb01, uint64_t nb02, uint64_t nb03, int64_t /*ne10*/,
@@ -133,25 +137,24 @@ static void concat_T_sycl_non_cont(
      int64_t o[4] = { 0, 0, 0, 0 };
      o[dim]       = dim == 0 ? ne00 : (dim == 1 ? ne01 : (dim == 2 ? ne02 : ne03));

-      const T * x;
+      const float * x;

      for (int i0 = item_ct1.get_local_id(2); i0 < ne0; i0 += item_ct1.get_local_range(2)) {
          if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
-              x = (const T *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00);
+              x = (const float *) (src0 + (i3) *nb03 + (i2) *nb02 + (i1) *nb01 + (i0) *nb00);
          } else {
-              x = (const T *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 +
+              x = (const float *) (src1 + (i3 - o[3]) * nb13 + (i2 - o[2]) * nb12 + (i1 - o[1]) * nb11 +
                                   (i0 - o[0]) * nb10);
          }

-          T *y = (T *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);
+          float *y = (float *)(dst + i3 * nb3 + i2 * nb2 + i1 * nb1 + i0 * nb0);

          *y = *x;
      }
  });
 }

-template <typename T>
-void concat_impl_sycl(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
    const ggml_tensor *  src0   = dst->src[0];
    const ggml_tensor *  src1   = dst->src[1];
@@ -160,14 +163,15 @@ void concat_impl_sycl(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
    const int32_t dim = ((int32_t *) dst->op_params)[0];

    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
-        const T * src0_d = (const T *) src0->data;
-        const T * src1_d = (const T *) src1->data;
-        T * dst_d = (T *) dst->data;
-        size_t type_size = elem_size(dst->type);
+        const float * src0_d = (const float *) src0->data;
+        const float * src1_d = (const float *) src1->data;
+
+        float * dst_d = (float *) dst->data;
+
        if (dim != 3) {
            for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-                concat_T_sycl<T>(src0_d + i3 * (src0->nb[3] / type_size), src1_d + i3 * (src1->nb[3] / type_size),
-                                dst_d + i3 * (dst->nb[3] / type_size), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
+                concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
+                                dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
                                dst->ne[1], dst->ne[2], dim, stream);
            }
        } else {
@@ -175,28 +179,13 @@ void concat_impl_sycl(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
            const size_t size1 = ggml_nbytes(src1);

            SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
-            SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / type_size, src1_d, size1).wait()));
+            SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
        }
    } else {
-        concat_T_sycl_non_cont<T>(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
+        concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
                                 src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1],
                                 src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
                                 src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
                                 dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
    }
 }
-
-void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-
-    switch (dst->type) {
-    case GGML_TYPE_F32:
-        concat_impl_sycl<float>(ctx, dst);
-        break;
-    case GGML_TYPE_I32:
-        concat_impl_sycl<int32_t>(ctx, dst);
-        break;
-    default:
-    GGML_ASSERT(false && "ggml_sycl_op_concat: unsupported type");
-    break;
-    }
-}
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -42,16 +42,13 @@
 #include "ggml-sycl/backend.hpp"
 #include "ggml-sycl/common.hpp"
 #include "ggml-sycl/element_wise.hpp"
-#include "ggml-sycl/norm.hpp"
 #include "ggml-sycl/presets.hpp"
 #include "ggml-sycl/gemm.hpp"
 #include "ggml-sycl/set_rows.hpp"
 #include "ggml-sycl/set.hpp"
 #include "ggml-sycl/sycl_hw.hpp"
 #include "ggml-sycl/getrows.hpp"
-#include "ggml-sycl/repeat_back.hpp"
 #include "ggml-sycl/quantize.hpp"
-#include "ggml-sycl/ssm_conv.hpp"
 #include "ggml.h"

 static bool g_sycl_loaded = false;
@@ -2618,10 +2615,6 @@ catch (sycl::exception const &exc) {
  std::exit(1);
 }

-static void ggml_sycl_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
-    ggml_sycl_op_repeat_back(ctx, dst);
-}

 static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
@@ -2638,11 +2631,6 @@ static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * ds
    ggml_sycl_op_rms_norm(ctx, dst);
 }

-static void ggml_sycl_rms_norm_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
-    ggml_sycl_op_rms_norm_back(ctx, dst);
-}
-
 static void ggml_sycl_l2_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
    ggml_sycl_op_l2_norm(ctx, dst);
@@ -3691,9 +3679,6 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
        case GGML_OP_REPEAT:
            ggml_sycl_repeat(ctx, dst);
            break;
-        case GGML_OP_REPEAT_BACK:
-            ggml_sycl_repeat_back(ctx, dst);
-            break;
        case GGML_OP_GET_ROWS:
            ggml_sycl_get_rows(ctx, dst);
            break;
@@ -3833,9 +3818,6 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
        case GGML_OP_LEAKY_RELU:
            ggml_sycl_leaky_relu(ctx, dst);
            break;
-        case GGML_OP_RMS_NORM_BACK:
-            ggml_sycl_rms_norm_back(ctx, dst);
-            break;
        case GGML_OP_RMS_NORM:
            ggml_sycl_rms_norm(ctx, dst);
            break;
@@ -3931,11 +3913,6 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
        case GGML_OP_GATED_LINEAR_ATTN:
            ggml_sycl_op_gated_linear_attn(ctx, dst);
            break;
-        case GGML_OP_SSM_CONV:
-            ggml_sycl_ssm_conv(ctx, dst);
-        case GGML_OP_ROLL:
-            ggml_sycl_roll(ctx, dst);
-            break;
        case GGML_OP_ARANGE:
            ggml_sycl_arange(ctx, dst);
            break;
@@ -4534,12 +4511,11 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                }
                return false;
            }
-        case GGML_OP_REPEAT_BACK:
+        case GGML_OP_CONCAT:
            {
                ggml_type src0_type = op->src[0]->type;
-                return src0_type == GGML_TYPE_F32;
+                return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
            }
-        case GGML_OP_CONCAT:
        case GGML_OP_DUP:
        case GGML_OP_ARGMAX:
        case GGML_OP_NONE:
@@ -4576,8 +4552,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_RMS_NORM:
            return ((op->src[0]->ne[0] % WARP_SIZE) == 0);
-        case GGML_OP_RMS_NORM_BACK:
-            return ((op->src[0]->ne[0] % WARP_SIZE) == 0);
        case GGML_OP_SCALE:
            return true;
        case GGML_OP_CONT:
@@ -4612,12 +4586,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_RWKV_WKV7:
        case GGML_OP_GATED_LINEAR_ATTN:
            return true;
-        case GGML_OP_SSM_CONV:
-            return op->type == GGML_TYPE_F32 &&
-                   op->src[0]->type == GGML_TYPE_F32 &&
-                   op->src[1]->type == GGML_TYPE_F32;
-        case GGML_OP_ROLL:
-            return op->type == GGML_TYPE_F32;
        case GGML_OP_ARANGE:
            return op->type == GGML_TYPE_F32;
        default:
--- a/ggml/src/ggml-sycl/norm.cpp
+++ b/ggml/src/ggml-sycl/norm.cpp
@@ -480,162 +480,6 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device);
 }

-void ggml_sycl_op_rms_norm_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
-
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); // dz
-    GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); // x
-    GGML_ASSERT(dst->type         == GGML_TYPE_F32);
-
-    float eps = 1e-5f;
-    std::memcpy(&eps, dst->op_params, sizeof(float));
-    if (!(eps > 0.0f) || !std::isfinite(eps)) eps = 1e-5f;
-
-    const float * g_base  = static_cast<const float *>(dst->src[0]->data); // dz
-    const float * x_base  = static_cast<const float *>(dst->src[1]->data); // x
-          float * dx_base = static_cast<      float *>(dst->data);
-
-    const int64_t D  = dst->ne[0];
-    const int64_t n1 = dst->ne[1], n2 = dst->ne[2], n3 = dst->ne[3]; (void) n3;
-    const int64_t N  = ggml_nrows(dst);
-    if (D == 0 || N == 0) return;
-
-    const ggml_tensor *G = dst->src[0];
-    const ggml_tensor *X = dst->src[1];
-    const int ts = (int) ggml_type_size(X->type);
-    GGML_ASSERT((size_t) X->nb[0]   == (size_t) ts);
-    GGML_ASSERT((size_t) G->nb[0]   == (size_t) ts);
-    GGML_ASSERT((size_t) dst->nb[0] == (size_t) ts);
-
-    const int64_t xs1 = X->nb[1] / ts, xs2 = X->nb[2] / ts, xs3 = X->nb[3] / ts;
-    const int64_t gs1 = G->nb[1] / ts, gs2 = G->nb[2] / ts, gs3 = G->nb[3] / ts;
-    const int64_t ds1 = dst->nb[1] / ts, ds2 = dst->nb[2] / ts, ds3 = dst->nb[3] / ts;
-
-    dpct::queue_ptr q = ctx.stream();
-
-    // work-group size: multiple of WARP_SIZE, capped by device and 256, and not larger than D
-    const int device_max_wg = ggml_sycl_info().max_work_group_sizes[ctx.device];
-    auto roundup = [](int v, int m) { return ((v + m - 1) / m) * m; };
-    int wg_cap = 256;
-    if (device_max_wg > 0) wg_cap = std::min(wg_cap, device_max_wg);
-    int WG = std::max(WARP_SIZE, std::min(roundup((int)std::min<int64_t>(D, wg_cap), WARP_SIZE), wg_cap));
-
-    // FP32 path: per-thread compensated accumulation + hierarchical reduction
-    q->submit([&](sycl::handler &cgh) {
-        const int nwarps_loc = std::max(1, WG / WARP_SIZE);
-        // store one partial value per warp (xx and xg) for cross-warp reduction
-        auto l_xx   = sycl::local_accessor<sycl::float2, 1>(sycl::range<1>(nwarps_loc), cgh);
-        auto l_xg   = sycl::local_accessor<sycl::float2, 1>(sycl::range<1>(nwarps_loc), cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, WG),
-                              sycl::range<3>(1, 1, WG)),
-            [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-                const int row = item_ct1.get_group(2);
-                const int tid = item_ct1.get_local_id(2);
-
-                const int64_t i1 = row % n1;
-                const int64_t i2 = (row / n1) % n2;
-                const int64_t i3 = row / (n1 * n2);
-
-                const float *__restrict x_row = x_base + i3 * xs3 + i2 * xs2 + i1 * xs1;
-                const float *__restrict g_row = g_base + i3 * gs3 + i2 * gs2 + i1 * gs1;
-                float *__restrict d_row       = dx_base + i3 * ds3 + i2 * ds2 + i1 * ds1;
-
-                // per-thread accumulation (compensated by default)
-                float sum_xx = 0.f, sum_xg = 0.f;
-#ifndef GGML_SYCL_RMS_BACK_FAST
-                float c_xx = 0.f, c_xg = 0.f;
-#endif
-                for (int64_t col = tid; col < D; col += WG) {
-                    const float xv = x_row[col];
-                    const float gv = g_row[col];
-#ifdef GGML_SYCL_RMS_BACK_FAST
-                    sum_xx += xv * xv;
-                    sum_xg += xv * gv;
-#else
-                    float y1 = xv * xv - c_xx;
-                    float t1 = sum_xx + y1;
-                    c_xx = (t1 - sum_xx) - y1;
-                    sum_xx = t1;
-
-                    float y2 = xv * gv - c_xg;
-                    float t2 = sum_xg + y2;
-                    c_xg = (t2 - sum_xg) - y2;
-                    sum_xg = t2;
-#endif
-                }
-
-                // warp-level reduction
-                sycl::float2 xx = sycl::float2(sum_xx,
-#ifndef GGML_SYCL_RMS_BACK_FAST
-                    c_xx
-#else
-                    0.f
-#endif
-                );
-                sycl::float2 xg = sycl::float2(sum_xg,
-#ifndef GGML_SYCL_RMS_BACK_FAST
-                    c_xg
-#else
-                    0.f
-#endif
-                );
-                xx = warp_reduce_sum(xx, item_ct1);
-                xg = warp_reduce_sum(xg, item_ct1);
-
-                // cross-warp reduction using local memory (single barrier)
-                const auto sub_group = item_ct1.get_sub_group();
-                const auto sg_id     = sub_group.get_group_linear_id();
-                const auto wi_in_sg  = sub_group.get_local_linear_id();
-                const int nthreads   = item_ct1.get_local_range(2);
-                const int nwarps     = nthreads / WARP_SIZE;
-
-                sycl::float2 xx_total = xx;
-                sycl::float2 xg_total = xg;
-                if (nwarps > 1) {
-                    if (wi_in_sg == 0) {
-                        l_xx[sg_id] = xx;
-                        l_xg[sg_id] = xg;
-                    }
-                    item_ct1.barrier(sycl::access::fence_space::local_space);
-
-                    if (sg_id == 0) {
-                        const unsigned wi_u = wi_in_sg;
-                        sycl::float2 xx_first = (wi_u < static_cast<unsigned>(nwarps)) ? l_xx[wi_u] : sycl::float2(0.f, 0.f);
-                        sycl::float2 xg_first = (wi_u < static_cast<unsigned>(nwarps)) ? l_xg[wi_u] : sycl::float2(0.f, 0.f);
-                        xx_total = warp_reduce_sum(xx_first, item_ct1);
-                        xg_total = warp_reduce_sum(xg_first, item_ct1);
-                    } else {
-                        // other subgroups keep their local totals; they'll be ignored
-                        xx_total = xx;
-                        xg_total = xg;
-                    }
-                    // ensure all threads see the first-subgroup result via broadcast below
-                }
-
-                // compute inv_r and coeff once per row and broadcast to the whole work-group
-                float inv_r = 0.f;
-                float coeff = 0.f;
-                if (tid == 0) {
-                    const float sum_xx_f  = xx_total.x() + xx_total.y();
-                    const float sum_xdz_f = xg_total.x() + xg_total.y();
-                    const float mean_eps  = sum_xx_f / (float) D + eps;
-                    const float sum_eps   = sum_xx_f + eps * (float) D;
-                    inv_r = sycl::rsqrt(mean_eps);
-                    coeff = -sum_xdz_f / sum_eps;
-                }
-                inv_r = sycl::group_broadcast(item_ct1.get_group(), inv_r);
-                coeff = sycl::group_broadcast(item_ct1.get_group(), coeff);
-
-                for (int64_t col = tid; col < D; col += WG) {
-                    d_row[col] = (g_row[col] + coeff * x_row[col]) * inv_r;
-                }
-            });
-    });
-
-}
-
 void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {

    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
--- a/ggml/src/ggml-sycl/norm.hpp
+++ b/ggml/src/ggml-sycl/norm.hpp
@@ -19,8 +19,6 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst);

 void ggml_sycl_op_rms_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst);

-void ggml_sycl_op_rms_norm_back(ggml_backend_sycl_context& ctx, ggml_tensor* dst);
-
 void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst);

 void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst);
--- a/ggml/src/ggml-sycl/repeat_back.cpp
+++ b/ggml/src/ggml-sycl/repeat_back.cpp
@@ -1,76 +0,0 @@
-#include "repeat_back.hpp"
-
-#include "common.hpp"
-
-#define GGML_ASSERT_TENSOR_FITS_INT(t) \
-    GGML_ASSERT((t)->ne[0] < INT_MAX && (t)->ne[1] < INT_MAX && (t)->ne[2] < INT_MAX && (t)->ne[3] < INT_MAX)
-
-void ggml_sycl_op_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    const float * src0_dd = (const float *) dst->src[0]->data;
-    float *       dst_dd  = (float *) dst->data;
-
-    GGML_ASSERT_TENSOR_FITS_INT(dst);
-    GGML_ASSERT_TENSOR_FITS_INT(dst->src[0]);
-
-    const int ne0 = dst->ne[0], ne1 = dst->ne[1], ne2 = dst->ne[2], ne3 = dst->ne[3];
-    const int ne00 = dst->src[0]->ne[0], ne01 = dst->src[0]->ne[1], ne02 = dst->src[0]->ne[2],
-              ne03 = dst->src[0]->ne[3];
-
-    const int nr0 = ne00 / ne0;
-    const int nr1 = ne01 / ne1;
-    const int nr2 = ne02 / ne2;
-    const int nr3 = ne03 / ne3;
-
-    const int nb0 = dst->src[0]->nb[0];
-    const int nb1 = dst->src[0]->nb[1];
-    const int nb2 = dst->src[0]->nb[2];
-    const int nb3 = dst->src[0]->nb[3];
-
-    const char * base = (const char *) src0_dd;
-
-    const size_t  total      = (size_t) ne0 * ne1 * ne2 * ne3;
-    constexpr int BLOCK_SIZE = 256;
-    const int     num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE;
-
-    const float inv_ne0      = 1.0f / ne0;
-    const float inv_ne_01    = 1.0f / (ne0 * ne1);
-    const float inv_ne_012   = 1.0f / (ne0 * ne1 * ne2);
-    const int   repeat_count = nr0 * nr1 * nr2 * nr3;
-
-    queue_ptr stream = ctx.stream();
-
-    stream->parallel_for(
-        sycl::nd_range<1>(sycl::range<1>(num_blocks * BLOCK_SIZE), sycl::range<1>(BLOCK_SIZE)),
-        [=](sycl::nd_item<1> item_ct1) {
-            const size_t i = item_ct1.get_global_linear_id();
-            if (i >= total) {
-                return;
-            }
-
-            const int i3 = (int) (i * inv_ne_012);
-            const int i2 = (int) (i * inv_ne_01) - i3 * ne2;
-            const int i1 = (int) (i * inv_ne0) - (int) (i * inv_ne_01) * ne1;
-            const int i0 = i - (int) (i * inv_ne0) * ne0;
-
-            int   j0 = 0, j1 = 0, j2 = 0, j3 = 0;
-            float acc = 0.0f;
-
-            for (int j = 0; j < repeat_count; ++j) {
-                const float * ptr = (const float *) (base + (i0 + j0 * ne0) * nb0 + (i1 + j1 * ne1) * nb1 +
-                    (i2 + j2 * ne2) * nb2 + (i3 + j3 * ne3) * nb3);
-                acc += *ptr;
-
-                int carry = (++j0 >= nr0);
-                j0 -= carry * nr0;
-                carry = (carry && (++j1 >= nr1));
-                j1 -= carry * nr1;
-                carry = (carry && (++j2 >= nr2));
-                j2 -= carry * nr2;
-                j3 += carry;
-            }
-            dst_dd[i] = acc;
-        });
-}
--- a/ggml/src/ggml-sycl/repeat_back.hpp
+++ b/ggml/src/ggml-sycl/repeat_back.hpp
@@ -1,8 +0,0 @@
-#ifndef GGML_SYCL_REPEAT_BACK_HPP
-#define GGML_SYCL_REPEAT_BACK_HPP
-
-#include "common.hpp"
-
-void ggml_sycl_op_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
-
-#endif  // GGML_SYCL_REPEAT_BACK_HPP
--- a/ggml/src/ggml-sycl/roll.cpp
+++ b/ggml/src/ggml-sycl/roll.cpp
@@ -1,122 +0,0 @@
-#include "roll.hpp"
-#include "common.hpp"
-
-using namespace sycl;
-
-static inline int wrap_add(int i, int shift, int n) {
-
-    int s = i + shift;
-    return (s >= n) ? (s - n) : s;
-}
-
-static void kernel_roll_fused_i0_i1(
-    queue &q,
-    const float *src_d,
-    float *dst_d,
-    int ne0, int ne1, int ne2, int ne3,
-    int sh0, int sh1, int sh2, int sh3)
-{
-    if (ne0 == 0 || ne1 == 0 || ne2 == 0 || ne3 == 0) return;
-
-
-    const int stride1 = ne0;
-    const int stride2 = ne0 * ne1;
-    const int stride3 = ne0 * ne1 * ne2;
-
-
-    const int shNe0 = (ne0 - sh0) % ne0;
-    const int shNe1 = (ne1 - sh1) % ne1;
-    const int shNe2 = (ne2 - sh2) % ne2;
-    const int shNe3 = (ne3 - sh3) % ne3;
-
-
-    const size_t g0 = (size_t) ne3;
-    const size_t g1 = (size_t) ne2;
-    const size_t g2 = (size_t) (ne1 * ne0);
-
-    const range<3> global{ g0, g1, g2 };
-
-    q.submit([&](handler &h) {
-        h.parallel_for(global, [=](id<3> idx) {
-            const int i3 = (int) idx[0];
-            const int i2 = (int) idx[1];
-
-            const int fused = (int) idx[2];
-            const int i1 = fused / ne0;
-            const int i0 = fused - i1 * ne0;  // fused % ne0
-
-
-            const int idx_dst = i0
-                              + i1 * stride1
-                              + i2 * stride2
-                              + i3 * stride3;
-
-
-            const int s0 = wrap_add(i0, shNe0, ne0);
-            const int s1 = wrap_add(i1, shNe1, ne1);
-            const int s2 = wrap_add(i2, shNe2, ne2);
-            const int s3 = wrap_add(i3, shNe3, ne3);
-
-            const int idx_src = s0
-                              + s1 * stride1
-                              + s2 * stride2
-                              + s3 * stride3;
-
-            dst_d[idx_dst] = src_d[idx_src];
-        });
-    });
-}
-
-void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    const ggml_tensor *src = dst->src[0];
-    GGML_ASSERT(src && src->type == GGML_TYPE_F32);
-
-    const int ne0 = (int) dst->ne[0];
-    const int ne1 = (int) dst->ne[1];
-    const int ne2 = (int) dst->ne[2];
-    const int ne3 = (int) dst->ne[3];
-
-    const int32_t *params = (const int32_t *) dst->op_params;
-    int shift0 = params[0];
-    int shift1 = params[1];
-    int shift2 = params[2];
-    int shift3 = params[3];
-
-
-    if ((shift0 | shift1 | shift2 | shift3) == 0) {
-        const size_t nb = ggml_nbytes(src);
-        queue *q = ctx.stream();
-        SYCL_CHECK(CHECK_TRY_ERROR(q->memcpy(dst->data, src->data, nb)));
-        return;
-    }
-
-    auto norm = [](int sh, int n) -> int {
-        if (n <= 0) return 0;
-        sh %= n;
-        if (sh < 0) sh += n;
-        return sh;
-    };
-    shift0 = norm(shift0, ne0);
-    shift1 = norm(shift1, ne1);
-    shift2 = norm(shift2, ne2);
-    shift3 = norm(shift3, ne3);
-
-    try {
-        queue *q = ctx.stream();
-
-        const float *src_d = (const float *) src->data;
-        float *dst_d = (float *) dst->data;
-        GGML_ASSERT(src_d && dst_d);
-
-        kernel_roll_fused_i0_i1(
-            *q, src_d, dst_d,
-            ne0, ne1, ne2, ne3,
-            shift0, shift1, shift2, shift3
-        );
-    } catch (const std::exception &e) {
-        std::fprintf(stderr, "[SYCL-ROLL] ERROR: %s\n", e.what());
-        throw;
-    }
-}
--- a/ggml/src/ggml-sycl/roll.hpp
+++ b/ggml/src/ggml-sycl/roll.hpp
@@ -1,20 +0,0 @@
-//
-// MIT license
-// Copyright (C) 2024 Intel Corporation
-// SPDX-License-Identifier: MIT
-//
-
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-
-#ifndef GGML_SYCL_ROLL_HPP
-#define GGML_SYCL_ROLL_HPP
-
-#include "common.hpp"
-
-void ggml_sycl_roll(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
-
-#endif // GGML_SYCL_ROLL_HPP
--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@@ -119,7 +119,7 @@ static void rope_multi(const T * x, T * dst, const int ne0, const int ne1, const
                        const size_t s2, const int n_dims, const int32_t * pos, const float freq_scale,
                        const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
                        const float theta_scale, const float * freq_factors, const mrope_sections sections,
-                        const bool is_imrope, const sycl::nd_item<3> & item_ct1) {
+                        const sycl::nd_item<3> & item_ct1) {
    // get index pos
    const int i0 = 2 * (item_ct1.get_group(1) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1));
    if (i0 >= ne0) {
@@ -143,29 +143,17 @@ static void rope_multi(const T * x, T * dst, const int ne0, const int ne1, const


    float theta_base = 0.0;
-    if (is_imrope) {
-        if (sector % 3 == 1 && sector < 3 * sections.v[1]) {
-            theta_base = pos[channel_x + ne2 * 1]*sycl::pow(theta_scale, i0/2.0f);
-        } else if (sector % 3 == 2 && sector < 3 * sections.v[2]) {
-            theta_base = pos[channel_x + ne2 * 2]*sycl::pow(theta_scale, i0/2.0f);
-        } else if (sector % 3 == 0 && sector < 3 * sections.v[0]) {
-            theta_base = pos[channel_x]*sycl::pow(theta_scale, i0/2.0f);
-        } else {
-            theta_base = pos[channel_x + ne2 * 3]*sycl::pow(theta_scale, i0/2.0f);
-        }
-    } else {
-        if (sector < sections.v[0]) {
-            theta_base = pos[channel_x]*sycl::pow(theta_scale, i0/2.0f);
-        }
-        else if (sector >= sections.v[0] && sector < sec_w) {
-            theta_base = pos[channel_x + ne2 * 1]*sycl::pow(theta_scale, i0/2.0f);
-        }
-        else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
-            theta_base = pos[channel_x + ne2 * 2]*sycl::pow(theta_scale, i0/2.0f);
-        }
-        else if (sector >= sec_w + sections.v[2]) {
-            theta_base = pos[channel_x + ne2 * 3]*sycl::pow(theta_scale, i0/2.0f);
-        }
+    if (sector < sections.v[0]) {
+        theta_base = pos[channel_x]*sycl::pow(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sections.v[0] && sector < sec_w) {
+        theta_base = pos[channel_x + ne2 * 1]*sycl::pow(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sec_w && sector < sec_w + sections.v[2]) {
+        theta_base = pos[channel_x + ne2 * 2]*sycl::pow(theta_scale, i0/2.0f);
+    }
+    else if (sector >= sec_w + sections.v[2]) {
+        theta_base = pos[channel_x + ne2 * 3]*sycl::pow(theta_scale, i0/2.0f);
    }

    const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
@@ -293,7 +281,7 @@ static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1,
                             const size_t s2, const int n_dims, const int nr, const int32_t * pos,
                             const float freq_scale, const float freq_base, const float ext_factor,
                             const float attn_factor, const rope_corr_dims corr_dims, const float * freq_factors,
-                             const mrope_sections sections, const bool is_imrope, queue_ptr stream) {
+                             const mrope_sections sections, queue_ptr stream) {
    GGML_ASSERT(ne0 % 2 == 0);
    const sycl::range<3>    block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
    const int               n_blocks_y = ceil_div(ne0, (2 * SYCL_ROPE_BLOCK_SIZE));
@@ -309,12 +297,12 @@ static void rope_multi_sycl(const T * x, T * dst, const int ne0, const int ne1,
    if (freq_factors == nullptr) {
        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
            rope_multi<T, false>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
-                                  corr_dims, theta_scale, freq_factors, sections, is_imrope, item_ct1);
+                                  corr_dims, theta_scale, freq_factors, sections, item_ct1);
        });
    } else {
        stream->parallel_for(nd_range, [=](sycl::nd_item<3> item_ct1) {
            rope_multi<T, true>(x, dst, ne0, ne1, ne2, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor,
-                                 corr_dims, theta_scale, freq_factors, sections, is_imrope, item_ct1);
+                                 corr_dims, theta_scale, freq_factors, sections, item_ct1);
        });
    }
 }
@@ -393,7 +381,6 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)

    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
    const bool is_mrope = mode & GGML_ROPE_TYPE_MROPE;
-    const bool is_imrope = mode == GGML_ROPE_TYPE_IMROPE;
    const bool is_vision = mode == GGML_ROPE_TYPE_VISION;

    if (is_mrope) {
@@ -435,11 +422,11 @@ inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
        if (dst->src[0]->type == GGML_TYPE_F16) {
            rope_multi_sycl((const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, ne01, ne02, s01,
                s02, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
-                freq_factors, sections, is_imrope, main_stream);
+                freq_factors, sections, main_stream);
        } else if (dst->src[0]->type == GGML_TYPE_F32) {
            rope_multi_sycl((const float *) dst->src[0]->data, (float *) dst->data, ne00, ne01, ne02, s01, s02, n_dims,
                             nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections,
-                             is_imrope, main_stream);
+                             main_stream);
        } else {
            GGML_ABORT("Fatal error: Tensor type unsupported!");
        }
--- a/ggml/src/ggml-sycl/ssm_conv.cpp
+++ b/ggml/src/ggml-sycl/ssm_conv.cpp
@@ -1,127 +0,0 @@
-#include "ssm_conv.hpp"
-#include "common.hpp"
-
-#include <cstdio>
-
-using namespace sycl;
-
-static void kernel_ssm_conv(
-    queue &q,
-    const float *src_data,
-    const float *weights,
-    float *dst_data,
-    int d_conv,
-    int d_inner,
-    int n_t,
-    int n_s,
-    int ncs __attribute__((unused)),
-    int src_stride_inner,
-    int src_stride_seq,
-    int dst_stride_token,
-    int dst_stride_seq
-) {
-    const size_t total_work = static_cast<size_t>(d_inner) * static_cast<size_t>(n_t) * static_cast<size_t>(n_s);
-    const size_t work_group_size = 256;
-    const size_t num_work_groups = (total_work + work_group_size - 1) / work_group_size;
-
-    const range<1> global_range(num_work_groups * work_group_size);
-    const range<1> local_range(work_group_size);
-
-    q.submit([&](handler &h) {
-        h.parallel_for(
-            nd_range<1>(global_range, local_range),
-            [=](nd_item<1> item) {
-                const size_t idx = item.get_global_id(0);
-                if (idx >= total_work) {
-                    return;
-                }
-
-                const int channel = static_cast<int>(idx % d_inner);
-                const int token   = static_cast<int>((idx / d_inner) % n_t);
-                const int seq     = static_cast<int>(idx / (static_cast<size_t>(d_inner) * static_cast<size_t>(n_t)));
-
-                const float *s = src_data
-                    + static_cast<size_t>(seq) * static_cast<size_t>(src_stride_seq)
-                    + static_cast<size_t>(channel) * static_cast<size_t>(src_stride_inner)
-                    + static_cast<size_t>(token);
-
-                const float *c = weights + static_cast<size_t>(channel) * static_cast<size_t>(d_conv);
-
-                float sumf = 0.0f;
-                for (int i0 = 0; i0 < d_conv; ++i0) {
-                    sumf += s[i0] * c[i0];
-                }
-
-                const size_t dst_idx =
-                    static_cast<size_t>(seq) * static_cast<size_t>(dst_stride_seq) +
-                    static_cast<size_t>(token) * static_cast<size_t>(dst_stride_token) +
-                    static_cast<size_t>(channel);
-
-                dst_data[dst_idx] = sumf;
-            }
-        );
-    });
-}
-
-void ggml_sycl_ssm_conv(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    ggml_tensor * src0 = dst->src[0];
-    ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
-
-    const int d_conv   = src1->ne[0];
-    const int ncs      = src0->ne[0];
-    const int d_inner  = src0->ne[1];
-    const int n_t      = dst->ne[1];
-    const int n_s      = dst->ne[2];
-
-    GGML_ASSERT(src0->ne[0] == d_conv - 1 + n_t);
-    GGML_ASSERT(src0->ne[1] == d_inner);
-    GGML_ASSERT(src1->ne[1] == d_inner);
-
-    GGML_ASSERT(dst->ne[0] == d_inner);
-    GGML_ASSERT(dst->ne[1] == n_t);
-    GGML_ASSERT(dst->ne[2] == n_s);
-
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-    GGML_ASSERT(src1->nb[0] == sizeof(float));
-
-    GGML_ASSERT(src0->nb[1] == src0->ne[0] * static_cast<int>(sizeof(float)));
-
-    const int src_stride_inner = ncs;
-    const int src_stride_seq   = ncs * d_inner;
-    const int dst_stride_token = d_inner;
-    const int dst_stride_seq   = d_inner * n_t;
-
-    try {
-        queue *q = ctx.stream();
-
-        const float *src_data = static_cast<const float *>(src0->data);
-        const float *weights  = static_cast<const float *>(src1->data);
-        float *dst_data       = static_cast<float *>(dst->data);
-
-        GGML_ASSERT(src_data && weights && dst_data);
-
-        kernel_ssm_conv(
-            *q,
-            src_data,
-            weights,
-            dst_data,
-            d_conv,
-            d_inner,
-            n_t,
-            n_s,
-            ncs,
-            src_stride_inner,
-            src_stride_seq,
-            dst_stride_token,
-            dst_stride_seq
-        );
-
-    } catch (const std::exception &e) {
-        std::fprintf(stderr, "[SYCL-SSM_CONV] ERROR: %s\n", e.what());
-        throw;
-    }
-}
--- a/ggml/src/ggml-sycl/ssm_conv.hpp
+++ b/ggml/src/ggml-sycl/ssm_conv.hpp
@@ -1,5 +0,0 @@
-#pragma once
-
-#include "common.hpp"
-
-void ggml_sycl_ssm_conv(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
--- a/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/argsort.comp
@@ -14,7 +14,6 @@ layout (binding = 1)          buffer D {int data_d[];};

 layout (push_constant) uniform parameter {
    uint ncols;
-    uint nrows;
    uint order;
 } p;

@@ -27,9 +26,10 @@ void swap(uint idx0, uint idx1) {
    dst_row[idx1] = tmp;
 }

-void argsort(bool needs_bounds_check, const uint row) {
+void argsort(bool needs_bounds_check) {
    // bitonic sort
    const int col = int(gl_LocalInvocationID.x);
+    const uint row = gl_WorkGroupID.y;

    const uint row_offset = row * p.ncols;

@@ -72,16 +72,8 @@ void argsort(bool needs_bounds_check, const uint row) {

 void main() {
    if (p.ncols == BLOCK_SIZE) {
-        uint row = gl_WorkGroupID.y;
-        while (row < p.nrows) {
-            argsort(false, row);
-            row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
-        }
+        argsort(false);
    } else {
-        uint row = gl_WorkGroupID.y;
-        while (row < p.nrows) {
-            argsort(true, row);
-            row += gl_WorkGroupSize.y * gl_NumWorkGroups.y;
-        }
+        argsort(true);
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs.glsl
@@ -437,7 +437,7 @@ vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
 #if defined(DATA_A_MXFP4)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
-    return vec2(kvalues_mxfp4[vui & 0xF], kvalues_mxfp4[vui >> 4]) * 0.5;
+    return vec2(kvalues_mxfp4[vui & 0xF], kvalues_mxfp4[vui >> 4]);
 }
 vec4 dequantize4(uint ib, uint iqs, uint a_offset) {
    vec2 v0 = dequantize(ib, iqs, a_offset);
@@ -488,9 +488,9 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {

    const uvec2 qs = uvec2(data_a[a_offset + ib].qs[qsi], data_a[a_offset + ib].qs[qsi + 1]);
    const uint scales = data_a[a_offset + ib].scales[scalesi];
-    const vec2 dm = vec2(data_a[a_offset + ib].dm);
+    const vec2 d = vec2(data_a[a_offset + ib].d);

-    return dm.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - dm.y * float(scales >> 4);
+    return d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4);
 }
 vec2 get_dm(uint ib, uint a_offset) {
    return vec2(1, 0);
@@ -529,7 +529,7 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    const uint is = 2 * n + b;                 // 0..7
    const uint qsi = n * 32 + (iqs % 16) * 2;  // 0,2,4..126

-    const vec2 loadd = vec2(data_a[a_offset + ib].dm);
+    const vec2 loadd = vec2(data_a[a_offset + ib].d);

    const uint scidx0 = (is < 4) ? is : (is + 4);
    const uint scidx1 = (is < 4) ? is : (is - 4);
@@ -567,7 +567,7 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {

    const uint8_t hm = uint8_t(1 << (iqs / 16));

-    const vec2 loadd = vec2(data_a[a_offset + ib].dm);
+    const vec2 loadd = vec2(data_a[a_offset + ib].d);

    const uint scidx0 = (is < 4) ? is : (is + 4);
    const uint scidx1 = (is < 4) ? is : (is - 4);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.glsl
@@ -120,7 +120,7 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ2
 float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2], const in uint coordInBlock[2])
 {
    decodeBufQ2_K_packed16 bl16 = decodeBufQ2_K_packed16(bl);
-    const f16vec2 dm = bl.block.dm;
+    const f16vec2 d = bl.block.d;
    const uint idx = coordInBlock[1];

    const uint scalesi = (idx & 0xF0) >> 4;             // 0..15
@@ -131,7 +131,7 @@ float16_t dequantFuncQ2_K(const in decodeBufQ2_K bl, const in uint blockCoords[2
    qs = unpack8(qs)[idx & 1];

    const uint scales = bl.block.scales[scalesi];
-    float16_t ret = dm.x * float16_t(scales & 0xF) * float16_t(qs) - dm.y * float16_t(scales >> 4);
+    float16_t ret = d.x * float16_t(scales & 0xF) * float16_t(qs) - d.y * float16_t(scales >> 4);
    return ret;
 }

@@ -680,7 +680,7 @@ float16_t dequantFuncMXFP4(const in decodeBufMXFP4 bl, const in uint blockCoords
    uint32_t qs = bl.block.qs[iqs];
    qs >>= shift;
    qs &= 0xF;
-    float16_t ret = float16_t(kvalues_mxfp4[qs] * d * 0.5);
+    float16_t ret = float16_t(kvalues_mxfp4[qs] * d);
    return ret;
 }
 #endif
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_mxfp4.comp
@@ -26,7 +26,7 @@ void main() {
    const float d = e8m0_to_fp32(data_a[ib].e);

    [[unroll]] for (uint l = 0; l < 8; ++l) {
-        data_b[b_idx + l +  0] = D_TYPE(d * 0.5 * float(kvalues_mxfp4[data_a[ib].qs[q_idx + l] & 0xF]));
-        data_b[b_idx + l + 16] = D_TYPE(d * 0.5 * float(kvalues_mxfp4[data_a[ib].qs[q_idx + l] >>  4]));
+        data_b[b_idx + l +  0] = D_TYPE(d * kvalues_mxfp4[data_a[ib].qs[q_idx + l] & 0xF]);
+        data_b[b_idx + l + 16] = D_TYPE(d * kvalues_mxfp4[data_a[ib].qs[q_idx + l] >>  4]);
    }
 }
--- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_q2_k.comp
@@ -24,8 +24,8 @@ void main() {
        const uint ql_idx = 32 * ip + il;
        const uint8_t qs = data_a[i].qs[32 * ip + il];

-        FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].dm.x);
-        FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].dm.y);
+        FLOAT_TYPE dall = FLOAT_TYPE(data_a[i].d.x);
+        FLOAT_TYPE dmin = FLOAT_TYPE(data_a[i].d.y);
        data_b[y_idx +  0] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+0] & 0xF) * ((qs >> 0) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+0] >> 4));
        data_b[y_idx + 32] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+2] & 0xF) * ((qs >> 2) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+2] >> 4));
        data_b[y_idx + 64] = D_TYPE(dall * FLOAT_TYPE((data_a[i].scales[is+4] & 0xF) * ((qs >> 4) & 3)) - dmin * FLOAT_TYPE(data_a[i].scales[is+4] >> 4));
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Francis Couture-Harpin	93fbd407f3	Merge branch 'master' into compilade/convert-prequant Some checks failed Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Has been cancelled Python check requirements.txt / check-requirements (push) Has been cancelled Python Type-Check / pyright type-check (push) Has been cancelled Update Operations Documentation / update-ops-docs (push) Has been cancelled	2025-10-23 14:23:12 -04:00
Francis Couture-Harpin	0d5cfed596	Merge branch 'master' into compilade/convert-prequant Some checks failed Copilot Setup Steps / copilot-setup-steps (push) Has been cancelled Check Pre-Tokenizer Hashes / pre-tokenizer-hashes (push) Has been cancelled Python check requirements.txt / check-requirements (push) Has been cancelled Python Type-Check / pyright type-check (push) Has been cancelled Update Operations Documentation / update-ops-docs (push) Has been cancelled	2025-09-09 14:23:06 -04:00
Francis Couture-Harpin	adec43d774	Merge branch 'master' into compilade/convert-prequant	2025-09-01 10:13:29 -04:00
Francis Couture-Harpin	899398277d	convert : fix conversion from FP8 for Deepseek-V3.1-Base	2025-08-19 17:27:59 -04:00
Francis Couture-Harpin	1ae6ab7601	Merge branch 'master' into compilade/convert-prequant	2025-08-14 17:05:21 -04:00
Francis Couture-Harpin	de12f8ac50	convert : begin handling pre-quantized models	2025-07-22 04:11:34 -04:00