cont

ggml-ci
ggml : trying stuff (wip)
2026-04-16 16:27:32 +03:00 · 2025-04-04 12:02:20 +03:00 · 2025-04-04 11:33:44 +03:00 · 2025-04-03 10:49:44 +03:00 · 2025-04-03 10:49:36 +03:00 · 2025-04-03 09:32:55 +02:00
192 changed files with 23784 additions and 20107 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -676,6 +676,35 @@ jobs:
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

+  macOS-latest-cmake-visionos:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: Build
+        id: cmake_build
+        run: |
+          sysctl -a
+          cmake -B build -G Xcode \
+            -DGGML_METAL_USE_BF16=ON \
+            -DGGML_METAL_EMBED_LIBRARY=ON \
+            -DLLAMA_BUILD_EXAMPLES=OFF \
+            -DLLAMA_BUILD_TESTS=OFF \
+            -DLLAMA_BUILD_SERVER=OFF \
+            -DCMAKE_SYSTEM_NAME=visionOS \
+            -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
+            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
+          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
+
  macOS-latest-swift:
    runs-on: macos-latest

@@ -774,7 +803,7 @@ jobs:
    env:
      OPENBLAS_VERSION: 0.3.23
      SDE_VERSION: 9.33.0-2024-01-07
-      VULKAN_VERSION: 1.4.304.1
+      VULKAN_VERSION: 1.4.309.0

    strategy:
      matrix:
--- a/README.md
+++ b/README.md
@@ -112,6 +112,8 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
 - [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
 - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
+- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
+- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)

 #### Multimodal

--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -432,8 +432,8 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xros \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
    -S .
 cmake --build build-visionos --config Release -- -quiet

@@ -445,8 +445,8 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xrsimulator \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
    -S .
 cmake --build build-visionos-sim --config Release -- -quiet

--- a/ci/README.md
+++ b/ci/README.md
@@ -26,4 +26,43 @@ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 # with SYCL support
 source /opt/intel/oneapi/setvars.sh
 GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+
+# with MUSA support
+GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
+
+## Running MUSA CI in a Docker Container
+
+Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
+
+### 1. Create a local directory to store cached models, configuration files and venv:
+
+```bash
+mkdir -p $HOME/llama.cpp/ci-cache
+```
+
+### 2. Create a local directory to store CI run results:
+
+```bash
+mkdir -p $HOME/llama.cpp/ci-results
+```
+
+### 3. Start a Docker container and run the CI:
+
+```bash
+docker run --privileged -it \
+    -v $HOME/llama.cpp/ci-cache:/ci-cache \
+    -v $HOME/llama.cpp/ci-results:/ci-results \
+    -v $PWD:/ws -w /ws \
+    mthreads/musa:rc3.1.1-devel-ubuntu22.04
+```
+
+Inside the container, execute the following commands:
+
+```bash
+apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
+git config --global --add safe.directory /ws
+GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
+```
+
+This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -16,6 +16,9 @@
 # # with VULKAN support
 # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with MUSA support
+# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@@ -52,13 +55,22 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
        echo "source /opt/intel/oneapi/setvars.sh"
        exit 1
    fi
-
+    # Use only main GPU
+    export ONEAPI_DEVICE_SELECTOR="level_zero:0"
+    # Enable sysman for correct memory reporting
+    export ZES_ENABLE_SYSMAN=1
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi

 if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
 fi
+
+if [ ! -z ${GG_BUILD_MUSA} ]; then
+    # Use qy1 by default (MTT S80)
+    MUSA_ARCH=${MUSA_ARCH:-21}
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
+fi
 ## helpers

 # download a file if it does not exist or if it is outdated
@@ -808,7 +820,7 @@ export LLAMA_LOG_PREFIX=1
 export LLAMA_LOG_TIMESTAMPS=1

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
+    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models
    rm -rf ${SRC}/models-mnt
    mnt_models=${MNT}/models
    mkdir -p ${mnt_models}
@@ -826,8 +838,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
 fi

 ret=0
-
-test $ret -eq 0 && gg_run ctest_debug
+if [ -z ${GG_BUILD_SYCL} ]; then
+    # SYCL build breaks with debug build flags
+    test $ret -eq 0 && gg_run ctest_debug
+fi
 test $ret -eq 0 && gg_run ctest_release

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -835,7 +849,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run rerank_tiny

    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
-        test $ret -eq 0 && gg_run test_scripts_debug
+        if [ -z ${GG_BUILD_SYCL} ]; then
+            test $ret -eq 0 && gg_run test_scripts_debug
+        fi
        test $ret -eq 0 && gg_run test_scripts_release
    fi

@@ -846,7 +862,9 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
            test $ret -eq 0 && gg_run pythia_2_8b
            #test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
-        test $ret -eq 0 && gg_run ctest_with_model_debug
+        if [ -z ${GG_BUILD_SYCL} ]; then
+            test $ret -eq 0 && gg_run ctest_with_model_debug
+        fi
        test $ret -eq 0 && gg_run ctest_with_model_release
    fi
 fi
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -114,8 +114,8 @@ if (LLAMA_LLGUIDANCE)

    ExternalProject_Add(llguidance_ext
        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
-        # v0.6.12:
-        GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
+        # v0.7.10:
+        GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
        PREFIX ${CMAKE_BINARY_DIR}/llguidance
        SOURCE_DIR ${LLGUIDANCE_SRC}
        BUILD_IN_SOURCE TRUE
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1,9 +1,20 @@
+#include "gguf.h" // for reading GGUF splits
 #include "arg.h"

+#include "common.h"
 #include "log.h"
 #include "sampling.h"
 #include "chat.h"

+// fix problem with std::min and std::max
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
 #include <algorithm>
 #include <climits>
 #include <cstdarg>
@@ -14,6 +25,14 @@
 #include <thread>
 #include <vector>

+//#define LLAMA_USE_CURL
+
+#if defined(LLAMA_USE_CURL)
+#include <curl/curl.h>
+#include <curl/easy.h>
+#include <future>
+#endif
+
 #include "json-schema-to-grammar.h"

 using json = nlohmann::ordered_json;
@@ -125,47 +144,549 @@ std::string common_arg::to_string() {
    return ss.str();
 }

+//
+// downloader
+//
+
+struct common_hf_file_res {
+    std::string repo; // repo name with ":tag" removed
+    std::string ggufFile;
+    std::string mmprojFile;
+};
+
+#ifdef LLAMA_USE_CURL
+
+#ifdef __linux__
+#include <linux/limits.h>
+#elif defined(_WIN32)
+#   if !defined(PATH_MAX)
+#   define PATH_MAX MAX_PATH
+#   endif
+#else
+#include <sys/syslimits.h>
+#endif
+#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+
+//
+// CURL utils
+//
+
+using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
+
+// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
+struct curl_slist_ptr {
+    struct curl_slist * ptr = nullptr;
+    ~curl_slist_ptr() {
+        if (ptr) {
+            curl_slist_free_all(ptr);
+        }
+    }
+};
+
+#define CURL_MAX_RETRY 3
+#define CURL_RETRY_DELAY_SECONDS 2
+
+static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
+    int remaining_attempts = max_attempts;
+
+    while (remaining_attempts > 0) {
+        LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
+
+        CURLcode res = curl_easy_perform(curl);
+        if (res == CURLE_OK) {
+            return true;
+        }
+
+        int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
+        LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
+
+        remaining_attempts--;
+        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
+    }
+
+    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+
+    return false;
+}
+
+// download one single file from remote URL to local path
+static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
+    // Initialize libcurl
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    if (!curl) {
+        LOG_ERR("%s: error initializing libcurl\n", __func__);
+        return false;
+    }
+
+    bool force_download = false;
+
+    // Set the URL, allow to follow http redirection
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
+
+    // Check if hf-token or bearer-token was specified
+    if (!bearer_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + bearer_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+    }
+
+#if defined(_WIN32)
+    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
+    //   operating system. Currently implemented under MS-Windows.
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+
+    // Check if the file already exists locally
+    auto file_exists = std::filesystem::exists(path);
+
+    // If the file exists, check its JSON metadata companion file.
+    std::string metadata_path = path + ".json";
+    nlohmann::json metadata;
+    std::string etag;
+    std::string last_modified;
+
+    if (file_exists) {
+        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
+        std::ifstream metadata_in(metadata_path);
+        if (metadata_in.good()) {
+            try {
+                metadata_in >> metadata;
+                LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+                if (metadata.contains("url") && metadata.at("url").is_string()) {
+                    auto previous_url = metadata.at("url").get<std::string>();
+                    if (previous_url != url) {
+                        LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
+                        return false;
+                    }
+                }
+                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
+                    etag = metadata.at("etag");
+                }
+                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
+                    last_modified = metadata.at("lastModified");
+                }
+            } catch (const nlohmann::json::exception & e) {
+            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+                return false;
+            }
+        }
+    } else {
+        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+    }
+
+    // Send a HEAD request to retrieve the etag and last-modified headers
+    struct common_load_model_from_url_headers {
+        std::string etag;
+        std::string last_modified;
+    };
+
+    common_load_model_from_url_headers headers;
+
+    {
+        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
+        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
+            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
+
+            static std::regex header_regex("([^:]+): (.*)\r\n");
+            static std::regex etag_regex("ETag", std::regex_constants::icase);
+            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
+
+            std::string header(buffer, n_items);
+            std::smatch match;
+            if (std::regex_match(header, match, header_regex)) {
+                const std::string & key = match[1];
+                const std::string & value = match[2];
+                if (std::regex_match(key, match, etag_regex)) {
+                    headers->etag = value;
+                } else if (std::regex_match(key, match, last_modified_regex)) {
+                    headers->last_modified = value;
+                }
+            }
+            return n_items;
+        };
+
+        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
+        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
+
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
+            return false;
+        }
+
+        long http_code = 0;
+        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+        if (http_code != 200) {
+            // HEAD not supported, we don't know if the file has changed
+            // force trigger downloading
+            force_download = true;
+            LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+        }
+    }
+
+    bool should_download = !file_exists || force_download;
+    if (!should_download) {
+        if (!etag.empty() && etag != headers.etag) {
+            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
+            should_download = true;
+        } else if (!last_modified.empty() && last_modified != headers.last_modified) {
+            LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
+            should_download = true;
+        }
+    }
+    if (should_download) {
+        std::string path_temporary = path + ".downloadInProgress";
+        if (file_exists) {
+            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+            if (remove(path.c_str()) != 0) {
+                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
+                return false;
+            }
+        }
+
+        // Set the output file
+
+        struct FILE_deleter {
+            void operator()(FILE * f) const {
+                fclose(f);
+            }
+        };
+
+        std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
+        if (!outfile) {
+            LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
+            return false;
+        }
+
+        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
+        auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
+            return fwrite(data, size, nmemb, (FILE *)fd);
+        };
+        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
+        curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+        curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
+
+        //  display download progress
+        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
+
+        // helper function to hide password in URL
+        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
+            std::size_t protocol_pos = url.find("://");
+            if (protocol_pos == std::string::npos) {
+                return url;  // Malformed URL
+            }
+
+            std::size_t at_pos = url.find('@', protocol_pos + 3);
+            if (at_pos == std::string::npos) {
+                return url;  // No password in URL
+            }
+
+            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
+        };
+
+        // start the download
+        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
+            return false;
+        }
+
+        long http_code = 0;
+        curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+        if (http_code < 200 || http_code >= 400) {
+            LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
+            return false;
+        }
+
+        // Causes file to be closed explicitly here before we rename it.
+        outfile.reset();
+
+        // Write the updated JSON metadata file.
+        metadata.update({
+            {"url", url},
+            {"etag", headers.etag},
+            {"lastModified", headers.last_modified}
+        });
+        std::ofstream(metadata_path) << metadata.dump(4);
+        LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
+
+        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
+            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+            return false;
+        }
+    }
+
+    return true;
+}
+
+// download multiple files from remote URLs to local paths
+// the input is a vector of pairs <url, path>
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
+    // Prepare download in parallel
+    std::vector<std::future<bool>> futures_download;
+    for (auto const & item : urls) {
+        futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
+            return common_download_file_single(it.first, it.second, bearer_token);
+        }, item));
+    }
+
+    // Wait for all downloads to complete
+    for (auto & f : futures_download) {
+        if (!f.get()) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static bool common_download_model(
+        const common_params_model & model,
+        const std::string & bearer_token) {
+    // Basic validation of the model.url
+    if (model.url.empty()) {
+        LOG_ERR("%s: invalid model url\n", __func__);
+        return false;
+    }
+
+    if (!common_download_file_single(model.url, model.path, bearer_token)) {
+        return false;
+    }
+
+    // check for additional GGUFs split to download
+    int n_split = 0;
+    {
+        struct gguf_init_params gguf_params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ NULL,
+        };
+        auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
+        if (!ctx_gguf) {
+            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, model.path.c_str());
+            return false;
+        }
+
+        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+        if (key_n_split >= 0) {
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+        }
+
+        gguf_free(ctx_gguf);
+    }
+
+    if (n_split > 1) {
+        char split_prefix[PATH_MAX] = {0};
+        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+
+        // Verify the first split file format
+        // and extract split URL and PATH prefixes
+        {
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
+                return false;
+            }
+
+            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
+                return false;
+            }
+        }
+
+        std::vector<std::pair<std::string, std::string>> urls;
+        for (int idx = 1; idx < n_split; idx++) {
+            char split_path[PATH_MAX] = {0};
+            llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
+
+            char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+            llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
+
+            if (std::string(split_path) == model.path) {
+                continue; // skip the already downloaded file
+            }
+
+            urls.push_back({split_url, split_path});
+        }
+
+        // Download in parallel
+        common_download_file_multiple(urls, bearer_token);
+    }
+
+    return true;
+}
+
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
+    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
+    std::string hf_repo = parts[0];
+    if (string_split<std::string>(hf_repo, '/').size() != 2) {
+        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
+    }
+
+    // fetch model info from Hugging Face Hub API
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    std::string res_str;
+    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
+    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
+    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
+        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
+        return size * nmemb;
+    };
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
+#if defined(_WIN32)
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+    if (!bearer_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + bearer_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+    }
+    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
+    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+
+    CURLcode res = curl_easy_perform(curl.get());
+
+    if (res != CURLE_OK) {
+        throw std::runtime_error("error: cannot make GET request to HF API");
+    }
+
+    long res_code;
+    std::string ggufFile   = "";
+    std::string mmprojFile = "";
+    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
+    if (res_code == 200) {
+        // extract ggufFile.rfilename in json, using regex
+        {
+            std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
+            std::smatch match;
+            if (std::regex_search(res_str, match, pattern)) {
+                ggufFile = match[1].str();
+            }
+        }
+        // extract mmprojFile.rfilename in json, using regex
+        {
+            std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
+            std::smatch match;
+            if (std::regex_search(res_str, match, pattern)) {
+                mmprojFile = match[1].str();
+            }
+        }
+    } else if (res_code == 401) {
+        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
+    } else {
+        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
+    }
+
+    // check response
+    if (ggufFile.empty()) {
+        throw std::runtime_error("error: model does not have ggufFile");
+    }
+
+    return { hf_repo, ggufFile, mmprojFile };
+}
+
+#else
+
+static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
+    LOG_ERR("error: built without CURL, cannot download model from internet\n");
+    return false;
+}
+
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
+    LOG_ERR("error: built without CURL, cannot download model from the internet\n");
+    return false;
+}
+
+static bool common_download_model(
+        const common_params_model &,
+        const std::string &) {
+    LOG_ERR("error: built without CURL, cannot download model from the internet\n");
+    return false;
+}
+
+static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
+    LOG_ERR("error: built without CURL, cannot download model from the internet\n");
+    return {};
+}
+
+#endif // LLAMA_USE_CURL
+
 //
 // utils
 //

-static void common_params_handle_model_default(
-        std::string & model,
-        const std::string & model_url,
-        std::string & hf_repo,
-        std::string & hf_file,
-        const std::string & hf_token,
-        const std::string & model_default) {
-    if (!hf_repo.empty()) {
-        // short-hand to avoid specifying --hf-file -> default it to --model
-        if (hf_file.empty()) {
-            if (model.empty()) {
-                auto auto_detected = common_get_hf_file(hf_repo, hf_token);
-                if (auto_detected.first.empty() || auto_detected.second.empty()) {
-                    exit(1); // built without CURL, error message already printed
+static void common_params_handle_model(
+        struct common_params_model & model,
+        const std::string & bearer_token,
+        const std::string & model_path_default,
+        bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
+    // handle pre-fill default model path and url based on hf_repo and hf_file
+    {
+        if (!model.hf_repo.empty()) {
+            // short-hand to avoid specifying --hf-file -> default it to --model
+            if (model.hf_file.empty()) {
+                if (model.path.empty()) {
+                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
+                    if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
+                        exit(1); // built without CURL, error message already printed
+                    }
+                    model.hf_repo = auto_detected.repo;
+                    model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
+                } else {
+                    model.hf_file = model.path;
                }
-                hf_repo = auto_detected.first;
-                hf_file = auto_detected.second;
-            } else {
-                hf_file = model;
            }
+
+            // TODO: allow custom host
+            model.url = "https://huggingface.co/" + model.hf_repo + "/resolve/main/" + model.hf_file;
+
+            // make sure model path is present (for caching purposes)
+            if (model.path.empty()) {
+                // this is to avoid different repo having same file name, or same file name in different subdirs
+                std::string filename = model.hf_repo + "_" + model.hf_file;
+                // to make sure we don't have any slashes in the filename
+                string_replace_all(filename, "/", "_");
+                model.path = fs_get_cache_file(filename);
+            }
+
+        } else if (!model.url.empty()) {
+            if (model.path.empty()) {
+                auto f = string_split<std::string>(model.url, '#').front();
+                f = string_split<std::string>(f, '?').front();
+                model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
+            }
+
+        } else if (model.path.empty()) {
+            model.path = model_path_default;
        }
-        // make sure model path is present (for caching purposes)
-        if (model.empty()) {
-            // this is to avoid different repo having same file name, or same file name in different subdirs
-            std::string filename = hf_repo + "_" + hf_file;
-            // to make sure we don't have any slashes in the filename
-            string_replace_all(filename, "/", "_");
-            model = fs_get_cache_file(filename);
+    }
+
+    // then, download it if needed
+    if (!model.url.empty()) {
+        bool ok = common_download_model(model, bearer_token);
+        if (!ok) {
+            LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
+            exit(1);
        }
-    } else if (!model_url.empty()) {
-        if (model.empty()) {
-            auto f = string_split<std::string>(model_url, '#').front();
-            f = string_split<std::string>(f, '?').front();
-            model = fs_get_cache_file(string_split<std::string>(f, '/').back());
-        }
-    } else if (model.empty()) {
-        model = model_default;
    }
 }

@@ -300,10 +821,16 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

-    // TODO: refactor model params in a common struct
-    common_params_handle_model_default(params.model,             params.model_url,             params.hf_repo,             params.hf_file,             params.hf_token, DEFAULT_MODEL_PATH);
-    common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
-    common_params_handle_model_default(params.vocoder.model,     params.vocoder.model_url,     params.vocoder.hf_repo,     params.vocoder.hf_file,     params.hf_token, "");
+    common_params_handle_model(params.model,             params.hf_token, DEFAULT_MODEL_PATH);
+    common_params_handle_model(params.speculative.model, params.hf_token, "");
+    common_params_handle_model(params.vocoder.model,     params.hf_token, "");
+
+    // allow --mmproj to be set from -hf
+    // assuming that mmproj is always in the same repo as text model
+    if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
+        params.mmproj.hf_repo = params.model.hf_repo;
+    }
+    common_params_handle_model(params.mmproj,            params.hf_token, "", true);

    if (params.escape) {
        string_process_escapes(params.prompt);
@@ -322,6 +849,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        params.kv_overrides.back().key[0] = 0;
    }

+    if (!params.tensor_buft_overrides.empty()) {
+        params.tensor_buft_overrides.push_back({nullptr, nullptr});
+    }
+
    if (params.reranking && params.embedding) {
        throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
    }
@@ -1561,7 +2092,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--mmproj"}, "FILE",
        "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
        [](common_params & params, const std::string & value) {
-            params.mmproj = value;
+            params.mmproj.path = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    add_opt(common_arg(
+        {"--mmproj-url"}, "URL",
+        "URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
+        [](common_params & params, const std::string & value) {
+            params.mmproj.url = value;
        }
    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
    add_opt(common_arg(
@@ -1647,6 +2185,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            exit(0);
        }
    ));
+    add_opt(common_arg(
+        {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
+        "override tensor buffer type", [](common_params & params, const std::string & value) {
+            /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
+            if (buft_list.empty()) {
+                // enumerate all the devices and add their buffer types to the list
+                for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+                    auto * dev = ggml_backend_dev_get(i);
+                    auto * buft = ggml_backend_dev_buffer_type(dev);
+                    if (buft) {
+                        buft_list[ggml_backend_buft_name(buft)] = buft;
+                    }
+                }
+            }
+
+            for (const auto & override : string_split<std::string>(value, ',')) {
+                std::string::size_type pos = override.find('=');
+                if (pos == std::string::npos) {
+                    throw std::invalid_argument("invalid value");
+                }
+                std::string tensor_name = override.substr(0, pos);
+                std::string buffer_type = override.substr(pos + 1);
+
+                if (buft_list.find(buffer_type) == buft_list.end()) {
+                    printf("Available buffer types:\n");
+                    for (const auto & it : buft_list) {
+                        printf("  %s\n", ggml_backend_buft_name(it.second));
+                    }
+                    throw std::invalid_argument("unknown buffer type");
+                }
+                // FIXME: this leaks memory
+                params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
+            }
+        }
+    ));
    add_opt(common_arg(
        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
        "number of layers to store in VRAM",
@@ -1790,14 +2363,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
            ),
        [](common_params & params, const std::string & value) {
-            params.model = value;
+            params.model.path = value;
        }
    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
    add_opt(common_arg(
        {"-mu", "--model-url"}, "MODEL_URL",
        "model download url (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.model_url = value;
+            params.model.url = value;
        }
    ).set_env("LLAMA_ARG_MODEL_URL"));
    add_opt(common_arg(
@@ -1806,35 +2379,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "example: unsloth/phi-4-GGUF:q4_k_m\n"
        "(default: unused)",
        [](common_params & params, const std::string & value) {
-            params.hf_repo = value;
+            params.model.hf_repo = value;
        }
    ).set_env("LLAMA_ARG_HF_REPO"));
    add_opt(common_arg(
        {"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
        "Same as --hf-repo, but for the draft model (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.speculative.hf_repo = value;
+            params.speculative.model.hf_repo = value;
        }
    ).set_env("LLAMA_ARG_HFD_REPO"));
    add_opt(common_arg(
        {"-hff", "--hf-file"}, "FILE",
        "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.hf_file = value;
+            params.model.hf_file = value;
        }
    ).set_env("LLAMA_ARG_HF_FILE"));
    add_opt(common_arg(
        {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
        "Hugging Face model repository for the vocoder model (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.vocoder.hf_repo = value;
+            params.vocoder.model.hf_repo = value;
        }
    ).set_env("LLAMA_ARG_HF_REPO_V"));
    add_opt(common_arg(
        {"-hffv", "--hf-file-v"}, "FILE",
        "Hugging Face model file for the vocoder model (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.vocoder.hf_file = value;
+            params.vocoder.model.hf_file = value;
        }
    ).set_env("LLAMA_ARG_HF_FILE_V"));
    add_opt(common_arg(
@@ -1979,7 +2552,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
    add_opt(common_arg(
        {"--host"}, "HOST",
-        string_format("ip address to listen (default: %s)", params.hostname.c_str()),
+        string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
        [](common_params & params, const std::string & value) {
            params.hostname = value;
        }
@@ -2454,7 +3027,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"-md", "--model-draft"}, "FNAME",
        "draft model for speculative decoding (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.speculative.model = value;
+            params.speculative.model.path = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));

@@ -2462,7 +3035,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"-mv", "--model-vocoder"}, "FNAME",
        "vocoder model for audio generation (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.vocoder.model = value;
+            params.vocoder.model.path = value;
        }
    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
@@ -2485,10 +3058,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--tts-oute-default"},
        string_format("use default OuteTTS models (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
-            params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
-            params.vocoder.hf_repo = "ggml-org/WavTokenizer";
-            params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
+            params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
+            params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
+            params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
+            params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
        }
    ).set_examples({LLAMA_EXAMPLE_TTS}));

@@ -2496,8 +3069,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--embd-bge-small-en-default"},
        string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
-            params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
+            params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
            params.embd_normalize = 2;
            params.n_ctx = 512;
@@ -2510,8 +3083,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--embd-e5-small-en-default"},
        string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
-            params.hf_file = "e5-small-v2-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
+            params.model.hf_file = "e5-small-v2-q8_0.gguf";
            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
            params.embd_normalize = 2;
            params.n_ctx = 512;
@@ -2524,8 +3097,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--embd-gte-small-default"},
        string_format("use default gte-small model (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
-            params.hf_file = "gte-small-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
+            params.model.hf_file = "gte-small-q8_0.gguf";
            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
            params.embd_normalize = 2;
            params.n_ctx = 512;
@@ -2538,8 +3111,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--fim-qwen-1.5b-default"},
        string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
-            params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
            params.port = 8012;
            params.n_gpu_layers = 99;
            params.flash_attn = true;
@@ -2554,8 +3127,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--fim-qwen-3b-default"},
        string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
-            params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
            params.port = 8012;
            params.n_gpu_layers = 99;
            params.flash_attn = true;
@@ -2570,8 +3143,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--fim-qwen-7b-default"},
        string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
-            params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
            params.port = 8012;
            params.n_gpu_layers = 99;
            params.flash_attn = true;
@@ -2586,10 +3159,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--fim-qwen-7b-spec"},
        string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
-            params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
-            params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
-            params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
            params.speculative.n_gpu_layers = 99;
            params.port = 8012;
            params.n_gpu_layers = 99;
@@ -2605,10 +3178,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--fim-qwen-14b-spec"},
        string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
-            params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
-            params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
-            params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
+            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
            params.speculative.n_gpu_layers = 99;
            params.port = 8012;
            params.n_gpu_layers = 99;
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -7,9 +7,6 @@

 #include "common.h"
 #include "log.h"
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
 #include "llama.h"

 #include <algorithm>
@@ -51,47 +48,11 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
-#if defined(LLAMA_USE_CURL)
-#include <curl/curl.h>
-#include <curl/easy.h>
-#include <future>
-#endif

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

-#if defined(LLAMA_USE_CURL)
-#ifdef __linux__
-#include <linux/limits.h>
-#elif defined(_WIN32)
-#   if !defined(PATH_MAX)
-#   define PATH_MAX MAX_PATH
-#   endif
-#else
-#include <sys/syslimits.h>
-#endif
-#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
-
-//
-// CURL utils
-//
-
-using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
-
-// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
-struct curl_slist_ptr {
-    struct curl_slist * ptr = nullptr;
-    ~curl_slist_ptr() {
-        if (ptr) {
-            curl_slist_free_all(ptr);
-        }
-    }
-};
-#endif // LLAMA_USE_CURL
-
-using json = nlohmann::ordered_json;
-
 //
 // CPU utils
 //
@@ -900,22 +861,14 @@ std::string fs_get_cache_file(const std::string & filename) {
 //
 // Model utils
 //
+
 struct common_init_result common_init_from_params(common_params & params) {
    common_init_result iparams;
    auto mparams = common_model_params_to_llama(params);

-    llama_model * model = nullptr;
-
-    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
-        model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
-    } else if (!params.model_url.empty()) {
-        model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
-    } else {
-        model = llama_model_load_from_file(params.model.c_str(), mparams);
-    }
-
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
    if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
        return iparams;
    }

@@ -950,7 +903,7 @@ struct common_init_result common_init_from_params(common_params & params) {

    llama_context * lctx = llama_init_from_model(model, cparams);
    if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
        llama_model_free(model);
        return iparams;
    }
@@ -1089,15 +1042,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    if (!params.devices.empty()) {
        mparams.devices = params.devices.data();
    }
+
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
+
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
+
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
    } else {
@@ -1105,6 +1061,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
        mparams.kv_overrides = params.kv_overrides.data();
    }

+    if (params.tensor_buft_overrides.empty()) {
+        mparams.tensor_buft_overrides = NULL;
+    } else {
+        GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
+        mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
+    }
+
    return mparams;
 }

@@ -1164,451 +1127,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
    return tpp;
 }

-#ifdef LLAMA_USE_CURL
-
-#define CURL_MAX_RETRY 3
-#define CURL_RETRY_DELAY_SECONDS 2
-
-static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
-    int remaining_attempts = max_attempts;
-
-    while (remaining_attempts > 0) {
-        LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
-
-        CURLcode res = curl_easy_perform(curl);
-        if (res == CURLE_OK) {
-            return true;
-        }
-
-        int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
-        LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
-
-        remaining_attempts--;
-        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
-    }
-
-    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
-
-    return false;
-}
-
-static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
-    // Initialize libcurl
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    if (!curl) {
-        LOG_ERR("%s: error initializing libcurl\n", __func__);
-        return false;
-    }
-
-    bool force_download = false;
-
-    // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
-
-    // Check if hf-token or bearer-token was specified
-    if (!hf_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + hf_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-    }
-
-#if defined(_WIN32)
-    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
-    //   operating system. Currently implemented under MS-Windows.
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-
-    // Check if the file already exists locally
-    auto file_exists = std::filesystem::exists(path);
-
-    // If the file exists, check its JSON metadata companion file.
-    std::string metadata_path = path + ".json";
-    nlohmann::json metadata;
-    std::string etag;
-    std::string last_modified;
-
-    if (file_exists) {
-        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
-        std::ifstream metadata_in(metadata_path);
-        if (metadata_in.good()) {
-            try {
-                metadata_in >> metadata;
-                LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
-                if (metadata.contains("url") && metadata.at("url").is_string()) {
-                    auto previous_url = metadata.at("url").get<std::string>();
-                    if (previous_url != url) {
-                        LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
-                        return false;
-                    }
-                }
-                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
-                    etag = metadata.at("etag");
-                }
-                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
-                    last_modified = metadata.at("lastModified");
-                }
-            } catch (const nlohmann::json::exception & e) {
-            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
-                return false;
-            }
-        }
-    } else {
-        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
-    }
-
-    // Send a HEAD request to retrieve the etag and last-modified headers
-    struct common_load_model_from_url_headers {
-        std::string etag;
-        std::string last_modified;
-    };
-
-    common_load_model_from_url_headers headers;
-
-    {
-        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
-        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
-
-            static std::regex header_regex("([^:]+): (.*)\r\n");
-            static std::regex etag_regex("ETag", std::regex_constants::icase);
-            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
-
-            std::string header(buffer, n_items);
-            std::smatch match;
-            if (std::regex_match(header, match, header_regex)) {
-                const std::string & key = match[1];
-                const std::string & value = match[2];
-                if (std::regex_match(key, match, etag_regex)) {
-                    headers->etag = value;
-                } else if (std::regex_match(key, match, last_modified_regex)) {
-                    headers->last_modified = value;
-                }
-            }
-            return n_items;
-        };
-
-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
-
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
-        if (!was_perform_successful) {
-            return false;
-        }
-
-        long http_code = 0;
-        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code != 200) {
-            // HEAD not supported, we don't know if the file has changed
-            // force trigger downloading
-            force_download = true;
-            LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
-        }
-    }
-
-    bool should_download = !file_exists || force_download;
-    if (!should_download) {
-        if (!etag.empty() && etag != headers.etag) {
-            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
-            should_download = true;
-        } else if (!last_modified.empty() && last_modified != headers.last_modified) {
-            LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
-            should_download = true;
-        }
-    }
-    if (should_download) {
-        std::string path_temporary = path + ".downloadInProgress";
-        if (file_exists) {
-            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
-            if (remove(path.c_str()) != 0) {
-                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                return false;
-            }
-        }
-
-        // Set the output file
-
-        struct FILE_deleter {
-            void operator()(FILE * f) const {
-                fclose(f);
-            }
-        };
-
-        std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
-        if (!outfile) {
-            LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
-            return false;
-        }
-
-        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
-        auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
-            return fwrite(data, size, nmemb, (FILE *)fd);
-        };
-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
-        curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
-
-        //  display download progress
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
-
-        // helper function to hide password in URL
-        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
-            std::size_t protocol_pos = url.find("://");
-            if (protocol_pos == std::string::npos) {
-                return url;  // Malformed URL
-            }
-
-            std::size_t at_pos = url.find('@', protocol_pos + 3);
-            if (at_pos == std::string::npos) {
-                return url;  // No password in URL
-            }
-
-            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
-        };
-
-        // start the download
-        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
-        if (!was_perform_successful) {
-            return false;
-        }
-
-        long http_code = 0;
-        curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code < 200 || http_code >= 400) {
-            LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
-            return false;
-        }
-
-        // Causes file to be closed explicitly here before we rename it.
-        outfile.reset();
-
-        // Write the updated JSON metadata file.
-        metadata.update({
-            {"url", url},
-            {"etag", headers.etag},
-            {"lastModified", headers.last_modified}
-        });
-        std::ofstream(metadata_path) << metadata.dump(4);
-        LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
-
-        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
-            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-            return false;
-        }
-    }
-
-    return true;
-}
-
-struct llama_model * common_load_model_from_url(
-        const std::string & model_url,
-        const std::string & local_path,
-        const std::string & hf_token,
-        const struct llama_model_params & params) {
-    // Basic validation of the model_url
-    if (model_url.empty()) {
-        LOG_ERR("%s: invalid model_url\n", __func__);
-        return NULL;
-    }
-
-    if (!common_download_file(model_url, local_path, hf_token)) {
-        return NULL;
-    }
-
-    // check for additional GGUFs split to download
-    int n_split = 0;
-    {
-        struct gguf_init_params gguf_params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ NULL,
-        };
-        auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
-        if (!ctx_gguf) {
-            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, local_path.c_str());
-            return NULL;
-        }
-
-        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
-        if (key_n_split >= 0) {
-            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
-        }
-
-        gguf_free(ctx_gguf);
-    }
-
-    if (n_split > 1) {
-        char split_prefix[PATH_MAX] = {0};
-        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-
-        // Verify the first split file format
-        // and extract split URL and PATH prefixes
-        {
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
-                return NULL;
-            }
-
-            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
-                return NULL;
-            }
-        }
-
-        // Prepare download in parallel
-        std::vector<std::future<bool>> futures_download;
-        for (int idx = 1; idx < n_split; idx++) {
-            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
-                char split_path[PATH_MAX] = {0};
-                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
-
-                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
-
-                return common_download_file(split_url, split_path, hf_token);
-            }, idx));
-        }
-
-        // Wait for all downloads to complete
-        for (auto & f : futures_download) {
-            if (!f.get()) {
-                return NULL;
-            }
-        }
-    }
-
-    return llama_model_load_from_file(local_path.c_str(), params);
-}
-
-struct llama_model * common_load_model_from_hf(
-        const std::string & repo,
-        const std::string & remote_path,
-        const std::string & local_path,
-        const std::string & hf_token,
-        const struct llama_model_params & params) {
-    // construct hugging face model url:
-    //
-    //  --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
-    //    https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
-    //
-    //  --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
-    //    https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
-    //
-
-    std::string model_url = "https://huggingface.co/";
-    model_url += repo;
-    model_url += "/resolve/main/";
-    model_url += remote_path;
-
-    return common_load_model_from_url(model_url, local_path, hf_token, params);
-}
-
-/**
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
- *
- * Return pair of <repo, file> (with "repo" already having tag removed)
- *
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
- */
-std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
-    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts.back() : "latest";
-    std::string hf_repo = parts[0];
-    if (string_split<std::string>(hf_repo, '/').size() != 2) {
-        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
-    }
-
-    // fetch model info from Hugging Face Hub API
-    json model_info;
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    std::string res_str;
-    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
-    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
-    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
-        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
-        return size * nmemb;
-    };
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
-#if defined(_WIN32)
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-    if (!hf_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + hf_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-    }
-    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
-    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-
-    CURLcode res = curl_easy_perform(curl.get());
-
-    if (res != CURLE_OK) {
-        throw std::runtime_error("error: cannot make GET request to HF API");
-    }
-
-    long res_code;
-    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
-    if (res_code == 200) {
-        model_info = json::parse(res_str);
-    } else if (res_code == 401) {
-        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
-    } else {
-        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
-    }
-
-    // check response
-    if (!model_info.contains("ggufFile")) {
-        throw std::runtime_error("error: model does not have ggufFile");
-    }
-    json & gguf_file = model_info.at("ggufFile");
-    if (!gguf_file.contains("rfilename")) {
-        throw std::runtime_error("error: ggufFile does not have rfilename");
-    }
-
-    return std::make_pair(hf_repo, gguf_file.at("rfilename"));
-}
-
-#else
-
-struct llama_model * common_load_model_from_url(
-        const std::string & /*model_url*/,
-        const std::string & /*local_path*/,
-        const std::string & /*hf_token*/,
-        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
-    return nullptr;
-}
-
-struct llama_model * common_load_model_from_hf(
-        const std::string & /*repo*/,
-        const std::string & /*remote_path*/,
-        const std::string & /*local_path*/,
-        const std::string & /*hf_token*/,
-        const struct llama_model_params & /*params*/) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
-    return nullptr;
-}
-
-std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
-    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
-    return std::make_pair("", "");
-}
-
-#endif // LLAMA_USE_CURL
-
 //
 // Batch utils
 //
@@ -2032,26 +1550,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c

    return result;
 }
-
-template <>
-json common_grammar_trigger::to_json() const {
-    json out {
-        {"type", (int) type},
-        {"value", value},
-    };
-    if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
-        out["token"] = (int) token;
-    }
-    return out;
-}
-
-template <>
-common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
-    common_grammar_trigger out;
-    out.type = (common_grammar_trigger_type) in.at("type").get<int>();
-    out.value = in.at("value").get<std::string>();
-    if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
-        out.token = (llama_token) in.at("token").get<int>();
-    }
-    return out;
-}
--- a/common/common.h
+++ b/common/common.h
@@ -121,10 +121,6 @@ struct common_grammar_trigger {
    common_grammar_trigger_type type;
    std::string value;
    llama_token token = LLAMA_TOKEN_NULL;
-
-    // T can only be nlohmann::ordered_json
-    template <class T> T to_json() const;
-    template <class T> static common_grammar_trigger from_json(const T & in);
 };

 // sampling parameters
@@ -184,6 +180,13 @@ struct common_params_sampling {
    std::string print() const;
 };

+struct common_params_model {
+    std::string path    = ""; // model local path                                           // NOLINT
+    std::string url     = ""; // model url to download                                      // NOLINT
+    std::string hf_repo = ""; // HF repo                                                    // NOLINT
+    std::string hf_file = ""; // HF file                                                    // NOLINT
+};
+
 struct common_params_speculative {
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

@@ -197,19 +200,11 @@ struct common_params_speculative {
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;

-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
-
-    std::string model = "";     // draft model for speculative decoding                      // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
+    struct common_params_model model;
 };

 struct common_params_vocoder {
-    std::string hf_repo = ""; // HF repo                                                     // NOLINT
-    std::string hf_file = ""; // HF file                                                     // NOLINT
-
-    std::string model     = ""; // model path                                                // NOLINT
-    std::string model_url = ""; // model url to download                                     // NOLINT
+    struct common_params_model model;

    std::string speaker_file = ""; // speaker file path                                      // NOLINT

@@ -267,12 +262,10 @@ struct common_params {
    struct common_params_speculative speculative;
    struct common_params_vocoder     vocoder;

-    std::string model                = ""; // model path                                                    // NOLINT
+    struct common_params_model model;
+
    std::string model_alias          = ""; // model alias                                                   // NOLINT
-    std::string model_url            = ""; // model url to download                                         // NOLINT
    std::string hf_token             = ""; // HF token                                                      // NOLINT
-    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
-    std::string hf_file              = ""; // HF file                                                       // NOLINT
    std::string prompt               = "";                                                                  // NOLINT
    std::string system_prompt        = "";                                                                  // NOLINT
    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
@@ -286,6 +279,7 @@ struct common_params {
    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;

    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@@ -347,7 +341,7 @@ struct common_params {
    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;

    // multimodal models (see examples/llava)
-    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
+    struct common_params_model mmproj;
    std::vector<std::string> image; // path to image file(s)

    // embedding
@@ -546,23 +540,6 @@ struct llama_model_params     common_model_params_to_llama  (      common_params
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

-struct llama_model * common_load_model_from_url(
-    const std::string & model_url,
-    const std::string & local_path,
-    const std::string & hf_token,
-    const struct llama_model_params & params);
-
-struct llama_model * common_load_model_from_hf(
-    const std::string & repo,
-    const std::string & remote_path,
-    const std::string & local_path,
-    const std::string & hf_token,
-    const struct llama_model_params & params);
-
-std::pair<std::string, std::string> common_get_hf_file(
-    const std::string & hf_repo_with_tag,
-    const std::string & hf_token);
-
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);

--- a/common/llguidance.cpp
+++ b/common/llguidance.cpp
@@ -11,25 +11,24 @@ struct llama_sampler_llg {
    std::string         grammar_kind;
    std::string         grammar_data;
    LlgTokenizer *      tokenizer;
-    LlgConstraint *     grammar;
-    LlgMaskResult       llg_res;
-    bool                has_llg_res;
+    LlgMatcher *        grammar;
 };

-static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
-                                             const char * grammar_data) {
+static LlgMatcher * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
+                                          const char * grammar_data) {
    LlgConstraintInit cinit;
    llg_constraint_init_set_defaults(&cinit, tokenizer);
    const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
    if (log_level && *log_level) {
        cinit.log_stderr_level = atoi(log_level);
    }
-    auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
-    if (llg_get_error(c)) {
-        LOG_ERR("llg error: %s\n", llg_get_error(c));
-        llg_free_constraint(c);
+    auto c = llg_new_matcher(&cinit, grammar_kind, grammar_data);
+    if (llg_matcher_get_error(c)) {
+        LOG_ERR("llg error: %s\n", llg_matcher_get_error(c));
+        llg_free_matcher(c);
        return nullptr;
    }
+
    return c;
 }

@@ -40,39 +39,29 @@ static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
 static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
    auto * ctx = (llama_sampler_llg *) smpl->ctx;
    if (ctx->grammar) {
-        LlgCommitResult res;
-        llg_commit_token(ctx->grammar, token, &res);
-        ctx->has_llg_res = false;
+        llg_matcher_consume_token(ctx->grammar, token);
    }
 }

 static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
    auto * ctx = (llama_sampler_llg *) smpl->ctx;
    if (ctx->grammar) {
-        if (!ctx->has_llg_res) {
-            if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
-                ctx->has_llg_res = true;
+        const uint32_t * mask = llg_matcher_get_mask(ctx->grammar);
+        if (mask == nullptr) {
+            if (llg_matcher_compute_mask(ctx->grammar) == 0) {
+                mask = llg_matcher_get_mask(ctx->grammar);
            } else {
-                LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
-                llg_free_constraint(ctx->grammar);
+                LOG_ERR("llg error: %s\n", llg_matcher_get_error(ctx->grammar));
+                llg_free_matcher(ctx->grammar);
                ctx->grammar = nullptr;
+                return;
            }
        }
-        if (ctx->has_llg_res) {
-            if (ctx->llg_res.is_stop) {
-                for (size_t i = 0; i < cur_p->size; ++i) {
-                    if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
-                        cur_p->data[i].logit = -INFINITY;
-                    }
-                }
-            } else {
-                const uint32_t * mask = ctx->llg_res.sample_mask;
-                for (size_t i = 0; i < cur_p->size; ++i) {
-                    auto token = cur_p->data[i].id;
-                    if ((mask[token / 32] & (1 << (token % 32))) == 0) {
-                        cur_p->data[i].logit = -INFINITY;
-                    }
-                }
+
+        for (size_t i = 0; i < cur_p->size; ++i) {
+            auto token = cur_p->data[i].id;
+            if ((mask[token / 32] & (1 << (token % 32))) == 0) {
+                cur_p->data[i].logit = -INFINITY;
            }
        }
    }
@@ -80,14 +69,9 @@ static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array

 static void llama_sampler_llg_reset(llama_sampler * smpl) {
    auto * ctx = (llama_sampler_llg *) smpl->ctx;
-    if (!ctx->grammar) {
-        return;
+    if (ctx->grammar) {
+        llg_matcher_reset(ctx->grammar);
    }
-
-    auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
-    llg_free_constraint(ctx->grammar);
-    ctx->grammar     = grammar_new;
-    ctx->has_llg_res = false;
 }

 static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
@@ -102,7 +86,7 @@ static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
        if (ctx->grammar) {
            result_ctx->grammar_kind = ctx->grammar_kind;
            result_ctx->grammar_data = ctx->grammar_data;
-            result_ctx->grammar      = llg_clone_constraint(ctx->grammar);
+            result_ctx->grammar      = llg_clone_matcher(ctx->grammar);
            result_ctx->tokenizer    = llg_clone_tokenizer(ctx->tokenizer);
        }
    }
@@ -114,7 +98,7 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
    const auto * ctx = (llama_sampler_llg *) smpl->ctx;

    if (ctx->grammar) {
-        llg_free_constraint(ctx->grammar);
+        llg_free_matcher(ctx->grammar);
        llg_free_tokenizer(ctx->tokenizer);
    }

@@ -239,9 +223,11 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
            /* .grammar_data = */ grammar_data,
            /* .tokenizer    = */ tokenizer,
            /* .grammar      = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
-            /* .llg_res      = */ {},
-            /* .has_llg_res  = */ false,
        };
+        if (ctx->grammar) {
+            GGML_ASSERT(((size_t) llama_vocab_n_tokens(vocab) + 31) / 32 * 4 ==
+                        llg_matcher_get_mask_byte_size(ctx->grammar));
+        }
    } else {
        *ctx = {
            /* .vocab        = */ vocab,
@@ -249,15 +235,12 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
            /* .grammar_data = */ {},
            /* .tokenizer    = */ nullptr,
            /* .grammar      = */ nullptr,
-            /* .llg_res      = */ {},
-            /* .has_llg_res  = */ false,
        };
    }

    return llama_sampler_init(
        /* .iface = */ &llama_sampler_llg_i,
-        /* .ctx   = */ ctx
-    );
+        /* .ctx   = */ ctx);
 }

 #else
--- a/common/minja/minja.hpp
+++ b/common/minja/minja.hpp
@@ -240,7 +240,7 @@ public:
      auto index = key.get<int>();
      return array_->at(index < 0 ? array_->size() + index : index);
    } else if (object_) {
-      if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
+      if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
      auto it = object_->find(key.primitive_);
      if (it == object_->end()) return Value();
      return it->second;
@@ -249,7 +249,7 @@ public:
  }
  void set(const Value& key, const Value& value) {
    if (!object_) throw std::runtime_error("Value is not an object: " + dump());
-    if (!key.is_hashable()) throw std::runtime_error("Unashable type: " + dump());
+    if (!key.is_hashable()) throw std::runtime_error("Unhashable type: " + dump());
    (*object_)[key.primitive_] = value;
  }
  Value call(const std::shared_ptr<Context> & context, ArgumentsValue & args) const {
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -208,6 +208,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                                                        trigger_patterns_c.data(), trigger_patterns_c.size(),
                                                        trigger_tokens.data(), trigger_tokens.size())
             :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
+        if (!grmr) {
+            return nullptr;
+        }
    }

    auto * result = new common_sampler {
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -180,7 +180,8 @@ class Model:
            extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
            missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
            if len(extra) == 0 and len(missing_files) > 0:
-                raise ValueError(f"Missing or incomplete model files: {missing_files}")
+                raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
+                                 f"Missing tensors: {missing}")
            else:
                raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
                                 f"Missing tensors: {missing}\n"
@@ -528,6 +529,8 @@ class Model:
        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
        added_vocab = tokenizer.get_added_vocab()

+        added_tokens_decoder = tokenizer.added_tokens_decoder
+
        for i in range(vocab_size):
            if i not in reverse_vocab:
                tokens.append(f"[PAD{i}]")
@@ -537,13 +540,13 @@ class Model:
                if token in added_vocab:
                    # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
                    # To avoid unexpected issues - we make sure to normalize non-normalized tokens
-                    if not tokenizer.added_tokens_decoder[i].normalized:
+                    if not added_tokens_decoder[i].normalized:
                        previous_token = token
                        token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
                        if previous_token != token:
                            logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")

-                    if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
+                    if added_tokens_decoder[i].special or self.does_token_look_special(token):
                        toktypes.append(gguf.TokenType.CONTROL)
                    else:
                        # NOTE: this was added for Gemma.
@@ -702,6 +705,15 @@ class Model:
        if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
            # ref: https://huggingface.co/Xenova/gpt-4o
            res = "gpt-4o"
+        if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
+            # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
+            res = "superbpe"
+        if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15":
+            # ref: https://huggingface.co/trillionlabs/Trillion-7B-preview
+            res = "trillion"
+        if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
+            # ref: https://huggingface.co/inclusionAI/Ling-lite
+            res = "bailingmoe"

        if res is None:
            logger.warning("\n")
@@ -1099,13 +1111,6 @@ class BloomModel(Model):

        tensors.append((self.map_tensor_name(name), data_torch))

-        if name == "word_embeddings.weight":
-            assert self.tensor_names is not None
-
-            # TODO: tie them at runtime, don't duplicate in the model file
-            if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
-                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
-
        return tensors


@@ -1747,6 +1752,25 @@ class LlamaModel(Model):
                raise ValueError(f"Unprocessed experts: {experts}")


+@Model.register("Mistral3ForConditionalGeneration")
+class Mistral3Model(LlamaModel):
+    model_arch = gguf.MODEL_ARCH.LLAMA
+
+    # we need to merge the text_config into the root level of hparams
+    def __init__(self, *args, **kwargs):
+        hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
+        if "text_config" in hparams:
+            hparams = {**hparams, **hparams["text_config"]}
+            kwargs["hparams"] = hparams
+        super().__init__(*args, **kwargs)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+        name = name.replace("language_model.", "")
+        if "multi_modal_projector" in name or "vision_tower" in name:
+            return []
+        return super().modify_tensors(data_torch, name, bid)
+
+
@Model.register("DeciLMForCausalLM")
 class DeciModel(Model):
    model_arch = gguf.MODEL_ARCH.DECI
@@ -2251,7 +2275,7 @@ class Qwen2Model(Model):
                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])


-@Model.register("Qwen2VLForConditionalGeneration")
+@Model.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
 class Qwen2VLModel(Model):
    model_arch = gguf.MODEL_ARCH.QWEN2VL

@@ -2404,10 +2428,6 @@ class GPT2Model(Model):

        tensors.append((new_name, data_torch))

-        # note: GPT2 output is tied to (same as) wte in original model
-        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
-            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
-
        return tensors


@@ -2737,21 +2757,26 @@ class CodeShellModel(Model):
        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
        self.gguf_writer.add_rope_scaling_factor(1.0)

+    _has_tok_embd = False
+
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused

+        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
+        tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
+
        new_name = self.map_tensor_name(name)

-        tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
+        # assuming token_embd.weight is seen before output.weight
+        if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
+            # even though the tensor file(s) does not contain the word embeddings they are still in the weight map
+            if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
+                logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
+                self.tensor_names.remove("transformer.wte.weight")
+        elif new_name == tok_embd_name:
+            self._has_tok_embd = True

-        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
-            assert self.tensor_names is not None
-
-            if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
-                # copy tok_embd.weight to output.weight
-                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
-
-        return tensors
+        return [(new_name, data_torch)]


@Model.register("InternLM2ForCausalLM")
@@ -3366,7 +3391,7 @@ class Gemma3Model(Model):

    # we need to merge the text_config into the root level of hparams
    def __init__(self, *args, **kwargs):
-        hparams = Model.load_hparams(kwargs["dir_model"])
+        hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
        if "text_config" in hparams:
            hparams = {**hparams, **hparams["text_config"]}
            kwargs["hparams"] = hparams
@@ -3532,8 +3557,8 @@ class RWKV6Qwen2Model(Rwkv6Model):
        head_size = hidden_size // num_attention_heads
        rms_norm_eps = self.hparams["rms_norm_eps"]
        intermediate_size = self.hparams["intermediate_size"]
-        time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
-        time_decay_extra_dim = 128 if hidden_size >= 4096 else 64
+        time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32)
+        time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64)

        # RWKV isn't context limited
        self.gguf_writer.add_context_length(1048576)
@@ -3784,8 +3809,6 @@ class MambaModel(Model):
    _tok_embd = None

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        del bid  # unused
-
        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
        tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)

@@ -3795,6 +3818,10 @@ class MambaModel(Model):
            logger.debug("A_log --> A ==> " + new_name)
            data_torch = -torch.exp(data_torch)

+        # [4 1 8192 1] -> [4 8192 1 1]
+        if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
+            data_torch = data_torch.squeeze()
+
        # assuming token_embd.weight is seen before output.weight
        if self._tok_embd is not None and new_name == output_name:
            if torch.equal(self._tok_embd, data_torch):
@@ -4398,6 +4425,29 @@ class DeepseekV2Model(Model):
                raise ValueError(f"Unprocessed experts: {experts}")


+@Model.register("PLMForCausalLM")
+class PLMModel(Model):
+    model_arch = gguf.MODEL_ARCH.PLM
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
+        self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
+        self.gguf_writer.add_value_length(hparams["v_head_dim"])
+        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+
@Model.register("T5WithLMHeadModel")
@Model.register("T5ForConditionalGeneration")
@Model.register("MT5ForConditionalGeneration")
@@ -5086,6 +5136,105 @@ class GraniteMoeModel(GraniteModel):
        return super().modify_tensors(data_torch, name, bid)


+@Model.register("BailingMoeForCausalLM")
+class BailingMoeModel(Model):
+    model_arch = gguf.MODEL_ARCH.BAILINGMOE
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
+
+        self.gguf_writer.add_rope_dimension_count(rope_dim)
+        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
+        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+        self.gguf_writer.add_expert_weights_scale(1.0)
+        self.gguf_writer.add_expert_count(hparams["num_experts"])
+        self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
+        self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    @staticmethod
+    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
+        if n_head_kv is not None and n_head != n_head_kv:
+            n_head = n_head_kv
+        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
+                .swapaxes(1, 2)
+                .reshape(weights.shape))
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = self.hparams.get("num_key_value_heads")
+        n_embd = self.hparams["hidden_size"]
+        head_dim = self.hparams.get("head_dim") or n_embd // n_head
+
+        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
+
+        if name.endswith("attention.dense.weight"):
+            return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)]
+        elif name.endswith("query_key_value.weight"):
+            q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
+
+            return [
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)),
+                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v)
+            ]
+        elif name.find("mlp.experts") != -1:
+            n_experts = self.hparams["num_experts"]
+            assert bid is not None
+
+            tensors: list[tuple[str, Tensor]] = []
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                # merge the experts into a single 3d tensor
+                for w_name in ["down_proj", "gate_proj", "up_proj"]:
+                    datas: list[Tensor] = []
+
+                    for xid in range(n_experts):
+                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+
+                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+                    new_name = self.map_tensor_name(merged_name)
+
+                    tensors.append((new_name, data_torch))
+
+            return tensors
+
+        new_name = self.map_tensor_name(name)
+
+        if new_name == output_name and self.hparams.get("norm_head"):
+            data_torch = data_torch.float()
+            data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
+
+        return [(new_name, data_torch)]
+
+    def prepare_tensors(self):
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            # flatten `list[dict[str, Tensor]]` into `list[str]`
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+
@Model.register("ChameleonForConditionalGeneration")
@Model.register("ChameleonForCausalLM")  # obsolete
 class ChameleonModel(Model):
@@ -5339,7 +5488,7 @@ def main() -> None:
            logger.error(f"Model {model_architecture} is not supported")
            sys.exit(1)

-        model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
+        model_instance = model_class(dir_model, output_type, fname_out,
                                     is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
                                     eager=args.no_lazy,
                                     metadata_override=args.metadata, model_name=args.model_name,
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -110,6 +110,9 @@ models = [
    {"name": "deepseek-v3",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
    {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
    {"name": "gpt-4o",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
+    {"name": "superbpe",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
+    {"name": "trillion",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
+    {"name": "bailingmoe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
 ]


--- a/docs/backend/CUDA-FEDORA.md
+++ b/docs/backend/CUDA-FEDORA.md
@@ -14,9 +14,7 @@ In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox
 - [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
 - [Installing Essential Development Tools](#installing-essential-development-tools)
 - [Adding the CUDA Repository](#adding-the-cuda-repository)
- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs)
- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts)
- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs)
+- [Installing Nvidia Driver Libraries](#installing-nvidia-driver-libraries)
 - [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package)
 - [Configuring the Environment](#configuring-the-environment)
 - [Verifying the Installation](#verifying-the-installation)
@@ -67,7 +65,7 @@ This guide focuses on Fedora hosts, but with small adjustments, it can work for
   sudo dnf distro-sync
   ```

-2. **Install the Default Text Editor (Optional):**
+2. **Install **Vim** the default text editor (Optional):**

   ```bash
   sudo dnf install vim-default-editor --allowerasing
@@ -97,36 +95,48 @@ After adding the repository, synchronize the package manager again:
 sudo dnf distro-sync
 ```

-## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
+## Installing Nvidia Driver Libraries

-We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go).
+First, we need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go):

 ```bash
 ls -la /usr/lib64/libcuda.so.1
 ```

+### If *`libcuda.so.1`* is missing:
+
+```
+ls: cannot access '/usr/lib64/libcuda.so.1': No such file or directory
+```
+
 **Explanation:**
+The host dose not supply the CUDA drivers, **install them now:**

- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA,
-  on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries.
-
-### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found).
+#### Install the Nvidia Driver Libraries on Guest:

 ```bash
-sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
+sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
 ```

-### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found).
+### If *`libcuda.so.1`* exists:
+```
+lrwxrwxrwx. 1 root root 21 Mar 24 11:26 /usr/lib64/libcuda.so.1 -> libcuda.so.570.133.07
+```

-If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.
+**Explanation:**
+The host is supply the CUDA drivers, **we need to update the guest RPM Database accordingly:**

-#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies)
+#### Update the Toolbox RPM Database to include the Host-Supplied Libraries:
+
+Note: we do not actually install the libraries, we just update the DB so that the guest system knows they are supplied by the host.
+
+##### 1. Download `nvidia-` parts that are supplied by the host RPM's (with dependencies)

 ```bash
-sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs
+sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
 ```

-#### 2. Update the RPM database to assume the installation of these packages.
+##### 2. Update the RPM database to assume the installation of these packages.

 ```bash
 sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
@@ -134,23 +144,26 @@ sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*

 **Note:**

- The `--justdb` option only updates the RPM database, without touching the filesystem.
+- The `--justdb` option only updates the RPM database, without touching the filesystem elsewhere.

-#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs`
+##### Check that the RPM Database has been correctly updated:
+
+**Note:** This is the same command as in the *"Install the Nvidia Driver Libraries on Guest"* for if *`libcuda.so.1`* was missing.

-After manually installing the dependencies, run:

 ```bash
-sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
+sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
 ```

-You should receive a message indicating the package is already installed:
+*(this time it will not install anything, as the database things that these packages are already installed)*

 ```
 Updating and loading repositories:
 Repositories loaded.
-Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed.
-Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-cuda-3:570.124.06-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-libs-3:570.124.06-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-cuda-libs-3:570.124.06-1.fc41.x86_64" is already installed.
+Package "nvidia-persistenced-3:570.124.06-1.fc41.x86_64" is already installed.

 Nothing to do.
 ```
@@ -207,9 +220,9 @@ You should see output similar to:
 ```
 nvcc: NVIDIA (R) Cuda compiler driver
 Copyright (c) 2005-2025 NVIDIA Corporation
-Built on Wed_Jan_15_19:20:09_PST_2025
-Cuda compilation tools, release 12.8, V12.8.61
-Build cuda_12.8.r12.8/compiler.35404655_0
+Built on Fri_Feb_21_20:23:50_PST_2025
+Cuda compilation tools, release 12.8, V12.8.93
+Build cuda_12.8.r12.8/compiler.35583870_0
 ```

 This output confirms that the CUDA compiler is accessible and indicates the installed version.
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -20,7 +20,7 @@
 **oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:

 - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
+- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
 - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

@@ -227,30 +227,19 @@ Upon a successful installation, SYCL is enabled for the available intel devices,

 **oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.

-
-**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
+**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:

 ```sh
-git clone https://github.com/oneapi-src/oneMKL
-cd oneMKL
-cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
-cmake --build buildWithCublas --config Release
+git clone https://github.com/oneapi-src/oneDNN.git
+cd oneDNN
+cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake --build build-nvidia --config Release
 ```

 - **Adding support to AMD GPUs**

 **oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.

-**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
-
-```sh
-git clone https://github.com/oneapi-src/oneMKL
-cd oneMKL
-# Find your HIPTARGET with rocminfo, under the key 'Name:'
-cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
-cmake --build buildWithrocBLAS --config Release
-```
-
 3. **Verify installation and environment**

 In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
@@ -315,22 +304,19 @@ cmake --build build --config Release -j -v

 #### Nvidia GPU

-```sh
-# Export relevant ENV variables
-export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
-export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
+The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
+By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.

+```sh
 # Build LLAMA with Nvidia BLAS acceleration through SYCL
 # Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
 GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture

 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl

 # Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl

 # build all binary
 cmake --build build --config Release -j -v
@@ -338,12 +324,10 @@ cmake --build build --config Release -j -v

 #### AMD GPU

-```sh
-# Export relevant ENV variables
-export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
-export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
+The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
+By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.

+```sh
 # Build LLAMA with rocBLAS acceleration through SYCL

 ## AMD
--- a/docs/build.md
+++ b/docs/build.md
@@ -132,12 +132,14 @@ You may find the official downloads here: [NVIDIA developer site](https://develo


 #### Compile and run inside a Fedora Toolbox Container
-We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
+We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).

 **Recommended for:**
-
- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
+- ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
+  - (there are no supported CUDA packages for these systems)
+- ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads).
+  - (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system)
+- ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean.
 - *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)


@@ -189,7 +191,7 @@ The following compilation options are also available to tweak performance:

 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
 |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
+| GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
 | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
 | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
@@ -216,6 +218,7 @@ By default, all supported compute capabilities are enabled. To customize this be

 ```bash
 cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
+cmake --build build --config Release
 ```

 This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
@@ -433,6 +436,26 @@ llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB

 For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).

+## Arm® KleidiAI™
+KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.
+
+To enable KleidiAI, go to the llama.cpp directory and build using CMake
+```bash
+cmake -B build -DGGML_CPU_KLEIDIAI=ON
+cmake --build build --config Release
+```
+You can verify that KleidiAI is being used by running
+```bash
+./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
+```
+If KleidiAI is enabled, the ouput will contain a line similar to:
+```
+load_tensors: CPU_KLEIDIAI model buffer size =  3474.00 MiB
+```
+KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
+
+Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
+
 ## Android

 To read documentation for how to build on Android, [click here](./android.md)
--- a/docs/install.md
+++ b/docs/install.md
@@ -9,6 +9,13 @@ brew install llama.cpp
 ```
 The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668

+## MacPorts
+
+```sh
+sudo port install llama.cpp
+```
+see also: https://ports.macports.org/port/llama.cpp/details/
+
 ## Nix

 On Mac and Linux, the Nix package manager can be used via
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = common_model_params_to_llama(params);

-    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = common_model_params_to_llama(params);

-    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);

    if (model == NULL) {
        LOG_ERR("%s: error: unable to load model\n" , __func__);
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -421,7 +421,7 @@ int main(int argc, char ** argv) {

    g_verbose = (params.verbosity > 1);
    try {
-        lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
+        lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
        ctx.run_merge();
    } catch (const std::exception & err) {
        fprintf(stderr, "%s\n", err.what());
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -168,7 +168,7 @@ int main(int argc, char * argv[]) {

    llama_backend_init();

-    llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);

    // create generation context
    llama_context * ctx = llama_init_from_model(model, cparams);
--- a/examples/llava/README-gemma3.md
+++ b/examples/llava/README-gemma3.md
@@ -4,6 +4,26 @@
 >
 > This is very experimental, only used for demo purpose.

+## Quick started
+
+You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account
+
+```bash
+# build
+cmake -B build
+cmake --build build --target llama-gemma3-cli
+
+# alternatively, install from brew (MacOS)
+brew install llama.cpp
+
+# run it
+llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
+llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
+llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF
+
+# note: 1B model does not support vision
+```
+
 ## How to get mmproj.gguf?

 ```bash
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1396,14 +1396,16 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
        const int n_kv = gguf_get_n_kv(ctx);
        const int ftype = get_u32(ctx, KEY_FTYPE);
        const std::string ftype_str = get_ftype(ftype);
-        const int idx_desc = get_key_idx(ctx, KEY_DESCRIPTION);
-        const std::string description = gguf_get_val_str(ctx, idx_desc);
        const int idx_name = gguf_find_key(ctx, KEY_NAME);
        if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
            const std::string name = gguf_get_val_str(ctx, idx_name);
            LOG_INF("%s: model name:   %s\n", __func__, name.c_str());
        }
-        LOG_INF("%s: description:  %s\n", __func__, description.c_str());
+        const int idx_desc = gguf_find_key(ctx, KEY_DESCRIPTION);
+        if (idx_desc != -1) { // ditto
+            const std::string description = gguf_get_val_str(ctx, idx_desc);
+            LOG_INF("%s: description:  %s\n", __func__, description.c_str());
+        }
        LOG_INF("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
        LOG_INF("%s: alignment:    %zu\n", __func__, gguf_get_alignment(ctx));
        LOG_INF("%s: n_tensors:    %d\n", __func__, n_tensors);
@@ -2989,7 +2991,10 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
    assert(itype < GGML_TYPE_COUNT);
    ggml_type type = static_cast<ggml_type>(itype);

-    auto * ctx_clip = clip_model_load(fname_inp, 2);
+    auto * ctx_clip = clip_init(fname_inp, clip_context_params{
+        /* use_gpu */   false,
+        /* verbosity */ 2,
+    });

    const auto & ctx_src = ctx_clip->ctx_gguf;
    const auto & ctx_data = ctx_clip->ctx_data;
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -78,7 +78,7 @@ struct gemma3_context {
    }

    void init_clip_model(common_params & params) {
-        const char * clip_path = params.mmproj.c_str();
+        const char * clip_path = params.mmproj.path.c_str();
        ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
    }

@@ -232,13 +232,13 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.mmproj.empty()) {
+    if (params.mmproj.path.empty()) {
        show_additional_info(argc, argv);
        return 1;
    }

    gemma3_context ctx(params);
-    printf("%s: %s\n", __func__, params.model.c_str());
+    printf("%s: %s\n", __func__, params.model.path.c_str());

    bool is_single_turn = !params.prompt.empty() && !params.image.empty();

--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -225,7 +225,7 @@ static struct llama_model * llava_init(common_params * params) {

    llama_model_params model_params = common_model_params_to_llama(*params);

-    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n" , __func__);
        return NULL;
@@ -234,7 +234,7 @@ static struct llama_model * llava_init(common_params * params) {
 }

 static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
-    const char * clip_path = params->mmproj.c_str();
+    const char * clip_path = params->mmproj.path.c_str();

    auto prompt = params->prompt;
    if (prompt.empty()) {
@@ -283,7 +283,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+    if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
        print_usage(argc, argv);
        return 1;
    }
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {

    llama_model_params model_params = common_model_params_to_llama(*params);

-    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n" , __func__);
        return NULL;
@@ -80,7 +80,7 @@ static void llava_free(struct llava_context * ctx_llava) {
 }

 static struct clip_ctx * clip_init_context(common_params * params) {
-    const char * clip_path = params->mmproj.c_str();
+    const char * clip_path = params->mmproj.path.c_str();

    auto prompt = params->prompt;
    if (prompt.empty()) {
@@ -290,7 +290,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.mmproj.empty() || (params.image.empty())) {
+    if (params.mmproj.path.empty() || (params.image.empty())) {
        show_additional_info(argc, argv);
        return 1;
    }
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -314,7 +314,7 @@ static struct llama_model * llava_init(common_params * params) {

    llama_model_params model_params = common_model_params_to_llama(*params);

-    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n" , __func__);
        return NULL;
@@ -323,7 +323,7 @@ static struct llama_model * llava_init(common_params * params) {
 }

 static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
-    const char * clip_path = params->mmproj.c_str();
+    const char * clip_path = params->mmproj.path.c_str();

    auto prompt = params->prompt;
    if (prompt.empty()) {
@@ -524,7 +524,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+    if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
        print_usage(argc, argv);
        return 1;
    }
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -106,6 +106,8 @@ int main(int argc, char ** argv) {

    common_params params;

+    params.n_predict = 128;
+
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
        return 1;
    }
@@ -405,7 +407,7 @@ int main(int argc, char ** argv) {
        params.prompt_file = "used built-in defaults";
    }
    LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
-    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());
+    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.path.c_str());

    LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
    LOG_INF("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = common_model_params_to_llama(params);

-    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);

    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n" , __func__);
--- a/examples/rpc/CMakeLists.txt
+++ b/examples/rpc/CMakeLists.txt
@@ -1,2 +1,4 @@
-add_executable(rpc-server rpc-server.cpp)
-target_link_libraries(rpc-server PRIVATE ggml llama)
+set(TARGET rpc-server)
+add_executable(${TARGET} rpc-server.cpp)
+target_link_libraries(${TARGET} PRIVATE ggml)
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -72,3 +72,14 @@ $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name

 This way you can offload model layers to both local and remote devices.

+### Local cache
+
+The RPC server can use a local cache to store large tensors and avoid transferring them over the network.
+This can speed up model loading significantly, especially when using large models.
+To enable the cache, use the `-c` option:
+
+```bash
+$ bin/rpc-server -c
+```
+
+By default, the cache is stored in the `$HOME/.cache/llama.cpp/rpc` directory and can be controlled via the `LLAMA_CACHE` environment variable.
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -1,3 +1,7 @@
+#if defined(_MSC_VER)
+#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
+#endif
+
 #include "ggml-cpu.h"

 #ifdef GGML_USE_CUDA
@@ -18,26 +22,142 @@

 #include "ggml-rpc.h"
 #ifdef _WIN32
+#  define DIRECTORY_SEPARATOR '\\'
+#  include <locale>
 #  include <windows.h>
+#  include <fcntl.h>
+#  include <io.h>
 #else
+#  define DIRECTORY_SEPARATOR '/'
 #  include <unistd.h>
+#  include <sys/stat.h>
 #endif
+#include <codecvt>
 #include <string>
 #include <stdio.h>
+#include <vector>
+#include <filesystem>
+
+namespace fs = std::filesystem;
+
+// NOTE: this is copied from common.cpp to avoid linking with libcommon
+// returns true if successful, false otherwise
+static bool fs_create_directory_with_parents(const std::string & path) {
+#ifdef _WIN32
+    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+    std::wstring wpath = converter.from_bytes(path);
+
+    // if the path already exists, check whether it's a directory
+    const DWORD attributes = GetFileAttributesW(wpath.c_str());
+    if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+        return true;
+    }
+
+    size_t pos_slash = 0;
+
+    // process path from front to back, procedurally creating directories
+    while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
+        const std::wstring subpath = wpath.substr(0, pos_slash);
+        const wchar_t * test = subpath.c_str();
+
+        const bool success = CreateDirectoryW(test, NULL);
+        if (!success) {
+            const DWORD error = GetLastError();
+
+            // if the path already exists, ensure that it's a directory
+            if (error == ERROR_ALREADY_EXISTS) {
+                const DWORD attributes = GetFileAttributesW(subpath.c_str());
+                if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+                    return false;
+                }
+            } else {
+                return false;
+            }
+        }
+
+        pos_slash += 1;
+    }
+
+    return true;
+#else
+    // if the path already exists, check whether it's a directory
+    struct stat info;
+    if (stat(path.c_str(), &info) == 0) {
+        return S_ISDIR(info.st_mode);
+    }
+
+    size_t pos_slash = 1; // skip leading slashes for directory creation
+
+    // process path from front to back, procedurally creating directories
+    while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
+        const std::string subpath = path.substr(0, pos_slash);
+        struct stat info;
+
+        // if the path already exists, ensure that it's a directory
+        if (stat(subpath.c_str(), &info) == 0) {
+            if (!S_ISDIR(info.st_mode)) {
+                return false;
+            }
+        } else {
+            // create parent directories
+            const int ret = mkdir(subpath.c_str(), 0755);
+            if (ret != 0) {
+                return false;
+            }
+        }
+
+        pos_slash += 1;
+    }
+
+    return true;
+#endif // _WIN32
+}
+
+// NOTE: this is copied from common.cpp to avoid linking with libcommon
+static std::string fs_get_cache_directory() {
+    std::string cache_directory = "";
+    auto ensure_trailing_slash = [](std::string p) {
+        // Make sure to add trailing slash
+        if (p.back() != DIRECTORY_SEPARATOR) {
+            p += DIRECTORY_SEPARATOR;
+        }
+        return p;
+    };
+    if (getenv("LLAMA_CACHE")) {
+        cache_directory = std::getenv("LLAMA_CACHE");
+    } else {
+#ifdef __linux__
+        if (std::getenv("XDG_CACHE_HOME")) {
+            cache_directory = std::getenv("XDG_CACHE_HOME");
+        } else {
+            cache_directory = std::getenv("HOME") + std::string("/.cache/");
+        }
+#elif defined(__APPLE__)
+        cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
+#elif defined(_WIN32)
+        cache_directory = std::getenv("LOCALAPPDATA");
+#endif // __linux__
+        cache_directory = ensure_trailing_slash(cache_directory);
+        cache_directory += "llama.cpp";
+    }
+    return ensure_trailing_slash(cache_directory);
+}

 struct rpc_server_params {
    std::string host        = "127.0.0.1";
    int         port        = 50052;
    size_t      backend_mem = 0;
+    bool        use_cache   = false;
 };

 static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
    fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -H HOST, --host HOST  host to bind to (default: %s)\n", params.host.c_str());
-    fprintf(stderr, "  -p PORT, --port PORT  port to bind to (default: %d)\n", params.port);
-    fprintf(stderr, "  -m MEM, --mem MEM     backend memory size (in MB)\n");
+    fprintf(stderr, "  -h, --help                show this help message and exit\n");
+    fprintf(stderr, "  -H HOST, --host HOST      host to bind to (default: %s)\n", params.host.c_str());
+    fprintf(stderr, "  -p PORT, --port PORT      port to bind to (default: %d)\n", params.port);
+    fprintf(stderr, "  -m MEM,  --mem MEM        backend memory size (in MB)\n");
+    fprintf(stderr, "  -c,      --cache          enable local file cache\n");
    fprintf(stderr, "\n");
 }

@@ -58,6 +178,8 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
            if (params.port <= 0 || params.port > 65535) {
                return false;
            }
+        } else if (arg == "-c" || arg == "--cache") {
+            params.use_cache = true;
        } else if (arg == "-m" || arg == "--mem") {
            if (++i >= argc) {
                return false;
@@ -164,8 +286,20 @@ int main(int argc, char * argv[]) {
    } else {
        get_backend_memory(&free_mem, &total_mem);
    }
-    printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
-    ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
+    const char * cache_dir = nullptr;
+    std::string cache_dir_str = fs_get_cache_directory() + "rpc/";
+    if (params.use_cache) {
+        if (!fs_create_directory_with_parents(cache_dir_str)) {
+            fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str());
+            return 1;
+        }
+        cache_dir = cache_dir_str.c_str();
+    }
+    printf("Starting RPC server\n");
+    printf("  endpoint       : %s\n", endpoint.c_str());
+    printf("  local cache    : %s\n", cache_dir ? cache_dir : "n/a");
+    printf("  backend memory : %zu MB\n", free_mem / (1024 * 1024));
+    ggml_backend_rpc_start_server(backend, endpoint.c_str(), cache_dir, free_mem, total_mem);
    ggml_backend_free(backend);
    return 0;
 }
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -38,24 +38,6 @@
 }
 #endif

-GGML_ATTRIBUTE_FORMAT(1, 2)
-static std::string fmt(const char * fmt, ...) {
-    va_list ap;
-    va_list ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    const int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
-    std::string buf;
-    buf.resize(size);
-    const int size2 = vsnprintf(const_cast<char *>(buf.data()), buf.size() + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-
-    return buf;
-}
-
 GGML_ATTRIBUTE_FORMAT(1, 2)
 static int printe(const char * fmt, ...) {
    va_list args;
@@ -525,11 +507,11 @@ class HttpClient {
        int secs = static_cast<int>(seconds) % 60;

        if (hrs > 0) {
-            return fmt("%dh %02dm %02ds", hrs, mins, secs);
+            return string_format("%dh %02dm %02ds", hrs, mins, secs);
        } else if (mins > 0) {
-            return fmt("%dm %02ds", mins, secs);
+            return string_format("%dm %02ds", mins, secs);
        } else {
-            return fmt("%ds", secs);
+            return string_format("%ds", secs);
        }
    }

@@ -544,7 +526,7 @@ class HttpClient {
            }
        }

-        return fmt("%.2f %s", dbl_size, suffix[i]);
+        return string_format("%.2f %s", dbl_size, suffix[i]);
    }

    static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
@@ -578,7 +560,9 @@ class HttpClient {
        return (now_downloaded_plus_file_size * 100) / total_to_download;
    }

-    static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast<long int>(percentage)); }
+    static std::string generate_progress_prefix(curl_off_t percentage) {
+        return string_format("%3ld%% |", static_cast<long int>(percentage));
+    }

    static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
        const auto                          now             = std::chrono::steady_clock::now();
@@ -589,9 +573,9 @@ class HttpClient {
    static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download,
                                                double speed, double estimated_time) {
        const int width = 10;
-        return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width,
-                   human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width,
-                   human_readable_time(estimated_time).c_str());
+        return string_format("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(),
+                             width, human_readable_size(total_to_download).c_str(), width,
+                             human_readable_size(speed).c_str(), width, human_readable_time(estimated_time).c_str());
    }

    static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -133,7 +133,8 @@ struct slot_params {

        auto grammar_triggers = json::array();
        for (const auto & trigger : sampling.grammar_triggers) {
-            grammar_triggers.push_back(trigger.to_json<json>());
+            server_grammar_trigger ct(std::move(trigger));
+            grammar_triggers.push_back(ct.to_json());
        }

        return json {
@@ -372,9 +373,9 @@ struct server_task {
            const auto grammar_triggers = data.find("grammar_triggers");
            if (grammar_triggers != data.end()) {
                for (const auto & t : *grammar_triggers) {
-                    auto ct = common_grammar_trigger::from_json(t);
-                    if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
-                        const auto & word = ct.value;
+                    server_grammar_trigger ct(t);
+                    if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
+                        const auto & word = ct.value.value;
                        auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
                        if (ids.size() == 1) {
                            auto token = ids[0];
@@ -392,7 +393,7 @@ struct server_task {
                            params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
                        }
                    } else {
-                        params.sampling.grammar_triggers.push_back(ct);
+                        params.sampling.grammar_triggers.push_back(std::move(ct.value));
                    }
                }
            }
@@ -489,8 +490,12 @@ struct result_timings {
    double predicted_per_token_ms;
    double predicted_per_second;

+    // Optional speculative metrics - only included when > 0
+    int32_t draft_n = 0;
+    int32_t draft_n_accepted = 0;
+
    json to_json() const {
-        return {
+        json base = {
            {"prompt_n",               prompt_n},
            {"prompt_ms",              prompt_ms},
            {"prompt_per_token_ms",    prompt_per_token_ms},
@@ -501,6 +506,13 @@ struct result_timings {
            {"predicted_per_token_ms", predicted_per_token_ms},
            {"predicted_per_second",   predicted_per_second},
        };
+
+        if (draft_n > 0) {
+            base["draft_n"] = draft_n;
+            base["draft_n_accepted"] = draft_n_accepted;
+        }
+
+        return base;
    }
 };

@@ -830,6 +842,11 @@ struct server_task_result_cmpl_final : server_task_result {
            ret.push_back({"timings", timings.to_json()});
        }

+        // extra fields for debugging purposes
+        if (verbose) {
+            ret["__verbose"] = to_json_non_oaicompat();
+        }
+
        return ret;
    }
 };
@@ -1294,6 +1311,10 @@ struct server_slot {

    std::function<void(int)> callback_on_release;

+    // Speculative decoding stats
+    int32_t n_draft_total = 0;      // Total draft tokens generated
+    int32_t n_draft_accepted = 0;   // Draft tokens actually accepted
+
    void reset() {
        SLT_DBG(*this, "%s", "\n");

@@ -1310,6 +1331,10 @@ struct server_slot {

        generated_tokens.clear();
        generated_token_probs.clear();
+
+        // clear speculative decoding stats
+        n_draft_total = 0;
+        n_draft_accepted = 0;
    }

    bool is_non_causal() const {
@@ -1376,6 +1401,12 @@ struct server_slot {
        timings.predicted_per_token_ms = t_token_generation / n_decoded;
        timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;

+        // Add speculative metrics
+        if (n_draft_total > 0) {
+            timings.draft_n = n_draft_total;
+            timings.draft_n_accepted = n_draft_accepted;
+        }
+
        return timings;
    }

@@ -1423,6 +1454,15 @@ struct server_slot {
                t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
                t_token_generation, n_decoded, t_gen, n_gen_second,
                t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
+
+        if (n_draft_total > 0) {
+            const float draft_ratio = (float) n_draft_accepted / n_draft_total;
+            SLT_INF(*this,
+                    "\n"
+                    "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
+                    draft_ratio, n_draft_accepted, n_draft_total
+            );
+        }
    }

    json to_json() const {
@@ -1837,7 +1877,7 @@ struct server_context {
    }

    bool load_model(const common_params & params) {
-        SRV_INF("loading model '%s'\n", params.model.c_str());
+        SRV_INF("loading model '%s'\n", params.model.path.c_str());

        params_base = params;

@@ -1847,7 +1887,7 @@ struct server_context {
        ctx   = llama_init.context.get();

        if (model == nullptr) {
-            SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
+            SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
            return false;
        }

@@ -1858,16 +1898,13 @@ struct server_context {
        add_bos_token = llama_vocab_get_add_bos(vocab);
        has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;

-        if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
-            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
+        if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
+            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());

            auto params_dft = params_base;

            params_dft.devices      = params_base.speculative.devices;
-            params_dft.hf_file      = params_base.speculative.hf_file;
-            params_dft.hf_repo      = params_base.speculative.hf_repo;
            params_dft.model        = params_base.speculative.model;
-            params_dft.model_url    = params_base.speculative.model_url;
            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
            params_dft.n_parallel   = 1;
@@ -1881,12 +1918,12 @@ struct server_context {
            model_dft = llama_init_dft.model.get();

            if (model_dft == nullptr) {
-                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
+                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
                return false;
            }

            if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
-                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
+                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());

                return false;
            }
@@ -3285,6 +3322,9 @@ struct server_context {

                llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);

+                // keep track of total number of tokens generated in the draft
+                slot.n_draft_total += draft.size();
+
                // ignore small drafts
                if (slot.params.speculative.n_min > (int) draft.size()) {
                    SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
@@ -3310,6 +3350,9 @@ struct server_context {
                slot.n_past    += ids.size();
                slot.n_decoded += ids.size();

+                // update how many tokens out of draft was accepted
+                slot.n_draft_accepted += ids.size() - 1;
+
                slot.cache_tokens.push_back(id);
                slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);

@@ -3820,7 +3863,7 @@ int main(int argc, char ** argv) {
        json data = {
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
            { "total_slots",                 ctx_server.params_base.n_parallel },
-            { "model_path",                  ctx_server.params_base.model },
+            { "model_path",                  ctx_server.params_base.model.path },
            { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
            { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
            { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@@ -4086,7 +4129,7 @@ int main(int argc, char ** argv) {
            {"object", "list"},
            {"data", {
                {
-                    {"id",       params.model_alias.empty() ? params.model : params.model_alias},
+                    {"id",       params.model_alias.empty() ? params.model.path : params.model_alias},
                    {"object",   "model"},
                    {"created",  std::time(0)},
                    {"owned_by", "llamacpp"},
@@ -4454,15 +4497,24 @@ int main(int argc, char ** argv) {
        llama_backend_free();
    };

-    // bind HTTP listen port
    bool was_bound = false;
-    if (params.port == 0) {
-        int bound_port = svr->bind_to_any_port(params.hostname);
-        if ((was_bound = (bound_port >= 0))) {
-            params.port = bound_port;
-        }
+    if (string_ends_with(std::string(params.hostname), ".sock")) {
+        LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
+        svr->set_address_family(AF_UNIX);
+        // bind_to_port requires a second arg, any value other than 0 should
+        // simply get ignored
+        was_bound = svr->bind_to_port(params.hostname, 8080);
    } else {
-        was_bound = svr->bind_to_port(params.hostname, params.port);
+        LOG_INF("%s: binding port with default address family\n", __func__);
+        // bind HTTP listen port
+        if (params.port == 0) {
+            int bound_port = svr->bind_to_any_port(params.hostname);
+            if ((was_bound = (bound_port >= 0))) {
+                params.port = bound_port;
+            }
+        } else {
+            was_bound = svr->bind_to_port(params.hostname, params.port);
+        }
    }

    if (!was_bound) {
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -58,6 +58,32 @@ static T json_value(const json & body, const std::string & key, const T & defaul

 const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);

+// thin wrapper around common_grammar_trigger with (de)serialization functions
+struct server_grammar_trigger {
+    common_grammar_trigger value;
+
+    server_grammar_trigger() = default;
+    server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
+    server_grammar_trigger(const json & in) {
+        value.type = (common_grammar_trigger_type) in.at("type").get<int>();
+        value.value = in.at("value").get<std::string>();
+        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+            value.token = (llama_token) in.at("token").get<int>();
+        }
+    }
+
+    json to_json() const {
+        json out {
+            {"type", (int) value.type},
+            {"value", value.value},
+        };
+        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+            out["token"] = (int) value.token;
+        }
+        return out;
+    }
+};
+
 //
 // tokenizer and input processing utils
 //
@@ -627,7 +653,8 @@ static json oaicompat_completion_params_parse(
    llama_params["grammar_lazy"]     = chat_params.grammar_lazy;
    auto grammar_triggers = json::array();
    for (const auto & trigger : chat_params.grammar_triggers) {
-        grammar_triggers.push_back(trigger.to_json<json>());
+        server_grammar_trigger ct(trigger);
+        grammar_triggers.push_back(ct.to_json());
    }
    llama_params["grammar_triggers"] = grammar_triggers;
    llama_params["preserved_tokens"] = chat_params.preserved_tokens;
--- a/examples/server/webui/src/components/ChatScreen.tsx
+++ b/examples/server/webui/src/components/ChatScreen.tsx
@@ -99,13 +99,9 @@ export default function ChatScreen() {
    canvasData,
    replaceMessageAndGenerate,
  } = useAppContext();
-  const [inputMsg, setInputMsg] = useState(prefilledMsg.content());
-  const inputRef = useRef<HTMLTextAreaElement>(null);
+  const textarea = useOptimizedTextarea(prefilledMsg.content());

-  const { extraContext, clearExtraContext } = useVSCodeContext(
-    inputRef,
-    setInputMsg
-  );
+  const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
  // TODO: improve this when we have "upload file" feature
  const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined;

@@ -135,9 +131,10 @@ export default function ChatScreen() {
  };

  const sendNewMessage = async () => {
-    if (inputMsg.trim().length === 0 || isGenerating(currConvId ?? '')) return;
-    const lastInpMsg = inputMsg;
-    setInputMsg('');
+    const lastInpMsg = textarea.value();
+    if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? ''))
+      return;
+    textarea.setValue('');
    scrollToBottom(false);
    setCurrNodeId(-1);
    // get the last message node
@@ -146,13 +143,13 @@ export default function ChatScreen() {
      !(await sendMessage(
        currConvId,
        lastMsgNodeId,
-        inputMsg,
+        lastInpMsg,
        currExtra,
        onChunk
      ))
    ) {
      // restore the input message if failed
-      setInputMsg(lastInpMsg);
+      textarea.setValue(lastInpMsg);
    }
    // OK
    clearExtraContext();
@@ -195,16 +192,13 @@ export default function ChatScreen() {
      // send the prefilled message if needed
      sendNewMessage();
    } else {
-      // otherwise, focus on the input and move the cursor to the end
-      if (inputRef.current) {
-        inputRef.current.focus();
-        inputRef.current.selectionStart = inputRef.current.value.length;
-      }
+      // otherwise, focus on the input
+      textarea.focus();
    }
    prefilledMsg.clear();
    // no need to keep track of sendNewMessage
    // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [inputRef]);
+  }, [textarea.ref]);

  // due to some timing issues of StorageUtils.appendMsg(), we need to make sure the pendingMsg is not duplicated upon rendering (i.e. appears once in the saved conversation and once in the pendingMsg)
  const pendingMsgDisplay: MessageDisplay[] =
@@ -258,9 +252,7 @@ export default function ChatScreen() {
          <textarea
            className="textarea textarea-bordered w-full"
            placeholder="Type a message (Shift+Enter to add a new line)"
-            ref={inputRef}
-            value={inputMsg}
-            onChange={(e) => setInputMsg(e.target.value)}
+            ref={textarea.ref}
            onKeyDown={(e) => {
              if (e.nativeEvent.isComposing || e.keyCode === 229) return;
              if (e.key === 'Enter' && e.shiftKey) return;
@@ -280,11 +272,7 @@ export default function ChatScreen() {
              Stop
            </button>
          ) : (
-            <button
-              className="btn btn-primary ml-2"
-              onClick={sendNewMessage}
-              disabled={inputMsg.trim().length === 0}
-            >
+            <button className="btn btn-primary ml-2" onClick={sendNewMessage}>
              Send
            </button>
          )}
@@ -298,3 +286,43 @@ export default function ChatScreen() {
    </div>
  );
 }
+
+export interface OptimizedTextareaValue {
+  value: () => string;
+  setValue: (value: string) => void;
+  focus: () => void;
+  ref: React.RefObject<HTMLTextAreaElement>;
+}
+
+// This is a workaround to prevent the textarea from re-rendering when the inner content changes
+// See https://github.com/ggml-org/llama.cpp/pull/12299
+function useOptimizedTextarea(initValue: string): OptimizedTextareaValue {
+  const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
+  const textareaRef = useRef<HTMLTextAreaElement>(null);
+
+  useEffect(() => {
+    if (textareaRef.current && savedInitValue) {
+      textareaRef.current.value = savedInitValue;
+      setSavedInitValue('');
+    }
+  }, [textareaRef, savedInitValue, setSavedInitValue]);
+
+  return {
+    value: () => {
+      return textareaRef.current?.value ?? savedInitValue;
+    },
+    setValue: (value: string) => {
+      if (textareaRef.current) {
+        textareaRef.current.value = value;
+      }
+    },
+    focus: () => {
+      if (textareaRef.current) {
+        // focus and move the cursor to the end
+        textareaRef.current.focus();
+        textareaRef.current.selectionStart = textareaRef.current.value.length;
+      }
+    },
+    ref: textareaRef,
+  };
+}
--- a/examples/server/webui/src/utils/llama-vscode.ts
+++ b/examples/server/webui/src/utils/llama-vscode.ts
@@ -1,5 +1,6 @@
 import { useEffect, useState } from 'react';
 import { MessageExtraContext } from './types';
+import { OptimizedTextareaValue } from '../components/ChatScreen';

 // Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
 // Ref: https://github.com/ggml-org/llama.cpp/pull/11940
@@ -14,10 +15,7 @@ interface SetTextEvData {
 * window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n  return 123' }, '*');
 */

-export const useVSCodeContext = (
-  inputRef: React.RefObject<HTMLTextAreaElement>,
-  setInputMsg: (text: string) => void
-) => {
+export const useVSCodeContext = (textarea: OptimizedTextareaValue) => {
  const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
    null
  );
@@ -27,20 +25,20 @@ export const useVSCodeContext = (
    const handleMessage = (event: MessageEvent) => {
      if (event.data?.command === 'setText') {
        const data: SetTextEvData = event.data;
-        setInputMsg(data?.text);
+        textarea.setValue(data?.text);
        if (data?.context && data.context.length > 0) {
          setExtraContext({
            type: 'context',
            content: data.context,
          });
        }
-        inputRef.current?.focus();
+        textarea.focus();
      }
    };

    window.addEventListener('message', handleMessage);
    return () => window.removeEventListener('message', handleMessage);
-  }, [inputRef, setInputMsg]);
+  }, [textarea]);

  // Add a keydown listener that sends the "escapePressed" message to the parent window
  useEffect(() => {
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -24,7 +24,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.speculative.model.empty()) {
+    if (params.speculative.model.path.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -46,7 +46,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.speculative.model.empty()) {
+    if (params.speculative.model.path.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
@@ -331,11 +331,11 @@ int main(int argc, char ** argv) {
                        }

                        active_seqs.erase(s);
-                        for(int i = 0; i < n_seq_dft; i++) {
+                        for (int i = 0; i < n_seq_dft; i++) {
                            if (i == s) {
                                continue;
                            }
-                            if (drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
+                            if (drafts[i].active && drafts[i].tokens[i_dft] == drafts[s].tokens[i_dft]) {
                                // synchronize active status for sequences with the same drafted token
                                drafts[i].active = drafts[i].active && accept;
                                if (!drafts[i].active) {
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -571,14 +571,13 @@ int main(int argc, char ** argv) {
    model_ttc = llama_init_ttc.model.get();
    ctx_ttc   = llama_init_ttc.context.get();

+    if (model_ttc == nullptr || ctx_ttc == nullptr) {
+        return ENOENT;
+    }
+
    const llama_vocab * vocab = llama_model_get_vocab(model_ttc);

-    // TODO: refactor in a common struct
-    params.model     = params.vocoder.model;
-    params.model_url = params.vocoder.model_url;
-    params.hf_repo   = params.vocoder.hf_repo;
-    params.hf_file   = params.vocoder.hf_file;
-
+    params.model = params.vocoder.model;
    params.embedding = true;

    common_init_result llama_init_cts = common_init_from_params(params);
@@ -586,6 +585,10 @@ int main(int argc, char ** argv) {
    model_cts = llama_init_cts.model.get();
    ctx_cts   = llama_init_cts.context.get();

+    if (model_cts == nullptr || ctx_cts == nullptr) {
+        return ENOENT;
+    }
+
    std::vector<common_sampler *> smpl(n_parallel);
    for (int i = 0; i < n_parallel; ++i) {
        params.sampling.no_perf = (i != 0);
@@ -691,11 +694,13 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
            const std::string voice_data = audio_data;

            auto tmp = common_tokenize(vocab, voice_data, false, true);
-            printf("\n\n");
+
+            std::ostringstream tokens_oss;
            for (size_t i = 0; i < tmp.size(); ++i) {
-                printf("%d, ", tmp[i]);
+                tokens_oss << tmp[i] << ", ";
            }
-            printf("\n\n");
+            LOG_INF("\n\n%s: llama tokens: %s\n\n", __func__, tokens_oss.str().c_str());
+
            prompt_add(prompt_inp, tmp);
 #else
            prompt_add(prompt_inp, llama_tokens {
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -100,6 +100,10 @@ else()
    set(INS_ENB ON)
 endif()

+message(DEBUG "GGML_NATIVE         : ${GGML_NATIVE}")
+message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
+message(DEBUG "INS_ENB             : ${INS_ENB}")
+
 option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
 option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 option(GGML_CPU_KLEIDIAI     "ggml: use KleidiAI optimized kernels if applicable" OFF)
@@ -123,10 +127,12 @@ endif()
 option(GGML_LASX             "ggml: enable lasx"             ON)
 option(GGML_LSX              "ggml: enable lsx"              ON)
 option(GGML_RVV              "ggml: enable rvv"              ON)
+option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)

 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
-set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
+set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
+set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")


 if (WIN32)
--- a/ggml/cmake/GitVars.cmake
+++ b/ggml/cmake/GitVars.cmake
@@ -0,0 +1,22 @@
+find_package(Git)
+
+# the commit's SHA1
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_SHA1
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# the date of the commit
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_DATE
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+# the subject of the commit
+execute_process(COMMAND
+    "${GIT_EXECUTABLE}" log -1 --format=%s
+    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
+    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
--- a/ggml/cmake/ggml-config.cmake.in
+++ b/ggml/cmake/ggml-config.cmake.in
@@ -5,7 +5,7 @@

 set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
 set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
-set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
+#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")

 find_package(Threads REQUIRED)

--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -17,7 +17,9 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c

 GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);

-GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
+                                                    const char * cache_dir,
+                                                    size_t free_mem, size_t total_mem);

 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);

--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1791,11 +1791,11 @@ extern "C" {

 #define GGML_KQ_MASK_PAD 64

-    // q:    [n_embd, n_batch,     n_head,    1]
-    // k:    [n_embd, n_kv,        n_head_kv, 1]
-    // v:    [n_embd, n_kv,        n_head_kv, 1] !! not transposed !!
-    // mask: [n_kv,   n_batch_pad, 1,         1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
-    // res:  [n_embd, n_head,      n_batch,   1] !! permuted !!
+    // q:    [n_embd_k, n_batch,     n_head,    1]
+    // k:    [n_embd_k, n_kv,        n_head_kv, 1]
+    // v:    [n_embd_v, n_kv,        n_head_kv, 1] !! not transposed !!
+    // mask: [n_kv,     n_batch_pad, 1,         1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
+    // res:  [n_embd_v, n_head,      n_batch,   1] !! permuted !!
    GGML_API struct ggml_tensor * ggml_flash_attn_ext(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -65,7 +65,7 @@ if (GGML_LTO)
    endif()
 endif()

-if (GGML_CCACHE)
+if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
    find_program(GGML_CCACHE_FOUND ccache)
    find_program(GGML_SCCACHE_FOUND sccache)

@@ -76,7 +76,11 @@ if (GGML_CCACHE)
            set(GGML_CCACHE_VARIANT sccache)
        endif()
        # TODO: should not be set globally
-        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
+        if (GGML_SYCL AND GGML_CCACHE_FOUND AND WIN32)
+            set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "ccache compiler_type=icl")
+        else ()
+            set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
+        endif ()
        set(ENV{CCACHE_SLOPPINESS} time_macros)
        message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
    else()
@@ -325,6 +329,10 @@ if (CMAKE_SYSTEM_NAME MATCHES "Android")
    target_link_libraries(ggml-base PRIVATE dl)
 endif()

+if(CMAKE_SYSTEM_NAME MATCHES "visionOS")
+    target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE)
+endif()
+
 if (BUILD_SHARED_LIBS)
    foreach (target ggml-base ggml)
        set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
--- a/ggml/src/ggml-cann/.clang-format
+++ b/ggml/src/ggml-cann/.clang-format
@@ -1,168 +0,0 @@
---
-Language:        Cpp
-# BasedOnStyle:  Google
-AccessModifierOffset: -1
-AlignAfterOpenBracket: Align
-AlignConsecutiveMacros: false
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllArgumentsOnNextLine: true
-AllowAllConstructorInitializersOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: Never
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: All
-AllowShortLambdasOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: WithoutElse
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterCaseLabel:  false
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DeriveLineEnding: true
-DerivePointerAlignment: true
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks:   Regroup
-IncludeCategories:
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-    SortPriority:    0
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-    SortPriority:    0
-  - Regex:           '^<.*'
-    Priority:        2
-    SortPriority:    0
-  - Regex:           '.*'
-    Priority:        3
-    SortPriority:    0
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IncludeIsMainSourceRegex: ''
-IndentCaseLabels: true
-IndentGotoLabels: true
-IndentPPDirectives: None
-IndentWidth:     4
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Never
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
-PointerAlignment: Left
-RawStringFormats:
-  - Language:        Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-  - Language:        TextProto
-    Delimiters:
-      - pb
-      - PB
-      - proto
-      - PROTO
-    EnclosingFunctions:
-      - EqualsProto
-      - EquivToProto
-      - PARSE_PARTIAL_TEXT_PROTO
-      - PARSE_TEST_PROTO
-      - PARSE_TEXT_PROTO
-      - ParseTextOrDie
-      - ParseTextProtoOrDie
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-ReflowComments:  true
-SortIncludes:    true
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyBlock: false
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles:  false
-SpacesInConditionalStatement: false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-SpaceBeforeSquareBrackets: false
-Standard:        Auto
-StatementMacros:
-  - Q_UNUSED
-  - QT_REQUIRE_VERSION
-TabWidth:        8
-UseCRLF:         false
-UseTab:          Never
-...
-
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -51,13 +51,11 @@ if (CANN_INSTALL_DIR)
        ${CANN_INSTALL_DIR}/acllib/include
    )

-    add_subdirectory(kernels)
    list(APPEND CANN_LIBRARIES
        ascendcl
        nnopbase
        opapi
        acl_op_compiler
-        ascendc_kernels
    )

    file(GLOB GGML_SOURCES_CANN "*.cpp")
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -54,9 +54,7 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
    // added.
    int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];

-    int64_t acl_storage_len = 0;
    if (ne == nullptr) {
-        acl_storage_len = ggml_nbytes(tensor);
        for (int i = 0; i < GGML_MAX_DIMS; i++) {
            acl_ne[i] = tensor->ne[i];
            // The step size of acl is in elements.
@@ -65,14 +63,18 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
    } else {
        // With bcast
        for (int i = 0; i < dims; i++) {
-            acl_storage_len += (ne[i] - 1) * nb[i];
            acl_ne[i] = ne[i];
            acl_stride[i] = nb[i] / ggml_element_size(tensor);
        }
    }

-    // Reverse ne and stride.
    int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
+    int64_t acl_storage_len = 1;
+    for (int i = 0; i < final_dims; i++) {
+        acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
+    }
+
+    // Reverse ne and stride.
    std::reverse(acl_ne, acl_ne + final_dims);
    std::reverse(acl_stride, acl_stride + final_dims);

--- a/ggml/src/ggml-cann/acl_tensor.h
+++ b/ggml/src/ggml-cann/acl_tensor.h
@@ -101,14 +101,14 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
        tmp_stride[i] = nb[i] / type_size;
    }

+    int64_t acl_storage_len = 1;
+    for (int i = 0; i < dims; i++) {
+        acl_storage_len += (tmp_ne[i] - 1) * tmp_stride[i];
+    }
+
    std::reverse(tmp_ne, tmp_ne + dims);
    std::reverse(tmp_stride, tmp_stride + dims);

-    int64_t acl_storage_len = 0;
-    for (int i = 0; i < dims; i++) {
-        acl_storage_len += (ne[i] - 1) * nb[i];
-    }
-
    aclTensor* acl_tensor =
        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
                        format, &acl_storage_len, 1, data_ptr);
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -30,6 +30,7 @@
 #include <aclnnop/aclnn_copy.h>
 #include <aclnnop/aclnn_cos.h>
 #include <aclnnop/aclnn_div.h>
+#include <aclnnop/aclnn_embedding.h>
 #include <aclnnop/aclnn_exp.h>
 #include <aclnnop/aclnn_fill_scalar.h>
 #include <aclnnop/aclnn_group_norm.h>
@@ -50,6 +51,7 @@
 #include <aclnnop/aclnn_triu.h>
 #include <aclnnop/aclnn_upsample_nearest_2d.h>
 #include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
+#include <aclnnop/aclnn_argmax.h>
 #include <float.h>

 #include <cmath>
@@ -58,7 +60,6 @@
 #include <vector>

 #include "ggml-impl.h"
-#include "kernels/ascendc_kernels.h"

 #define GGML_COMMON_DECL_C

@@ -99,6 +100,35 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
    ACL_CHECK(aclDestroyIntArray(repeats));
 }

+/**
+ * @brief Casts the elements of a tensor to a specified data type using the CANN backend.
+ *
+ * @details This function performs a type conversion on the elements of the input tensor `acl_src`
+ *          and stores the results in the destination tensor `acl_dst`. The conversion type is
+ *          determined based on the `dst` tensor's data type.
+ *
+ * @param ctx The context for the CANN backend operations.
+ * @param acl_src The source tensor whose elements will be cast.
+ * @param acl_dst The destination tensor that will store the casted elements.
+ * @param dst The ggml tensor specifying the target data type.
+ */
+static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
+                    aclTensor* acl_dst, ggml_tensor* dst) {
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+    ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src,
+                                        ggml_cann_type_mapping(dst->type),
+                                        acl_dst, &workspaceSize, &executor));
+
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+
+    ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
+}
+
 void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
    GGML_ASSERT(ggml_can_repeat(src, dst));
@@ -329,8 +359,6 @@ void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {

 void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
-    GGML_ASSERT(src->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    float min;
    float max;
@@ -889,173 +917,76 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 }

 void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src = dst->src[0];
+    ggml_tensor* src0 = dst->src[0];

-    aclTensor* acl_src = ggml_cann_create_tensor(src);
+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-
-    ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    src->extra = src_extra_allocator.get();
-    dst->extra = dst_extra_allocator.get();
-    ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
-    ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
-
-    if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
-        ggml_are_same_shape(src, dst)) {
-        cann_copy(ctx, acl_src, acl_dst);
-        ACL_CHECK(aclDestroyTensor(acl_src));
-        ACL_CHECK(aclDestroyTensor(acl_dst));
-        return;
-    }
-    // TODO: simplify
-    if (src->type == GGML_TYPE_F16) {
-        if (dst->type == GGML_TYPE_Q8_0) {
-            aclrtlaunch_ascendc_quantize_f16_q8_0(
-                24, ctx.stream(), src->data, dst->data,
-                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
-                ((ggml_tensor*)dst->extra)->ne);
-            return;
-        }
-        if (dst->type == GGML_TYPE_Q4_0) {
-            aclrtlaunch_ascendc_quantize_f16_to_q4_0(
-                24, ctx.stream(), src->data, dst->data,
-                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
-                ((ggml_tensor*)dst->extra)->ne);
-            return;
-        }
-        if (dst->type == GGML_TYPE_F16) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
-            if (ggml_is_contiguous(dst)) {
-                const size_t src_type_size = ggml_type_size(src->type);
-                if (src->nb[0] == src_type_size) {
-                    // src0 is contigous on first dimension, copy by rows
-                    int64_t rows_num = ggml_nrows(src);
-
-                    aclrtlaunch_ascendc_dup_by_rows_fp16(
-                        rows_num, ctx.stream(), src->data, dst->data,
-                        ((ggml_tensor*)src->extra)->ne,
-                        ((ggml_tensor*)src->extra)->nb,
-                        ((ggml_tensor*)dst->extra)->ne,
-                        ((ggml_tensor*)dst->extra)->nb);
-                    return;
-                }
-                GGML_ABORT("fatal error");
-            }
-            GGML_ABORT("fatal error");
-        }
-        if (dst->type == GGML_TYPE_F32) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
-            if (ggml_is_contiguous(dst)) {
-                const size_t src_type_size = ggml_type_size(src->type);
-                if (src->nb[0] == src_type_size) {
-                    // src0 is contigous on first dimension, copy by rows
-                    int64_t rows_num = ggml_nrows(src);
-                    aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
-                        rows_num, ctx.stream(), src->data, dst->data,
-                        ((ggml_tensor*)src->extra)->ne,
-                        ((ggml_tensor*)src->extra)->nb,
-                        ((ggml_tensor*)dst->extra)->ne,
-                        ((ggml_tensor*)dst->extra)->nb);
-                    return;
-                }
-                GGML_ABORT("fatal error");
-            }
-            GGML_ABORT("fatal error");
-        }
-        // TODO
-        GGML_ABORT("fatal error");
-    } else if (src->type == GGML_TYPE_F32) {
-        // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
-        //          && nb0 == type_size)
-        if (dst->type == GGML_TYPE_Q8_0) {
-            aclrtlaunch_ascendc_quantize_f32_q8_0(
-                24, ctx.stream(), src->data, dst->data,
-                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
-                ((ggml_tensor*)dst->extra)->ne);
-            return;
-        }
-        if (dst->type == GGML_TYPE_Q4_0) {
-            aclrtlaunch_ascendc_quantize_f32_to_q4_0(
-                24, ctx.stream(), src->data, dst->data,
-                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
-                ((ggml_tensor*)dst->extra)->ne);
-            return;
-        }
-        if (dst->type == GGML_TYPE_F32) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
-            if (ggml_is_contiguous(dst)) {
-                const size_t src_type_size = ggml_type_size(src->type);
-                if (src->nb[0] == src_type_size) {
-                    // src0 is contigous on first dimension, copy by rows
-                    int64_t rows_num = ggml_nrows(src);
-                    aclrtlaunch_ascendc_dup_by_rows_fp32(
-                        rows_num, ctx.stream(), src->data, dst->data,
-                        ((ggml_tensor*)src->extra)->ne,
-                        ((ggml_tensor*)src->extra)->nb,
-                        ((ggml_tensor*)dst->extra)->ne,
-                        ((ggml_tensor*)dst->extra)->nb);
-                    return;
-                }
-                GGML_ABORT("fatal error");
-            } else {
-                // TODO: dst not contiguous
-                GGML_ABORT("fatal error");
-            }
-        }
-        if (dst->type == GGML_TYPE_F16) {
-            if (ggml_are_same_shape(src, dst)) {
-                cann_copy(ctx, acl_src, acl_dst);
-                ACL_CHECK(aclDestroyTensor(acl_src));
-                ACL_CHECK(aclDestroyTensor(acl_dst));
-                return;
-            }
-            if (ggml_is_contiguous(dst)) {
-                const size_t src_type_size = ggml_type_size(src->type);
-                if (src->nb[0] == src_type_size) {
-                    // src0 is contigous on first dimension, copy by rows
-                    int64_t rows_num = ggml_nrows(src);
-                    aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
-                        rows_num, ctx.stream(), src->data, dst->data,
-                        ((ggml_tensor*)src->extra)->ne,
-                        ((ggml_tensor*)src->extra)->nb,
-                        ((ggml_tensor*)dst->extra)->ne,
-                        ((ggml_tensor*)dst->extra)->nb);
-                    return;
-                }
-                GGML_ABORT("fatal error");
-            }
-        }
-        // TODO
-        GGML_ABORT("fatal error");
-    } else {
-        if (ggml_are_same_shape(src, dst)) {
+    if (ggml_are_same_shape(src0, dst)) {
+        if (dst->type == src0->type) {
            cann_copy(ctx, acl_src, acl_dst);
-            ACL_CHECK(aclDestroyTensor(acl_src));
-            ACL_CHECK(aclDestroyTensor(acl_dst));
-            return;
+        } else {
+            aclnn_cast(ctx, acl_src, acl_dst, dst);
+        }
+    } else {
+        if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
+            if (dst->type == src0->type) {
+                size_t cpy_size = ggml_nbytes(dst);
+                ACL_CHECK(aclrtMemcpyAsync(
+                    dst->data, cpy_size, src0->data, cpy_size,
+                    ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+                return;
+            } else {
+                ggml_cann_pool_alloc src_buffer_allocator(
+                    ctx.pool(),
+                    ggml_nelements(dst) * ggml_type_size(dst->type));
+                void* src_trans_buffer = src_buffer_allocator.get();
+                size_t src_trans_nb[GGML_MAX_DIMS];
+                src_trans_nb[0] = ggml_type_size(dst->type);
+                for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+                }
+                aclTensor* src_trans_tensor = ggml_cann_create_tensor(
+                    src_trans_buffer, ggml_cann_type_mapping(dst->type),
+                    ggml_type_size(dst->type), src0->ne, src_trans_nb,
+                    GGML_MAX_DIMS);
+
+                aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
+                size_t cpy_size = ggml_nbytes(dst);
+                ACL_CHECK(aclrtMemcpyAsync(
+                    dst->data, cpy_size, src_trans_buffer, cpy_size,
+                    ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
+                ACL_CHECK(aclDestroyTensor(src_trans_tensor));
+                return;
+            }
+        } else if (ggml_is_contiguous(dst)) {
+            ggml_cann_pool_alloc src_buffer_allocator(
+                ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
+            void* src_trans_buffer = src_buffer_allocator.get();
+            size_t src_trans_nb[GGML_MAX_DIMS];
+            src_trans_nb[0] = ggml_type_size(dst->type);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+            }
+            aclTensor* src_trans_tensor = ggml_cann_create_tensor(
+                src_trans_buffer, ggml_cann_type_mapping(dst->type),
+                ggml_type_size(dst->type), src0->ne, src_trans_nb,
+                GGML_MAX_DIMS);
+
+            aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
+
+            size_t cpy_size = ggml_nbytes(dst);
+            ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src_trans_buffer,
+                                       cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
+                                       ctx.stream()));
+            ACL_CHECK(aclDestroyTensor(src_trans_tensor));
+            return;
+        } else {
+            GGML_ABORT("Unsupport dst is not tontiguous.");
        }
-        GGML_ABORT("fatal error");
    }
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
 }

 #ifdef __cplusplus
@@ -1158,8 +1089,6 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));

-    GGML_ASSERT(eps > 0.0f);
-
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
@@ -2378,85 +2307,168 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
 }

-void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src0 = dst->src[0];
-    ggml_tensor* src1 = dst->src[1];
+/**
+ * @brief Performs embedding operation on a 4D tensor using the CANN backend.
+ *
+ * This function extracts slices from the source tensor (`src_buffer`),
+ * index tensor (`index`), and destination tensor (`dst`), and performs an
+ * embedding operation on them. The embedding operation is applied by iterating
+ * over the last two dimensions of the source tensor, creating the necessary
+ * tensors for the source, index, and output, and executing the embedding operation.
+ *
+ * @param ctx The context for CANN backend operations.
+ * @param src_buffer The source buffer holding the data for the source tensor.
+ * @param src_ne The dimensions of the source tensor.
+ * @param src_nb The strides (byte offsets) of the source tensor.
+ * @param index The index tensor used in the embedding operation.
+ * @param dst The destination tensor where the result will be stored.
+ */
+static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
+                            int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
+                            ggml_tensor* dst) {
+    for (int64_t i = 0; i < src_ne[3]; i++) {
+        for (int64_t j = 0; j < src_ne[2]; j++) {
+            // src
+            int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
+            size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
+            aclTensor* acl_src_tensor = ggml_cann_create_tensor(
+                (char*)src_buffer + i * src_nb[3] + j * src_nb[2],
+                ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
+                acl_src_ne, acl_src_nb, 2);

-    ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
-    src0->extra = src0_extra_allocator.get();
-    src1->extra = src1_extra_allocator.get();
-    dst->extra = dst_extra_allocator.get();
-    ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
-    ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
-    ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
-                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
-                               ctx.stream()));
+            // index
+            int64_t acl_index_ne[1] = {index->ne[0]};
+            size_t acl_index_nb[1] = {index->nb[0]};
+            aclTensor* acl_index = ggml_cann_create_tensor(
+                (char*)index->data + i * index->nb[2] + j * index->nb[1],
+                ggml_cann_type_mapping(index->type), ggml_element_size(index),
+                acl_index_ne, acl_index_nb, 1);
+
+            // out
+            int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
+            size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
+            aclTensor* acl_out = ggml_cann_create_tensor(
+                (char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
+                ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
+                acl_out_ne, acl_out_nb, 2);
+
+            uint64_t workspaceSize = 0;
+            aclOpExecutor* executor;
+            void* workspaceAddr = nullptr;
+
+            ACL_CHECK(aclnnEmbeddingGetWorkspaceSize(
+                acl_src_tensor, acl_index, acl_out, &workspaceSize, &executor));
+
+            if (workspaceSize > 0) {
+                ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
+                                                         workspaceSize);
+                workspaceAddr = workspace_allocator.get();
+            }
+
+            ACL_CHECK(aclnnEmbedding(workspaceAddr, workspaceSize, executor,
+                                     ctx.stream()));
+
+            ACL_CHECK(aclDestroyTensor(acl_src_tensor));
+            ACL_CHECK(aclDestroyTensor(acl_index));
+            ACL_CHECK(aclDestroyTensor(acl_out));
+        }
+    }
+}
+
+void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
+    ggml_tensor* src0 = dst->src[0];  // src
+    ggml_tensor* src1 = dst->src[1];  // index

    switch (src0->type) {
        case GGML_TYPE_F32: {
-#ifdef ASCEND_310P
-            // Special operation for get_row_f32 kernel of 310P: clear the
-            // content of dest data buffer when row is not aligned to 32 bytes
-            if ((src0->ne[0] % 8) != 0) {
-                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
-                                 src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
-                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
-            }
-#endif
-            aclrtlaunch_ascendc_get_row_f32(
-                24, ctx.stream(), src0->data, src1->data, dst->data,
-                ((ggml_tensor*)src0->extra)->ne,
-                ((ggml_tensor*)src0->extra)->nb,
-                ((ggml_tensor*)src1->extra)->ne,
-                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
-                ((ggml_tensor*)dst->extra)->nb);
+            aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
+                                   dst);
            break;
        }
        case GGML_TYPE_F16: {
-#ifdef ASCEND_310P
-            // Special operation for get_row_f16 kernel of 310P: clear the
-            // content of dest data buffer when row is not aligned to 32 bytes
-            if ((src0->ne[0] % 16) != 0) {
-                size_t dst_len =
-                    src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
-                    ggml_type_size(
-                        GGML_TYPE_F32);  // out is also f32, even input is f16
-                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
+            aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
+            ggml_cann_pool_alloc src_buffer_allocator(
+                ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
+            void* src_trans_buffer = src_buffer_allocator.get();
+            size_t src_trans_nb[GGML_MAX_DIMS];
+            src_trans_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
            }
-#endif
-            aclrtlaunch_ascendc_get_row_f16(
-                24, ctx.stream(), src0->data, src1->data, dst->data,
-                ((ggml_tensor*)src0->extra)->ne,
-                ((ggml_tensor*)src0->extra)->nb,
-                ((ggml_tensor*)src1->extra)->ne,
-                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
-                ((ggml_tensor*)dst->extra)->nb);
+            aclTensor* src_trans_tensor = ggml_cann_create_tensor(
+                src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
+                src0->ne, src_trans_nb, GGML_MAX_DIMS);
+            aclnn_cast(ctx, acl_src0, src_trans_tensor, dst);
+            aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
+                                   src_trans_nb, src1, dst);
+            ACL_CHECK(aclDestroyTensor(acl_src0));
+            ACL_CHECK(aclDestroyTensor(src_trans_tensor));
            break;
        }
-        case GGML_TYPE_Q4_0:
-            aclrtlaunch_ascendc_get_row_q4_0(
-                24, ctx.stream(), src0->data, src1->data, dst->data,
-                ((ggml_tensor*)src0->extra)->ne,
-                ((ggml_tensor*)src1->extra)->ne,
-                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
-                ((ggml_tensor*)dst->extra)->nb);
-            break;
-        case GGML_TYPE_Q8_0:
-            aclrtlaunch_ascendc_get_row_q8_0(
-                24, ctx.stream(), src0->data, src1->data, dst->data,
-                ((ggml_tensor*)src0->extra)->ne,
-                ((ggml_tensor*)src1->extra)->ne,
-                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
-                ((ggml_tensor*)dst->extra)->nb);
+        case GGML_TYPE_Q8_0: {
+            // add 1 dim for bcast mul.
+            size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
+                dequant_nb[GGML_MAX_DIMS + 1];
+            int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
+                *dequant_ne;
+            int64_t scale_offset = 0;
+
+            // [3,4,5,64] -> [3,4,5,2,32]
+            weight_ne[0] = QK8_0;
+            weight_ne[1] = src0->ne[0] / QK8_0;
+            weight_nb[0] = sizeof(int8_t);
+            weight_nb[1] = weight_nb[0] * weight_ne[0];
+            for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
+                weight_ne[i] = src0->ne[i - 1];
+                weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
+            }
+
+            // [3,4,5,64] -> [3,4,5,2,1]
+            scale_ne[0] = 1;
+            scale_ne[1] = src0->ne[0] / QK8_0;
+            scale_nb[0] = sizeof(uint16_t);
+            scale_nb[1] = scale_nb[0] * scale_ne[0];
+            for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
+                scale_ne[i] = src0->ne[i - 1];
+                scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
+            }
+
+            // [3,4,5,64] -> [3,4,5,2,32]
+            dequant_ne = weight_ne;
+            dequant_nb[0] = sizeof(float_t);
+            for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
+                dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
+            }
+
+            scale_offset = ggml_nelements(src0) * sizeof(int8_t);
+            ggml_cann_pool_alloc dequant_buffer_allocator(
+                ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
+
+            aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
+                src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
+                GGML_MAX_DIMS + 1);
+            aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
+                src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb,
+                GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
+            aclTensor* dequant_tensor = ggml_cann_create_tensor(
+                dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
+                dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
+
+            aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
+            dequant_nb[0] = sizeof(float_t);
+            dequant_ne = src0->ne;
+            for (int i = 1; i < GGML_MAX_DIMS; i++) {
+                dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
+            }
+
+            aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
+                                   dequant_ne, dequant_nb, src1, dst);
+
+            ACL_CHECK(aclDestroyTensor(dequant_tensor));
            break;
+        }
        default:
-            GGML_ABORT("fatal error");
+            GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
            break;
    }
 }
@@ -2797,8 +2809,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,

            ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
                acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
-                nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor,
-                &workspaceSize, &executor));
+                nullptr, nullptr, nullptr, antiquantGroupSize,
+                acl_output_tensor, &workspaceSize, &executor));
            if (workspaceAddr == nullptr) {
                workspaceAddr = workspace_allocator.alloc(workspaceSize);
            }
@@ -3137,7 +3149,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    // TODO: use ascendc
    // Only test with LLAMA model.
    ggml_tensor* src0 = dst->src[0];  // input
-    ggml_tensor* src2 = dst->src[2];  // freq_factors
+    // ggml_tensor* src2 = dst->src[2];  // freq_factors, not used now.

    // param
    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
@@ -3429,3 +3441,46 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
+
+
+ void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+    ggml_tensor * src0 = dst->src[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
+
+    uint64_t workspaceSize = 0;
+    aclOpExecutor* executor;
+    void* workspaceAddr = nullptr;
+
+    ACL_CHECK(aclnnArgMaxGetWorkspaceSize(acl_src, 3, false, acl_dst,
+                     &workspaceSize, &executor));
+    if (workspaceSize > 0) {
+        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
+        workspaceAddr = workspace_allocator.get();
+    }
+    ACL_CHECK(aclnnArgMax(workspaceAddr, workspaceSize, executor, ctx.stream()));
+
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_cos(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+    ggml_tensor * src0 = dst->src[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+    aclnn_cos(ctx, acl_src, acl_dst);
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
+
+void ggml_cann_sin(ggml_backend_cann_context& ctx, ggml_tensor* dst){
+    ggml_tensor * src0 = dst->src[0];
+
+    aclTensor* acl_src = ggml_cann_create_tensor(src0);
+    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
+    aclnn_sin(ctx, acl_src, acl_dst);
+    ACL_CHECK(aclDestroyTensor(acl_src));
+    ACL_CHECK(aclDestroyTensor(acl_dst));
+}
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -484,6 +484,47 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 */
 void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);

+/**
+ * @brief   Computes the index of the maximum value along the specified dimension
+ *          of a ggml tensor using the CANN backend.
+ *
+ * @details This function performs an argmax operation on the input tensor.
+ *          It finds the index of the maximum value along the specified axis
+ *          and stores these indices in the destination tensor `dst`. The
+ *          operation is executed using the CANN backend for optimized performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the indices of the maximum values will be stored.
+ *            dst->op is `GGML_OP_ARGMAX`.
+ */
+void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Computes the cosine of each element in a ggml tensor using the CANN backend.
+ *
+ * @details This function applies the cosine function element-wise to the input tensor.
+ *          The computed cosine values are stored in the destination tensor `dst`.
+ *          The operation is optimized using the CANN backend for improved performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the cosine values will be stored.
+ *            dst->op is `GGML_OP_COS`.
+ */
+void ggml_cann_cos(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
+/**
+ * @brief   Computes the sine of each element in a ggml tensor using the CANN backend.
+ *
+ * @details This function applies the sine function element-wise to the input tensor.
+ *          The computed sine values are stored in the destination tensor `dst`.
+ *          The operation is optimized using the CANN backend for improved performance.
+ *
+ * @param ctx The CANN context used for operations.
+ * @param dst The destination tensor where the sine values will be stored.
+ *            dst->op is `GGML_OP_SIN`.
+ */
+void ggml_cann_sin(ggml_backend_cann_context& ctx, ggml_tensor* dst);
+
 template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
                                       aclTensor*, uint64_t*, aclOpExecutor**),
          aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
@@ -535,9 +576,6 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
 void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];

-    GGML_ASSERT(src->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);

@@ -566,9 +604,6 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
 void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];

-    GGML_ASSERT(src->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);

--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1420,6 +1420,15 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
        case GGML_OP_ARGSORT:
            ggml_cann_argsort(ctx, dst);
            break;
+        case GGML_OP_ARGMAX:
+            ggml_cann_argmax(ctx, dst);
+            break;
+        case GGML_OP_COS:
+            ggml_cann_cos(ctx, dst);
+            break;
+        case GGML_OP_SIN:
+            ggml_cann_sin(ctx, dst);
+            break;
        default:
            return false;
    }
@@ -1458,11 +1467,6 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
    ACL_CHECK(aclrtSynchronizeDevice());
    ACL_CHECK(aclrtResetDevice(cann_ctx->device));

-    // finalize when last backend freed.
-    if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) {
-        ACL_CHECK(aclFinalize());
-    }
-
    delete cann_ctx;
    delete backend;
 }
@@ -1688,11 +1692,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
        case GGML_OP_MUL_MAT: {
            switch (op->src[0]->type) {
-                case GGML_TYPE_Q8_0:
                case GGML_TYPE_F16:
                case GGML_TYPE_F32:
-                case GGML_TYPE_Q4_0:
                    return true;
+                case GGML_TYPE_Q8_0:
+                case GGML_TYPE_Q4_0:
+                    // only support contiguous for quantized types.
+                    return ggml_is_contiguous(op->src[0]) &&
+                            ggml_is_contiguous(op->src[1]);
                default:
                    return false;
            }
@@ -1704,7 +1711,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            switch (op->src[0]->type) {
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
-                case GGML_TYPE_Q4_0:
                case GGML_TYPE_Q8_0:
                    return true;
                default:
@@ -1712,16 +1718,21 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
        } break;
        case GGML_OP_CPY: {
-            switch (op->type) {
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_Q8_0:
-                case GGML_TYPE_Q4_0:
-                    return true;
-                default:
-                    return false;
+            ggml_tensor *src = op->src[0];
+            if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
+                  (src->type != GGML_TYPE_F32 &&
+                    src->type != GGML_TYPE_F16)) {
+                // only support F32 and F16.
+                return false;
            }
-        }
+
+            if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
+                // unsupport dst is not contiguous.
+                return false;
+            }
+
+            return true;
+        } break;
        case GGML_OP_CONT: {
            // TODO: support GGML_TYPE_BF16
            switch (op->src[0]->type) {
@@ -1734,13 +1745,14 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        }
        case GGML_OP_ROPE: {
            // TODO: with ops-test v == 1
-            float * ext_factor = (float*)((int32_t*)op->op_params + 7);
+            float ext_factor = 0.0f;
+            memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
            // TODO: n_dims <= ne0
            if (op->src[0]->ne[0] != op->op_params[1]) {
                return false;
            }
            // TODO: ext_factor != 0
-            if (*ext_factor != 0) {
+            if (ext_factor != 0) {
                return false;
            }

@@ -1762,9 +1774,19 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
            return true;
        }
+        case GGML_OP_POOL_2D: {
+            const int32_t * opts = (const int32_t *) op->op_params;
+            const int       k0   = opts[1];
+            const int       k1   = opts[2];
+            const int       p0   = opts[5];
+            const int       p1   = opts[6];
+            // value of paddingH should be at most half of kernelH
+            // value of paddingW should be at most half of kernelW
+            return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
+        }
+        case GGML_OP_DUP:
        case GGML_OP_IM2COL:
        case GGML_OP_CONCAT:
-        case GGML_OP_DUP:
        case GGML_OP_REPEAT:
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
@@ -1781,7 +1803,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_CLAMP:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
-        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
@@ -1790,6 +1811,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
+        case GGML_OP_ARGMAX:
+        case GGML_OP_COS:
+        case GGML_OP_SIN:
            return true;
        default:
            return false;
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@@ -1,30 +0,0 @@
-file(GLOB SRC_FILES
-    get_row_f32.cpp
-    get_row_f16.cpp
-    get_row_q4_0.cpp
-    get_row_q8_0.cpp
-    quantize_f32_q8_0.cpp
-    quantize_f16_q8_0.cpp
-    quantize_float_to_q4_0.cpp
-    dup.cpp
-)
-
-set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
-set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
-
-if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
-    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
-elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
-    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
-else()
-    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
-endif()
-include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
-
-ascendc_library(ascendc_kernels STATIC
-    ${SRC_FILES}
-)
-
-message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
-ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
-# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
--- a/ggml/src/ggml-cann/kernels/ascendc_kernels.h
+++ b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
@@ -1,19 +0,0 @@
-#ifndef ASCENDC_KERNELS_H
-#define ASCENDC_KERNELS_H
-
-#include "aclrtlaunch_ascendc_get_row_f32.h"
-#include "aclrtlaunch_ascendc_get_row_f16.h"
-#include "aclrtlaunch_ascendc_get_row_q8_0.h"
-#include "aclrtlaunch_ascendc_get_row_q4_0.h"
-
-#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
-#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
-#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
-#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
-
-#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
-#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
-
-#endif  // ASCENDC_KERNELS_H
--- a/ggml/src/ggml-cann/kernels/dup.cpp
+++ b/ggml/src/ggml-cann/kernels/dup.cpp
@@ -1,234 +0,0 @@
-#include "kernel_operator.h"
-
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-const int64_t SUPPORTED_MAX_DIM = 65535;  // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
-
-template <typename SRC_T, typename DST_T>
-class DupByRows {
-   public:
-    __aicore__ inline DupByRows() {}
-    __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
-                                size_t *input_nb_ub) {
-        /* Dup by rows when src is contigous on first dimension and dst is
-        contiguous, each kernel process one row.
-        */
-
-        // Input has four dims.
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        // param
-        num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
-        num_elem = input_ne_ub[0];
-
-        // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
-        idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
-        idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
-                  / (input_ne_ub[1]);
-        idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
-                - idx_ne2 * input_ne_ub[1];
-
-        // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
-        src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
-                     + input_nb_ub[1] * idx_ne1;
-
-        // dst is contiguous
-        dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
-
-        src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
-                                                                src_stride));
-        dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
-                                                                dst_stride));
-
-        pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
-                                                32 - 1) / 32 * 32);
-        pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
-                                                32 - 1) / 32 * 32);
-    }
-
-    __aicore__ inline void copy_in() {
-        LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
-        const size_t elem_per_block = 32 / sizeof(SRC_T);
-        size_t tail = num_elem % elem_per_block;
-        size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
-        DataCopy(src_local, src_gm, cpy_elements_len);
-        src_queue.EnQue(src_local);
-    }
-
-    __aicore__ inline void copy_out() {
-        LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
-#ifdef ASCEND_310P
-        const size_t elem_per_block = 32 / sizeof(DST_T);
-        size_t tail = num_elem % elem_per_block;
-        size_t len = num_elem & ~(elem_per_block - 1);
-        if (len > 0) {
-            DataCopy(dst_gm, dst_local, len);
-        }
-        if(tail != 0) {
-            for (size_t i = tail; i < elem_per_block; i++) {
-                dst_local[len + i].SetValue(0, 0);
-            }
-            SetAtomicAdd<float>();
-            DataCopy(dst_gm[len], dst_local[len], elem_per_block);
-            SetAtomicNone();
-        }
-#else
-        DataCopyExtParams dataCopyParams;
-        dataCopyParams.blockCount = 1;
-        dataCopyParams.blockLen = num_elem * sizeof(DST_T);
-        DataCopyPad(dst_gm, dst_local, dataCopyParams);
-#endif
-        dst_queue.FreeTensor(dst_local);
-    }
-
-    __aicore__ inline void dup() {
-        // main process, copy one row data from src to dst.
-        copy_in();
-
-        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
-        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
-
-        int32_t BLOCK_NUM = 32 / sizeof(DST_T);
-        DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
-                                        / BLOCK_NUM * BLOCK_NUM);
-        dst_queue.EnQue<DST_T>(dst_local);
-
-        src_queue.FreeTensor(src_local);
-        copy_out();
-    }
-
-    __aicore__ inline void dup_with_cast() {
-        // main process, copy one row data from src to dst.
-        // cast dtype from src to dst.
-        copy_in();
-
-        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
-        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
-
-        Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
-        dst_queue.EnQue<DST_T>(dst_local);
-
-        src_queue.FreeTensor(src_local);
-        copy_out();
-    }
-
-   private:
-
-    TPipe pipe;
-    GlobalTensor<SRC_T> src_gm;
-    GlobalTensor<DST_T> dst_gm;
-
-    int64_t num_rows;
-    int64_t num_elem;
-    int64_t idx_ne3;
-    int64_t idx_ne2;
-    int64_t idx_ne1;
-    int64_t src_stride;
-    int64_t dst_stride;
-
-    TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    DupByRows<half, half> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup();
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    DupByRows<float, float> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup();
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    DupByRows<float, half> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup_with_cast();
-}
-
-extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
-                                                        GM_ADDR src_gm,
-                                                        GM_ADDR dst_gm,
-                                                        GM_ADDR input_ne_gm,
-                                                        GM_ADDR input_nb_gm,
-                                                        GM_ADDR output_ne_gm,
-                                                        GM_ADDR output_nb_gm) {
-
-    // copy params from gm to ub.
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    DupByRows<half, float> op;
-    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
-    op.dup_with_cast();
-}
--- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
@@ -1,197 +0,0 @@
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-
-class GET_ROW_F16 {
-   public:
-    __aicore__ inline GET_ROW_F16() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
-                                int64_t *output_ne_ub, size_t *output_nb_ub) {
-        // TODO, use template for F16/f32
-        int64_t op_block_num = GetBlockNum();
-        op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        input_gm.SetGlobalBuffer((__gm__ half *)input);
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-
-        uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
-                                             & ~31);
-        uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
-                                              & ~31);
-
-        local_buffer_elems = input_local_buffer_size / sizeof(half);
-
-        // TODO, consider long row that can't put in UB.
-        // All data should asign to 32. It's ok because all data is align to 32.
-        pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
-        pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
-        size_t origin_len = len;
-        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
-        const size_t elem_per_block = 32 / sizeof(half);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if(tail != 0) {
-            len += elem_per_block;
-        }
-        DataCopy(input_local, input_gm[offset], len);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        const size_t elem_per_block = 32 / sizeof(float);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if (len > 0) {
-            DataCopy(output_gm[offset], output_local, len);
-        }
-
-        if(tail != 0) {
-#ifdef ASCEND_310P
-            for (size_t i = tail; i < elem_per_block; i++) {
-                output_local[len + i].SetValue(0, 0);
-            }
-            SetAtomicAdd<float>();
-            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
-            SetAtomicNone();
-#else
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPad(output_gm[offset + len], output_local[len],
-                        dataCopyParams);
-#endif
-        }
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void calculate_row(int64_t idx) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3];
-
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3];
-
-        copy_in(input_offset, input_ne[0]);
-        LocalTensor<half> input_local = input_queue.DeQue<half>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
-        Cast(output_local, input_local, RoundMode::CAST_NONE,
-             local_buffer_elems);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset, input_ne[0]);
-
-        input_queue.FreeTensor(input_local);
-    }
-
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            calculate_row(i);
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    size_t local_buffer_elems;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<half> input_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    int64_t op_block_idx;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_f16(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
-    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    GET_ROW_F16 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
-            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}
--- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
@@ -1,190 +0,0 @@
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-
-class GET_ROW_F32 {
-   public:
-    __aicore__ inline GET_ROW_F32() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
-                                int64_t *output_ne_ub, size_t *output_nb_ub) {
-        int64_t op_block_num = GetBlockNum();
-        op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        input_gm.SetGlobalBuffer((__gm__ float *)input);
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-
-        uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
-        local_buffer_elems = local_buffer_size / sizeof(float);
-
-        // TODO, consider long row that can't put in UB.
-        // All data should asign to 32. It's ok because all data is align to 32.
-        pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
-        pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
-        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
-        const size_t elem_per_block = 32 / sizeof(float);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if(tail != 0) {
-            len += elem_per_block;
-        }
-        DataCopy(input_local, input_gm[offset], len);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        const size_t elem_per_block = 32 / sizeof(float);
-        size_t tail = len % elem_per_block;
-        len = len & ~(elem_per_block - 1);
-        if (len > 0) {
-            DataCopy(output_gm[offset], output_local, len);
-        }
-
-        if(tail != 0) {
-#ifdef ASCEND_310P
-            for (size_t i = tail; i < elem_per_block; i++) {
-                output_local[len + i].SetValue(0, 0);
-            }
-            SetAtomicAdd<float>();
-            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
-            SetAtomicNone();
-#else
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = tail * sizeof(float);
-            DataCopyPad(output_gm[offset + len], output_local[len],
-                        dataCopyParams);
-#endif
-        }
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void calculate_row(int64_t idx) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3];
-
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3];
-
-        copy_in(input_offset, input_ne[0]);
-        LocalTensor<float> input_local = input_queue.DeQue<float>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
-        DataCopy(output_local, input_local, local_buffer_elems);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset, input_ne[0]);
-
-        input_queue.FreeTensor(input_local);
-    }
-
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            calculate_row(i);
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    size_t local_buffer_elems;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<float> input_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    int64_t op_block_idx;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_f32(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
-    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    GET_ROW_F32 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
-            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}
--- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@@ -1,204 +0,0 @@
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support 4bit get row
-    extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
-        GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-        GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
-        GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support 4bit get row.\n");
-    }
-#else
-
-#define BUFFER_NUM 2
-
-#define QK4_0 32
-
-class GET_ROW_Q4_0 {
-   public:
-    __aicore__ inline GET_ROW_Q4_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
-                                size_t *indices_nb_ub, int64_t *output_ne_ub,
-                                size_t *output_nb_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-            scale_ne[i] = input_ne_ub[i];
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-
-        // one scale for a group.
-        scale_ne[0] /= QK4_0;
-
-        input_stride[0] = 1;
-        scale_stride[0] = 1;
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        group_size_in_row = input_ne[0] / QK4_0;
-        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
-                               input_ne[3] / 2;
-
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
-        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
-        // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
-        DataCopy(input_local, input_gm[offset], QK4_0);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        DataCopy(output_gm[offset], output_local, QK4_0);
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3] +
-                                     group * QK4_0;
-        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
-                                     indices_ne1_idx * scale_stride[2] +
-                                     indices_ne2_idx * scale_stride[3] + group;
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3] +
-                                      group * QK4_0;
-
-        copy_in(input_offset);
-        LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
-        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
-        // TODO: cast more data to speed up.
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
-        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
-
-        // Only mul need compile by group.
-        half scale = scale_gm.GetValue(scale_offset);
-
-        Muls(output_local, output_local, (float)scale, QK4_0);
-
-        input_queue.FreeTensor(input_local);
-        cast_queue.FreeTensor(cast_local);
-        output_queue.EnQue(output_local);
-
-        copy_out(output_offset);
-    }
-
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                calculate_group(i, j);
-            }
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t scale_ne[4];
-    size_t scale_stride[4];
-
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t ir;
-    int64_t dr;
-
-    int64_t group_size_in_row;
-
-    TPipe pipe;
-    GlobalTensor<int4b_t> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
-    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    GET_ROW_Q4_0 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
-            indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
--- a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
@@ -1,191 +0,0 @@
-#include "kernel_operator.h"
-
-// optimize me. Use template to avoid copy code.
-using namespace AscendC;
-
-#define BUFFER_NUM 2
-
-#define QK8_0 32
-
-class GET_ROW_Q8_0 {
-   public:
-    __aicore__ inline GET_ROW_Q8_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
-                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
-                                size_t *indices_nb_ub, int64_t *output_ne_ub,
-                                size_t *output_nb_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            indices_ne[i] = indices_ne_ub[i];
-            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
-            scale_ne[i] = input_ne_ub[i];
-            output_ne[i] = output_ne_ub[i];
-            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
-        }
-
-        // one scale for a group.
-        scale_ne[0] /= QK8_0;
-
-        input_stride[0] = 1;
-        scale_stride[0] = 1;
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        group_size_in_row = input_ne[0] / QK8_0;
-        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
-                               input_ne[3] * sizeof(int8_t);
-
-        // Indices has two dims. n_elements = all rows should get.
-        // dr, all rows should this thread get.
-        uint64_t n_elements =
-            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
-        dr = n_elements / op_block_num;
-
-        uint64_t tails = n_elements % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
-        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
-        output_gm.SetGlobalBuffer((__gm__ float *)output);
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
-        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
-        DataCopy(input_local, input_gm[offset], QK8_0);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<float> output_local = output_queue.DeQue<float>();
-        DataCopy(output_gm[offset], output_local, QK8_0);
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
-        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
-        const int64_t indices_ne1_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
-            indices_ne[0];
-        const int64_t indices_ne0_idx =
-            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
-             indices_ne1_idx * indices_ne[0]);
-
-        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
-                                       indices_ne1_idx * indices_stride[1] +
-                                       indices_ne2_idx * indices_stride[2];
-        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
-
-        const int64_t input_offset = selected_row_idx * input_stride[1] +
-                                     indices_ne1_idx * input_stride[2] +
-                                     indices_ne2_idx * input_stride[3] +
-                                     group * QK8_0;
-        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
-                                     indices_ne1_idx * scale_stride[2] +
-                                     indices_ne2_idx * scale_stride[3] + group;
-        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
-                                      indices_ne1_idx * output_stride[2] +
-                                      indices_ne2_idx * output_stride[3] +
-                                      group * QK8_0;
-
-        copy_in(input_offset);
-        LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
-        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
-        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
-
-        // TODO: cast more data to speed up.
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
-        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
-
-        // Only mul need compile by group.
-        half scale = scale_gm.GetValue(scale_offset);
-        Muls(output_local, output_local, (float)scale, QK8_0);
-
-        input_queue.FreeTensor(input_local);
-        cast_queue.FreeTensor(cast_local);
-        output_queue.EnQue(output_local);
-
-        copy_out(output_offset);
-    }
-
-    __aicore__ inline void calculate() {
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                calculate_group(i, j);
-            }
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t scale_ne[4];
-    size_t scale_stride[4];
-
-    int64_t indices_ne[4];
-    size_t indices_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t ir;
-    int64_t dr;
-
-    int64_t group_size_in_row;
-
-    TPipe pipe;
-    GlobalTensor<int8_t> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int32_t> indices_gm;
-    GlobalTensor<float> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
-    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
-    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
-    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
-    int64_t input_ne_ub[4];
-    int64_t indices_ne_ub[4];
-    size_t indices_nb_ub[4];
-    int64_t output_ne_ub[4];
-    size_t output_nb_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
-    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-    copy_to_ub(output_nb_gm, output_nb_ub, 32);
-
-    GET_ROW_Q8_0 op;
-    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
-            indices_nb_ub, output_ne_ub, output_nb_ub);
-    op.calculate();
-}
--- a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
@@ -1,218 +0,0 @@
-#include "kernel_operator.h"
-
-using namespace AscendC;
-#ifdef ASCEND_310P
-    extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f16->8bit quantization.\n");
-    }
-#else
-
-#define BUFFER_NUM 2
-#define QK8_0 32
-
-class QUANTIZE_F16_Q8_0 {
-   public:
-    __aicore__ inline QUANTIZE_F16_Q8_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *output_ne_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
-            output_ne[i] = output_ne_ub[i];
-        }
-
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
-        }
-
-        scale_ne = input_ne;
-        scale_stride[0] = 1;
-        scale_stride[1] = input_ne[0] / QK8_0;
-        for (int i = 2; i < 4; i++) {
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        // split input tensor by rows.
-        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
-        dr = nr / op_block_num;
-
-        uint64_t tails = nr % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        group_size_in_row = scale_stride[1];
-        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
-                              output_ne[3] * sizeof(uint8_t);
-
-        input_gm.SetGlobalBuffer((__gm__ half *)input);
-        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
-                                                 group_size_in_row *
-                                                 sizeof(half)));
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
-        pipe.InitBuffer(work_queue, 1, 32);
-        pipe.InitBuffer(max_queue, 1, 32);
-        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
-        pipe.InitBuffer(scale_queue, 1, 32);
-        pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
-        DataCopy(input_local, input_gm[offset], QK8_0);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
-        DataCopy(output_gm[offset], output_local, QK8_0);
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
-        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
-        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
-        const int64_t i1 =
-            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-
-        const int64_t input_offset = i1 * input_stride[1] +
-                                     i2 * input_stride[2] +
-                                     i3 * input_stride[3] + QK8_0 * group;
-
-        const int64_t output_offset = i1 * output_stride[1] +
-                                      i2 * output_stride[2] +
-                                      i3 * output_stride[3] + QK8_0 * group;
-
-        copy_in(input_offset);
-        LocalTensor<half> input_local = input_queue.DeQue<half>();
-        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
-        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
-        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
-        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
-        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
-
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
-        Abs(abs_local, cast_local, QK8_0);
-        ReduceMax(max_local, abs_local, work_local, QK8_0);
-
-        pipe_barrier(PIPE_ALL);
-        float d = max_local.GetValue(0);
-        d = d / ((1 << 7) - 1);
-        if (d != 0) {
-            Muls(cast_local, cast_local, 1.0f / d, QK8_0);
-        }
-
-        Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-
-        input_queue.FreeTensor(input_local);
-        work_queue.FreeTensor(work_local);
-        abs_queue.FreeTensor(abs_local);
-        max_queue.FreeTensor(max_local);
-        cast_queue.FreeTensor(cast_local);
-        return (half)d;
-    }
-
-    __aicore__ inline void calculate() {
-        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
-        uint32_t scale_local_offset = 0;
-        uint32_t scale_global_offset = 0;
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                half scale = calculate_group(i, j);
-                scale_local.SetValue(scale_local_offset++, scale);
-                if (scale_local_offset == 16) {
-                    scale_local_offset = 0;
-                    // TODO: OPTIMIZE ME
-                    pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
-                    pipe_barrier(PIPE_ALL);
-                    scale_global_offset += 16;
-                }
-            }
-        }
-
-        if (scale_local_offset != 0) {
-            pipe_barrier(PIPE_ALL);
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
-            DataCopyPad(scale_gm[scale_global_offset], scale_local,
-                        dataCopyParams);
-            pipe_barrier(PIPE_ALL);
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t *scale_ne;
-    size_t scale_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t group_size_in_row;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<half> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int8_t> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, 1> work_queue;
-    TQue<QuePosition::VECOUT, 1> max_queue;
-    TQue<QuePosition::VECIN, 1> abs_queue;
-    TQue<QuePosition::VECOUT, 1> scale_queue;
-    TQue<QuePosition::VECOUT, 1> cast_queue;
-
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
-    QUANTIZE_F16_Q8_0 op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
--- a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
@@ -1,216 +0,0 @@
-#include "kernel_operator.h"
-
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support f32->8bit quantization
-    extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f32->8bit quantization.\n");
-    }
-#else
-
-#define BUFFER_NUM 2
-#define QK8_0 32
-
-class QUANTIZE_F32_Q8_0 {
-   public:
-    __aicore__ inline QUANTIZE_F32_Q8_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *output_ne_ub) {
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-
-            output_ne[i] = output_ne_ub[i];
-        }
-
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
-        }
-
-        scale_ne = input_ne;
-        scale_stride[0] = 1;
-        scale_stride[1] = input_ne[0] / QK8_0;
-        for (int i = 2; i < 4; i++) {
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        // split input tensor by rows.
-        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
-        dr = nr / op_block_num;
-
-        uint64_t tails = nr % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        group_size_in_row = scale_stride[1];
-        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
-                              output_ne[3] * sizeof(uint8_t);
-
-        input_gm.SetGlobalBuffer((__gm__ float *)input);
-        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
-                                                 ir * group_size_in_row *
-                                                 sizeof(half)));
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
-        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
-        pipe.InitBuffer(work_queue, 1, 32);
-        pipe.InitBuffer(max_queue, 1, 32);
-        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
-        pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
-        pipe.InitBuffer(scale_queue, 1, 32);
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
-        DataCopy(input_local, input_gm[offset], QK8_0);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
-        DataCopy(output_gm[offset], output_local, QK8_0);
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
-        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
-        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
-        const int64_t i1 =
-            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-
-        const int64_t input_offset = i1 * input_stride[1] +
-                                     i2 * input_stride[2] +
-                                     i3 * input_stride[3] + QK8_0 * group;
-
-        const int64_t output_offset = i1 * output_stride[1] +
-                                      i2 * output_stride[2] +
-                                      i3 * output_stride[3] + QK8_0 * group;
-
-        copy_in(input_offset);
-        LocalTensor<float> input_local = input_queue.DeQue<float>();
-        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
-        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
-        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
-        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
-        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
-
-        Abs(abs_local, input_local, QK8_0);
-        ReduceMax(max_local, abs_local, work_local, QK8_0);
-        pipe_barrier(PIPE_ALL);
-        float d = max_local.GetValue(0);
-        d = d / ((1 << 7) - 1);
-        if (d != 0) {
-            Muls(input_local, input_local, 1.0f / d, QK8_0);
-        }
-
-        Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
-        Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-
-        input_queue.FreeTensor(input_local);
-        work_queue.FreeTensor(work_local);
-        abs_queue.FreeTensor(abs_local);
-        max_queue.FreeTensor(max_local);
-        cast_queue.FreeTensor(cast_local);
-
-        return (half)d;
-    }
-
-    __aicore__ inline void calculate() {
-        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
-        uint32_t scale_local_offset = 0;
-        uint32_t scale_global_offset = 0;
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                half scale = calculate_group(i, j);
-                scale_local.SetValue(scale_local_offset++, scale);
-                if (scale_local_offset == 16) {
-                    scale_local_offset = 0;
-                    // TODO: OPTIMIZE ME
-                    pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
-                    pipe_barrier(PIPE_ALL);
-                    scale_global_offset += 16;
-                }
-            }
-        }
-
-        if (scale_local_offset != 0) {
-            pipe_barrier(PIPE_ALL);
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
-            DataCopyPad(scale_gm[scale_global_offset], scale_local,
-                        dataCopyParams);
-            pipe_barrier(PIPE_ALL);
-        }
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t *scale_ne;
-    size_t scale_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t group_size_in_row;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<float> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int8_t> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, 1> work_queue;
-    TQue<QuePosition::VECOUT, 1> max_queue;
-    TQue<QuePosition::VECIN, 1> abs_queue;
-    TQue<QuePosition::VECIN, 1> cast_queue;
-    TQue<QuePosition::VECOUT, 1> scale_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
-    QUANTIZE_F32_Q8_0 op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
--- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
@@ -1,295 +0,0 @@
-#include "kernel_operator.h"
-
-using namespace AscendC;
-#ifdef ASCEND_310P // 310P not support float->4bit quantization
-    extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f32->4bit quantization.\n");
-    }
-
-    extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
-        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
-        printf("Ascend310P not support f16->4bit quantization.\n");
-    }
-#else
-
-#define BUFFER_NUM 2
-#define Group_Size 32
-
-template <typename SRC_T>
-class QUANTIZE_FLOAT_TO_Q4_0 {
-   public:
-    __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
-    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
-                                int64_t *input_ne_ub, size_t *input_nb_ub,
-                                int64_t *output_ne_ub) {
-        // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
-        //                         permute=[0,0,0,0]):
-        // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
-        int64_t op_block_num = GetBlockNum();
-        int64_t op_block_idx = GetBlockIdx();
-
-        // input stride of data elements
-        for (int i = 0; i < 4; i++) {
-            input_ne[i] = input_ne_ub[i];
-            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
-            output_ne[i] = output_ne_ub[i];
-        }
-
-        // output stride of data elements
-        output_stride[0] = 1;
-        for (int i = 1; i < 4; i++) {
-            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
-        }
-
-        // scale saved one by one after data:. [group1_scale, group2_scale, ...]
-        scale_ne = input_ne;
-        scale_stride[0] = 1;
-        scale_stride[1] = input_ne[0] / Group_Size;
-        for (int i = 2; i < 4; i++) {
-            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
-        }
-
-        // split input tensor by rows.
-        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
-        dr = nr / op_block_num;
-
-        uint64_t tails = nr % op_block_num;
-        if (op_block_idx < tails) {
-            dr += 1;
-            ir = dr * op_block_idx;
-        } else {
-            ir = dr * op_block_idx + tails;
-        }
-
-        group_size_in_row = scale_stride[1];
-        int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
-                              output_ne[3] * sizeof(uint8_t) / 2;
-
-        input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
-        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
-        scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
-                                                 group_size_in_row *
-                                                 sizeof(half)));
-
-        pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
-        pipe.InitBuffer(output_queue, BUFFER_NUM,
-                            Group_Size * sizeof(int8_t) / 2);
-        pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
-        pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
-        pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
-        pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
-    }
-
-    __aicore__ inline void copy_in(uint32_t offset) {
-        LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
-        DataCopy(input_local, input_gm[offset], Group_Size);
-        input_queue.EnQue(input_local);
-    }
-
-    __aicore__ inline void copy_out(uint32_t offset) {
-        // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
-        // and using DataCopyPad to avoid 32 bits align.
-        LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
-        LocalTensor<int8_t> output_int8_local =
-                                    output_local.ReinterpretCast<int8_t>();
-
-        DataCopyExtParams dataCopyParams;
-        dataCopyParams.blockCount = 1;
-        dataCopyParams.blockLen = Group_Size / 2  * sizeof(int8_t);
-        DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
-
-        output_queue.FreeTensor(output_local);
-    }
-
-    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
-                                         LocalTensor<float> input_local) {
-        DataCopy(cast_local, input_local, Group_Size);
-    }
-
-    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
-                                         LocalTensor<half> input_local) {
-        Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
-    }
-
-    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
-        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
-        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
-        const int64_t i1 =
-            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
-
-        const int64_t input_offset = i1 * input_stride[1] +
-                                     i2 * input_stride[2] +
-                                     i3 * input_stride[3] + Group_Size * group;
-
-        // output_offset is stride for output_gm which datatype is int8_t and
-        // divided by 2 is needed for int4b_t.
-        const int64_t output_offset = (i1 * output_stride[1] +
-                                       i2 * output_stride[2] +
-                                       i3 * output_stride[3] +
-                                       Group_Size * group) / 2;
-        copy_in(input_offset);
-
-        LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
-        LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
-        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
-        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
-        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
-        LocalTensor<float> min_local = min_queue.AllocTensor<float>();
-        LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
-        LocalTensor<half> half_local = half_queue.AllocTensor<half>();
-
-        input_to_cast(cast_local, input_local);
-
-        ReduceMax(max_local, cast_local, work_local, Group_Size);
-        ReduceMin(min_local, cast_local, work_local, Group_Size);
-        const float max_value = max_local.GetValue(0);
-        const float min_value = min_local.GetValue(0);
-        float d = max_value;
-        if (min_value < 0 && (-1 * min_value) > max_value) {
-            d = min_value;
-        }
-
-        d = d / (-8);
-        if (d != 0) {
-            Muls(cast_local, cast_local, 1.0f / d, Group_Size);
-        }
-
-        // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
-        float scalar = 8.5f;
-        Adds(cast_local, cast_local, scalar, Group_Size);
-        Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
-        scalar = 15.0f;
-        Mins(cast_local, cast_local, scalar, Group_Size);
-        scalar = -8.0f;
-        Adds(cast_local, cast_local, scalar, Group_Size);
-
-        // float->half->int4b
-        Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
-        Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
-
-        output_queue.EnQue(output_local);
-        copy_out(output_offset);
-
-        input_queue.FreeTensor(input_local);
-        work_queue.FreeTensor(work_local);
-        max_queue.FreeTensor(max_local);
-        min_queue.FreeTensor(min_local);
-        int8_queue.FreeTensor(int8_local);
-        half_queue.FreeTensor(half_local);
-        cast_queue.FreeTensor(cast_local);
-        return (half)d;
-    }
-
-    __aicore__ inline void calculate() {
-        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
-        uint32_t scale_local_offset = 0;
-        uint32_t scale_global_offset = 0;
-        for (int64_t i = ir; i < ir + dr; i++) {
-            for (int64_t j = 0; j < group_size_in_row; j++) {
-                half scale = calculate_group(i, j);
-                scale_local.SetValue(scale_local_offset++, scale);
-                // Copy Group_Size/2 length data each time.
-                if (scale_local_offset == Group_Size / 2) {
-                    scale_local_offset = 0;
-                    // TODO: OPTIMIZE ME
-                    pipe_barrier(PIPE_ALL);
-                    DataCopy(scale_gm[scale_global_offset], scale_local,
-                                      Group_Size / 2);
-                    pipe_barrier(PIPE_ALL);
-                    scale_global_offset += Group_Size / 2;
-                }
-            }
-        }
-
-        if (scale_local_offset != 0) {
-            pipe_barrier(PIPE_ALL);
-            DataCopyExtParams dataCopyParams;
-            dataCopyParams.blockCount = 1;
-            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
-            DataCopyPad(scale_gm[scale_global_offset], scale_local,
-                        dataCopyParams);
-            pipe_barrier(PIPE_ALL);
-        }
-        scale_queue.FreeTensor(scale_local);
-    }
-
-   private:
-    int64_t input_ne[4];
-    size_t input_stride[4];
-
-    int64_t *scale_ne;
-    size_t scale_stride[4];
-
-    int64_t output_ne[4];
-    size_t output_stride[4];
-
-    int64_t group_size_in_row;
-
-    int64_t ir;
-    int64_t dr;
-
-    TPipe pipe;
-    GlobalTensor<SRC_T> input_gm;
-    GlobalTensor<half> scale_gm;
-    GlobalTensor<int8_t> output_gm;
-    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
-    TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
-};
-
-template <typename T>
-__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
-    auto gm_ptr = (__gm__ uint8_t *)gm;
-    auto ub_ptr = (uint8_t *)(ub);
-    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
-        *ub_ptr = *gm_ptr;
-    }
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
-    QUANTIZE_FLOAT_TO_Q4_0<half> op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-
-extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
-    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
-    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
-    int64_t input_ne_ub[4];
-    size_t input_nb_ub[4];
-    int64_t output_ne_ub[4];
-
-    copy_to_ub(input_ne_gm, input_ne_ub, 32);
-    copy_to_ub(input_nb_gm, input_nb_ub, 32);
-    copy_to_ub(output_ne_gm, output_ne_ub, 32);
-
-    QUANTIZE_FLOAT_TO_Q4_0<float> op;
-    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
-    op.calculate();
-}
-
-#endif // #ifdef ASCEND_310P
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -158,6 +158,12 @@ typedef sycl::half2 ggml_half2;

 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

+#ifdef _MSC_VER
+#define GGML_EXTENSION
+#else // _MSC_VER
+#define GGML_EXTENSION __extension__
+#endif // _MSC_VER
+
 #define QK4_0 32
 typedef struct {
    ggml_half d;           // delta
@@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 b

 #define QK4_1 32
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d; // delta
            ggml_half m; // min
@@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0

 #define QK5_1 32
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d; // delta
            ggml_half m; // min
@@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block

 #define QK8_1 32
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d; // delta
            ggml_half s; // d * sum(qs[i])
@@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0
 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
@@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12
 // weight is represented as x = a * q + b
 // Effectively 4.5 bits per weight
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
@@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2,
 // weight is represented as x = a * q + b
 // Effectively 5.5 bits per weight
 typedef struct {
-    union {
+    GGML_EXTENSION union {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -23,6 +23,16 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        ggml-cpu/amx/mmq.cpp
        ggml-cpu/amx/mmq.h
        ggml-cpu/ggml-cpu-impl.h
+        ggml-cpu/common.h
+        ggml-cpu/binary-ops.h
+        ggml-cpu/binary-ops.cpp
+        ggml-cpu/unary-ops.h
+        ggml-cpu/unary-ops.cpp
+        ggml-cpu/simd-mappings.h
+        ggml-cpu/vec.h
+        ggml-cpu/vec.cpp
+        ggml-cpu/ops.h
+        ggml-cpu/ops.cpp
        )

    target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
@@ -289,23 +299,29 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        endif()
    elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
        message(STATUS "PowerPC detected")
-        if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
-            file(READ "/proc/cpuinfo" POWER10_M)
-        elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
-            execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
-        endif()
+        if (GGML_NATIVE)
+            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+                file(READ "/proc/cpuinfo" POWER10_M)
+            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
+                execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
+            endif()

-        string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
-        string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
+            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
+            string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")

-        if (EXTRACTED_NUMBER GREATER_EQUAL 10)
-            list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
-        elseif (EXTRACTED_NUMBER EQUAL 9)
-            list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
-        elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-            list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
+            if (EXTRACTED_NUMBER GREATER_EQUAL 10)
+                list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
+            elseif (EXTRACTED_NUMBER EQUAL 9)
+                list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
+            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+                list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
+            else()
+                list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
+            endif()
        else()
-            list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
+            if (GGML_CPU_POWERPC_CPUTYPE)
+                list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
+            endif()
        endif()
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
        message(STATUS "loongarch64 detected")
@@ -320,7 +336,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
        message(STATUS "RISC-V detected")
        if (GGML_RVV)
-            list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
+            if (GGML_RV_ZFH)
+                list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
+            else()
+                list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
+            endif()
        endif()
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
        message(STATUS "s390x detected")
@@ -359,9 +379,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

        # Fetch KleidiAI sources:
        include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.3.0")
+        set(KLEIDIAI_COMMIT_TAG "v1.5.0")
        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "060bd2dc64642b091f461cc8dd7426d9")
+        set(KLEIDIAI_ARCHIVE_MD5  "ea22e1aefb800e9bc8c74d91633cc58e")

        if (POLICY CMP0135)
            cmake_policy(SET CMP0135 NEW)
--- a/ggml/src/ggml-cpu/binary-ops.cpp
+++ b/ggml/src/ggml-cpu/binary-ops.cpp
@@ -0,0 +1,158 @@
+#include "binary-ops.h"
+
+#if defined(GGML_USE_ACCELERATE)
+#include <Accelerate/Accelerate.h>
+
+using vDSP_fn_t = void (*)(const float *, vDSP_Stride, const float *, vDSP_Stride, float *, vDSP_Stride, vDSP_Length);
+#endif
+
+static inline float op_add(float a, float b) {
+    return a + b;
+}
+
+static inline float op_sub(float a, float b) {
+    return a - b;
+}
+
+static inline float op_mul(float a, float b) {
+    return a * b;
+}
+
+static inline float op_div(float a, float b) {
+    return a / b;
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i])));
+    }
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        int i10 = i % ne10;
+        const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10);
+        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr)));
+    }
+}
+
+template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
+static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(dst_t));
+    GGML_ASSERT(nb00 == sizeof(src0_t));
+
+    const auto [ir0, ir1] = get_thread_range(params, src0);
+    const bool is_src1_contiguous = (nb10 == sizeof(src1_t));
+
+    if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous
+        GGML_ASSERT(ggml_are_same_shape(src0, src1));
+    }
+
+#ifdef GGML_USE_ACCELERATE
+    vDSP_fn_t vDSP_op = nullptr;
+    // TODO - avoid the f32-only check using type 'trait' lookup tables and row-based src-to-float conversion functions
+    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        if (op == op_add) {
+            vDSP_op = vDSP_vadd;
+        } else if (op == op_sub) {
+            vDSP_op = vDSP_vsub;
+        } else if (op == op_mul) {
+            vDSP_op = vDSP_vmul;
+        } else if (op == op_div) {
+            vDSP_op = vDSP_vdiv;
+        }
+    }
+#endif
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        const int64_t i13 = i03 % ne13;
+        const int64_t i12 = i02 % ne12;
+        const int64_t i11 = i01 % ne11;
+
+        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+        const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+        if (is_src1_contiguous) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t nr0 = ne00 / ne10;
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                if constexpr (std::is_same_v<src0_t, float> && std::is_same_v<src1_t, float> && std::is_same_v<dst_t, float>) {
+                    if (vDSP_op != nullptr) {
+                        vDSP_op(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
+                        continue;
+                    }
+                }
+#endif
+                vec_binary_op_contiguous<op>(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+            }
+        } else {
+            vec_binary_op_non_contiguous<op>(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr);
+        }
+    }
+}
+
+// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
+template <float (*op)(float, float)>
+static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_binary_op<op, float, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_binary_op<op, ggml_fp16_t, ggml_fp16_t, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_binary_op<op, ggml_bf16_t, ggml_bf16_t, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_BF16) {
+        apply_binary_op<op, ggml_bf16_t, float, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
+        apply_binary_op<op, ggml_bf16_t, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F16) {
+        apply_binary_op<op, ggml_fp16_t, float, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
+        apply_binary_op<op, ggml_fp16_t, float, float>(params, dst);
+    } else {
+        GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+    }
+}
+
+void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_add>(params, dst);
+}
+
+void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_sub>(params, dst);
+}
+
+void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_mul>(params, dst);
+}
+
+void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) {
+    binary_op<op_div>(params, dst);
+}
--- a/ggml/src/ggml-cpu/binary-ops.h
+++ b/ggml/src/ggml-cpu/binary-ops.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ggml_compute_forward_add_non_quantized(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sub(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_mul(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_div(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ggml/src/ggml-cpu/common.h
+++ b/ggml/src/ggml-cpu/common.h
@@ -0,0 +1,72 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-cpu-traits.h"
+#include "ggml-cpu-impl.h"
+#include "ggml-impl.h"
+
+#ifdef __cplusplus
+
+#include <utility>
+
+// convenience functions/macros for use in template calls
+// note: these won't be required after the 'traits' lookup table is used.
+static inline ggml_fp16_t f32_to_f16(float x) {
+    return GGML_FP32_TO_FP16(x);
+}
+
+static inline float f16_to_f32(ggml_fp16_t x) {
+    return GGML_FP16_TO_FP32(x);
+}
+
+static inline ggml_bf16_t f32_to_bf16(float x) {
+    return GGML_FP32_TO_BF16(x);
+}
+
+static inline float bf16_to_f32(ggml_bf16_t x) {
+    return GGML_BF16_TO_FP32(x);
+}
+
+static inline float f32_to_f32(float x) {
+    return x;
+}
+
+// TODO - merge this into the traits table, after using row-based conversions
+template <class T>
+struct type_conversion_table;
+
+template <>
+struct type_conversion_table<ggml_fp16_t> {
+    static constexpr float (*to_f32)(ggml_fp16_t) = f16_to_f32;
+    static constexpr ggml_fp16_t (*from_f32)(float) = f32_to_f16;
+};
+
+template <>
+struct type_conversion_table<float> {
+    static constexpr float (*to_f32)(float) = f32_to_f32;
+    static constexpr float (*from_f32)(float) = f32_to_f32;
+};
+
+template <>
+struct type_conversion_table<ggml_bf16_t> {
+    static constexpr float (*to_f32)(ggml_bf16_t) = bf16_to_f32;
+    static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
+};
+
+static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
+    const int64_t ith = params->ith;
+    const int64_t nth = params->nth;
+
+    const int64_t nr  = ggml_nrows(src0);
+
+    // rows per thread
+    const int64_t dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int64_t ir0 = dr*ith;
+    const int64_t ir1 = MIN(ir0 + dr, nr);
+
+    return {ir0, ir1};
+}
+
+#endif
--- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
+++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
--- a/ggml/src/ggml-cpu/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h
@@ -4,13 +4,13 @@

 #include "ggml.h"
 #include "ggml-impl.h"
+
 #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 //#include <stddef.h>
 #include <stdbool.h>
 #include <string.h> // memcpy
 #include <math.h>   // fabsf

-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -69,33 +69,16 @@ struct ggml_compute_params {
 #endif

 #if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
 #include <sys/prctl.h>
 #endif

-// 16-bit float
-// on Arm, we use __fp16
-// on x86, we use uint16_t
 #if defined(__ARM_NEON)

-// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
-//
-//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
-//
-#include <arm_neon.h>
-
+// ref: https://github.com/ggml-org/llama.cpp/pull/5404
 #ifdef _MSC_VER
-
-typedef uint16_t ggml_fp16_internal_t;
-
 #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
-
 #else
-
-typedef __fp16 ggml_fp16_internal_t;
-
 #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
-
 #endif // _MSC_VER

 #if !defined(__aarch64__)
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
--- a/ggml/src/ggml-cpu/kleidiai/kernels.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.cpp
@@ -51,11 +51,10 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .run_kernel            = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
        },
        /* .lhs_info = */ {
-            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
-            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
+            /* .get_offset            = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
+            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon,
            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon,
            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
-            /* .require_aligned_m_idx = */ true,
        },
        /* .rhs_info = */ {
            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
@@ -100,7 +99,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
-            /* .require_aligned_m_idx = */ false,
        },
        /* .rhs_info = */ {
            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -144,7 +142,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
-            /* .require_aligned_m_idx = */ false,
        },
        /* .rhs_info = */ {
            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -189,7 +186,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
-            /* .require_aligned_m_idx = */ false,
        },
        /* .rhs_info = */ {
            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
@@ -233,7 +229,6 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
            /* .get_packed_offset     = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
            /* .packed_size           = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
            /* .pack_func             = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
-            /* .require_aligned_m_idx = */ false,
        },
        /* .rhs_info = */ {
            /* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
--- a/ggml/src/ggml-cpu/kleidiai/kernels.h
+++ b/ggml/src/ggml-cpu/kleidiai/kernels.h
@@ -40,7 +40,6 @@ struct lhs_packing_info {
    size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
    void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
                      size_t lhs_stride, void* lhs_packed);
-    bool require_aligned_m_idx;
 };

 struct rhs_packing_info {
--- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
+++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp
@@ -124,8 +124,7 @@ class tensor_traits : public ggml::cpu::tensor_traits {
            size_t sr = kernel->get_sr();

            // Calculate number of columns to be processed per thread
-            const bool use_multithread = lhs_info->require_aligned_m_idx && m <= mr ? false : true;
-            const size_t num_m_per_thread = use_multithread ? kai_roundup(m, nth) / nth : m;
+            const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
            const size_t m_start = ith * num_m_per_thread;
            size_t m_to_process = num_m_per_thread;
            if ((m_start + m_to_process) > m) {
@@ -135,11 +134,11 @@ class tensor_traits : public ggml::cpu::tensor_traits {
            if(m_start < m) {
                // Transform LHS
                const size_t src_stride        = src1->nb[1];
-                const float * src_ptr          = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(0, dst->src[1]->nb[1]));
+                const float * src_ptr          = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
                const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr);
                void * lhs_packed_ptr          = static_cast<void *>(lhs_packed + lhs_packed_offset);

-                lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, m_start, src_ptr, src_stride, lhs_packed_ptr);
+                lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
            }

            ggml_barrier(params->threadpool);
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -55,6 +55,7 @@

 #include <atomic>
 #include <array>
+#include <type_traits>

 #ifdef _MSC_VER
 #define NOINLINE __declspec(noinline)
@@ -1092,13 +1093,403 @@ class tinyBLAS_Q0_PPC {
       }
    }

-    template<typename VA, typename VB>
-    void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
+    template<typename VA, typename VB, int size>
+    void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, VA* vec, std::array<int, size>& comparray) {
        int64_t i, j;
        TA *aoffset = NULL;
        VA *vecOffset = NULL;
        TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
        TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
+        VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
+        VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
+        VB t1, t2, t3, t4, t5, t6, t7, t8;
+        const vector signed char lowMask = vec_splats((signed char)0xF);
+        const vector unsigned char v4 = vec_splats((unsigned char)0x4);
+        const vector signed char v8 = vec_splats((signed char)0x8);
+        aoffset = const_cast<TA*>(a);
+        vecOffset = vec;
+        vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+        vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+        vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
+        vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
+        vector signed int vsum = {0};
+        vector signed int vsum2 = {0};
+
+        j = (rows >> 3);
+        if (j > 0) {
+            do {
+                aoffset1 = aoffset;
+                aoffset2 = aoffset1 + lda;
+                aoffset3 = aoffset2 + lda;
+                aoffset4 = aoffset3 + lda;
+                aoffset5 = aoffset4 + lda;
+                aoffset6 = aoffset5 + lda;
+                aoffset7 = aoffset6 + lda;
+                aoffset8 = aoffset7 + lda;
+                aoffset += 8 * lda;
+
+                i = (cols >> 2);
+                if (i > 0) {
+                    do {
+                        c1[1] = reinterpret_cast<VB>(vec_xl(0, aoffset1->qs));
+                        c2[1] = reinterpret_cast<VB>(vec_xl(0, aoffset2->qs));
+                        c3[1] = reinterpret_cast<VB>(vec_xl(0, aoffset3->qs));
+                        c4[1] = reinterpret_cast<VB>(vec_xl(0, aoffset4->qs));
+                        c5[1] = reinterpret_cast<VB>(vec_xl(0, aoffset5->qs));
+                        c6[1] = reinterpret_cast<VB>(vec_xl(0, aoffset6->qs));
+                        c7[1] = reinterpret_cast<VB>(vec_xl(0, aoffset7->qs));
+                        c8[1] = reinterpret_cast<VB>(vec_xl(0, aoffset8->qs));
+
+                        c1[0] = vec_and(c1[1], lowMask);
+                        c1[1] = vec_sr(c1[1], v4);
+                        c1[0] = vec_sub(c1[0], v8);
+                        c1[1] = vec_sub(c1[1], v8);
+                        vsum = vec_sum4s(c1[0], vsum);
+                        vsum2 = vec_sum4s(c1[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c2[0] = vec_and(c2[1], lowMask);
+                        c2[1] = vec_sr(c2[1], v4);
+                        c2[0] = vec_sub(c2[0], v8);
+                        c2[1] = vec_sub(c2[1], v8);
+                        vsum = vec_sum4s(c2[0], vsum);
+                        vsum2 = vec_sum4s(c2[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c3[0] = vec_and(c3[1], lowMask);
+                        c3[1] = vec_sr(c3[1], v4);
+                        c3[0] = vec_sub(c3[0], v8);
+                        c3[1] = vec_sub(c3[1], v8);
+                        vsum = vec_sum4s(c3[0], vsum);
+                        vsum2 = vec_sum4s(c3[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c4[0] = vec_and(c4[1], lowMask);
+                        c4[1] = vec_sr(c4[1], v4);
+                        c4[0] = vec_sub(c4[0], v8);
+                        c4[1] = vec_sub(c4[1], v8);
+                        vsum = vec_sum4s(c4[0], vsum);
+                        vsum2 = vec_sum4s(c4[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c5[0] = vec_and(c5[1], lowMask);
+                        c5[1] = vec_sr(c5[1], v4);
+                        c5[0] = vec_sub(c5[0], v8);
+                        c5[1] = vec_sub(c5[1], v8);
+                        vsum = vec_sum4s(c5[0], vsum);
+                        vsum2 = vec_sum4s(c5[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[4] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c6[0] = vec_and(c6[1], lowMask);
+                        c6[1] = vec_sr(c6[1], v4);
+                        c6[0] = vec_sub(c6[0], v8);
+                        c6[1] = vec_sub(c6[1], v8);
+                        vsum = vec_sum4s(c6[0], vsum);
+                        vsum2 = vec_sum4s(c6[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[5] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c7[0] = vec_and(c7[1], lowMask);
+                        c7[1] = vec_sr(c7[1], v4);
+                        c7[0] = vec_sub(c7[0], v8);
+                        c7[1] = vec_sub(c7[1], v8);
+                        vsum = vec_sum4s(c7[0], vsum);
+                        vsum2 = vec_sum4s(c7[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[6] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        c8[0] = vec_and(c8[1], lowMask);
+                        c8[1] = vec_sr(c8[1], v4);
+                        c8[0] = vec_sub(c8[0], v8);
+                        c8[1] = vec_sub(c8[1], v8);
+                        vsum = vec_sum4s(c8[0], vsum);
+                        vsum2 = vec_sum4s(c8[1], vsum2);
+                        vsum = vec_add(vsum, vsum2);
+                        comparray[7] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                        vsum = vec_splats(0);
+                        vsum2 = vec_splats(0);
+
+                        t1 = vec_perm(c1[0], c2[0], swiz1);
+                        t2 = vec_perm(c1[0], c2[0], swiz2);
+                        t3 = vec_perm(c3[0], c4[0], swiz1);
+                        t4 = vec_perm(c3[0], c4[0], swiz2);
+                        t5 = vec_perm(t1, t3, swiz3);
+                        t6 = vec_perm(t1, t3, swiz4);
+                        t7 = vec_perm(t2, t4, swiz3);
+                        t8 = vec_perm(t2, t4, swiz4);
+                        vec_xst(t5, 0, vecOffset);
+                        vec_xst(t6, 0, vecOffset+16);
+                        vec_xst(t7, 0, vecOffset+32);
+                        vec_xst(t8, 0, vecOffset+48);
+
+                        t1 = vec_perm(c1[1], c2[1], swiz1);
+                        t2 = vec_perm(c1[1], c2[1], swiz2);
+                        t3 = vec_perm(c3[1], c4[1], swiz1);
+                        t4 = vec_perm(c3[1], c4[1], swiz2);
+                        t5 = vec_perm(t1, t3, swiz3);
+                        t6 = vec_perm(t1, t3, swiz4);
+                        t7 = vec_perm(t2, t4, swiz3);
+                        t8 = vec_perm(t2, t4, swiz4);
+                        vec_xst(t5, 0, vecOffset+64);
+                        vec_xst(t6, 0, vecOffset+80);
+                        vec_xst(t7, 0, vecOffset+96);
+                        vec_xst(t8, 0, vecOffset+112);
+
+                        t1 = vec_perm(c5[0], c6[0], swiz1);
+                        t2 = vec_perm(c5[0], c6[0], swiz2);
+                        t3 = vec_perm(c7[0], c8[0], swiz1);
+                        t4 = vec_perm(c7[0], c8[0], swiz2);
+                        t5 = vec_perm(t1, t3, swiz3);
+                        t6 = vec_perm(t1, t3, swiz4);
+                        t7 = vec_perm(t2, t4, swiz3);
+                        t8 = vec_perm(t2, t4, swiz4);
+                        vec_xst(t5, 0, vecOffset+128);
+                        vec_xst(t6, 0, vecOffset+144);
+                        vec_xst(t7, 0, vecOffset+160);
+                        vec_xst(t8, 0, vecOffset+176);
+
+                        t1 = vec_perm(c5[1], c6[1], swiz1);
+                        t2 = vec_perm(c5[1], c6[1], swiz2);
+                        t3 = vec_perm(c7[1], c8[1], swiz1);
+                        t4 = vec_perm(c7[1], c8[1], swiz2);
+                        t5 = vec_perm(t1, t3, swiz3);
+                        t6 = vec_perm(t1, t3, swiz4);
+                        t7 = vec_perm(t2, t4, swiz3);
+                        t8 = vec_perm(t2, t4, swiz4);
+                        vec_xst(t5, 0, vecOffset+192);
+                        vec_xst(t6, 0, vecOffset+208);
+                        vec_xst(t7, 0, vecOffset+224);
+                        vec_xst(t8, 0, vecOffset+240);
+
+                        aoffset1 += lda;
+                        aoffset2 += lda;
+                        aoffset3 += lda;
+                        aoffset4 += lda;
+                        aoffset5 += lda;
+                        aoffset6 += lda;
+                        aoffset7 += lda;
+                        aoffset8 += lda;
+                        vecOffset += 256;
+                        i--;
+                    } while (i > 0);
+                }
+                j--;
+            } while (j > 0);
+        }
+
+        if (rows & 4) {
+            aoffset1 = aoffset;
+            aoffset2 = aoffset1 + lda;
+            aoffset3 = aoffset2 + lda;
+            aoffset4 = aoffset3 + lda;
+            aoffset += 4 * lda;
+
+            i = (cols >> 2);
+            if (i > 0) {
+                do {
+                    c1[1] = reinterpret_cast<VB>(vec_xl(0, aoffset1->qs));
+                    c2[1] = reinterpret_cast<VB>(vec_xl(0, aoffset2->qs));
+                    c3[1] = reinterpret_cast<VB>(vec_xl(0, aoffset3->qs));
+                    c4[1] = reinterpret_cast<VB>(vec_xl(0, aoffset4->qs));
+
+                    c1[0] = vec_and(c1[1], lowMask);
+                    c1[1] = vec_sr(c1[1], v4);
+                    c1[0] = vec_sub(c1[0], v8);
+                    c1[1] = vec_sub(c1[1], v8);
+                    vsum = vec_sum4s(c1[0], vsum);
+                    vsum2 = vec_sum4s(c1[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    c2[0] = vec_and(c2[1], lowMask);
+                    c2[1] = vec_sr(c2[1], v4);
+                    c2[0] = vec_sub(c2[0], v8);
+                    c2[1] = vec_sub(c2[1], v8);
+                    vsum = vec_sum4s(c2[0], vsum);
+                    vsum2 = vec_sum4s(c2[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    c3[0] = vec_and(c3[1], lowMask);
+                    c3[1] = vec_sr(c3[1], v4);
+                    c3[0] = vec_sub(c3[0], v8);
+                    c3[1] = vec_sub(c3[1], v8);
+                    vsum = vec_sum4s(c3[0], vsum);
+                    vsum2 = vec_sum4s(c3[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    c4[0] = vec_and(c4[1], lowMask);
+                    c4[1] = vec_sr(c4[1], v4);
+                    c4[0] = vec_sub(c4[0], v8);
+                    c4[1] = vec_sub(c4[1], v8);
+                    vsum = vec_sum4s(c4[0], vsum);
+                    vsum2 = vec_sum4s(c4[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats( 0);
+
+                    t1 = vec_perm(c1[0], c2[0], swiz1);
+                    t2 = vec_perm(c1[0], c2[0], swiz2);
+                    t3 = vec_perm(c3[0], c4[0], swiz1);
+                    t4 = vec_perm(c3[0], c4[0], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    vec_xst(t5, 0, vecOffset);
+                    vec_xst(t6, 0, vecOffset+16);
+                    vec_xst(t7, 0, vecOffset+32);
+                    vec_xst(t8, 0, vecOffset+48);
+
+                    t1 = vec_perm(c1[1], c2[1], swiz1);
+                    t2 = vec_perm(c1[1], c2[1], swiz2);
+                    t3 = vec_perm(c3[1], c4[1], swiz1);
+                    t4 = vec_perm(c3[1], c4[1], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    vec_xst(t5, 0, vecOffset+64);
+                    vec_xst(t6, 0, vecOffset+80);
+                    vec_xst(t7, 0, vecOffset+96);
+                    vec_xst(t8, 0, vecOffset+112);
+
+                    aoffset1 += lda;
+                    aoffset2 += lda;
+                    aoffset3 += lda;
+                    aoffset4 += lda;
+                    vecOffset += 128;
+                    i--;
+                } while (i > 0);
+            }
+        }
+
+        if (rows & 3) {
+            aoffset1 = aoffset;
+            aoffset2 = aoffset1 + lda;
+            aoffset3 = aoffset2 + lda;
+            i = (cols >> 2);
+            if (i > 0) {
+                do {
+                    switch(rows) {
+                        case 3: c3[1] = reinterpret_cast<VB>(vec_xl(0, aoffset3->qs));
+                        case 2: c2[1] = reinterpret_cast<VB>(vec_xl(0, aoffset2->qs));
+                        case 1: c1[1] = reinterpret_cast<VB>(vec_xl(0, aoffset1->qs));
+                            break;
+                    }
+                    c1[0] = vec_and(c1[1], lowMask);
+                    c1[1] = vec_sr(c1[1], v4);
+                    c1[0] = vec_sub(c1[0], v8);
+                    c1[1] = vec_sub(c1[1], v8);
+                    vsum = vec_sum4s(c1[0], vsum);
+                    vsum2 = vec_sum4s(c1[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    c2[0] = vec_and(c2[1], lowMask);
+                    c2[1] = vec_sr(c2[1], v4);
+                    c2[0] = vec_sub(c2[0], v8);
+                    c2[1] = vec_sub(c2[1], v8);
+                    vsum = vec_sum4s(c2[0], vsum);
+                    vsum2 = vec_sum4s(c2[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    c3[0] = vec_and(c3[1], lowMask);
+                    c3[1] = vec_sr(c3[1], v4);
+                    c3[0] = vec_sub(c3[0], v8);
+                    c3[1] = vec_sub(c3[1], v8);
+                    vsum = vec_sum4s(c3[0], vsum);
+                    vsum2 = vec_sum4s(c3[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    c4[0] = vec_and(c4[1], lowMask);
+                    c4[1] = vec_sr(c4[1], v4);
+                    c4[0] = vec_sub(c4[0], v8);
+                    c4[1] = vec_sub(c4[1], v8);
+                    vsum = vec_sum4s(c4[0], vsum);
+                    vsum2 = vec_sum4s(c4[1], vsum2);
+                    vsum = vec_add(vsum, vsum2);
+                    comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
+                    vsum = vec_splats(0);
+                    vsum2 = vec_splats(0);
+
+                    t1 = vec_perm(c1[0], c2[0], swiz1);
+                    t2 = vec_perm(c1[0], c2[0], swiz2);
+                    t3 = vec_perm(c3[0], c4[0], swiz1);
+                    t4 = vec_perm(c3[0], c4[0], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    vec_xst(t5, 0, vecOffset);
+                    vec_xst(t6, 0, vecOffset+16);
+                    vec_xst(t7, 0, vecOffset+32);
+                    vec_xst(t8, 0, vecOffset+48);
+
+                    t1 = vec_perm(c1[1], c2[1], swiz1);
+                    t2 = vec_perm(c1[1], c2[1], swiz2);
+                    t3 = vec_perm(c3[1], c4[1], swiz1);
+                    t4 = vec_perm(c3[1], c4[1], swiz2);
+                    t5 = vec_perm(t1, t3, swiz3);
+                    t6 = vec_perm(t1, t3, swiz4);
+                    t7 = vec_perm(t2, t4, swiz3);
+                    t8 = vec_perm(t2, t4, swiz4);
+                    vec_xst(t5, 0, vecOffset+64);
+                    vec_xst(t6, 0, vecOffset+80);
+                    vec_xst(t7, 0, vecOffset+96);
+                    vec_xst(t8, 0, vecOffset+112);
+                    aoffset1 += lda;
+                    aoffset2 += lda;
+                    aoffset3 += lda;
+                    vecOffset += 128;
+                    i--;
+                } while(i > 0);
+            }
+        }
+    }
+
+    template<typename VA, typename VB>
+    void packNormal(const TB* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
+        int64_t i, j;
+        TB *aoffset = NULL;
+        VA *vecOffset = NULL;
+        TB *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
+        TB *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
        __vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
        VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0};
        VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0};
@@ -1111,24 +1502,24 @@ class tinyBLAS_Q0_PPC {
        vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
        vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};

-        aoffset = const_cast<TA*>(a);
+        aoffset = const_cast<TB*>(a);
        vecOffset = vec;
        j = (rows >> 3);
        if (j > 0) {
            do {
-            aoffset1 = aoffset;
-            aoffset2 = aoffset1 + lda;
-            aoffset3 = aoffset2 + lda;
-            aoffset4 = aoffset3 + lda;
-            aoffset5 = aoffset4 + lda;
-            aoffset6 = aoffset5 + lda;
-            aoffset7 = aoffset6 + lda;
-            aoffset8 = aoffset7 + lda;
-            aoffset += 8 * lda;
+                aoffset1 = aoffset;
+                aoffset2 = aoffset1 + lda;
+                aoffset3 = aoffset2 + lda;
+                aoffset4 = aoffset3 + lda;
+                aoffset5 = aoffset4 + lda;
+                aoffset6 = aoffset5 + lda;
+                aoffset7 = aoffset6 + lda;
+                aoffset8 = aoffset7 + lda;
+                aoffset += 8 * lda;

-            i = (cols >> 3);
-            if (i > 0) {
-               do {
+                i = (cols >> 3);
+                if (i > 0) {
+                do {
                    C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
                    C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
                    C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
@@ -1156,10 +1547,10 @@ class tinyBLAS_Q0_PPC {
                    t7 = vec_perm(t2, t4, swiz3);
                    t8 = vec_perm(t2, t4, swiz4);
                    if (flip == true) {
-                       t5 = vec_xor(t5, xor_vector);
-                       t6 = vec_xor(t6, xor_vector);
-                       t7 = vec_xor(t7, xor_vector);
-                       t8 = vec_xor(t8, xor_vector);
+                        t5 = vec_xor(t5, xor_vector);
+                        t6 = vec_xor(t6, xor_vector);
+                        t7 = vec_xor(t7, xor_vector);
+                        t8 = vec_xor(t8, xor_vector);
                    }
                    vec_xst(t5, 0, vecOffset);
                    vec_xst(t6, 0, vecOffset+16);
@@ -1175,10 +1566,10 @@ class tinyBLAS_Q0_PPC {
                    t7 = vec_perm(t2, t4, swiz3);
                    t8 = vec_perm(t2, t4, swiz4);
                    if (flip == true) {
-                       t5 = vec_xor(t5, xor_vector);
-                       t6 = vec_xor(t6, xor_vector);
-                       t7 = vec_xor(t7, xor_vector);
-                       t8 = vec_xor(t8, xor_vector);
+                        t5 = vec_xor(t5, xor_vector);
+                        t6 = vec_xor(t6, xor_vector);
+                        t7 = vec_xor(t7, xor_vector);
+                        t8 = vec_xor(t8, xor_vector);
                    }
                    vec_xst(t5, 0, vecOffset+64);
                    vec_xst(t6, 0, vecOffset+80);
@@ -1194,10 +1585,10 @@ class tinyBLAS_Q0_PPC {
                    t7 = vec_perm(t2, t4, swiz3);
                    t8 = vec_perm(t2, t4, swiz4);
                    if (flip == true) {
-                       t5 = vec_xor(t5, xor_vector);
-                       t6 = vec_xor(t6, xor_vector);
-                       t7 = vec_xor(t7, xor_vector);
-                       t8 = vec_xor(t8, xor_vector);
+                        t5 = vec_xor(t5, xor_vector);
+                        t6 = vec_xor(t6, xor_vector);
+                        t7 = vec_xor(t7, xor_vector);
+                        t8 = vec_xor(t8, xor_vector);
                    }
                    vec_xst(t5, 0, vecOffset+128);
                    vec_xst(t6, 0, vecOffset+144);
@@ -1213,10 +1604,10 @@ class tinyBLAS_Q0_PPC {
                    t7 = vec_perm(t2, t4, swiz3);
                    t8 = vec_perm(t2, t4, swiz4);
                    if (flip == true) {
-                       t5 = vec_xor(t5, xor_vector);
-                       t6 = vec_xor(t6, xor_vector);
-                       t7 = vec_xor(t7, xor_vector);
-                       t8 = vec_xor(t8, xor_vector);
+                        t5 = vec_xor(t5, xor_vector);
+                        t6 = vec_xor(t6, xor_vector);
+                        t7 = vec_xor(t7, xor_vector);
+                        t8 = vec_xor(t8, xor_vector);
                    }
                    vec_xst(t5, 0, vecOffset+192);
                    vec_xst(t6, 0, vecOffset+208);
@@ -1240,11 +1631,11 @@ class tinyBLAS_Q0_PPC {
    }

    if (rows & 4) {
-            aoffset1 = aoffset;
-            aoffset2 = aoffset1 + lda;
-            aoffset3 = aoffset2 + lda;
-            aoffset4 = aoffset3 + lda;
-            aoffset += 4 * lda;
+        aoffset1 = aoffset;
+        aoffset2 = aoffset1 + lda;
+        aoffset3 = aoffset2 + lda;
+        aoffset4 = aoffset3 + lda;
+        aoffset += 4 * lda;

        i = (cols >> 3);
            if (i > 0) {
@@ -1311,7 +1702,7 @@ class tinyBLAS_Q0_PPC {
            aoffset2 = aoffset1 + lda;
            aoffset3 = aoffset2 + lda;
            i = (cols >> 3);
-        if (i > 0) {
+            if (i > 0) {
                do {
                    switch(rows) {
                        case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
@@ -1527,13 +1918,18 @@ class tinyBLAS_Q0_PPC {
    void KERNEL_4x8(int64_t ii, int64_t jj) {
        vec_t vec_A[8], vec_B[16] = {0};
        acc_t acc_0, acc_1;
-        std::array<int, 4> comparray;
+        std::array<int, 4> comparray {};
        vector float fin_res[8] = {0};
        vector float vs[8] = {0};
+        bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
        for (int l = 0; l < k; l++) {
            __builtin_mma_xxsetaccz(&acc_0);
            __builtin_mma_xxsetaccz(&acc_1);
-            packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
+            if (std::is_same_v<TA, block_q4_0>) {
+               packNormalInt4<int8_t, vector signed char, 4>((A+(ii*lda)+l), lda, 4, 4, (int8_t*)vec_A, comparray);
+            } else {
+               packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
+            }
            packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
            for(int x = 0; x < 8; x++) {
                __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
@@ -1545,15 +1941,17 @@ class tinyBLAS_Q0_PPC {
                    *((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
                }
            }
-            auto aoffset = A+(ii*lda)+l;
-            for (int i = 0; i < 4; i++) {
-                comparray[i] = 0;
-                int ca = 0;
-                const int8_t *at = aoffset->qs;
-                for (int j = 0; j < 32; j++)
-                    ca += (int)*at++;
-                comparray[i] = ca;
-                aoffset += lda;
+            if (!isAblock_q4) {
+                auto aoffset = A+(ii*lda)+l;
+                for (int i = 0; i < 4; i++) {
+                    comparray[i] = 0;
+                    int ca = 0;
+                    auto *at = aoffset->qs;
+                    for (int j = 0; j < 32; j++)
+                        ca += (int)*at++;
+                    comparray[i] = ca;
+                    aoffset += lda;
+                }
            }
            compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
            compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
@@ -1565,13 +1963,18 @@ class tinyBLAS_Q0_PPC {
    void KERNEL_8x4(int64_t ii, int64_t jj) {
        vec_t vec_A[16], vec_B[8] = {0};
        acc_t acc_0, acc_1;
-        std::array<int, 8> comparray;
+        std::array<int, 8> comparray {};
        vector float fin_res[8] = {0};
        vector float vs[8] = {0};
+        bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
        for (int l = 0; l < k; l++) {
            __builtin_mma_xxsetaccz(&acc_0);
            __builtin_mma_xxsetaccz(&acc_1);
-            packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
+            if (std::is_same_v<TA, block_q4_0>) {
+               packNormalInt4<int8_t, vector signed char, 8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray);
+            } else {
+               packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
+            }
            packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
            for(int x = 0; x < 8; x++) {
                __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
@@ -1582,15 +1985,17 @@ class tinyBLAS_Q0_PPC {
                    *((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
                }
            }
-            auto aoffset = A+(ii*lda)+l;
-            for (int i = 0; i < 8; i++) {
-                comparray[i] = 0;
-                int ca = 0;
-                const int8_t *at = aoffset->qs;
-                for (int j = 0; j < 32; j++)
-                    ca += (int)*at++;
-                comparray[i] = ca;
-                aoffset += lda;
+            if (!isAblock_q4) {
+                auto aoffset = A+(ii*lda)+l;
+                for (int i = 0; i < 8; i++) {
+                    comparray[i] = 0;
+                    int ca = 0;
+                    auto *at = aoffset->qs;
+                    for (int j = 0; j < 32; j++)
+                        ca += (int)*at++;
+                    comparray[i] = ca;
+                    aoffset += lda;
+                }
            }
            compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
            compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
@@ -1602,15 +2007,20 @@ class tinyBLAS_Q0_PPC {
    void KERNEL_8x8(int64_t ii, int64_t jj) {
        vec_t vec_A[16], vec_B[16] = {0};
        acc_t acc_0, acc_1, acc_2, acc_3;
-        std::array<int, 8> comparray;
+        std::array<int, 8> comparray {};
        vector float fin_res[16] = {0};
        vector float vs[16] = {0};
+        bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
        for (int l = 0; l < k; l++) {
            __builtin_mma_xxsetaccz(&acc_0);
            __builtin_mma_xxsetaccz(&acc_1);
            __builtin_mma_xxsetaccz(&acc_2);
            __builtin_mma_xxsetaccz(&acc_3);
-            packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
+            if (std::is_same_v<TA, block_q4_0>) {
+               packNormalInt4<int8_t, vector signed char, 8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray);
+            } else {
+               packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
+            }
            packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
            for(int x = 0; x < 8; x++) {
                __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
@@ -1624,15 +2034,17 @@ class tinyBLAS_Q0_PPC {
                    *((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
                }
            }
-            auto aoffset = A+(ii*lda)+l;
-            for (int i = 0; i < 8; i++) {
-                comparray[i] = 0;
-                int ca = 0;
-                const int8_t *at = aoffset->qs;
-                for (int j = 0; j < 32; j++)
-                    ca += (int)*at++;
-                comparray[i] = ca;
-                aoffset += lda;
+            if (!isAblock_q4) {
+                auto aoffset = A+(ii*lda)+l;
+                for (int i = 0; i < 8; i++) {
+                    comparray[i] = 0;
+                    int ca = 0;
+                    auto *at = aoffset->qs;
+                    for (int j = 0; j < 32; j++)
+                        ca += (int)*at++;
+                    comparray[i] = ca;
+                    aoffset += lda;
+                }
            }
            compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
            compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
@@ -1653,16 +2065,17 @@ class tinyBLAS_Q0_PPC {
        int64_t duty = (tiles + nth - 1) / nth;
        int64_t start = duty * ith;
        int64_t end = start + duty;
-        vec_t vec_A[8], vec_B[8] = {0};
+        vec_t vec_A[8] = {0}, vec_B[8] = {0};
        vector signed int vec_C[4];
        acc_t acc_0;
+        bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;

        if (end > tiles)
            end = tiles;
        for (int64_t job = start; job < end; ++job) {
            int64_t ii = m0 + job / xtiles * RM;
            int64_t jj = n0 + job % xtiles * RN;
-            std::array<int, RM> comparray;
+            std::array<int, 4> comparray{};
            vector float res[4] = {0};
            vector float fin_res[4] = {0};
            vector float vs[4] = {0};
@@ -1673,7 +2086,11 @@ class tinyBLAS_Q0_PPC {
                __builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead
                __builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead
                __builtin_mma_xxsetaccz(&acc_0);
-                packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
+                if (isAblock_q4) {
+                   packNormalInt4<int8_t, vector signed char, 4>((A+(ii*lda)+l), lda, RM, 4, (int8_t*)vec_A, comparray);
+                } else {
+                   packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
+                }
                packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
                for(int x = 0; x < 8; x+=4) {
                    __builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
@@ -1687,17 +2104,18 @@ class tinyBLAS_Q0_PPC {
                    }
                }
                __builtin_mma_disassemble_acc(vec_C, &acc_0);
-                auto aoffset = A+(ii*lda)+l;
-                for (int i = 0; i < RM; i++) {
-                    comparray[i] = 0;
-                    int ca = 0;
-                    const int8_t *at = aoffset->qs;
-                    for (int j = 0; j < 32; j++)
-                        ca += (int)*at++;
-                    comparray[i] = ca;
-                    aoffset += lda;
+                if (!isAblock_q4) {
+                    auto aoffset = A+(ii*lda)+l;
+                    for (int i = 0; i < RM; i++) {
+                        comparray[i] = 0;
+                        int ca = 0;
+                        auto *at = aoffset->qs;
+                        for (int j = 0; j < 32; j++)
+                            ca += (int)*at++;
+                        comparray[i] = ca;
+                        aoffset += lda;
+                    }
                }
-
                for (int i = 0; i < RM; i++) {
                    CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
                    res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
@@ -2013,6 +2431,7 @@ class tinyBLAS_PPC {
            }
        }
    }
+
    void KERNEL_4x4(int64_t ii, int64_t jj) {
        vec_t vec_A[4], vec_B[4], vec_C[4];
        acc_t acc_0;
@@ -2259,15 +2678,27 @@ class tinyBLAS_PPC {
            vec_t vec_C[4];
            acc_t acc_0;
            __builtin_mma_xxsetaccz(&acc_0);
-            vec_t vec_A[4], vec_B[4];
+            vec_t vec_A[4] {0}, vec_B[4] = {0};
            for (int l=0; l<k; l+=4) {
-                if (RN >= 4 && RM == 1) {
+                /* 'GEMV Forwarding' concept is used in first two conditional loops.
+                 * when one of the matrix has a single row/column, the elements are
+                 * broadcasted, instead of using packing routine to prepack the
+                 * matrix elements.
+                 */
+                if (RM == 1) {
                    TA* a = const_cast<TA*>(A+(ii)*lda+l);
-                    packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
+                    packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
                    vec_A[0] = (vec_t)vec_xl(0,a);
                    vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
                    vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
                    vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
+                } else if (RN == 1) {
+                    packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
+                    TB* b = const_cast<TB*>(B+(jj)*ldb+l);
+                    vec_B[0] = (vec_t)vec_xl(0,b);
+                    vec_B[1] = (vec_t)vec_splats(*((TB*)&vec_B+1));
+                    vec_B[2] = (vec_t)vec_splats(*((TB*)&vec_B+2));
+                    vec_B[3] = (vec_t)vec_splats(*((TB*)&vec_B+3));
                } else {
                    packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
                    packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
@@ -2371,8 +2802,10 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
    assert(params->ith < params->nth);

    // only enable sgemm for prompt processing
+#if !defined(__MMA__)
    if (n < 2)
        return false;
+#endif

    if (Ctype != GGML_TYPE_F32)
        return false;
@@ -2503,8 +2936,8 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
-
 #elif defined(__MMA__)
+    //TO-DO: Remove this condition once gemv forwarding is enabled.
        if (n < 8 && n != 4)
           return false;
        if (m < 8 && m != 4)
@@ -2516,7 +2949,6 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
-
 #else
        return false;
 #endif
@@ -2541,6 +2973,19 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
            params->ith, params->nth};
        tb.matmul(m, n);
        return true;
+#elif defined(__MMA__)
+    //TO-DO: Remove this condition once gemv forwarding is enabled.
+        if (n < 8 && n != 4)
+           return false;
+        if (m < 8 && m != 4)
+           return false;
+        tinyBLAS_Q0_PPC<block_q4_0, block_q8_0, float> tb{
+            k, (const block_q4_0 *)A, lda,
+            (const block_q8_0 *)B, ldb,
+            (float *)C, ldc,
+            params->ith, params->nth};
+        tb.matmul(m, n);
+        return true;
 #else
        return false;
 #endif
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include "ggml.h"
+
+//
+// cache line
+//
+
+#if defined(__cpp_lib_hardware_interference_size)
+#define CACHE_LINE_SIZE std::hardware_destructive_interference_size
+#else
+#if defined(__POWER9_VECTOR__)
+#define CACHE_LINE_SIZE 128
+#elif defined(__VXE__) || defined(__VXE2__)
+#define CACHE_LINE_SIZE 256
+#else
+#define CACHE_LINE_SIZE 64
+#endif
+#endif
+
+static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ggml_compute_forward_dup(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_add(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sum_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_mean(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_argmax(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_count_equal(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_repeat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_repeat_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_concat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_silu_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_rms_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_rms_norm_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_group_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_l2_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_out_prod(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_scale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_set(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_cpy(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_cont(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_reshape(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_view(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_permute(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_soft_max(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_soft_max_ext_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_rope(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_rope_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_flash_attn_ext(
+    const struct ggml_compute_params * params,
+    const struct ggml_tensor * q,
+    const struct ggml_tensor * k,
+    const struct ggml_tensor * v,
+    const struct ggml_tensor * mask,
+    struct ggml_tensor * dst);
+void ggml_compute_forward_flash_attn_back(
+        const struct ggml_compute_params * params,
+        const bool masked,
+        struct ggml_tensor * dst);
+void ggml_compute_forward_ssm_conv(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_map_unary(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
+    const ggml_unary_op_f32_t fun);
+void ggml_compute_forward_map_binary(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
+    const ggml_binary_op_f32_t fun);
+void ggml_compute_forward_map_custom1_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
+    const ggml_custom1_op_f32_t fun);
+void ggml_compute_forward_map_custom2_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
+    const ggml_custom2_op_f32_t fun);
+void ggml_compute_forward_map_custom3_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
+    const ggml_custom3_op_f32_t fun);
+void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -0,0 +1,884 @@
+#pragma once
+
+#include "ggml-cpu-impl.h"
+
+//
+// simd mappings
+//
+
+// we define a common set of C macros which map to specific intrinsics based on the current architecture
+// we then implement the fundamental computation operations below using only these macros
+// adding support for new architectures requires to define the corresponding SIMD macros
+//
+// GGML_F32_STEP / GGML_F16_STEP
+//   number of elements to process in a single step
+//
+// GGML_F32_EPR / GGML_F16_EPR
+//   number of elements to fit in a single register
+//
+
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
+
+#define GGML_SIMD
+
+// F32 NEON
+
+#define GGML_F32_STEP 16
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              float32x4_t
+#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)
+#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)
+#define GGML_F32x4_LOAD         vld1q_f32
+#define GGML_F32x4_STORE        vst1q_f32
+#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
+#define GGML_F32x4_ADD          vaddq_f32
+#define GGML_F32x4_MUL          vmulq_f32
+#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
+#define GGML_F32x4_REDUCE(res, x)                       \
+{                                                       \
+    int offset = GGML_F32_ARR >> 1;                     \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    offset >>= 1;                                       \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    offset >>= 1;                                       \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    (res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 NEON
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    #define GGML_F16_STEP 32
+    #define GGML_F16_EPR  8
+
+    #define GGML_F16x8              float16x8_t
+    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)
+    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)
+    #define GGML_F16x8_LOAD(x)      vld1q_f16((const __fp16 *)(x))
+    #define GGML_F16x8_STORE        vst1q_f16
+    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
+    #define GGML_F16x8_ADD          vaddq_f16
+    #define GGML_F16x8_MUL          vmulq_f16
+    #define GGML_F16x8_REDUCE(res, x)                               \
+    do {                                                            \
+        int offset = GGML_F16_ARR >> 1;                             \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
+        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
+        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
+    } while (0)
+
+    #define GGML_F16_VEC                GGML_F16x8
+    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
+    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
+    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((__fp16 *)(p), (r)[i])
+    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
+    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
+    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
+    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE
+#else
+    // if FP16 vector arithmetic is not supported, we use FP32 instead
+    // and take advantage of the vcvt_ functions to convert to/from FP16
+
+    #define GGML_F16_STEP 16
+    #define GGML_F16_EPR  4
+
+    #define GGML_F32Cx4              float32x4_t
+    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)
+    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)
+    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16((const __fp16 *)(x)))
+    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))
+    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
+    #define GGML_F32Cx4_ADD          vaddq_f32
+    #define GGML_F32Cx4_MUL          vmulq_f32
+    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE
+
+    #define GGML_F16_VEC                GGML_F32Cx4
+    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO
+    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1
+    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((__fp16 *)(p), r[i])
+    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA
+    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD
+    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL
+    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
+#endif
+
+#elif defined(__AVX512F__)
+
+#define GGML_SIMD
+
+// F32 AVX512
+
+#define GGML_F32_STEP 64
+#define GGML_F32_EPR  16
+
+#define GGML_F32x16         __m512
+#define GGML_F32x16_ZERO    _mm512_setzero_ps()
+#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
+#define GGML_F32x16_LOAD    _mm512_loadu_ps
+#define GGML_F32x16_STORE   _mm512_storeu_ps
+// _mm512_fmadd_ps is defined in AVX512F so no guard is required
+#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32x16_ADD     _mm512_add_ps
+#define GGML_F32x16_MUL     _mm512_mul_ps
+#define GGML_F32x16_REDUCE(res, x)                                    \
+do {                                                                  \
+    int offset = GGML_F32_ARR >> 1;                                   \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    offset >>= 1;                                                     \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    offset >>= 1;                                                     \
+    for (int i = 0; i < offset; ++i) {                                \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
+    }                                                                 \
+    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                    \
+} while (0)
+
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x16
+#define GGML_F32_VEC_ZERO   GGML_F32x16_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x16_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x16_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x16_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x16_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x16_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x16_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
+
+// F16 AVX512
+
+// F16 AVX
+
+#define GGML_F16_STEP 64
+#define GGML_F16_EPR  16
+
+// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
+
+#define GGML_F32Cx16             __m512
+#define GGML_F32Cx16_ZERO        _mm512_setzero_ps()
+#define GGML_F32Cx16_SET1(x)     _mm512_set1_ps(x)
+
+// unlike  _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
+// so F16C guard isn't required
+#define GGML_F32Cx16_LOAD(x)     _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
+#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
+
+#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
+#define GGML_F32Cx16_ADD         _mm512_add_ps
+#define GGML_F32Cx16_MUL         _mm512_mul_ps
+#define GGML_F32Cx16_REDUCE(res, x)                               \
+do {                                                              \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    res = (ggml_float) _mm512_reduce_add_ps(x[0]);                \
+} while (0)
+
+#define GGML_F16_VEC                GGML_F32Cx16
+#define GGML_F16_VEC_ZERO           GGML_F32Cx16_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32Cx16_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx16_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32Cx16_FMA
+#define GGML_F16_VEC_ADD            GGML_F32Cx16_ADD
+#define GGML_F16_VEC_MUL            GGML_F32Cx16_MUL
+
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx16_REDUCE
+#elif defined(__AVX__)
+
+#define GGML_SIMD
+
+// F32 AVX
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  8
+
+#define GGML_F32x8         __m256
+#define GGML_F32x8_ZERO    _mm256_setzero_ps()
+#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
+#define GGML_F32x8_LOAD    _mm256_loadu_ps
+#define GGML_F32x8_STORE   _mm256_storeu_ps
+#if defined(__FMA__)
+    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
+#else
+    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
+#endif
+#define GGML_F32x8_ADD     _mm256_add_ps
+#define GGML_F32x8_MUL     _mm256_mul_ps
+#define GGML_F32x8_REDUCE(res, x)                                 \
+do {                                                              \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm256_add_ps(x[i], x[offset+i]);                  \
+    }                                                             \
+    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \
+                                 _mm256_extractf128_ps(x[0], 1)); \
+    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \
+    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1));        \
+} while (0)
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x8
+#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
+
+// F16 AVX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  8
+
+// F16 arithmetic is not supported by AVX, so we use F32 instead
+
+#define GGML_F32Cx8             __m256
+#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()
+#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)
+
+#if defined(__F16C__)
+// the  _mm256_cvt intrinsics require F16C
+#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
+#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
+#else
+static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
+    float tmp[8];
+
+    for (int i = 0; i < 8; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    return _mm256_loadu_ps(tmp);
+}
+static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
+    float arr[8];
+
+    _mm256_storeu_ps(arr, y);
+
+    for (int i = 0; i < 8; i++)
+        x[i] = GGML_FP32_TO_FP16(arr[i]);
+}
+#define GGML_F32Cx8_LOAD(x)     __avx_f32cx8_load(x)
+#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
+#endif
+
+#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
+#define GGML_F32Cx8_ADD         _mm256_add_ps
+#define GGML_F32Cx8_MUL         _mm256_mul_ps
+#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
+
+#define GGML_F16_VEC                GGML_F32Cx8
+#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
+#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
+#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
+
+#elif defined(__POWER9_VECTOR__)
+
+#define GGML_SIMD
+
+// F32 POWER9
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              vector float
+#define GGML_F32x4_ZERO         0.0f
+#define GGML_F32x4_SET1         vec_splats
+#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
+#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
+#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
+#define GGML_F32x4_ADD          vec_add
+#define GGML_F32x4_MUL          vec_mul
+#define GGML_F32x4_REDUCE(res, x)              \
+{                                              \
+    int offset = GGML_F32_ARR >> 1;            \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    offset >>= 1;                              \
+    for (int i = 0; i < offset; ++i) {         \
+        x[i] = vec_add(x[i], x[offset+i]);     \
+    }                                          \
+    res = vec_extract(x[0], 0) +               \
+          vec_extract(x[0], 1) +               \
+          vec_extract(x[0], 2) +               \
+          vec_extract(x[0], 3);                \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 POWER9
+#define GGML_F16_STEP       GGML_F32_STEP
+#define GGML_F16_EPR        GGML_F32_EPR
+#define GGML_F16_VEC        GGML_F32x4
+#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F16_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F16_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F16_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F16_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
+// Use vec_xl, not vec_ld, in case the load address is not aligned.
+#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \
+  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
+  vec_extract_fp32_from_shortl(vec_xl(0, p))
+#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
+#define GGML_F16_VEC_STORE(p, r, i)                             \
+  if (i & 0x1)                                                  \
+    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \
+                                   r[i - GGML_ENDIAN_BYTE(0)]), \
+            0, p - GGML_F16_EPR)
+
+#elif defined(__wasm_simd128__)
+
+#define GGML_SIMD
+
+// F32 WASM
+
+#define GGML_F32_STEP 16
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              v128_t
+#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)
+#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)
+#define GGML_F32x4_LOAD         wasm_v128_load
+#define GGML_F32x4_STORE        wasm_v128_store
+#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
+#define GGML_F32x4_ADD          wasm_f32x4_add
+#define GGML_F32x4_MUL          wasm_f32x4_mul
+#define GGML_F32x4_REDUCE(res, x)                  \
+{                                                  \
+    int offset = GGML_F32_ARR >> 1;                \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);  \
+    }                                              \
+    res = wasm_f32x4_extract_lane(x[0], 0) +       \
+          wasm_f32x4_extract_lane(x[0], 1) +       \
+          wasm_f32x4_extract_lane(x[0], 2) +       \
+          wasm_f32x4_extract_lane(x[0], 3);        \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 WASM
+
+#define GGML_F16_STEP 16
+#define GGML_F16_EPR  4
+
+inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
+    float tmp[4];
+
+    tmp[0] = GGML_FP16_TO_FP32(p[0]);
+    tmp[1] = GGML_FP16_TO_FP32(p[1]);
+    tmp[2] = GGML_FP16_TO_FP32(p[2]);
+    tmp[3] = GGML_FP16_TO_FP32(p[3]);
+
+    return wasm_v128_load(tmp);
+}
+
+inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
+    float tmp[4];
+
+    wasm_v128_store(tmp, x);
+
+    p[0] = GGML_FP32_TO_FP16(tmp[0]);
+    p[1] = GGML_FP32_TO_FP16(tmp[1]);
+    p[2] = GGML_FP32_TO_FP16(tmp[2]);
+    p[3] = GGML_FP32_TO_FP16(tmp[3]);
+}
+
+#define GGML_F16x4             v128_t
+#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)
+#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)
+#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)
+#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
+#define GGML_F16x4_FMA         GGML_F32x4_FMA
+#define GGML_F16x4_ADD         wasm_f32x4_add
+#define GGML_F16x4_MUL         wasm_f32x4_mul
+#define GGML_F16x4_REDUCE(res, x)                           \
+{                                                           \
+    int offset = GGML_F16_ARR >> 1;                         \
+    for (int i = 0; i < offset; ++i) {                      \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
+    }                                                       \
+    offset >>= 1;                                           \
+    for (int i = 0; i < offset; ++i) {                      \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
+    }                                                       \
+    offset >>= 1;                                           \
+    for (int i = 0; i < offset; ++i) {                      \
+        x[i] = wasm_f32x4_add(x[i], x[offset+i]);           \
+    }                                                       \
+    res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) +  \
+          wasm_f32x4_extract_lane(x[0], 1) +                \
+          wasm_f32x4_extract_lane(x[0], 2) +                \
+          wasm_f32x4_extract_lane(x[0], 3));                \
+}
+
+#define GGML_F16_VEC                GGML_F16x4
+#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO
+#define GGML_F16_VEC_SET1           GGML_F16x4_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F16x4_FMA
+#define GGML_F16_VEC_ADD            GGML_F16x4_ADD
+#define GGML_F16_VEC_MUL            GGML_F16x4_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE
+
+#elif defined(__SSE3__)
+
+#define GGML_SIMD
+
+// F32 SSE
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4         __m128
+#define GGML_F32x4_ZERO    _mm_setzero_ps()
+#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
+#define GGML_F32x4_LOAD    _mm_loadu_ps
+#define GGML_F32x4_STORE   _mm_storeu_ps
+#if defined(__FMA__)
+    // TODO: Does this work?
+    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
+#else
+    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
+#endif
+#define GGML_F32x4_ADD     _mm_add_ps
+#define GGML_F32x4_MUL     _mm_mul_ps
+#define GGML_F32x4_REDUCE(res, x)                                 \
+{                                                                 \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = _mm_add_ps(x[i], x[offset+i]);                     \
+    }                                                             \
+    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \
+    res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0));        \
+}
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 SSE
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  4
+
+static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
+    float tmp[4];
+
+    tmp[0] = GGML_FP16_TO_FP32(x[0]);
+    tmp[1] = GGML_FP16_TO_FP32(x[1]);
+    tmp[2] = GGML_FP16_TO_FP32(x[2]);
+    tmp[3] = GGML_FP16_TO_FP32(x[3]);
+
+    return _mm_loadu_ps(tmp);
+}
+
+static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
+    float arr[4];
+
+    _mm_storeu_ps(arr, y);
+
+    x[0] = GGML_FP32_TO_FP16(arr[0]);
+    x[1] = GGML_FP32_TO_FP16(arr[1]);
+    x[2] = GGML_FP32_TO_FP16(arr[2]);
+    x[3] = GGML_FP32_TO_FP16(arr[3]);
+}
+
+#define GGML_F32Cx4             __m128
+#define GGML_F32Cx4_ZERO        _mm_setzero_ps()
+#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)
+#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)
+#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
+#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
+#define GGML_F32Cx4_ADD         _mm_add_ps
+#define GGML_F32Cx4_MUL         _mm_mul_ps
+#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
+
+#define GGML_F16_VEC                 GGML_F32Cx4
+#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
+#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
+#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
+#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
+#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
+#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
+
+#elif defined(__loongarch_asx)
+
+#define GGML_SIMD
+
+// F32 LASX
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  8
+
+#define GGML_F32x8         __m256
+#define GGML_F32x8_ZERO    (__m256)__lasx_xvldi(0)
+#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
+#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
+#define GGML_F32x8_STORE(x,y)   __lasx_xvst((y), (x), 0)
+#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
+#define GGML_F32x8_ADD     __lasx_xvfadd_s
+#define GGML_F32x8_MUL     __lasx_xvfmul_s
+#define GGML_F32x8_REDUCE(res, x)                                 \
+do {                                                              \
+    int offset = GGML_F32_ARR >> 1;                               \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
+    }                                                             \
+    offset >>= 1;                                                 \
+    for (int i = 0; i < offset; ++i) {                            \
+        x[i] = __lasx_xvfadd_s(x[i], x[offset+i]);                  \
+    }                                                             \
+    float *tmp_p = (float *)&x[0]; \
+    res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7];  \
+} while (0)
+// TODO: is this optimal ?
+
+#define GGML_F32_VEC        GGML_F32x8
+#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x8_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x8_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x8_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x8_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x8_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
+
+// F16 LASX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  8
+
+// F16 arithmetic is not supported by LASX, so we use F32 instead
+
+#define GGML_F32Cx8          __m256
+#define GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
+#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
+
+static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
+    __m256i a;
+    memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
+    a = __lasx_xvpermi_d(a, 0 | (1 << 4));
+    return __lasx_xvfcvtl_s_h(a);
+}
+
+static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
+    __m256i a = __lasx_xvfcvt_h_s(y, y);
+    a = __lasx_xvpermi_d(a, 0 | (2 << 2));
+    memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
+}
+#define GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
+#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
+
+#define GGML_F32Cx8_FMA         GGML_F32x8_FMA
+#define GGML_F32Cx8_ADD         __lasx_xvfadd_s
+#define GGML_F32Cx8_MUL         __lasx_xvfmul_s
+#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE
+
+#define GGML_F16_VEC                GGML_F32Cx8
+#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1
+#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA
+#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD
+#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE
+
+#elif defined(__loongarch_sx)
+
+#define GGML_SIMD
+
+// F32 LSX
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4         __m128
+#define GGML_F32x4_ZERO    __lsx_vldi(0)
+#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
+#define GGML_F32x4_STORE((x),(y))   __lsx_vst((y), (x), 0)
+#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
+#define GGML_F32x4_ADD     __lsx_vfadd_s
+#define GGML_F32x4_MUL     __lsx_vfmul_s
+#define GGML_F32x4_REDUCE(res, x)                                                     \
+{                                                                                     \
+    int offset = GGML_F32_ARR >> 1;                                                   \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    __m128i tmp     = __lsx_vsrli_d((__m128i) x[0], 32);                              \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]);                    \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88);                                     \
+    tmp             = __lsx_vsrli_d((__m128i) t0, 32);                                \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, t0);                      \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    res             = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 LSX
+
+#define GGML_F16_STEP 32
+#define GGML_F16_EPR  4
+
+static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
+    float tmp[4];
+
+    tmp[0] = GGML_FP16_TO_FP32(x[0]);
+    tmp[1] = GGML_FP16_TO_FP32(x[1]);
+    tmp[2] = GGML_FP16_TO_FP32(x[2]);
+    tmp[3] = GGML_FP16_TO_FP32(x[3]);
+
+    return __lsx_vld(tmp, 0);
+}
+
+static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
+    float arr[4];
+
+    __lsx_vst(y, arr, 0);
+
+    x[0] = GGML_FP32_TO_FP16(arr[0]);
+    x[1] = GGML_FP32_TO_FP16(arr[1]);
+    x[2] = GGML_FP32_TO_FP16(arr[2]);
+    x[3] = GGML_FP32_TO_FP16(arr[3]);
+}
+
+#define GGML_F32Cx4             __m128
+#define GGML_F32Cx4_ZERO        __lsx_vldi(0)
+#define GGML_F32Cx4_SET1(x)     __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
+#define GGML_F32Cx4_LOAD(x)     __lsx_f16x4_load(x)
+#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
+#define GGML_F32Cx4_FMA         GGML_F32x4_FMA
+#define GGML_F32Cx4_ADD         __lsx_vfadd_s
+#define GGML_F32Cx4_MUL         __lsx_vfmul_s
+#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE
+
+#define GGML_F16_VEC                 GGML_F32Cx4
+#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO
+#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1
+#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)
+#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])
+#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA
+#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD
+#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL
+#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE
+
+#elif defined(__VXE__) || defined(__VXE2__)
+
+#define GGML_SIMD
+
+// F32 s390x
+
+#define GGML_F32_STEP 32
+#define GGML_F32_EPR  4
+
+#define GGML_F32x4              __vector float
+#define GGML_F32x4_ZERO         vec_splats(0.0f)
+#define GGML_F32x4_SET1         vec_splats
+#define GGML_F32x4_LOAD(p)      vec_xl(0, p)
+#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
+#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
+#define GGML_F32x4_ADD          vec_add
+#define GGML_F32x4_MUL          vec_mul
+#define GGML_F32x4_REDUCE(res, x)                   \
+{                                                   \
+    int offset = GGML_F32_ARR >> 1;                 \
+    for (int i = 0; i < offset; ++i) {              \
+        x[i] = vec_add(x[i], x[offset + i]);        \
+    }                                               \
+    offset >>= 1;                                   \
+    for (int i = 0; i < offset; ++i) {              \
+        x[i] = vec_add(x[i], x[offset + i]);        \
+    }                                               \
+    offset >>= 1;                                   \
+    for (int i = 0; i < offset; ++i) {              \
+        x[i] = vec_add(x[i], x[offset + i]);        \
+    }                                               \
+    res = vec_extract(x[0], 0) +                    \
+          vec_extract(x[0], 1) +                    \
+          vec_extract(x[0], 2) +                    \
+          vec_extract(x[0], 3);                     \
+}
+
+#define GGML_F32_VEC        GGML_F32x4
+#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO
+#define GGML_F32_VEC_SET1   GGML_F32x4_SET1
+#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD
+#define GGML_F32_VEC_STORE  GGML_F32x4_STORE
+#define GGML_F32_VEC_FMA    GGML_F32x4_FMA
+#define GGML_F32_VEC_ADD    GGML_F32x4_ADD
+#define GGML_F32_VEC_MUL    GGML_F32x4_MUL
+#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
+
+// F16 s390x
+#define GGML_F16_STEP GGML_F32_STEP
+#define GGML_F16_EPR  GGML_F32_EPR
+
+static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
+    float tmp[4];
+
+    for (int i = 0; i < 4; i++) {
+        tmp[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+
+    return vec_xl(0, tmp);
+}
+
+static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
+    float arr[4];
+
+    vec_xst(y, 0, arr);
+
+    for (int i = 0; i < 4; i++) {
+        x[i] = GGML_FP32_TO_FP16(arr[i]);
+    }
+}
+
+#define GGML_F16_VEC                GGML_F32x4
+#define GGML_F16_VEC_ZERO           GGML_F32x4_ZERO
+#define GGML_F16_VEC_SET1           GGML_F32x4_SET1
+#define GGML_F16_VEC_LOAD(p, i)     __lzs_f16cx4_load(p)
+#define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
+#define GGML_F16_VEC_FMA            GGML_F32x4_FMA
+#define GGML_F16_VEC_ADD            GGML_F32x4_ADD
+#define GGML_F16_VEC_MUL            GGML_F32x4_MUL
+#define GGML_F16_VEC_REDUCE         GGML_F32x4_REDUCE
+
+#endif
+
+// GGML_F32_ARR / GGML_F16_ARR
+//   number of registers to use per step
+#ifdef GGML_SIMD
+#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
+#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
+#endif
--- a/ggml/src/ggml-cpu/unary-ops.cpp
+++ b/ggml/src/ggml-cpu/unary-ops.cpp
@@ -0,0 +1,186 @@
+#include "unary-ops.h"
+
+static inline float op_abs(float x) {
+    return fabsf(x);
+}
+
+static inline float op_sgn(float x) {
+    return (x > 0.f) ? 1.f : ((x < 0.f) ? -1.f : 0.f);
+}
+
+static inline float op_neg(float x) {
+    return -x;
+}
+
+static inline float op_step(float x) {
+    return (x > 0.f) ? 1.f : 0.f;
+}
+
+static inline float op_tanh(float x) {
+    return tanhf(x);
+}
+
+static inline float op_elu(float x) {
+    return (x > 0.f) ? x : expm1f(x);
+}
+
+static inline float op_relu(float x) {
+    return (x > 0.f) ? x : 0.f;
+}
+
+static inline float op_sigmoid(float x) {
+    return 1.f / (1.f + expf(-x));
+}
+
+static inline float op_hardsigmoid(float x) {
+    return fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
+}
+
+static inline float op_exp(float x) {
+    return expf(x);
+}
+
+static inline float op_hardswish(float x) {
+    return x * fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
+}
+
+static inline float op_sqr(float x) {
+    return x * x;
+}
+
+static inline float op_sqrt(float x) {
+    return sqrtf(x);
+}
+
+static inline float op_sin(float x) {
+    return sinf(x);
+}
+
+static inline float op_cos(float x) {
+    return cosf(x);
+}
+
+static inline float op_log(float x) {
+    return logf(x);
+}
+
+template <float (*op)(float), typename src0_t, typename dst_t>
+static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
+    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
+    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
+
+    for (int i = 0; i < n; i++) {
+        y[i] = f32_to_dst(op(src0_to_f32(x[i])));
+    }
+}
+
+template <float (*op)(float), typename src0_t, typename dst_t>
+static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(dst_t));
+    GGML_ASSERT(nb00 == sizeof(src0_t));
+
+    const auto [ir0, ir1] = get_thread_range(params, src0);
+
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        const int64_t i03 = ir/(ne02*ne01);
+        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+        vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
+    }
+}
+
+// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
+template <float (*op)(float)>
+static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    /*  */ if (src0->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
+        apply_unary_op<op, float, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
+        apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
+        apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_bf16_t, float>(params, dst);
+    } else if (src0->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F32) {
+        apply_unary_op<op, ggml_fp16_t, float>(params, dst);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type));
+        GGML_ABORT("fatal error");
+    }
+}
+
+void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_abs>(params, dst);
+}
+
+void ggml_compute_forward_sgn(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sgn>(params, dst);
+}
+
+void ggml_compute_forward_neg(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_neg>(params, dst);
+}
+
+void ggml_compute_forward_step(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_step>(params, dst);
+}
+
+void ggml_compute_forward_tanh(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_tanh>(params, dst);
+}
+
+void ggml_compute_forward_elu(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_elu>(params, dst);
+}
+
+void ggml_compute_forward_relu(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_relu>(params, dst);
+}
+
+void ggml_compute_forward_sigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sigmoid>(params, dst);
+}
+
+void ggml_compute_forward_hardsigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_hardsigmoid>(params, dst);
+}
+
+void ggml_compute_forward_exp(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_exp>(params, dst);
+}
+
+void ggml_compute_forward_hardswish(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_hardswish>(params, dst);
+}
+
+void ggml_compute_forward_sqr(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sqr>(params, dst);
+}
+
+void ggml_compute_forward_sqrt(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sqrt>(params, dst);
+}
+
+void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_sin>(params, dst);
+}
+
+void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_cos>(params, dst);
+}
+
+void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
+    unary_op<op_log>(params, dst);
+}
--- a/ggml/src/ggml-cpu/unary-ops.h
+++ b/ggml/src/ggml-cpu/unary-ops.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ggml_compute_forward_abs(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sgn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_neg(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_step(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_tanh(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_elu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_hardsigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_exp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_hardswish(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sqr(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ggml/src/ggml-cpu/vec.cpp
+++ b/ggml/src/ggml-cpu/vec.cpp
@@ -0,0 +1,258 @@
+#include "vec.h"
+
+#include <cassert>
+
+#if defined(_MSC_VER)
+// disable "possible loss of data" to avoid hundreds of casts
+// we should just be careful :)
+#pragma warning(disable: 4244 4267)
+#endif
+
+// precomputed gelu table for f16 (128 KB)
+ggml_fp16_t ggml_table_gelu_f16[1 << 16];
+
+// precomputed quick gelu table for f16 (128 KB)
+ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
+
+void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
+   assert(nrc == 1);
+   GGML_UNUSED(nrc);
+   GGML_UNUSED(bx);
+   GGML_UNUSED(by);
+   GGML_UNUSED(bs);
+
+#if defined(GGML_SIMD)
+    float sumf = 0.0f;
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };
+
+    GGML_F32_VEC ax[GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+
+            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    GGML_F32_VEC_REDUCE(sumf, sum);
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        sumf += x[i]*y[i];
+    }
+#else
+    // scalar
+    ggml_float sumf = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_float)(x[i]*y[i]);
+    }
+#endif
+
+    *s = sumf;
+}
+
+void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc) {
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+    int i = 0;
+    ggml_float sumf = 0;
+
+#if defined(__AVX512BF16__)
+    __m512 c1 = _mm512_setzero_ps();
+    __m512 c2 = _mm512_setzero_ps();
+    for (; i + 64 <= n; i += 64) {
+        c1 = _mm512_dpbf16_ps(c1, m512bh(_mm512_loadu_si512((x + i))),
+                             m512bh(_mm512_loadu_si512((y + i))));
+        c2 = _mm512_dpbf16_ps(c2, m512bh(_mm512_loadu_si512((x + i + 32))),
+                             m512bh(_mm512_loadu_si512((y + i + 32))));
+    }
+    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
+    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
+
+#elif defined(__AVX512F__)
+#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16))
+    __m512 c1 = _mm512_setzero_ps();
+    __m512 c2 = _mm512_setzero_ps();
+    for (; i + 32 <= n; i += 32) {
+        c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
+        c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2);
+    }
+    sumf += (ggml_float)_mm512_reduce_add_ps(c1);
+    sumf += (ggml_float)_mm512_reduce_add_ps(c2);
+
+#undef LOAD
+#elif defined(__AVX2__) || defined(__AVX__)
+#if defined(__AVX2__)
+#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
+#else
+#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
+#endif
+    __m256 c1 = _mm256_setzero_ps();
+    __m256 c2 = _mm256_setzero_ps();
+    __m256 c3 = _mm256_setzero_ps();
+    __m256 c4 = _mm256_setzero_ps();
+    for (; i + 32 <= n; i += 32) {
+        c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1);
+        c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2);
+        c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3);
+        c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4);
+    }
+    __m128 g;
+    c1 = _mm256_add_ps(_mm256_add_ps(c1, c3),
+                       _mm256_add_ps(c2, c4));
+    g = _mm_add_ps(_mm256_extractf128_ps(c1, 1),
+                   _mm256_castps256_ps128(c1));
+    g = _mm_add_ps(g, _mm_movehl_ps(g, g));
+    g = _mm_add_ss(g, _mm_movehdup_ps(g));
+    sumf += (ggml_float)_mm_cvtss_f32(g);
+
+#undef LOAD
+#endif
+
+    for (; i < n; ++i) {
+        sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) *
+                             GGML_BF16_TO_FP32(y[i]));
+    }
+    *s = sumf;
+}
+
+void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc) {
+    assert(nrc == 1);
+    GGML_UNUSED(nrc);
+    GGML_UNUSED(bx);
+    GGML_UNUSED(by);
+    GGML_UNUSED(bs);
+
+    ggml_float sumf = 0.0;
+
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };
+
+    GGML_F16_VEC ax[GGML_F16_ARR];
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+
+            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    GGML_F16_VEC_REDUCE(sumf, sum);
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
+    }
+#else
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]));
+    }
+#endif
+
+    *s = sumf;
+}
+
+void ggml_vec_silu_f32(const int n, float * y, const float * x) {
+    int i = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        _mm512_storeu_ps(y + i, ggml_v_silu(_mm512_loadu_ps(x + i)));
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        _mm256_storeu_ps(y + i, ggml_v_silu(_mm256_loadu_ps(x + i)));
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        _mm_storeu_ps(y + i, ggml_v_silu(_mm_loadu_ps(x + i)));
+    }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    for (; i + 3 < n; i += 4) {
+        vst1q_f32(y + i, ggml_v_silu(vld1q_f32(x + i)));
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] = ggml_silu_f32(x[i]);
+    }
+}
+
+ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max) {
+    int i = 0;
+    ggml_float sum = 0;
+#if defined(__AVX512F__) && defined(__AVX512DQ__)
+    for (; i + 15 < n; i += 16) {
+        __m512 val = ggml_v_expf(_mm512_sub_ps(_mm512_loadu_ps(x + i),
+                                               _mm512_set1_ps(max)));
+        _mm512_storeu_ps(y + i, val);
+        sum += (ggml_float)_mm512_reduce_add_ps(val);
+    }
+#elif defined(__AVX2__) && defined(__FMA__)
+    for (; i + 7 < n; i += 8) {
+        __m256 val = ggml_v_expf(_mm256_sub_ps(_mm256_loadu_ps(x + i),
+                                               _mm256_set1_ps(max)));
+        _mm256_storeu_ps(y + i, val);
+        __m128 val2 = _mm_add_ps(_mm256_extractf128_ps(val, 1),
+                                 _mm256_castps256_ps128(val));
+        val2 = _mm_add_ps(val2, _mm_movehl_ps(val2, val2));
+        val2 = _mm_add_ss(val2, _mm_movehdup_ps(val2));
+        sum += (ggml_float)_mm_cvtss_f32(val2);
+    }
+#elif defined(__SSE2__)
+    for (; i + 3 < n; i += 4) {
+        __m128 val = ggml_v_expf(_mm_sub_ps(_mm_loadu_ps(x + i),
+                                            _mm_set1_ps(max)));
+        _mm_storeu_ps(y + i, val);
+#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__)
+        val = _mm_add_ps(val, _mm_movehl_ps(val, val));
+        val = _mm_add_ss(val, _mm_movehdup_ps(val));
+#else
+        __m128 tmp = _mm_shuffle_ps(val, val, _MM_SHUFFLE(2, 3, 0, 1));
+        val = _mm_add_ps(val, tmp);
+        tmp = _mm_movehl_ps(tmp, val);
+        val = _mm_add_ss(val, tmp);
+#endif
+        sum += (ggml_float)_mm_cvtss_f32(val);
+    }
+#elif defined(__ARM_NEON) && defined(__aarch64__)
+    for (; i + 3 < n; i += 4) {
+        float32x4_t val = ggml_v_expf(vsubq_f32(vld1q_f32(x + i),
+                                                vdupq_n_f32(max)));
+        vst1q_f32(y + i, val);
+        sum += (ggml_float)vaddvq_f32(val);
+    }
+#endif
+    for (; i < n; ++i) {
+        float val = expf(x[i] - max);
+        sum += (ggml_float)val;
+        y[i] = val;
+    }
+    return sum;
+}
+
+ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max) {
+    // log(soft_max) = log(soft_max_i / soft_max_sum) = log(soft_max_i) - log(soft_max_sum) = (logit_i - max) - log(soft_max_i)
+
+    int i = 0;
+    ggml_float sum = 0;
+    for (; i < n; ++i) {
+        float val = x[i] - max;
+        y[i] = val;
+        sum += (ggml_float)expf(val);
+    }
+    return sum = (ggml_float)logf(sum);
+}
--- a/ggml/src/ggml-cpu/vec.h
+++ b/ggml/src/ggml-cpu/vec.h
@@ -0,0 +1,802 @@
+// Vectorized functions for fundamental operations
+
+#pragma once
+
+#include "ggml-impl.h"
+#include "simd-mappings.h"
+#include "ggml.h"
+
+#if defined(GGML_USE_ACCELERATE)
+#include <Accelerate/Accelerate.h>
+#endif
+
+// floating point type used to accumulate sums
+typedef double ggml_float;
+
+#define GGML_GELU_FP16
+#define GGML_GELU_QUICK_FP16
+
+#define GGML_SOFT_MAX_UNROLL 4
+#define GGML_VEC_DOT_UNROLL  2
+#define GGML_VEC_MAD_UNROLL  32
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// global data
+//
+
+// precomputed gelu table for f16 (128 KB)
+extern ggml_fp16_t ggml_table_gelu_f16[1 << 16];
+
+// precomputed quick gelu table for f16 (128 KB)
+extern ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
+
+//
+// fundamental operations
+//
+
+void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
+void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
+void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
+
+void ggml_vec_silu_f32(const int n, float * y, const float * x);
+ggml_float ggml_vec_soft_max_f32(const int n, float * y, const float * x, float max);
+ggml_float ggml_vec_log_soft_max_f32(const int n, float * y, const float * x, float max);
+
+inline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+inline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+
+inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t   v) { for (int i = 0; i < n; ++i) x[i] = v;    }
+inline static void ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
+
+inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
+inline static void ggml_vec_add_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
+    for (int i = 0; i < n; ++i) {
+        z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) + GGML_FP16_TO_FP32(y[i]));
+    }
+}
+inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
+inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
+inline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
+inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
+inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
+    for (int i = 0; i < n; ++i) {
+        z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) - GGML_FP16_TO_FP32(y[i]));
+    }
+}
+inline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
+inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
+inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
+inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(-GGML_FP16_TO_FP32(x[i]));
+    }
+}
+
+inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
+inline static void ggml_vec_mul_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
+    for (int i = 0; i < n; ++i) {
+        z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) * GGML_FP16_TO_FP32(y[i]));
+    }
+}
+inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
+inline static void ggml_vec_div_f16 (const int n, ggml_fp16_t * z, const ggml_fp16_t * x, const ggml_fp16_t * y) {
+    for (int i = 0; i < n; ++i) {
+        z[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(x[i]) / GGML_FP16_TO_FP32(y[i]));
+    }
+}
+
+// compute GGML_VEC_DOT_UNROLL dot products at once
+// xs - x row stride in bytes
+inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) {
+    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
+
+    ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL];
+
+    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
+        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
+    }
+
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };
+
+    GGML_F16_VEC ax[GGML_F16_ARR];
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+
+            for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
+                ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);
+
+                sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);
+            }
+        }
+    }
+
+    // reduce sum0..sum3 to sum0
+    for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {
+        GGML_F16_VEC_REDUCE(sumf[k], sum[k]);
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
+            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
+        }
+    }
+#else
+    for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {
+            sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]));
+        }
+    }
+#endif
+
+    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
+        s[i] = (float)sumf[i];
+    }
+}
+
+inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) {
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+
+    GGML_F32_VEC ax[GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] += x[i]*v;
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] += x[i]*v;
+    }
+#endif
+}
+
+inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y, const ggml_fp16_t * GGML_RESTRICT x, const float v) {
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
+
+    GGML_F16_VEC ax[GGML_F16_ARR];
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
+
+            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
+    }
+#endif
+}
+
+// xs and vs are byte strides of x and v
+inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) {
+
+    const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL];
+    const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL];
+
+    for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) {
+        x[i] = (const float *) ((const char *) xv + i*xs);
+        v[i] = (const float *) ((const char *) vv + i*vs);
+    }
+
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx[GGML_VEC_MAD_UNROLL];
+
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        vx[k] = GGML_F32_VEC_SET1(v[k][0]);
+    }
+
+    GGML_F32_VEC ax[GGML_VEC_MAD_UNROLL][GGML_F32_ARR];
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+
+            for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+                ax[k][j] = GGML_F32_VEC_LOAD(x[k] + i + j*GGML_F32_EPR);
+                ay[j] = GGML_F32_VEC_FMA(ay[j], ax[k][j], vx[k]);
+            }
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        for (int i = np; i < n; ++i) {
+            y[i] += x[k][i]*v[k][0];
+        }
+    }
+#else
+    // scalar
+    for (int k = 0; k < GGML_VEC_MAD_UNROLL; ++k) {
+        for (int i = 0; i < n; ++i) {
+            y[i] += x[k][i]*v[k][0];
+        }
+    }
+#endif
+}
+
+//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
+inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
+#if defined(GGML_USE_ACCELERATE)
+    vDSP_vsmul(y, 1, &v, y, 1, n);
+#elif defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F32_STEP - 1));
+
+    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);
+
+    GGML_F32_VEC ay[GGML_F32_ARR];
+
+    for (int i = 0; i < np; i += GGML_F32_STEP) {
+        for (int j = 0; j < GGML_F32_ARR; j++) {
+            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);
+            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);
+
+            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] *= v;
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] *= v;
+    }
+#endif
+}
+
+inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) {
+#if defined(GGML_SIMD)
+    const int np = (n & ~(GGML_F16_STEP - 1));
+
+    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
+
+    GGML_F16_VEC ay[GGML_F16_ARR];
+
+    for (int i = 0; i < np; i += GGML_F16_STEP) {
+        for (int j = 0; j < GGML_F16_ARR; j++) {
+            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
+            ay[j] = GGML_F16_VEC_MUL(ay[j], vx);
+
+            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
+        }
+    }
+
+    // leftovers
+    for (int i = np; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
+    }
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v);
+    }
+#endif
+}
+
+inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
+inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
+inline static void ggml_vec_sqr_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        float v = GGML_FP16_TO_FP32(x[i]);
+        y[i] = GGML_FP32_TO_FP16(v*v);
+    }
+}
+inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
+inline static void ggml_vec_sqrt_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(sqrtf(GGML_FP16_TO_FP32(x[i])));
+    }
+}
+inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);  }
+inline static void ggml_vec_log_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(logf(GGML_FP16_TO_FP32(x[i])));
+    }
+}
+inline static void ggml_vec_sin_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]);  }
+inline static void ggml_vec_sin_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(sinf(GGML_FP16_TO_FP32(x[i])));
+    }
+}
+inline static void ggml_vec_cos_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]);  }
+inline static void ggml_vec_cos_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(cosf(GGML_FP16_TO_FP32(x[i])));
+    }
+}
+inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
+inline static void ggml_vec_abs_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(fabsf(GGML_FP16_TO_FP32(x[i])));
+    }
+}
+inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
+inline static void ggml_vec_sgn_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        float v = GGML_FP16_TO_FP32(x[i]);
+        y[i] = GGML_FP32_TO_FP16((v > 0.f) ? 1.f : ((v < 0.f) ? -1.f : 0.f));
+    }
+}
+inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
+inline static void ggml_vec_step_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16((GGML_FP16_TO_FP32(x[i]) > 0.f) ? 1.f : 0.f);
+    }
+}
+inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
+inline static void ggml_vec_tanh_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(tanhf(GGML_FP16_TO_FP32(x[i])));
+    }
+}
+inline static void ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
+inline static void ggml_vec_elu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(expm1f(GGML_FP16_TO_FP32(x[i])));
+    }
+}
+inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
+inline static void ggml_vec_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        float v = GGML_FP16_TO_FP32(x[i]);
+        y[i] = GGML_FP32_TO_FP16((v > 0.f) ? v : 0.f);
+    }
+}
+inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
+inline static void ggml_vec_leaky_relu_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x, const float ns) {
+    for (int i = 0; i < n; ++i) {
+        float v = GGML_FP16_TO_FP32(x[i]);
+        y[i] = GGML_FP32_TO_FP16(((v > 0.f) ? v : 0.f) + ns * ((v < 0.0f) ? v : 0.f));
+    }
+}
+inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
+inline static void ggml_vec_sigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(1.f / (1.f + expf(-GGML_FP16_TO_FP32(x[i]))));
+    }
+}
+// TODO: optimize performance
+inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
+inline static void ggml_vec_hardswish_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        float v = GGML_FP16_TO_FP32(x[i]);
+        y[i] = GGML_FP32_TO_FP16(v * fminf(1.0f, fmaxf(0.0f, (v + 3.0f) / 6.0f)));
+    }
+}
+inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
+inline static void ggml_vec_hardsigmoid_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(fminf(1.0f, fmaxf(0.0f, (GGML_FP16_TO_FP32(x[i]) + 3.0f) / 6.0f)));
+    }
+}
+inline static void ggml_vec_exp_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
+inline static void ggml_vec_exp_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(expf(GGML_FP16_TO_FP32(x[i])));
+    }
+}
+
+static const float GELU_COEF_A     = 0.044715f;
+static const float GELU_QUICK_COEF = -1.702f;
+static const float SQRT_2_OVER_PI  = 0.79788456080286535587989211986876f;
+
+inline static float ggml_gelu_f32(float x) {
+    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+}
+
+inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    const uint16_t * i16 = (const uint16_t *) x;
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_table_gelu_f16[i16[i]];
+    }
+}
+
+#ifdef GGML_GELU_FP16
+inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        if (x[i] <= -10.0f) {
+            y[i] = 0.0f;
+        } else if (x[i] >= 10.0f) {
+            y[i] = x[i];
+        } else {
+            ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+            memcpy(&t, &fp16, sizeof(uint16_t));
+            y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_f16[t]);
+        }
+    }
+}
+#else
+inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_gelu_f32(x[i]);
+    }
+}
+#endif
+
+inline static float ggml_gelu_quick_f32(float x) {
+    return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x)));
+}
+
+//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+//    const uint16_t * i16 = (const uint16_t *) x;
+//    for (int i = 0; i < n; ++i) {
+//        y[i] = ggml_table_gelu_quick_f16[i16[i]];
+//    }
+//}
+
+#ifdef GGML_GELU_QUICK_FP16
+inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);
+        memcpy(&t, &fp16, sizeof(uint16_t));
+        y[i] = GGML_FP16_TO_FP32(ggml_table_gelu_quick_f16[t]);
+    }
+}
+#else
+inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_gelu_quick_f32(x[i]);
+    }
+}
+#endif
+
+inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        float v = GGML_FP16_TO_FP32(x[i]);
+        y[i] = GGML_FP32_TO_FP16(v*(1.0f/(1.0f+expf(GELU_QUICK_COEF*v))));
+    }
+}
+
+// Sigmoid Linear Unit (SiLU) function
+inline static float ggml_silu_f32(float x) {
+    return x/(1.0f + expf(-x));
+}
+inline static ggml_fp16_t ggml_silu_f16(ggml_fp16_t x) {
+    float v = GGML_FP16_TO_FP32(x);
+    return GGML_FP32_TO_FP16(v/(1.0f + expf(-v)));
+}
+
+#if __FINITE_MATH_ONLY__
+#error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
+#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
+#endif
+
+#if defined(__ARM_NEON) && defined(__aarch64__)
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static float32x4_t ggml_v_expf(float32x4_t x) {
+    const float32x4_t r = vdupq_n_f32(0x1.8p23f);
+    const float32x4_t z = vfmaq_f32(r, x, vdupq_n_f32(0x1.715476p+0f));
+    const float32x4_t n = vsubq_f32(z, r);
+    const float32x4_t b = vfmsq_f32(vfmsq_f32(x, n, vdupq_n_f32(0x1.62e4p-1f)), n,
+                                    vdupq_n_f32(0x1.7f7d1cp-20f));
+    const uint32x4_t e = vshlq_n_u32(vreinterpretq_u32_f32(z), 23);
+    const float32x4_t k = vreinterpretq_f32_u32(vaddq_u32(e, vreinterpretq_u32_f32(vdupq_n_f32(1))));
+    const uint32x4_t c = vcagtq_f32(n, vdupq_n_f32(126));
+    const float32x4_t u = vmulq_f32(b, b);
+    const float32x4_t j = vfmaq_f32(
+        vmulq_f32(vdupq_n_f32(0x1.ffffecp-1f), b),
+        vfmaq_f32(vfmaq_f32(vdupq_n_f32(0x1.fffdb6p-2f), vdupq_n_f32(0x1.555e66p-3f), b),
+                  vfmaq_f32(vdupq_n_f32(0x1.573e2ep-5f), vdupq_n_f32(0x1.0e4020p-7f), b), u), u);
+    if (!vpaddd_u64(vreinterpretq_u64_u32(c)))
+        return vfmaq_f32(k, j, k);
+    const uint32x4_t d = vandq_u32(vclezq_f32(n), vdupq_n_u32(0x82000000));
+    const float32x4_t s1 = vreinterpretq_f32_u32(vaddq_u32(d, vdupq_n_u32(0x7f000000)));
+    const float32x4_t s2 = vreinterpretq_f32_u32(vsubq_u32(e, d));
+    return vbslq_f32(vcagtq_f32(n, vdupq_n_f32(192)), vmulq_f32(s1, s1),
+                     vbslq_f32(c, vmulq_f32(vfmaq_f32(s2, s2, j), s1), vfmaq_f32(k, k, j)));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static float32x4_t ggml_v_silu(float32x4_t x) {
+    const float32x4_t one = vdupq_n_f32(1.0f);
+    const float32x4_t zero = vdupq_n_f32(0.0f);
+    const float32x4_t neg_x = vsubq_f32(zero, x);
+    const float32x4_t exp_neg_x = ggml_v_expf(neg_x);
+    const float32x4_t one_plus_exp_neg_x = vaddq_f32(one, exp_neg_x);
+    return vdivq_f32(x, one_plus_exp_neg_x);
+}
+
+#elif defined(__AVX512F__) && defined(__AVX512DQ__)
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static __m512 ggml_v_expf(__m512 x) {
+  const __m512 r = _mm512_set1_ps(0x1.8p23f);
+  const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
+  const __m512 n = _mm512_sub_ps(z, r);
+  const __m512 b =
+      _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
+                       _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
+  const __mmask16 d =
+      _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
+  const __m512 u = _mm512_mul_ps(b, b);
+  const __m512 j = _mm512_fmadd_ps(
+      _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
+                                      _mm512_set1_ps(0x1.573e2ep-5f)),
+                      u,
+                      _mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
+                                      _mm512_set1_ps(0x1.fffdb6p-2f))),
+      u,
+      _mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
+  const __m512 res = _mm512_scalef_ps(j, n);
+  if (_mm512_kortestz(d, d))
+    return res;
+  const __m512 zero = _mm512_setzero_ps();
+  const __m512 alt = _mm512_mask_blend_ps(
+      _mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
+  return _mm512_mask_blend_ps(d, res, alt);
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m512 ggml_v_silu(__m512 x) {
+    const __m512 one = _mm512_set1_ps(1);
+    const __m512 zero = _mm512_setzero_ps();
+    const __m512 neg_x = _mm512_sub_ps(zero, x);
+    const __m512 exp_neg_x = ggml_v_expf(neg_x);
+    const __m512 one_plus_exp_neg_x = _mm512_add_ps(one, exp_neg_x);
+    return _mm512_div_ps(x, one_plus_exp_neg_x);
+}
+
+#elif defined(__AVX2__) && defined(__FMA__)
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static __m256 ggml_v_expf(__m256 x) {
+  const __m256 r = _mm256_set1_ps(0x1.8p23f);
+  const __m256 z = _mm256_fmadd_ps(x, _mm256_set1_ps(0x1.715476p+0f), r);
+  const __m256 n = _mm256_sub_ps(z, r);
+  const __m256 b = _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.7f7d1cp-20f),
+                                    _mm256_fnmadd_ps(n, _mm256_set1_ps(0x1.62e4p-1f), x));
+  const __m256i e = _mm256_slli_epi32(_mm256_castps_si256(z), 23);
+  const __m256 k = _mm256_castsi256_ps(
+      _mm256_add_epi32(e, _mm256_castps_si256(_mm256_set1_ps(1))));
+  const __m256i c = _mm256_castps_si256(
+      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
+                    _mm256_set1_ps(126), _CMP_GT_OQ));
+  const __m256 u = _mm256_mul_ps(b, b);
+  const __m256 j = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_set1_ps(0x1.0e4020p-7f), b,
+                                                                   _mm256_set1_ps(0x1.573e2ep-5f)), u,
+                                                   _mm256_fmadd_ps(_mm256_set1_ps(0x1.555e66p-3f), b,
+                                                                   _mm256_set1_ps(0x1.fffdb6p-2f))),
+                                   u, _mm256_mul_ps(_mm256_set1_ps(0x1.ffffecp-1f), b));
+  if (!_mm256_movemask_ps(_mm256_castsi256_ps(c)))
+    return _mm256_fmadd_ps(j, k, k);
+  const __m256i g = _mm256_and_si256(
+      _mm256_castps_si256(_mm256_cmp_ps(n, _mm256_setzero_ps(), _CMP_LE_OQ)),
+      _mm256_set1_epi32(0x82000000u));
+  const __m256 s1 =
+      _mm256_castsi256_ps(_mm256_add_epi32(g, _mm256_set1_epi32(0x7f000000u)));
+  const __m256 s2 = _mm256_castsi256_ps(_mm256_sub_epi32(e, g));
+  const __m256i d = _mm256_castps_si256(
+      _mm256_cmp_ps(_mm256_andnot_ps(_mm256_set1_ps(-0.f), n),
+                    _mm256_set1_ps(192), _CMP_GT_OQ));
+  return _mm256_or_ps(
+      _mm256_and_ps(_mm256_castsi256_ps(d), _mm256_mul_ps(s1, s1)),
+      _mm256_andnot_ps(
+          _mm256_castsi256_ps(d),
+          _mm256_or_ps(
+              _mm256_and_ps(_mm256_castsi256_ps(c),
+                            _mm256_mul_ps(_mm256_fmadd_ps(s2, j, s2), s1)),
+              _mm256_andnot_ps(_mm256_castsi256_ps(c), _mm256_fmadd_ps(k, j, k)))));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m256 ggml_v_silu(__m256 x) {
+    const __m256 one = _mm256_set1_ps(1);
+    const __m256 zero = _mm256_setzero_ps();
+    const __m256 neg_x = _mm256_sub_ps(zero, x);
+    const __m256 exp_neg_x = ggml_v_expf(neg_x);
+    const __m256 one_plus_exp_neg_x = _mm256_add_ps(one, exp_neg_x);
+    return _mm256_div_ps(x, one_plus_exp_neg_x);
+}
+
+#elif defined(__SSE2__) // __AVX2__ / __ARM_NEON
+
+#if defined(__FMA__)
+#define MADD128(x, y, z) _mm_fmadd_ps(x, y, z)
+#define NMADD128(x, y, z) _mm_fnmadd_ps(x, y, z)
+#else
+#define MADD128(x, y, z) _mm_add_ps(_mm_mul_ps(x, y), z)
+#define NMADD128(x, y, z) _mm_sub_ps(z, _mm_mul_ps(x, y))
+#endif
+
+// adapted from arm limited optimized routine
+// the maximum error is 1.45358 plus 0.5 ulps
+// numbers above 88.38 will flush to infinity
+// numbers beneath -103.97 will flush to zero
+inline static __m128 ggml_v_expf(__m128 x) {
+    const __m128 r = _mm_set1_ps(0x1.8p23f);
+    const __m128 z = MADD128(x, _mm_set1_ps(0x1.715476p+0f), r);
+    const __m128 n = _mm_sub_ps(z, r);
+    const __m128 b =
+        NMADD128(n, _mm_set1_ps(0x1.7f7d1cp-20f), NMADD128(n, _mm_set1_ps(0x1.62e4p-1f), x));
+    const __m128i e = _mm_slli_epi32(_mm_castps_si128(z), 23);
+    const __m128 k = _mm_castsi128_ps(_mm_add_epi32(e, _mm_castps_si128(_mm_set1_ps(1))));
+    const __m128i c =
+        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(126)));
+    const __m128 u = _mm_mul_ps(b, b);
+    const __m128 j =
+        MADD128(MADD128(MADD128(_mm_set1_ps(0x1.0e4020p-7f), b, _mm_set1_ps(0x1.573e2ep-5f)), u,
+                        MADD128(_mm_set1_ps(0x1.555e66p-3f), b, _mm_set1_ps(0x1.fffdb6p-2f))),
+                u, _mm_mul_ps(_mm_set1_ps(0x1.ffffecp-1f), b));
+    if (!_mm_movemask_epi8(c))
+        return MADD128(j, k, k);
+    const __m128i g = _mm_and_si128(_mm_castps_si128(_mm_cmple_ps(n, _mm_setzero_ps())),
+                                    _mm_set1_epi32(0x82000000u));
+    const __m128 s1 = _mm_castsi128_ps(_mm_add_epi32(g, _mm_set1_epi32(0x7f000000u)));
+    const __m128 s2 = _mm_castsi128_ps(_mm_sub_epi32(e, g));
+    const __m128i d =
+        _mm_castps_si128(_mm_cmpgt_ps(_mm_andnot_ps(_mm_set1_ps(-0.f), n), _mm_set1_ps(192)));
+    return _mm_or_ps(
+        _mm_and_ps(_mm_castsi128_ps(d), _mm_mul_ps(s1, s1)),
+        _mm_andnot_ps(_mm_castsi128_ps(d),
+                      _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(c), _mm_mul_ps(MADD128(s2, j, s2), s1)),
+                                _mm_andnot_ps(_mm_castsi128_ps(c), MADD128(k, j, k)))));
+}
+
+// computes silu x/(1+exp(-x)) in single precision vector
+inline static __m128 ggml_v_silu(__m128 x) {
+    const __m128 one = _mm_set1_ps(1);
+    const __m128 zero = _mm_setzero_ps();
+    const __m128 neg_x = _mm_sub_ps(zero, x);
+    const __m128 exp_neg_x = ggml_v_expf(neg_x);
+    const __m128 one_plus_exp_neg_x = _mm_add_ps(one, exp_neg_x);
+    return _mm_div_ps(x, one_plus_exp_neg_x);
+}
+
+#endif // __ARM_NEON / __AVX2__ / __SSE2__
+
+inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = ggml_silu_f16(x[i]);
+    }
+}
+
+inline static float ggml_silu_backward_f32(float x, float dy) {
+    const float s = 1.0f/(1.0f + expf(-x));
+    return dy*s*(1.0f + x*(1.0f - s));
+}
+
+inline static ggml_fp16_t ggml_silu_backward_f16(ggml_fp16_t x, ggml_fp16_t dy) {
+    const float v = GGML_FP16_TO_FP32(x);
+    const float s = 1.0f/(1.0f + expf(-v));
+    return GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(dy)*s*(1.0f + v*(1.0f - s)));
+}
+
+inline static void ggml_vec_silu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
+    for (int i = 0; i < n; ++i) {
+        dx[i] = ggml_silu_backward_f32(x[i], dy[i]);
+    }
+}
+
+inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
+    for (int i = 0; i < n; ++i) {
+        dx[i] = ggml_silu_backward_f16(x[i], dy[i]);
+    }
+}
+
+inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {
+#ifndef GGML_USE_ACCELERATE
+    ggml_float sum = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sum += (ggml_float)x[i];
+    }
+    *s = (float)sum;
+#else
+    vDSP_sve(x, 1, s, n);
+#endif
+}
+
+inline static void ggml_vec_sum_f32_ggf(const int n, ggml_float * s, const float * x) {
+    ggml_float sum = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sum += (ggml_float)x[i];
+    }
+    *s = sum;
+}
+
+inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_t * x) {
+    float sum = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        sum += GGML_FP16_TO_FP32(x[i]);
+    }
+    *s = sum;
+}
+
+inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) {
+    float sum = 0.0f;
+    for (int i = 0; i < n; ++i) {
+        sum += GGML_BF16_TO_FP32(x[i]);
+    }
+    *s = sum;
+}
+
+inline static void ggml_vec_max_f32(const int n, float * s, const float * x) {
+#ifndef GGML_USE_ACCELERATE
+    float max = -INFINITY;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, x[i]);
+    }
+    *s = max;
+#else
+    vDSP_maxv(x, 1, s, n);
+#endif
+}
+
+inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) {
+    ggml_vec_norm_f32(n, s, x);
+    *s = 1.f/(*s);
+}
+
+inline static void ggml_vec_argmax_f32(const int n, int * s, const float * x) {
+    float max = -INFINITY;
+    int idx = 0;
+    for (int i = 0; i < n; ++i) {
+        max = MAX(max, x[i]);
+        if (max == x[i]) { idx = i; }
+    }
+    *s = idx;
+}
+
+#ifdef __cplusplus
+}
+#endif
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -41,15 +41,18 @@
 #define CUDART_HMAX   11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
 #define CUDART_HMASK  12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons

-#define GGML_CUDA_CC_PASCAL       600
-#define GGML_CUDA_CC_DP4A         610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
-#define GGML_CUDA_CC_VOLTA        700
-#define GGML_CUDA_CC_TURING       750
-#define GGML_CUDA_CC_AMPERE       800
-#define GGML_CUDA_CC_ADA_LOVELACE 890
-#define GGML_CUDA_CC_OFFSET_AMD   0x1000000
+#define GGML_CUDA_CC_PASCAL          600
+#define GGML_CUDA_CC_DP4A            610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define GGML_CUDA_CC_VOLTA           700
+#define GGML_CUDA_CC_TURING          750
+#define GGML_CUDA_CC_AMPERE          800
+#define GGML_CUDA_CC_ADA_LOVELACE    890
+#define GGML_CUDA_CC_OFFSET_AMD      0x1000000
+#define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
+#define GGML_CUDA_CC_IS_NVIDIA(cc)   (cc < GGML_CUDA_CC_OFFSET_MTHREADS)

-// GCN/CNDA, wave size is 64
+// AMD
+// GCN/CDNA, wave size is 64
 #define GGML_CUDA_CC_GCN4       (GGML_CUDA_CC_OFFSET_AMD + 0x803)  // Tonga, Fiji, Polaris, minimum for fast fp16
 #define GGML_CUDA_CC_VEGA       (GGML_CUDA_CC_OFFSET_AMD + 0x900)  // Vega56/64, minimum for fp16 dual issue
 #define GGML_CUDA_CC_VEGA20     (GGML_CUDA_CC_OFFSET_AMD + 0x906)  // MI50/Radeon VII, minimum for dp4a
@@ -57,21 +60,32 @@
 #define GGML_CUDA_CC_CDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x910)  // MI210, minimum acc register renameing
 #define GGML_CUDA_CC_CDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x942)  // MI300

-// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
+// RDNA removes MFMA, dp4a, xnack, acc registers, wave size is 32
 #define GGML_CUDA_CC_RDNA1      (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
 #define GGML_CUDA_CC_RDNA2      (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
 #define GGML_CUDA_CC_RDNA3      (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
+#define GGML_CUDA_CC_RDNA4      (GGML_CUDA_CC_OFFSET_AMD + 0x1200) // RX 9000

 #define GGML_CUDA_CC_IS_AMD(cc)   (cc >= GGML_CUDA_CC_OFFSET_AMD)
 #define GGML_CUDA_CC_IS_RDNA(cc)  (cc >= GGML_CUDA_CC_RDNA1)
 #define GGML_CUDA_CC_IS_RDNA1(cc) (cc >= GGML_CUDA_CC_RDNA1 && cc < GGML_CUDA_CC_RDNA2)
 #define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
-#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3)
+#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
+#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
 #define GGML_CUDA_CC_IS_GCN(cc)   (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
 #define GGML_CUDA_CC_IS_CDNA(cc)  (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)

-#define GGML_CUDA_CC_QY1        210
-#define GGML_CUDA_CC_QY2        220
+// Moore Threads
+#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
+
+#define GGML_CUDA_CC_QY1  (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
+#define GGML_CUDA_CC_QY2  (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
+#define GGML_CUDA_CC_NG   (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD
+
+#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
+#define GGML_CUDA_CC_IS_QY1(cc)      (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
+#define GGML_CUDA_CC_IS_QY2(cc)      (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT)
+#define GGML_CUDA_CC_IS_NG(cc)       (cc >= GGML_CUDA_CC_NG)

 #ifdef __CUDA_ARCH_LIST__
 constexpr bool ggml_cuda_has_arch_impl(int) {
@@ -197,9 +211,9 @@ typedef float2 dfloat2;
 #define FP16_MMA_AVAILABLE
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_VOLTA

-#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
+#if defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))
 #define FP16_MMA_AVAILABLE
-#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3))
+#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || defined(RDNA4))

 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
 #define NEW_MMA_AVAILABLE
@@ -209,21 +223,21 @@ typedef float2 dfloat2;
 #define CP_ASYNC_AVAILABLE
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE

-#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
+#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
 #define FLASH_ATTN_AVAILABLE
-#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
+#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)

 static bool fp16_available(const int cc) {
    return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
 }

 static bool fast_fp16_available(const int cc) {
-    return fp16_available(cc) && cc != 610;
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
 }

 // To be used for feature selection of external libraries, e.g. cuBLAS.
 static bool fast_fp16_hardware_available(const int cc) {
-    return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
 }

 // Any FP16 tensor core instructions are available for ggml code.
@@ -231,20 +245,20 @@ static bool fp16_mma_available(const int cc) {
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
    return false;
 #else
-    return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ||
-        GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA) ||
+        GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
 #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
 }

 // To be used for feature selection of external libraries, e.g. cuBLAS.
 static bool fp16_mma_hardware_available(const int cc) {
-    return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA ||
-        GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA) ||
+        GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc);
 }

 // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
 static bool new_mma_available(const int cc) {
-    return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
+    return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
 }

 static bool cp_async_available(const int cc) {
@@ -274,6 +288,10 @@ static __device__ void no_device_code(
    __trap();

    GGML_UNUSED(no_device_code); // suppress unused function warning
+
+#if defined(GGML_USE_MUSA)
+    __builtin_unreachable();
+#endif // defined(GGML_USE_MUSA)
 }

 #ifdef __CUDA_ARCH__
@@ -397,7 +415,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
 #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
 #if defined(CDNA) || defined(RDNA2) || defined(__gfx906__)
    c = __builtin_amdgcn_sdot4(a, b, c, false);
-#elif defined(RDNA3)
+#elif defined(RDNA3) || defined(RDNA4)
    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
 #elif defined(RDNA1) || defined(__gfx900__)
    int tmp1;
@@ -711,7 +729,13 @@ struct ggml_cuda_graph {
    bool disable_due_to_failed_graph_capture = false;
    int number_consecutive_updates = 0;
    std::vector<ggml_graph_node_properties> ggml_graph_properties;
-    std::vector<char **> updated_kernel_arg;
+    bool use_cpy_indirection = false;
+    std::vector<char *> cpy_dest_ptrs;
+    char ** dest_ptrs_d;
+    int dest_ptrs_size = 0;
+    // Index to allow each cpy kernel to be aware of it's position within the graph
+    // relative to other cpy nodes.
+    int graph_cpynode_index = -1;
 #endif
 };

--- a/ggml/src/ggml-cuda/concat.cu
+++ b/ggml/src/ggml-cuda/concat.cu
@@ -38,7 +38,7 @@ static __global__ void concat_f32_dim1(const float * x, const float * y, float *
        blockIdx.y * ne0 +
        blockIdx.z * ne0 * gridDim.y;

-    if (blockIdx.y < ne01) { // src0
+    if (blockIdx.y < (unsigned)ne01) { // src0
        int offset_src =
            nidx +
            blockIdx.y * ne0 +
@@ -64,7 +64,7 @@ static __global__ void concat_f32_dim2(const float * x, const float * y, float *
        blockIdx.y * ne0 +
        blockIdx.z * ne0 * gridDim.y;

-    if (blockIdx.z < ne02) { // src0
+    if (blockIdx.z < (unsigned)ne02) { // src0
        int offset_src =
            nidx +
            blockIdx.y * ne0 +
--- a/ggml/src/ggml-cuda/conv-transpose-1d.cu
+++ b/ggml/src/ggml-cuda/conv-transpose-1d.cu
@@ -34,6 +34,10 @@ static  __global__ void conv_transpose_1d_kernel(
        }
    }
    dst[global_index] = accumulator;
+    GGML_UNUSED(p0); GGML_UNUSED(d0); GGML_UNUSED(src0_ne3);
+    GGML_UNUSED(src1_ne3); GGML_UNUSED(dst_ne3);
+    GGML_UNUSED(src1_ne1); GGML_UNUSED(dst_ne1);
+    GGML_UNUSED(src1_ne2); GGML_UNUSED(dst_ne2);
 }

 static void conv_transpose_1d_f32_f32_cuda(
@@ -75,8 +79,6 @@ void ggml_cuda_op_conv_transpose_1d(ggml_backend_cuda_context & ctx, ggml_tensor
    const int p0 = 0;//opts[3];
    const int d0 = 1;//opts[4];

-    const int64_t kernel_size = ggml_nelements(src0);
-    const int64_t input_size = ggml_nelements(src1);
    const int64_t output_size = ggml_nelements(dst);

    conv_transpose_1d_f32_f32_cuda(s0, p0, d0, output_size,
--- a/ggml/src/ggml-cuda/convert.cu
+++ b/ggml/src/ggml-cuda/convert.cu
@@ -577,7 +577,7 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
        return;
    }

-    const src_t * x = (src_t *) vx;
+    const src_t * x = (const src_t *) vx;

    y[i] = x[i];
 }
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -32,16 +32,18 @@ static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) {
 }

 template <cpy_kernel_t cpy_1>
-static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+static __global__ void cpy_f32_f16(const char * cx, char * cdst_direct, const int ne,
                                   const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                   const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                   const int nb12, const int nb13) {
+                                   const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
    const int64_t i = blockDim.x*blockIdx.x + threadIdx.x;

    if (i >= ne) {
        return;
    }

+    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+
    // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
    // then combine those indices with the corresponding byte offsets to get the total offsets
    const int64_t i03 = i/(ne00 * ne01 * ne02);
@@ -288,16 +290,18 @@ static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) {
 }

 template <cpy_kernel_t cpy_blck, int qk>
-static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
+static __global__ void cpy_f32_q(const char * cx, char * cdst_direct, const int ne,
                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                 const int nb12, const int nb13) {
+                                 const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;

    if (i >= ne) {
        return;
    }

+    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+
    const int i03 = i/(ne00 * ne01 * ne02);
    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
@@ -314,16 +318,18 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
 }

 template <cpy_kernel_t cpy_blck, int qk>
-static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
+static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int ne,
                                 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
                                 const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
-                                 const int nb12, const int nb13) {
+                                 const int nb12, const int nb13, char ** cdst_indirect, int graph_cpynode_index) {
    const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk;

    if (i >= ne) {
        return;
    }

+    char * cdst = (cdst_indirect != nullptr) ? cdst_indirect[graph_cpynode_index]: cdst_direct;
+
    const int i03 = i/(ne00 * ne01 * ne02);
    const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
    const int i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
@@ -339,66 +345,84 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne,
    cpy_blck(cx + x_offset, cdst + dst_offset);
 }

+// Copy destination pointers to GPU to be available when pointer indirection is in use
+
+void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
+#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+    if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+        if (cuda_graph->dest_ptrs_d != nullptr) {
+            CUDA_CHECK(cudaFree(cuda_graph->dest_ptrs_d));
+        }
+        CUDA_CHECK(cudaMalloc(&cuda_graph->dest_ptrs_d, host_dest_ptrs_size*sizeof(char *)));
+        cuda_graph->dest_ptrs_size = host_dest_ptrs_size;
+    }
+    // copy destination pointers to GPU
+    CUDA_CHECK(cudaMemcpyAsync(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice, stream));
+    cuda_graph->graph_cpynode_index = 0; // reset index
+#endif
+}
+
 static void ggml_cpy_f16_f32_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_f32_f32_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_f32_f16_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_f32_q8_0_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

    GGML_ASSERT(ne % QK8_0 == 0);
    const int num_blocks = ne / QK8_0;
    cpy_f32_q<cpy_blck_f32_q8_0, QK8_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_q8_0_f32_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

    const int num_blocks = ne;
    cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_f32_q4_0_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

    GGML_ASSERT(ne % QK4_0 == 0);
    const int num_blocks = ne / QK4_0;
    cpy_f32_q<cpy_blck_f32_q4_0, QK4_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_q4_0_f32_cuda(
@@ -407,22 +431,22 @@ static void ggml_cpy_q4_0_f32_cuda(
    const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12,
    const int nb10, const int nb11, const int nb12, const int nb13,
-    cudaStream_t stream) {
+    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
    const int num_blocks = ne;
    cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+         ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_f32_q4_1_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

    GGML_ASSERT(ne % QK4_1 == 0);
    const int num_blocks = ne / QK4_1;
    cpy_f32_q<cpy_blck_f32_q4_1, QK4_1><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_q4_1_f32_cuda(
@@ -431,22 +455,22 @@ static void ggml_cpy_q4_1_f32_cuda(
    const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12,
    const int nb10, const int nb11, const int nb12, const int nb13,
-    cudaStream_t stream) {
+    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
    const int num_blocks = ne;
    cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-         ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+         ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_f32_q5_0_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

    GGML_ASSERT(ne % QK5_0 == 0);
    const int num_blocks = ne / QK5_0;
    cpy_f32_q<cpy_blck_f32_q5_0, QK5_0><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_q5_0_f32_cuda(
@@ -455,22 +479,22 @@ static void ggml_cpy_q5_0_f32_cuda(
    const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12,
    const int nb10, const int nb11, const int nb12, const int nb13,
-    cudaStream_t stream) {
+    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
    const int num_blocks = ne;
    cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_f32_q5_1_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

    GGML_ASSERT(ne % QK5_1 == 0);
    const int num_blocks = ne / QK5_1;
    cpy_f32_q<cpy_blck_f32_q5_1, QK5_1><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_q5_1_f32_cuda(
@@ -479,32 +503,32 @@ static void ggml_cpy_q5_1_f32_cuda(
    const int nb00, const int nb01, const int nb02,
    const int nb03, const int ne10, const int ne11, const int ne12,
    const int nb10, const int nb11, const int nb12, const int nb13,
-    cudaStream_t stream) {
+    cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
    const int num_blocks = ne;
    cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
-        ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_f32_iq4_nl_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

    GGML_ASSERT(ne % QK4_NL == 0);
    const int num_blocks = ne / QK4_NL;
    cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL><<<num_blocks, 1, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 static void ggml_cpy_f16_f16_cuda(
    const char * cx, char * cdst, const int ne,
    const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
-    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
+    const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {

    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
    cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
-        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
+        (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
 }

 void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) {
@@ -541,46 +565,60 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
    char * src0_ddc = (char *) src0->data;
    char * src1_ddc = (char *) src1->data;

+    char ** dest_ptrs_d = nullptr;
+    int graph_cpynode_index = -1;
+#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+    if(ctx.cuda_graph->use_cpy_indirection) {
+        dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
+        graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
+    }
+#endif
    if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
        GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
        CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
-        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
-        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_q4_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
-        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_q4_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
-        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
        ggml_cpy_q5_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
-            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+            nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
-        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
-        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
+        ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
    } else {
        GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
                ggml_type_name(src0->type), ggml_type_name(src1->type));
    }
+#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
+    if(ctx.cuda_graph->use_cpy_indirection) {
+        ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
+    }
+#endif
+
 }

 void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
--- a/ggml/src/ggml-cuda/cpy.cuh
+++ b/ggml/src/ggml-cuda/cpy.cuh
@@ -7,3 +7,5 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
 void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

 void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
+
+void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream);
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -315,14 +315,14 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(

    float vals[sizeof(int)] = {0.0f};
 #pragma unroll
-    for (int l = 0; l < sizeof(int); ++l) {
+    for (int l = 0; l < int(sizeof(int)); ++l) {
        vals[l] = scale * x[4*threadIdx.x + l];
    }

    float amax = fabsf(vals[0]);
    float sum  = vals[0];
 #pragma unroll
-    for (int l = 1; l < sizeof(int); ++l) {
+    for (int l = 1; l < int(sizeof(int)); ++l) {
        amax = fmaxf(amax, fabsf(vals[l]));
        sum += vals[l];
    }
@@ -338,7 +338,7 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(

    if (d != 0.0f) {
 #pragma unroll
-        for (int l = 0; l < sizeof(int); ++l) {
+        for (int l = 0; l < int(sizeof(int)); ++l) {
            q8[l] = roundf(vals[l] / d);
        }
    }
@@ -606,50 +606,50 @@ static __global__ void flash_attn_stream_k_fixup(
    *dst = dst_val / rowsum;
 }

-template<int D, int parallel_blocks> // D == head size
+template<int D> // D == head size
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 static __global__ void flash_attn_combine_results(
        const float  * __restrict__ VKQ_parts,
        const float2 * __restrict__ VKQ_meta,
-        float * __restrict__ dst) {
-    VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
-    VKQ_meta  += parallel_blocks   * gridDim.y*blockIdx.x;
-    dst       +=                 D * gridDim.y*blockIdx.x;
+        float * __restrict__ dst,
+        const int parallel_blocks) {
+    VKQ_parts += parallel_blocks*D * gridDim.z*blockIdx.x;
+    VKQ_meta  += parallel_blocks   * gridDim.z*blockIdx.x;
+    dst       +=                 D * gridDim.z*blockIdx.x;

    const int tid = threadIdx.x;
    __builtin_assume(tid < D);

-    __shared__ float2 meta[parallel_blocks];
+    extern __shared__ float2 meta[];
    if (tid < 2*parallel_blocks) {
-        ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
+        ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.z*(2*parallel_blocks) + tid];
    }

    __syncthreads();

    float kqmax = meta[0].x;
-#pragma unroll
    for (int l = 1; l < parallel_blocks; ++l) {
        kqmax = max(kqmax, meta[l].x);
    }

    float VKQ_numerator   = 0.0f;
    float VKQ_denominator = 0.0f;
-#pragma unroll
    for (int l = 0; l < parallel_blocks; ++l) {
        const float diff = meta[l].x - kqmax;
-        const float KQ_max_scale = expf(diff);
+        float KQ_max_scale = expf(diff);
        const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
        *((uint32_t *) &KQ_max_scale) &= ftz_mask;

-        VKQ_numerator   += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
+        VKQ_numerator   += KQ_max_scale * VKQ_parts[l*gridDim.z*D + blockIdx.z*D + tid];
        VKQ_denominator += KQ_max_scale * meta[l].y;
    }

-    dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
+    dst[blockIdx.z*D + tid] = VKQ_numerator / VKQ_denominator;
 }

+[[noreturn]]
 static void on_no_fattn_vec_case(const int D) {
    if (D == 64) {
        fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
@@ -671,12 +671,10 @@ static void on_no_fattn_vec_case(const int D) {
    }
 }

-// parallel_blocks == 0 is stream-k decomposition
-template <int D, int ncols1, int ncols2, int parallel_blocks, int KQ_stride>
+template <int D, int ncols1, int ncols2, int KQ_stride>
 void launch_fattn(
-    ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel,
-    const int nwarps, const size_t nbytes_shared, const bool need_f16_K, const bool need_f16_V,
-    const int warp_size = WARP_SIZE
+    ggml_backend_cuda_context & ctx, ggml_tensor * dst, fattn_kernel_t fattn_kernel, const int nwarps, const size_t nbytes_shared,
+    const int KQ_row_granularity, const bool need_f16_K, const bool need_f16_V, const bool stream_k, const int warp_size = WARP_SIZE
 ) {
    constexpr int ncols = ncols1 * ncols2;

@@ -748,12 +746,14 @@ void launch_fattn(
        nb23 = nb23*bs*sizeof(half)/ts;
    }

+    int parallel_blocks = 1;
+
    const int ntiles_x = ((Q->ne[1] + ncols1 - 1) / ncols1);
    const int ntiles_total = ntiles_x * (Q->ne[2] / ncols2) * Q->ne[3];

    const dim3 block_dim(warp_size, nwarps, 1);
    dim3 blocks_num;
-    if (parallel_blocks == 0) {
+    if (stream_k) {
        // For short contexts it can be faster to have the SMs work on whole tiles because this lets us skip the fixup.
        const int max_blocks = 2*nsm;
        const int tiles_nwaves = (ntiles_total + max_blocks - 1) / max_blocks;
@@ -769,9 +769,43 @@ void launch_fattn(

        dst_tmp_meta.alloc(blocks_num.x*ncols * (2*2 + D) * sizeof(float));
    } else {
-        blocks_num.x = parallel_blocks*ntiles_x;
-        blocks_num.y = Q->ne[2];
-        blocks_num.z = Q->ne[3];
+        GGML_ASSERT(K->ne[1] % KQ_row_granularity == 0);
+        const int ntiles_KQ = K->ne[1] / KQ_row_granularity; // Max. number of parallel blocks limited by tensor size.
+
+        int max_blocks_per_sm = 1; // Max. number of active blocks limited by occupancy.
+        CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks_per_sm, fattn_kernel, block_dim.x * block_dim.y * block_dim.z, nbytes_shared));
+
+        // parallel_blocks should be at least large enough to achieve max. occupancy for a single wave:
+        parallel_blocks = std::max((nsm * max_blocks_per_sm) / ntiles_total, 1);
+
+        // parallel_blocks must not be larger than what the tensor size allows:
+        parallel_blocks = std::min(parallel_blocks, ntiles_KQ);
+
+        // If ntiles_total % blocks_per_wave != 0 then some efficiency is lost due to tail effects.
+        // Test whether parallel_blocks can be set to a higher value for better efficiency.
+        const int blocks_per_wave = nsm * max_blocks_per_sm;
+        int nwaves_best = 0;
+        int efficiency_percent_best = 0;
+        for (int parallel_blocks_test = parallel_blocks; parallel_blocks_test <= ntiles_KQ; ++parallel_blocks_test) {
+            const int nblocks_total = ntiles_total * parallel_blocks_test;
+            const int nwaves = (nblocks_total + blocks_per_wave - 1) / blocks_per_wave;
+            const int efficiency_percent = 100 * nblocks_total / (nwaves*blocks_per_wave);
+
+            // Stop trying configurations with more waves if we already have good efficiency to avoid excessive overhead.
+            if (efficiency_percent_best >= 90 && nwaves > nwaves_best) {
+                break;
+            }
+
+            if (efficiency_percent > efficiency_percent_best) {
+                nwaves_best = nwaves;
+                efficiency_percent_best = efficiency_percent;
+                parallel_blocks = parallel_blocks_test;
+            }
+        }
+
+        blocks_num.x = ntiles_x;
+        blocks_num.y = parallel_blocks;
+        blocks_num.z = Q->ne[2]*Q->ne[3];

        if (parallel_blocks > 1) {
            dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
@@ -803,7 +837,7 @@ void launch_fattn(
        K_data,
        V_data,
        mask ? ((const char *) mask->data) : nullptr,
-        (parallel_blocks) > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
+        !stream_k && parallel_blocks > 1 ? dst_tmp.ptr : (float *) KQV->data, dst_tmp_meta.ptr,
        scale, max_bias, m0, m1, n_head_log2, logit_softcap,
        Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
        K->ne[0], K->ne[1], K->ne[2], K->ne[3],
@@ -815,7 +849,7 @@ void launch_fattn(
    );
    CUDA_CHECK(cudaGetLastError());

-    if constexpr (parallel_blocks == 0) {
+    if (stream_k) {
        if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
            const dim3 block_dim_combine(D, 1, 1);
            const dim3 blocks_num_combine = {blocks_num.x, ncols1, ncols2};
@@ -824,13 +858,14 @@ void launch_fattn(
                <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
                ((float *) KQV->data, dst_tmp_meta.ptr, Q->ne[1], Q->ne[2], K->ne[1]);
        }
-    } else if constexpr (parallel_blocks > 1) {
+    } else if (parallel_blocks > 1) {
        const dim3 block_dim_combine(D, 1, 1);
-        const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
+        const dim3 blocks_num_combine(Q->ne[1], 1, blocks_num.z);
+        const size_t nbytes_shared_combine = parallel_blocks*sizeof(float2);

-        flash_attn_combine_results<D, parallel_blocks>
-            <<<blocks_num_combine, block_dim_combine, 0, main_stream>>>
-            (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
+        flash_attn_combine_results<D>
+            <<<blocks_num_combine, block_dim_combine, nbytes_shared_combine, main_stream>>>
+            (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data, parallel_blocks);
    }
    CUDA_CHECK(cudaGetLastError());
 }
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -406,6 +406,15 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
 #endif // CP_ASYNC_AVAILABLE

 #else
+    GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
+    GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
+    GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_KV);
+    GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
+    GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
+    GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
+    GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
+    GGML_UNUSED(kb0);
    NO_DEVICE_CODE;
 #endif // NEW_MMA_AVAILABLE
 }
@@ -797,6 +806,12 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
        __syncthreads();
    }
 #else
+    GGML_UNUSED(Q_f2); GGML_UNUSED(K_h2); GGML_UNUSED(V_h2);
+    GGML_UNUSED(mask_h2); GGML_UNUSED(dstk); GGML_UNUSED(dstk_fixup);
+    GGML_UNUSED(scale); GGML_UNUSED(slope); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(stride_Q1);
+    GGML_UNUSED(stride_Q2); GGML_UNUSED(stride_KV); GGML_UNUSED(stride_mask);
+    GGML_UNUSED(jt); GGML_UNUSED(kb0_start); GGML_UNUSED(kb0_stop);
    NO_DEVICE_CODE;
 #endif // NEW_MMA_AVAILABLE
 }
@@ -931,6 +946,16 @@ static __global__ void flash_attn_ext_f16(
        (Q_f2, K_h2, V_h2, mask_h2, dstk, dst_meta, scale, slope, logit_softcap,
         ne01, ne02, stride_Q1, stride_Q2, stride_KV, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
 #else
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
+    GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
+    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
+    GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
    NO_DEVICE_CODE;
 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
 }
@@ -970,7 +995,8 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
        fattn_kernel = flash_attn_ext_f16<D, ncols1, ncols2, nwarps, KQ_per_iter, ntiles, use_logit_softcap>;
    }

-    launch_fattn<D, ncols1, ncols2, 0, KQ_per_iter>(ctx, dst, fattn_kernel, nwarps, nbytes_shared_total, true, true);
+    launch_fattn<D, ncols1, ncols2, KQ_per_iter>
+        (ctx, dst, fattn_kernel, nwarps, nbytes_shared_total, FATTN_KQ_STRIDE, true, true, true);
 }


@@ -984,38 +1010,38 @@ void ggml_cuda_flash_attn_ext_mma_f16_case(ggml_backend_cuda_context & ctx, ggml
    extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/4, 4); \
    extern DECL_FATTN_MMA_F16_CASE(D, (ncols)/8, 8); \

-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,   8);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,   8);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,   8)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,   8)

-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  16);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  16);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  16)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  16)

-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  32);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  32);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  32)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  32)

-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  64);
-DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  64);
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128,  64)
+DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256,  64)

 // Kernels with ncols == 128 are only 4% faster due to register pressure.
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128);
-// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128); // Needs too much shared memory.
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 64, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 80, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2( 96, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(112, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(128, 128)
+// DECL_FATTN_MMA_F16_CASE_ALL_NCOLS2(256, 128) // Needs too much shared memory.
--- a/ggml/src/ggml-cuda/fattn-tile-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu
@@ -4,7 +4,7 @@

 #define FATTN_KQ_STRIDE_TILE_F16 64

-template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
+template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
@@ -58,18 +58,17 @@ static __global__ void flash_attn_tile_ext_f16(

    //In this kernel Q, K, V are matrices while i, j, k are matrix indices.

-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+    const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.

    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
-    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
-    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
+    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.z              + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.z / gqa_ratio));
+    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
    const half   * maskh = (const half   *)  mask + ne11*ic0;

    const int stride_KV2 = nb11 / sizeof(half2);

-    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+    const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);

    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
@@ -105,8 +104,7 @@ static __global__ void flash_attn_tile_ext_f16(

    __syncthreads();

-    const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F16;
-    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F16) {
+    for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F16; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F16) {
        // Calculate KQ tile and keep track of new maximum KQ values:

        half kqmax_new[ncols/nwarps];
@@ -271,24 +269,36 @@ static __global__ void flash_attn_tile_ext_f16(
            const int i0 = i00 + 2*threadIdx.x;

            half2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
-            if (parallel_blocks == 1) {
+            if (gridDim.y == 1) {
                dst_val /= __half2half2(kqsum_j);
            }
-            const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
-            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] =  __low2float(dst_val);
-            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = __high2float(dst_val);
+            const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;
+            dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 0] =  __low2float(dst_val);
+            dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 1] = __high2float(dst_val);
        }

-        if (parallel_blocks != 1 && threadIdx.x == 0) {
-            dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
+        if (gridDim.y != 1 && threadIdx.x == 0) {
+            dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
        }
    }
 #else
-   NO_DEVICE_CODE;
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
+    GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
+    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
+    GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
+    GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
+    NO_DEVICE_CODE;
 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
 }

-template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
+template <int cols_per_block, bool use_logit_softcap>
 void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    switch (Q->ne[0]) {
@@ -296,15 +306,17 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int    D             = 64;
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
+            launch_fattn<D, cols_per_block, 1, -1>
+                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
        } break;
        case 128: {
            constexpr int    D             = 128;
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f16<D, cols_per_block, nwarps, use_logit_softcap>;
+            launch_fattn<D, cols_per_block, 1, -1>
+                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F16, true, true, false);
        } break;
        default: {
            GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
@@ -324,37 +336,22 @@ void ggml_cuda_flash_attn_ext_tile_f16(ggml_backend_cuda_context & ctx, ggml_ten

    if (Q->ne[1] <= 16) {
        constexpr int cols_per_block = 16;
-        constexpr int parallel_blocks = 4;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+            launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
-        }
-        return;
-    }
-
-    if (Q->ne[1] <= 32) {
-        constexpr int cols_per_block = 32;
-        constexpr int parallel_blocks = 4;
-        if (logit_softcap == 0.0f) {
-            constexpr bool use_logit_softcap = false;
-            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
-        } else {
-            constexpr bool use_logit_softcap = true;
-            launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+            launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
        }
        return;
    }

    constexpr int cols_per_block = 32;
-    constexpr int parallel_blocks = 1;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
-        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
    } else {
        constexpr bool use_logit_softcap = true;
-        launch_fattn_tile_f16_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        launch_fattn_tile_f16_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
    }
 }
--- a/ggml/src/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml/src/ggml-cuda/fattn-tile-f32.cu
@@ -4,7 +4,7 @@

 #define FATTN_KQ_STRIDE_TILE_F32 32

-template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
+template<int D, int ncols, int nwarps, bool use_logit_softcap> // D == head size
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(nwarps*WARP_SIZE, 1)
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
@@ -58,18 +58,17 @@ static __global__ void flash_attn_tile_ext_f32(

    // In this kernel Q, K, V are matrices while i, j, k are matrix indices.

-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+    const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.

    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.y              + nb01*ic0);
-    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.y / gqa_ratio));
-    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
+    const float2 * Q_f2  = (const float2 *) (Q    + nb02* blockIdx.z              + nb01*ic0);
+    const half2  * K_h2  = (const half2  *) (K    + nb12*(blockIdx.z / gqa_ratio));
+    const half2  * V_h2  = (const half2  *) (V    + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
    const half   * maskh = (const half   *)  mask + ne11*ic0;

    const int stride_KV2 = nb11 / sizeof(half2);

-    const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+    const float slope = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);

    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");

@@ -103,8 +102,7 @@ static __global__ void flash_attn_tile_ext_f32(

    __syncthreads();

-    const int k_start = parallel_blocks == 1 ? 0 : ip*FATTN_KQ_STRIDE_TILE_F32;
-    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE_TILE_F32) {
+    for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE_TILE_F32; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE_TILE_F32) {
        // Calculate KQ tile and keep track of new maximum KQ values:

        float kqmax_new[ncols/nwarps];
@@ -269,25 +267,37 @@ static __global__ void flash_attn_tile_ext_f32(
            const int i0 = i00 + 2*threadIdx.x;

            float2 dst_val = VKQ[j_VKQ_0/nwarps][i0/(2*WARP_SIZE)];
-            if (parallel_blocks == 1) {
+            if (gridDim.y == 1) {
                dst_val.x /= kqsum_j;
                dst_val.y /= kqsum_j;
            }
-            const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
-            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 0] = dst_val.x;
-            dst[j_dst*D*gridDim.y + D*blockIdx.y + i0 + 1] = dst_val.y;
+            const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;
+            dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 0] = dst_val.x;
+            dst[j_dst*D*gridDim.z + D*blockIdx.z + i0 + 1] = dst_val.y;
        }

-        if (parallel_blocks != 1 && threadIdx.x == 0) {
-            dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
+        if (gridDim.y != 1 && threadIdx.x == 0) {
+            dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[j_VKQ_0/nwarps], kqsum_j);
        }
    }
 #else
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
+    GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
+    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
+    GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
+    GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
    NO_DEVICE_CODE;
 #endif // FLASH_ATTN_AVAILABLE
 }

-template <int cols_per_block, int parallel_blocks, bool use_logit_softcap>
+template <int cols_per_block, bool use_logit_softcap>
 void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * Q = dst->src[0];
    switch (Q->ne[0]) {
@@ -295,15 +305,17 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int    D             = 64;
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
+            launch_fattn<D, cols_per_block, 1, -1>
+                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
        } break;
        case 128: {
            constexpr int    D             = 128;
            constexpr int    nwarps        = 8;
            constexpr size_t nbytes_shared = 0;
-            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks, use_logit_softcap>;
-            launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, true, true);
+            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, use_logit_softcap>;
+            launch_fattn<D, cols_per_block, 1, -1>
+                (ctx, dst, fattn_kernel, nwarps, nbytes_shared, FATTN_KQ_STRIDE_TILE_F32, true, true, false);
        } break;
        default: {
            GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
@@ -320,37 +332,22 @@ void ggml_cuda_flash_attn_ext_tile_f32(ggml_backend_cuda_context & ctx, ggml_ten

    if (Q->ne[1] <= 16) {
        constexpr int cols_per_block = 16;
-        constexpr int parallel_blocks = 4;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+            launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
-        }
-        return;
-    }
-
-    if (Q->ne[1] <= 32) {
-        constexpr int cols_per_block = 32;
-        constexpr int parallel_blocks = 4;
-        if (logit_softcap == 0.0f) {
-            constexpr bool use_logit_softcap = false;
-            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
-        } else {
-            constexpr bool use_logit_softcap = true;
-            launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+            launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
        }
        return;
    }

    constexpr int cols_per_block = 32;
-    constexpr int parallel_blocks = 1;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
-        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
    } else {
        constexpr bool use_logit_softcap = true;
-        launch_fattn_tile_f32_64_128<cols_per_block, parallel_blocks, use_logit_softcap>(ctx, dst);
+        launch_fattn_tile_f32_64_128<cols_per_block, use_logit_softcap>(ctx, dst);
    }
 }
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -1,7 +1,7 @@
 #include "common.cuh"
 #include "fattn-common.cuh"

-template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
+template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
@@ -55,17 +55,16 @@ static __global__ void flash_attn_vec_ext_f16(
    constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
    constexpr dequantize_1_f16_t dequantize_1_v = get_dequantize_1_f16(type_V);

-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+    const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.

    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    Q += nb02* blockIdx.y              + nb01*ic0;
-    K += nb12*(blockIdx.y / gqa_ratio);
-    V += nb22*(blockIdx.y / gqa_ratio);
+    Q += nb02* blockIdx.z              + nb01*ic0;
+    K += nb12*(blockIdx.z / gqa_ratio);
+    V += nb22*(blockIdx.z / gqa_ratio);

    const half * maskh = (const half   *)  mask + ne11*ic0;

-    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+    const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);

    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
@@ -172,8 +171,7 @@ static __global__ void flash_attn_vec_ext_f16(

    half2 VKQ[ncols] = {{0.0f, 0.0f}};

-    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
-    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
+    for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:

        // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
@@ -283,29 +281,41 @@ static __global__ void flash_attn_vec_ext_f16(
        kqsum[j_VKQ] = warp_reduce_sum((float)kqsum[j_VKQ]);

        half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
-        if (parallel_blocks == 1) {
+        if (gridDim.y == 1) {
            dst_val /= kqsum[j_VKQ];
        }
-        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
-        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
+        const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;
+        dst[j_dst*D*gridDim.z + D*blockIdx.z + tid] = dst_val;
    }

-    if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
-        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
+    if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
+        dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
    }
 #else
-   NO_DEVICE_CODE;
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
+    GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
+    GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
+    GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
+    GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
+    NO_DEVICE_CODE;
 #endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
 }

-template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
+template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
 void ggml_cuda_flash_attn_ext_vec_f16_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    constexpr int nwarps = D/WARP_SIZE;
-    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
+    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f16<D, cols_per_block, type_K, type_V, use_logit_softcap>;
    constexpr bool need_f16_K = D != 128;
    constexpr bool need_f16_V = D != 128 && D != 64;
    constexpr size_t nbytes_shared = 0;
-    launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, need_f16_K, need_f16_V);
+    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
 }

 template <int D, ggml_type type_K, ggml_type type_V>
@@ -325,65 +335,48 @@ void ggml_cuda_flash_attn_ext_vec_f16_case(ggml_backend_cuda_context & ctx, ggml
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

    if (Q->ne[1] == 1) {
-        constexpr int cols_per_block  = 1;
-        constexpr int parallel_blocks = 4;
+        constexpr int cols_per_block = 1;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }

    if (Q->ne[1] == 2) {
-        constexpr int cols_per_block  = 2;
-        constexpr int parallel_blocks = 4;
+        constexpr int cols_per_block = 2;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }

    if (Q->ne[1] <= 4) {
-        constexpr int cols_per_block  = 4;
-        constexpr int parallel_blocks = 4;
+        constexpr int cols_per_block = 4;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }

-    if (Q->ne[1] <= 8) {
-        constexpr int cols_per_block  = 8;
-        constexpr int parallel_blocks = 4;
-        if (logit_softcap == 0.0f) {
-            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
-        } else {
-            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
-        }
-        return;
-    }
-
-    constexpr int cols_per_block  = 8;
-    constexpr int parallel_blocks = 1;
+    constexpr int cols_per_block = 8;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
-        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
    } else {
        constexpr bool use_logit_softcap = true;
-        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        ggml_cuda_flash_attn_ext_vec_f16_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
    }
 }

--- a/ggml/src/ggml-cuda/fattn-vec-f32.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f32.cuh
@@ -1,7 +1,7 @@
 #include "common.cuh"
 #include "fattn-common.cuh"

-template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
+template<int D, int ncols, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
 #if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
 __launch_bounds__(D, 1)
 #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
@@ -55,16 +55,15 @@ static __global__ void flash_attn_vec_ext_f32(
    constexpr bool Q_q8_1 = type_K != GGML_TYPE_F16;
    constexpr dequantize_1_f32_t dequantize_1_v = get_dequantize_1_f32(type_V);

-    const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
-    const int ip  =  blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
+    const int ic0 = blockIdx.x * ncols; // Index of the Q/QKV column to work on.

    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    Q += nb02* blockIdx.y              + nb01*ic0;
-    K += nb12*(blockIdx.y / gqa_ratio);
-    V += nb22*(blockIdx.y / gqa_ratio); // K and V have same shape
+    Q += nb02* blockIdx.z              + nb01*ic0;
+    K += nb12*(blockIdx.z / gqa_ratio);
+    V += nb22*(blockIdx.z / gqa_ratio); // K and V have same shape
    const half * maskh = (const half   *)  mask + ne11*ic0;

-    const float slope = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+    const float slope = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);

    static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
    constexpr int nwarps = D / WARP_SIZE;
@@ -167,8 +166,7 @@ static __global__ void flash_attn_vec_ext_f32(

    float VKQ[ncols] = {0.0f};

-    const int k_start = parallel_blocks == 1 ? 0 : ip*D;
-    for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
+    for (int k_VKQ_0 = blockIdx.y*D; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*D) {
        // Calculate KQ tile and keep track of new maximum KQ values:

        float kqmax_new_arr[ncols];
@@ -268,29 +266,39 @@ static __global__ void flash_attn_vec_ext_f32(
        kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);

        float dst_val = VKQ[j_VKQ];
-        if (parallel_blocks == 1) {
+        if (gridDim.y == 1) {
            dst_val /= kqsum[j_VKQ];
        }
-        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
-        dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
+        const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;
+        dst[j_dst*D*gridDim.z + D*blockIdx.z + tid] = dst_val;
    }

-    if (parallel_blocks != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
-        dst_meta[(ic0 + tid)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[tid], kqsum[tid]);
+    if (gridDim.y != 1 && tid < ncols && (ncols <= 2 || ic0 + tid < ne01)) {
+        dst_meta[((ic0 + tid)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = make_float2(kqmax[tid], kqsum[tid]);
    }
 #else
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
+    GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
+    GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31);
+    GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
+    GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
+    GGML_UNUSED(nb22); GGML_UNUSED(nb23); GGML_UNUSED(ne0); GGML_UNUSED(ne1);
+    GGML_UNUSED(ne2); GGML_UNUSED(ne3);
    NO_DEVICE_CODE;
 #endif // FLASH_ATTN_AVAILABLE
 }

-template <int D, int cols_per_block, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
+template <int D, int cols_per_block, ggml_type type_K, ggml_type type_V, bool use_logit_softcap>
 void ggml_cuda_flash_attn_ext_vec_f32_case_impl(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    constexpr int nwarps = D/WARP_SIZE;
-    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>;
+    fattn_kernel_t fattn_kernel = flash_attn_vec_ext_f32<D, cols_per_block, type_K, type_V, use_logit_softcap>;
    constexpr bool need_f16_K = D != 128;
    constexpr bool need_f16_V = D != 128 && D != 64;
    constexpr size_t nbytes_shared = 0;
-    launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, need_f16_K, need_f16_V);
+    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
 }

 template <int D, ggml_type type_K, ggml_type type_V>
@@ -307,65 +315,48 @@ void ggml_cuda_flash_attn_ext_vec_f32_case(ggml_backend_cuda_context & ctx, ggml
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

    if (Q->ne[1] == 1) {
-        constexpr int cols_per_block  = 1;
-        constexpr int parallel_blocks = 4;
+        constexpr int cols_per_block = 1;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }

    if (Q->ne[1] == 2) {
-        constexpr int cols_per_block  = 2;
-        constexpr int parallel_blocks = 4;
+        constexpr int cols_per_block = 2;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }

    if (Q->ne[1] <= 4) {
-        constexpr int cols_per_block  = 4;
-        constexpr int parallel_blocks = 4;
+        constexpr int cols_per_block = 4;
        if (logit_softcap == 0.0f) {
            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        } else {
            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
        }
        return;
    }

-    if (Q->ne[1] <= 8) {
-        constexpr int cols_per_block  = 8;
-        constexpr int parallel_blocks = 4;
-        if (logit_softcap == 0.0f) {
-            constexpr bool use_logit_softcap = false;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
-        } else {
-            constexpr bool use_logit_softcap = true;
-            ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
-        }
-        return;
-    }
-
-    constexpr int cols_per_block  = 8;
-    constexpr int parallel_blocks = 1;
+    constexpr int cols_per_block = 8;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
-        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
    } else {
        constexpr bool use_logit_softcap = true;
-        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, parallel_blocks, type_K, type_V, use_logit_softcap>(ctx, dst);
+        ggml_cuda_flash_attn_ext_vec_f32_case_impl<D, cols_per_block, type_K, type_V, use_logit_softcap>(ctx, dst);
    }
 }

--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cu
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cu
@@ -18,7 +18,7 @@ namespace wmma = rocwmma;
 #endif // FP16_MMA_AVAILABLE

 // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
-template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
+template<int D, int ncols, int nwarps, int VKQ_stride, typename KQ_acc_t, bool use_logit_softcap>
 __launch_bounds__(nwarps*ggml_cuda_get_physical_warp_size(), 1)
 static __global__ void flash_attn_ext_f16(
        const char * __restrict__ Q,
@@ -67,8 +67,7 @@ static __global__ void flash_attn_ext_f16(

    constexpr int warp_size = ggml_cuda_get_physical_warp_size();

-    const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on.
-    const int ip  =        blockIdx.x % parallel_blocks;  // Index in group of blocks running for the same column in parallel.
+    const int ic0 = ncols*blockIdx.x; // Index of the first Q/QKV column to work on.

    static_assert(D <= FATTN_KQ_STRIDE, "D must be <= FATTN_KQ_STRIDE.");
    static_assert(ncols == 8 || ncols % 16 == 0, "ncols must be 8 or a multiple of 16.");
@@ -91,16 +90,16 @@ static __global__ void flash_attn_ext_f16(
    constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half);

    const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
-    const float * Q_f   = (const float *) (Q + nb02* blockIdx.y              + nb01*ic0);
-    const half  * K_h   = (const half  *) (K + nb12*(blockIdx.y / gqa_ratio));
-    const half  * V_h   = (const half  *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
+    const float * Q_f   = (const float *) (Q + nb02* blockIdx.z              + nb01*ic0);
+    const half  * K_h   = (const half  *) (K + nb12*(blockIdx.z / gqa_ratio));
+    const half  * V_h   = (const half  *) (V + nb12*(blockIdx.z / gqa_ratio)); // K and V have same shape
    const half  * maskh = (const half  *)  mask + (nb31/sizeof(half))* ic0;
    const half2 * mask2 = (const half2 *)  mask + (nb31/sizeof(half))*(ic0/2);

    const int stride_Q  = nb01 / sizeof(float);
    const int stride_KV = nb11 / sizeof(half);

-    const float slopef = get_alibi_slope(max_bias, blockIdx.y, n_head_log2, m0, m1);
+    const float slopef = get_alibi_slope(max_bias, blockIdx.z, n_head_log2, m0, m1);
    const half  slopeh = __float2half(slopef);
    const half2 slope2 = make_half2(slopef, slopef);

@@ -176,7 +175,7 @@ static __global__ void flash_attn_ext_f16(
    __syncthreads();

    // Iterate over ne11 == previous tokens:
-    for (int k_VKQ_0 = ip*FATTN_KQ_STRIDE; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE) {
+    for (int k_VKQ_0 = blockIdx.y*FATTN_KQ_STRIDE; k_VKQ_0 < ne11; k_VKQ_0 += gridDim.y*FATTN_KQ_STRIDE) {
        // Calculate tile of KQ:
 #pragma unroll
        for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE; i_KQ_0 += KQ_stride_tc) {
@@ -395,7 +394,7 @@ static __global__ void flash_attn_ext_f16(
        if (ic0 + j_VKQ >= ne01) {
            return;
        }
-        const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
+        const int j_dst = (ic0 + j_VKQ)*gridDim.y + blockIdx.y;

        float KQ_rowsum_j;
        if (std::is_same<KQ_acc_t, float>::value) {
@@ -411,13 +410,13 @@ static __global__ void flash_attn_ext_f16(
                break;
            }
            float dst_val = VKQ[j_VKQ*D_padded + i];
-            if (parallel_blocks == 1) {
+            if (gridDim.y == 1) {
                dst_val /= KQ_rowsum_j;
            }
-            dst[j_dst*gridDim.y*D + blockIdx.y*D + i] = dst_val;
+            dst[j_dst*gridDim.z*D + blockIdx.z*D + i] = dst_val;
        }

-        if (parallel_blocks == 1 || threadIdx.x != 0) {
+        if (gridDim.y == 1 || threadIdx.x != 0) {
            continue;
        }

@@ -428,10 +427,20 @@ static __global__ void flash_attn_ext_f16(
            dst_meta_val.x = __low2float(KQ_max_h2[j0/nwarps]);
        }
        dst_meta_val.y = KQ_rowsum_j;
-        dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = dst_meta_val;
+        dst_meta[((ic0 + j_VKQ)*gridDim.z + blockIdx.z) * gridDim.y + blockIdx.y] = dst_meta_val;
    }
 #else
-   NO_DEVICE_CODE;
+    GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
+    GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
+    GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
+    GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
+    GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
+    GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
+    GGML_UNUSED(ne31); GGML_UNUSED(nb31); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
+    GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
+    GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
+    GGML_UNUSED(ne0); GGML_UNUSED(ne1); GGML_UNUSED(ne2); GGML_UNUSED(ne3);
+    NO_DEVICE_CODE;
 #endif // defined(FLASH_ATTN_AVAILABLE) && (__CUDA_ARCH__ == GGML_CUDA_CC_VOLTA || (defined(GGML_HIP_ROCWMMA_FATTN) && defined(FP16_MMA_AVAILABLE)))
 }

@@ -462,60 +471,26 @@ static_assert(get_VKQ_stride( 80, 4, 16) ==  16, "Test failed.");
 template <int D, int cols_per_block, typename KQ_acc_t>
 void ggml_cuda_flash_attn_ext_wmma_f16_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * KQV = dst;
-    const ggml_tensor * Q   = dst->src[0];

    constexpr int nwarps = 4;

    constexpr int frag_m = cols_per_block == 8 && D % 32 == 0 ? 32 : 16;
-    const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3];
-    const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
    const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;

    float logit_softcap;
    memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));

-    if (4*blocks_num_pb1 < 2*nsm) {
-        constexpr int parallel_blocks = 4;
-        fattn_kernel_t fattn_kernel;
-        if (logit_softcap == 0.0f) {
-            constexpr bool use_logit_softcap = false;
-            fattn_kernel = flash_attn_ext_f16<
-                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
-        } else {
-            constexpr bool use_logit_softcap = true;
-            fattn_kernel = flash_attn_ext_f16<
-                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
-        }
-        launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size);
-        return;
-    }
-    if (2*blocks_num_pb1 < 2*nsm) {
-        constexpr int parallel_blocks = 2;
-        fattn_kernel_t fattn_kernel;
-        if (logit_softcap == 0.0f) {
-            constexpr bool use_logit_softcap = false;
-            fattn_kernel = flash_attn_ext_f16<
-                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
-        } else {
-            constexpr bool use_logit_softcap = true;
-            fattn_kernel = flash_attn_ext_f16<
-                D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
-        }
-        launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size);
-        return;
-    }
-    constexpr int parallel_blocks = 1;
    fattn_kernel_t fattn_kernel;
    if (logit_softcap == 0.0f) {
        constexpr bool use_logit_softcap = false;
        fattn_kernel = flash_attn_ext_f16<
-            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
+            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), KQ_acc_t, use_logit_softcap>;
    } else {
        constexpr bool use_logit_softcap = true;
        fattn_kernel = flash_attn_ext_f16<
-            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t, use_logit_softcap>;
+            D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), KQ_acc_t, use_logit_softcap>;
    }
-    launch_fattn<D, cols_per_block, 1, parallel_blocks, -1>(ctx, dst, fattn_kernel, nwarps, 0, true, true, warp_size);
+    launch_fattn<D, cols_per_block, 1, -1>(ctx, dst, fattn_kernel, nwarps, 0, FATTN_KQ_STRIDE, true, true, false, warp_size);
 }

 void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -253,7 +253,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
    const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
    const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);

-    if (cc >= GGML_CUDA_CC_OFFSET_AMD) {
+    if (GGML_CUDA_CC_IS_AMD(cc)) {
 #if defined(GGML_HIP_ROCWMMA_FATTN)
        if (fp16_mma_available(cc)) {
            ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);
@@ -281,13 +281,13 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst

    if (!fp16_mma_available(cc)) {
        if (prec == GGML_PREC_DEFAULT) {
-            if (Q->ne[1] <= 8) {
+            if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
                ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
            } else {
                ggml_cuda_flash_attn_ext_tile_f16(ctx, dst);
            }
        } else {
-            if (Q->ne[1] <= 8) {
+            if (Q->ne[1] <= 8 || Q->ne[0] == 256) {
                ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
            } else {
                ggml_cuda_flash_attn_ext_tile_f32(ctx, dst);
@@ -296,17 +296,17 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
        return;
    }

-    const int gqa_ratio = Q->ne[2] / K->ne[2];
-    const bool mma_fast_for_bs1 = fp16_mma_available(cc) && gqa_ratio % 2 == 0 &&
-        K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16 && mask;
-    if (Q->ne[1] == 1 && Q->ne[0] % (2*warp_size) == 0 && !mma_fast_for_bs1) {
+    const bool gqa_opt_applies = ((Q->ne[2] / K->ne[2]) % 2 == 0) && mask; // The mma-based kernels have GQA-specific optimizations
+    const bool mma_needs_data_conversion = K->type != GGML_TYPE_F16 || V->type != GGML_TYPE_F16;
+    const bool mma_faster_for_bs1 = new_mma_available(cc) && gqa_opt_applies && cc < GGML_CUDA_CC_ADA_LOVELACE && !mma_needs_data_conversion;
+    const bool can_use_vector_kernel = (Q->ne[0] % (2*warp_size) == 0) && (prec == GGML_PREC_DEFAULT || Q->ne[0] <= 128);
+    if (Q->ne[1] == 1 && can_use_vector_kernel && !mma_faster_for_bs1) {
        if (prec == GGML_PREC_DEFAULT) {
            ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
-            return;
-        } else if(Q->ne[0] <= 128) {
+        } else {
            ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
-            return;
        }
+        return;
    }

    // The MMA implementation needs Turing or newer, use the old WMMA code for Volta:
--- a/Show More
+++ b/Show More