speculative : adapt to new llama API

ggml-ci
Merge branch 'master' into xsn/private_batch_api
2026-02-12 14:03:20 +02:00 · 2025-03-18 22:05:44 +02:00 · 2025-03-18 15:45:22 +01:00 · 2025-03-17 12:17:14 +01:00 · 2025-03-17 12:05:23 +01:00 · 2025-03-14 22:30:29 +01:00
223 changed files with 12594 additions and 15829 deletions
--- a/.github/workflows/build-linux-cross.yml
+++ b/.github/workflows/build-linux-cross.yml
@@ -1,121 +0,0 @@
-name: Build on Linux using cross-compiler
-on:
-  workflow_dispatch:
-  workflow_call:
-
-jobs:
-  ubuntu-latest-riscv64-cpu-cross:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Riscv
-        run: |
-          sudo dpkg --add-architecture riscv64
-          sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
-                 /etc/apt/sources.list /etc/apt/apt-mirrors.txt
-          sudo apt-get clean
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu
-
-      - name: Build
-        run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-latest-riscv64-vulkan-cross:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Setup Riscv
-        run: |
-          sudo dpkg --add-architecture riscv64
-          sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
-                 /etc/apt/sources.list /etc/apt/apt-mirrors.txt
-          sudo apt-get clean
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  glslc \
-                  gcc-14-riscv64-linux-gnu \
-                  g++-14-riscv64-linux-gnu \
-                  libvulkan-dev:riscv64
-
-      - name: Build
-        run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_VULKAN=ON \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=riscv64 \
-                         -DCMAKE_C_COMPILER=riscv64-linux-gnu-gcc-14 \
-                         -DCMAKE_CXX_COMPILER=riscv64-linux-gnu-g++-14 \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/riscv64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
-
-  ubuntu-latest-arm64-vulkan-cross:
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-
-      - name: Setup Arm64
-        run: |
-          sudo dpkg --add-architecture arm64
-          sudo sed -i 's|http://azure.archive.ubuntu.com/ubuntu|http://ports.ubuntu.com/ubuntu-ports|g' \
-                 /etc/apt/sources.list /etc/apt/apt-mirrors.txt
-          sudo apt-get clean
-          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends \
-                  build-essential \
-                  glslc \
-                  crossbuild-essential-arm64 \
-                  libvulkan-dev:arm64
-
-      - name: Build
-        run: |
-          cmake -B build -DCMAKE_BUILD_TYPE=Release \
-                         -DGGML_VULKAN=ON \
-                         -DGGML_OPENMP=OFF \
-                         -DLLAMA_BUILD_EXAMPLES=ON \
-                         -DLLAMA_BUILD_TESTS=OFF \
-                         -DCMAKE_SYSTEM_NAME=Linux \
-                         -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
-                         -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc \
-                         -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++ \
-                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-                         -DCMAKE_FIND_ROOT_PATH=/usr/lib/aarch64-linux-gnu \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
-                         -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-          cmake --build build --config Release -j $(nproc)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,7 +10,7 @@ on:
  push:
    branches:
      - master
-    paths: ['.github/workflows/build.yml', '.github/workflows/build-linux-cross.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
+    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal', '**/*.comp']
@@ -606,9 +606,6 @@ jobs:
            -DGGML_SYCL_F16=ON
          cmake --build build --config Release -j $(nproc)

-  build-linux-cross:
-    uses: ./.github/workflows/build-linux-cross.yml
-
  macOS-latest-cmake-ios:
    runs-on: macos-latest

@@ -679,35 +676,6 @@ jobs:
            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO

-  macOS-latest-cmake-visionos:
-    runs-on: macos-latest
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v4
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-
-      - name: Build
-        id: cmake_build
-        run: |
-          sysctl -a
-          cmake -B build -G Xcode \
-            -DGGML_METAL_USE_BF16=ON \
-            -DGGML_METAL_EMBED_LIBRARY=ON \
-            -DLLAMA_BUILD_EXAMPLES=OFF \
-            -DLLAMA_BUILD_TESTS=OFF \
-            -DLLAMA_BUILD_SERVER=OFF \
-            -DCMAKE_SYSTEM_NAME=visionOS \
-            -DCMAKE_OSX_DEPLOYMENT_TARGET=1.0 \
-            -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
-          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
-
  macOS-latest-swift:
    runs-on: macos-latest

@@ -806,7 +774,7 @@ jobs:
    env:
      OPENBLAS_VERSION: 0.3.23
      SDE_VERSION: 9.33.0-2024-01-07
-      VULKAN_VERSION: 1.4.309.0
+      VULKAN_VERSION: 1.4.304.1

    strategy:
      matrix:
--- a/README.md
+++ b/README.md
@@ -112,8 +112,6 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
 - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM)
 - [x] [QRWKV-6](https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1)
 - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct)
- [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview)
- [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32)

 #### Multimodal

@@ -530,35 +528,6 @@ If your issue is with model generation quality, then please at least scan the fo
    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)

-## XCFramework
-The XCFramework is a precompiled version of the library for iOS, visionOS, tvOS,
-and macOS. It can be used in Swift projects without the need to compile the
-library from source. For example:
-```swift
-// swift-tools-version: 5.10
-// The swift-tools-version declares the minimum version of Swift required to build this package.
-
-import PackageDescription
-
-let package = Package(
-    name: "MyLlamaPackage",
-    targets: [
-        .executableTarget(
-            name: "MyLlamaPackage",
-            dependencies: [
-                "LlamaFramework"
-            ]),
-        .binaryTarget(
-            name: "LlamaFramework",
-            url: "https://github.com/ggml-org/llama.cpp/releases/download/b5046/llama-b5046-xcframework.zip",
-            checksum: "c19be78b5f00d8d29a25da41042cb7afa094cbf6280a225abe614b03b20029ab"
-        )
-    ]
-)
-```
-The above example is using an intermediate build `b5046` of the library. This can be modified
-to use a different version by changing the URL and checksum.
-
 ## Completions
 Command-line completion is available for some environments.

--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -432,8 +432,8 @@ cmake -B build-visionos -G Xcode \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xros \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
    -S .
 cmake --build build-visionos --config Release -- -quiet

@@ -445,8 +445,8 @@ cmake -B build-visionos-sim -G Xcode \
    -DCMAKE_SYSTEM_NAME=visionOS \
    -DCMAKE_OSX_SYSROOT=xrsimulator \
    -DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
-    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
-    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
+    -DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
+    -DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
    -S .
 cmake --build build-visionos-sim --config Release -- -quiet

--- a/ci/README.md
+++ b/ci/README.md
@@ -26,43 +26,4 @@ GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 # with SYCL support
 source /opt/intel/oneapi/setvars.sh
 GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-
-# with MUSA support
-GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
-
-## Running MUSA CI in a Docker Container
-
-Assuming `$PWD` is the root of the `llama.cpp` repository, follow these steps to set up and run MUSA CI in a Docker container:
-
-### 1. Create a local directory to store cached models, configuration files and venv:
-
-```bash
-mkdir -p $HOME/llama.cpp/ci-cache
-```
-
-### 2. Create a local directory to store CI run results:
-
-```bash
-mkdir -p $HOME/llama.cpp/ci-results
-```
-
-### 3. Start a Docker container and run the CI:
-
-```bash
-docker run --privileged -it \
-    -v $HOME/llama.cpp/ci-cache:/ci-cache \
-    -v $HOME/llama.cpp/ci-results:/ci-results \
-    -v $PWD:/ws -w /ws \
-    mthreads/musa:rc3.1.1-devel-ubuntu22.04
-```
-
-Inside the container, execute the following commands:
-
-```bash
-apt update -y && apt install -y bc cmake ccache git python3.10-venv time unzip wget
-git config --global --add safe.directory /ws
-GG_BUILD_MUSA=1 bash ./ci/run.sh /ci-results /ci-cache
-```
-
-This setup ensures that the CI runs within an isolated Docker environment while maintaining cached files and results across runs.
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -16,9 +16,6 @@
 # # with VULKAN support
 # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
-# # with MUSA support
-# GG_BUILD_MUSA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
-#

 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@@ -55,24 +52,13 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
        echo "source /opt/intel/oneapi/setvars.sh"
        exit 1
    fi
-    # Use only main GPU
-    export ONEAPI_DEVICE_SELECTOR="level_zero:0"
-    # Enable sysman for correct memory reporting
-    export ZES_ENABLE_SYSMAN=1
-    # to circumvent precision issues on CPY operations
-    export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"
+
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi

 if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
 fi
-
-if [ ! -z ${GG_BUILD_MUSA} ]; then
-    # Use qy1 by default (MTT S80)
-    MUSA_ARCH=${MUSA_ARCH:-21}
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
-fi
 ## helpers

 # download a file if it does not exist or if it is outdated
@@ -822,7 +808,7 @@ export LLAMA_LOG_PREFIX=1
 export LLAMA_LOG_TIMESTAMPS=1

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
-    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models
+    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
    rm -rf ${SRC}/models-mnt
    mnt_models=${MNT}/models
    mkdir -p ${mnt_models}
@@ -840,10 +826,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
 fi

 ret=0
-if [ -z ${GG_BUILD_SYCL} ]; then
-    # SYCL build breaks with debug build flags
-    test $ret -eq 0 && gg_run ctest_debug
-fi
+
+test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release

 if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -851,9 +835,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run rerank_tiny

    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
-        if [ -z ${GG_BUILD_SYCL} ]; then
-            test $ret -eq 0 && gg_run test_scripts_debug
-        fi
+        test $ret -eq 0 && gg_run test_scripts_debug
        test $ret -eq 0 && gg_run test_scripts_release
    fi

@@ -864,9 +846,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
            test $ret -eq 0 && gg_run pythia_2_8b
            #test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
-        if [ -z ${GG_BUILD_SYCL} ]; then
-            test $ret -eq 0 && gg_run ctest_with_model_debug
-        fi
+        test $ret -eq 0 && gg_run ctest_with_model_debug
        test $ret -eq 0 && gg_run ctest_with_model_release
    fi
 fi
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -114,8 +114,8 @@ if (LLAMA_LLGUIDANCE)

    ExternalProject_Add(llguidance_ext
        GIT_REPOSITORY https://github.com/guidance-ai/llguidance
-        # v0.7.10:
-        GIT_TAG 0309d2a6bf40abda35344a362edc71e06d5009f8
+        # v0.6.12:
+        GIT_TAG ced1c9023d47ec194fa977932d35ce65c2ebfc09
        PREFIX ${CMAKE_BINARY_DIR}/llguidance
        SOURCE_DIR ${LLGUIDANCE_SRC}
        BUILD_IN_SOURCE TRUE
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1,24 +1,12 @@
-#include "gguf.h" // for reading GGUF splits
 #include "arg.h"

-#include "common.h"
 #include "log.h"
 #include "sampling.h"
 #include "chat.h"

-// fix problem with std::min and std::max
-#if defined(_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
-#include <windows.h>
-#endif
-
 #include <algorithm>
 #include <climits>
 #include <cstdarg>
-#include <filesystem>
 #include <fstream>
 #include <regex>
 #include <set>
@@ -26,14 +14,6 @@
 #include <thread>
 #include <vector>

-//#define LLAMA_USE_CURL
-
-#if defined(LLAMA_USE_CURL)
-#include <curl/curl.h>
-#include <curl/easy.h>
-#include <future>
-#endif
-
 #include "json-schema-to-grammar.h"

 using json = nlohmann::ordered_json;
@@ -145,553 +125,47 @@ std::string common_arg::to_string() {
    return ss.str();
 }

-//
-// downloader
-//
-
-struct common_hf_file_res {
-    std::string repo; // repo name with ":tag" removed
-    std::string ggufFile;
-    std::string mmprojFile;
-};
-
-#ifdef LLAMA_USE_CURL
-
-#ifdef __linux__
-#include <linux/limits.h>
-#elif defined(_WIN32)
-#   if !defined(PATH_MAX)
-#   define PATH_MAX MAX_PATH
-#   endif
-#else
-#include <sys/syslimits.h>
-#endif
-#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
-
-//
-// CURL utils
-//
-
-using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
-
-// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
-struct curl_slist_ptr {
-    struct curl_slist * ptr = nullptr;
-    ~curl_slist_ptr() {
-        if (ptr) {
-            curl_slist_free_all(ptr);
-        }
-    }
-};
-
-#define CURL_MAX_RETRY 3
-#define CURL_RETRY_DELAY_SECONDS 2
-
-static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
-    int remaining_attempts = max_attempts;
-
-    while (remaining_attempts > 0) {
-        LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
-
-        CURLcode res = curl_easy_perform(curl);
-        if (res == CURLE_OK) {
-            return true;
-        }
-
-        int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
-        LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
-
-        remaining_attempts--;
-        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
-    }
-
-    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
-
-    return false;
-}
-
-// download one single file from remote URL to local path
-static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
-    // Initialize libcurl
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    if (!curl) {
-        LOG_ERR("%s: error initializing libcurl\n", __func__);
-        return false;
-    }
-
-    bool force_download = false;
-
-    // Set the URL, allow to follow http redirection
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
-
-    // Check if hf-token or bearer-token was specified
-    if (!bearer_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + bearer_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-    }
-
-#if defined(_WIN32)
-    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
-    //   operating system. Currently implemented under MS-Windows.
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-
-    // Check if the file already exists locally
-    auto file_exists = std::filesystem::exists(path);
-
-    // If the file exists, check its JSON metadata companion file.
-    std::string metadata_path = path + ".json";
-    nlohmann::json metadata;
-    std::string etag;
-    std::string last_modified;
-
-    if (file_exists) {
-        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
-        std::ifstream metadata_in(metadata_path);
-        if (metadata_in.good()) {
-            try {
-                metadata_in >> metadata;
-                LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
-                if (metadata.contains("url") && metadata.at("url").is_string()) {
-                    auto previous_url = metadata.at("url").get<std::string>();
-                    if (previous_url != url) {
-                        LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
-                        return false;
-                    }
-                }
-                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
-                    etag = metadata.at("etag");
-                }
-                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
-                    last_modified = metadata.at("lastModified");
-                }
-            } catch (const nlohmann::json::exception & e) {
-            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
-                return false;
-            }
-        }
-    } else {
-        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
-    }
-
-    // Send a HEAD request to retrieve the etag and last-modified headers
-    struct common_load_model_from_url_headers {
-        std::string etag;
-        std::string last_modified;
-    };
-
-    common_load_model_from_url_headers headers;
-
-    {
-        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
-        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
-            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
-
-            static std::regex header_regex("([^:]+): (.*)\r\n");
-            static std::regex etag_regex("ETag", std::regex_constants::icase);
-            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
-
-            std::string header(buffer, n_items);
-            std::smatch match;
-            if (std::regex_match(header, match, header_regex)) {
-                const std::string & key = match[1];
-                const std::string & value = match[2];
-                if (std::regex_match(key, match, etag_regex)) {
-                    headers->etag = value;
-                } else if (std::regex_match(key, match, last_modified_regex)) {
-                    headers->last_modified = value;
-                }
-            }
-            return n_items;
-        };
-
-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
-
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
-        if (!was_perform_successful) {
-            return false;
-        }
-
-        long http_code = 0;
-        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code != 200) {
-            // HEAD not supported, we don't know if the file has changed
-            // force trigger downloading
-            force_download = true;
-            LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
-        }
-    }
-
-    bool should_download = !file_exists || force_download;
-    if (!should_download) {
-        if (!etag.empty() && etag != headers.etag) {
-            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
-            should_download = true;
-        } else if (!last_modified.empty() && last_modified != headers.last_modified) {
-            LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
-            should_download = true;
-        }
-    }
-    if (should_download) {
-        std::string path_temporary = path + ".downloadInProgress";
-        if (file_exists) {
-            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
-            if (remove(path.c_str()) != 0) {
-                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
-                return false;
-            }
-        }
-
-        // Set the output file
-
-        struct FILE_deleter {
-            void operator()(FILE * f) const {
-                fclose(f);
-            }
-        };
-
-        std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
-        if (!outfile) {
-            LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
-            return false;
-        }
-
-        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
-        auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
-            return fwrite(data, size, nmemb, (FILE *)fd);
-        };
-        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
-        curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-        curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
-
-        //  display download progress
-        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
-
-        // helper function to hide password in URL
-        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
-            std::size_t protocol_pos = url.find("://");
-            if (protocol_pos == std::string::npos) {
-                return url;  // Malformed URL
-            }
-
-            std::size_t at_pos = url.find('@', protocol_pos + 3);
-            if (at_pos == std::string::npos) {
-                return url;  // No password in URL
-            }
-
-            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
-        };
-
-        // start the download
-        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
-            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
-        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
-        if (!was_perform_successful) {
-            return false;
-        }
-
-        long http_code = 0;
-        curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
-        if (http_code < 200 || http_code >= 400) {
-            LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
-            return false;
-        }
-
-        // Causes file to be closed explicitly here before we rename it.
-        outfile.reset();
-
-        // Write the updated JSON metadata file.
-        metadata.update({
-            {"url", url},
-            {"etag", headers.etag},
-            {"lastModified", headers.last_modified}
-        });
-        std::ofstream(metadata_path) << metadata.dump(4);
-        LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
-
-        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
-            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
-            return false;
-        }
-    }
-
-    return true;
-}
-
-// download multiple files from remote URLs to local paths
-// the input is a vector of pairs <url, path>
-static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
-    // Prepare download in parallel
-    std::vector<std::future<bool>> futures_download;
-    for (auto const & item : urls) {
-        futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
-            return common_download_file_single(it.first, it.second, bearer_token);
-        }, item));
-    }
-
-    // Wait for all downloads to complete
-    for (auto & f : futures_download) {
-        if (!f.get()) {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-static bool common_download_model(
-        const common_params_model & model,
-        const std::string & bearer_token) {
-    // Basic validation of the model.url
-    if (model.url.empty()) {
-        LOG_ERR("%s: invalid model url\n", __func__);
-        return false;
-    }
-
-    if (!common_download_file_single(model.url, model.path, bearer_token)) {
-        return false;
-    }
-
-    // check for additional GGUFs split to download
-    int n_split = 0;
-    {
-        struct gguf_init_params gguf_params = {
-            /*.no_alloc = */ true,
-            /*.ctx      = */ NULL,
-        };
-        auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params);
-        if (!ctx_gguf) {
-            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, model.path.c_str());
-            return false;
-        }
-
-        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
-        if (key_n_split >= 0) {
-            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
-        }
-
-        gguf_free(ctx_gguf);
-    }
-
-    if (n_split > 1) {
-        char split_prefix[PATH_MAX] = {0};
-        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-
-        // Verify the first split file format
-        // and extract split URL and PATH prefixes
-        {
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split);
-                return false;
-            }
-
-            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) {
-                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split);
-                return false;
-            }
-        }
-
-        std::vector<std::pair<std::string, std::string>> urls;
-        for (int idx = 1; idx < n_split; idx++) {
-            char split_path[PATH_MAX] = {0};
-            llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
-
-            char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
-            llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split);
-
-            if (std::string(split_path) == model.path) {
-                continue; // skip the already downloaded file
-            }
-
-            urls.push_back({split_url, split_path});
-        }
-
-        // Download in parallel
-        common_download_file_multiple(urls, bearer_token);
-    }
-
-    return true;
-}
-
-/**
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
- *
- * Return pair of <repo, file> (with "repo" already having tag removed)
- *
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
- */
-static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
-    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts.back() : "latest";
-    std::string hf_repo = parts[0];
-    if (string_split<std::string>(hf_repo, '/').size() != 2) {
-        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
-    }
-
-    // fetch model info from Hugging Face Hub API
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    std::string res_str;
-    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
-    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
-    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
-        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
-        return size * nmemb;
-    };
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
-#if defined(_WIN32)
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-    if (!bearer_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + bearer_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-    }
-    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
-    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-
-    CURLcode res = curl_easy_perform(curl.get());
-
-    if (res != CURLE_OK) {
-        throw std::runtime_error("error: cannot make GET request to HF API");
-    }
-
-    long res_code;
-    std::string ggufFile   = "";
-    std::string mmprojFile = "";
-    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
-    if (res_code == 200) {
-        // extract ggufFile.rfilename in json, using regex
-        {
-            std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
-            std::smatch match;
-            if (std::regex_search(res_str, match, pattern)) {
-                ggufFile = match[1].str();
-            }
-        }
-        // extract mmprojFile.rfilename in json, using regex
-        {
-            std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\"");
-            std::smatch match;
-            if (std::regex_search(res_str, match, pattern)) {
-                mmprojFile = match[1].str();
-            }
-        }
-    } else if (res_code == 401) {
-        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
-    } else {
-        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
-    }
-
-    // check response
-    if (ggufFile.empty()) {
-        throw std::runtime_error("error: model does not have ggufFile");
-    }
-
-    return { hf_repo, ggufFile, mmprojFile };
-}
-
-#else
-
-static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
-    LOG_ERR("error: built without CURL, cannot download model from internet\n");
-    return false;
-}
-
-static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
-    LOG_ERR("error: built without CURL, cannot download model from the internet\n");
-    return false;
-}
-
-static bool common_download_model(
-        const common_params_model &,
-        const std::string &) {
-    LOG_ERR("error: built without CURL, cannot download model from the internet\n");
-    return false;
-}
-
-static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
-    LOG_ERR("error: built without CURL, cannot download model from the internet\n");
-    return {};
-}
-
-#endif // LLAMA_USE_CURL
-
 //
 // utils
 //

-static void common_params_handle_model(
-        struct common_params_model & model,
-        const std::string & bearer_token,
-        const std::string & model_path_default,
-        bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
-    // handle pre-fill default model path and url based on hf_repo and hf_file
-    {
-        if (!model.hf_repo.empty()) {
-            // short-hand to avoid specifying --hf-file -> default it to --model
-            if (model.hf_file.empty()) {
-                if (model.path.empty()) {
-                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
-                    if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
-                        exit(1); // built without CURL, error message already printed
-                    }
-                    model.hf_repo = auto_detected.repo;
-                    model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
-                } else {
-                    model.hf_file = model.path;
+static void common_params_handle_model_default(
+        std::string & model,
+        const std::string & model_url,
+        std::string & hf_repo,
+        std::string & hf_file,
+        const std::string & hf_token,
+        const std::string & model_default) {
+    if (!hf_repo.empty()) {
+        // short-hand to avoid specifying --hf-file -> default it to --model
+        if (hf_file.empty()) {
+            if (model.empty()) {
+                auto auto_detected = common_get_hf_file(hf_repo, hf_token);
+                if (auto_detected.first.empty() || auto_detected.second.empty()) {
+                    exit(1); // built without CURL, error message already printed
                }
+                hf_repo = auto_detected.first;
+                hf_file = auto_detected.second;
+            } else {
+                hf_file = model;
            }
-
-            std::string hf_endpoint = "https://huggingface.co/";
-            const char * hf_endpoint_env = getenv("HF_ENDPOINT");
-            if (hf_endpoint_env) {
-                hf_endpoint = hf_endpoint_env;
-                if (hf_endpoint.back() != '/') hf_endpoint += '/';
-            }
-            model.url = hf_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
-            // make sure model path is present (for caching purposes)
-            if (model.path.empty()) {
-                // this is to avoid different repo having same file name, or same file name in different subdirs
-                std::string filename = model.hf_repo + "_" + model.hf_file;
-                // to make sure we don't have any slashes in the filename
-                string_replace_all(filename, "/", "_");
-                model.path = fs_get_cache_file(filename);
-            }
-
-        } else if (!model.url.empty()) {
-            if (model.path.empty()) {
-                auto f = string_split<std::string>(model.url, '#').front();
-                f = string_split<std::string>(f, '?').front();
-                model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
-            }
-
-        } else if (model.path.empty()) {
-            model.path = model_path_default;
        }
-    }
-
-    // then, download it if needed
-    if (!model.url.empty()) {
-        bool ok = common_download_model(model, bearer_token);
-        if (!ok) {
-            LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
-            exit(1);
+        // make sure model path is present (for caching purposes)
+        if (model.empty()) {
+            // this is to avoid different repo having same file name, or same file name in different subdirs
+            std::string filename = hf_repo + "_" + hf_file;
+            // to make sure we don't have any slashes in the filename
+            string_replace_all(filename, "/", "_");
+            model = fs_get_cache_file(filename);
        }
+    } else if (!model_url.empty()) {
+        if (model.empty()) {
+            auto f = string_split<std::string>(model_url, '#').front();
+            f = string_split<std::string>(f, '?').front();
+            model = fs_get_cache_file(string_split<std::string>(f, '/').back());
+        }
+    } else if (model.empty()) {
+        model = model_default;
    }
 }

@@ -826,16 +300,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
    }

-    common_params_handle_model(params.model,             params.hf_token, DEFAULT_MODEL_PATH);
-    common_params_handle_model(params.speculative.model, params.hf_token, "");
-    common_params_handle_model(params.vocoder.model,     params.hf_token, "");
-
-    // allow --mmproj to be set from -hf
-    // assuming that mmproj is always in the same repo as text model
-    if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
-        params.mmproj.hf_repo = params.model.hf_repo;
-    }
-    common_params_handle_model(params.mmproj,            params.hf_token, "", true);
+    // TODO: refactor model params in a common struct
+    common_params_handle_model_default(params.model,             params.model_url,             params.hf_repo,             params.hf_file,             params.hf_token, DEFAULT_MODEL_PATH);
+    common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, "");
+    common_params_handle_model_default(params.vocoder.model,     params.vocoder.model_url,     params.vocoder.hf_repo,     params.vocoder.hf_file,     params.hf_token, "");

    if (params.escape) {
        string_process_escapes(params.prompt);
@@ -854,10 +322,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        params.kv_overrides.back().key[0] = 0;
    }

-    if (!params.tensor_buft_overrides.empty()) {
-        params.tensor_buft_overrides.push_back({nullptr, nullptr});
-    }
-
    if (params.reranking && params.embedding) {
        throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both");
    }
@@ -2097,14 +1561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--mmproj"}, "FILE",
        "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
        [](common_params & params, const std::string & value) {
-            params.mmproj.path = value;
-        }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
-    add_opt(common_arg(
-        {"--mmproj-url"}, "URL",
-        "URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
-        [](common_params & params, const std::string & value) {
-            params.mmproj.url = value;
+            params.mmproj = value;
        }
    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
    add_opt(common_arg(
@@ -2190,41 +1647,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            exit(0);
        }
    ));
-    add_opt(common_arg(
-        {"--override-tensor", "-ot"}, "<tensor name pattern>=<buffer type>,...",
-        "override tensor buffer type", [](common_params & params, const std::string & value) {
-            /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
-            if (buft_list.empty()) {
-                // enumerate all the devices and add their buffer types to the list
-                for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
-                    auto * dev = ggml_backend_dev_get(i);
-                    auto * buft = ggml_backend_dev_buffer_type(dev);
-                    if (buft) {
-                        buft_list[ggml_backend_buft_name(buft)] = buft;
-                    }
-                }
-            }
-
-            for (const auto & override : string_split<std::string>(value, ',')) {
-                std::string::size_type pos = override.find('=');
-                if (pos == std::string::npos) {
-                    throw std::invalid_argument("invalid value");
-                }
-                std::string tensor_name = override.substr(0, pos);
-                std::string buffer_type = override.substr(pos + 1);
-
-                if (buft_list.find(buffer_type) == buft_list.end()) {
-                    printf("Available buffer types:\n");
-                    for (const auto & it : buft_list) {
-                        printf("  %s\n", ggml_backend_buft_name(it.second));
-                    }
-                    throw std::invalid_argument("unknown buffer type");
-                }
-                // FIXME: this leaks memory
-                params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
-            }
-        }
-    ));
    add_opt(common_arg(
        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
        "number of layers to store in VRAM",
@@ -2368,14 +1790,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH
            ),
        [](common_params & params, const std::string & value) {
-            params.model.path = value;
+            params.model = value;
        }
    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
    add_opt(common_arg(
        {"-mu", "--model-url"}, "MODEL_URL",
        "model download url (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.model.url = value;
+            params.model_url = value;
        }
    ).set_env("LLAMA_ARG_MODEL_URL"));
    add_opt(common_arg(
@@ -2384,35 +1806,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        "example: unsloth/phi-4-GGUF:q4_k_m\n"
        "(default: unused)",
        [](common_params & params, const std::string & value) {
-            params.model.hf_repo = value;
+            params.hf_repo = value;
        }
    ).set_env("LLAMA_ARG_HF_REPO"));
    add_opt(common_arg(
        {"-hfd", "-hfrd", "--hf-repo-draft"}, "<user>/<model>[:quant]",
        "Same as --hf-repo, but for the draft model (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.speculative.model.hf_repo = value;
+            params.speculative.hf_repo = value;
        }
    ).set_env("LLAMA_ARG_HFD_REPO"));
    add_opt(common_arg(
        {"-hff", "--hf-file"}, "FILE",
        "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.model.hf_file = value;
+            params.hf_file = value;
        }
    ).set_env("LLAMA_ARG_HF_FILE"));
    add_opt(common_arg(
        {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
        "Hugging Face model repository for the vocoder model (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.vocoder.model.hf_repo = value;
+            params.vocoder.hf_repo = value;
        }
    ).set_env("LLAMA_ARG_HF_REPO_V"));
    add_opt(common_arg(
        {"-hffv", "--hf-file-v"}, "FILE",
        "Hugging Face model file for the vocoder model (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.vocoder.model.hf_file = value;
+            params.vocoder.hf_file = value;
        }
    ).set_env("LLAMA_ARG_HF_FILE_V"));
    add_opt(common_arg(
@@ -2557,7 +1979,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
    add_opt(common_arg(
        {"--host"}, "HOST",
-        string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
+        string_format("ip address to listen (default: %s)", params.hostname.c_str()),
        [](common_params & params, const std::string & value) {
            params.hostname = value;
        }
@@ -3032,7 +2454,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"-md", "--model-draft"}, "FNAME",
        "draft model for speculative decoding (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.speculative.model.path = value;
+            params.speculative.model = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));

@@ -3040,7 +2462,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"-mv", "--model-vocoder"}, "FNAME",
        "vocoder model for audio generation (default: unused)",
        [](common_params & params, const std::string & value) {
-            params.vocoder.model.path = value;
+            params.vocoder.model = value;
        }
    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
@@ -3063,10 +2485,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--tts-oute-default"},
        string_format("use default OuteTTS models (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
-            params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
-            params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
-            params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
+            params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
+            params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
+            params.vocoder.hf_repo = "ggml-org/WavTokenizer";
+            params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
        }
    ).set_examples({LLAMA_EXAMPLE_TTS}));

@@ -3074,8 +2496,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--embd-bge-small-en-default"},
        string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
-            params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
+            params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
+            params.hf_file = "bge-small-en-v1.5-q8_0.gguf";
            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
            params.embd_normalize = 2;
            params.n_ctx = 512;
@@ -3088,8 +2510,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--embd-e5-small-en-default"},
        string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
-            params.model.hf_file = "e5-small-v2-q8_0.gguf";
+            params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
+            params.hf_file = "e5-small-v2-q8_0.gguf";
            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
            params.embd_normalize = 2;
            params.n_ctx = 512;
@@ -3102,8 +2524,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--embd-gte-small-default"},
        string_format("use default gte-small model (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
-            params.model.hf_file = "gte-small-q8_0.gguf";
+            params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
+            params.hf_file = "gte-small-q8_0.gguf";
            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
            params.embd_normalize = 2;
            params.n_ctx = 512;
@@ -3116,8 +2538,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--fim-qwen-1.5b-default"},
        string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
-            params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
+            params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
+            params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
            params.port = 8012;
            params.n_gpu_layers = 99;
            params.flash_attn = true;
@@ -3132,8 +2554,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--fim-qwen-3b-default"},
        string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
-            params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
+            params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
+            params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
            params.port = 8012;
            params.n_gpu_layers = 99;
            params.flash_attn = true;
@@ -3148,8 +2570,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--fim-qwen-7b-default"},
        string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
-            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+            params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+            params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
            params.port = 8012;
            params.n_gpu_layers = 99;
            params.flash_attn = true;
@@ -3164,10 +2586,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--fim-qwen-7b-spec"},
        string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
-            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
-            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
-            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+            params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+            params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+            params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+            params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
            params.speculative.n_gpu_layers = 99;
            params.port = 8012;
            params.n_gpu_layers = 99;
@@ -3183,10 +2605,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"--fim-qwen-14b-spec"},
        string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
-            params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
-            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
-            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+            params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
+            params.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
+            params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+            params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
            params.speculative.n_gpu_layers = 99;
            params.port = 8012;
            params.n_gpu_layers = 99;
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -7,6 +7,9 @@

 #include "common.h"
 #include "log.h"
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
+#include "json.hpp"
 #include "llama.h"

 #include <algorithm>
@@ -48,11 +51,47 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
+#if defined(LLAMA_USE_CURL)
+#include <curl/curl.h>
+#include <curl/easy.h>
+#include <future>
+#endif

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

+#if defined(LLAMA_USE_CURL)
+#ifdef __linux__
+#include <linux/limits.h>
+#elif defined(_WIN32)
+#   if !defined(PATH_MAX)
+#   define PATH_MAX MAX_PATH
+#   endif
+#else
+#include <sys/syslimits.h>
+#endif
+#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+
+//
+// CURL utils
+//
+
+using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
+
+// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
+struct curl_slist_ptr {
+    struct curl_slist * ptr = nullptr;
+    ~curl_slist_ptr() {
+        if (ptr) {
+            curl_slist_free_all(ptr);
+        }
+    }
+};
+#endif // LLAMA_USE_CURL
+
+using json = nlohmann::ordered_json;
+
 //
 // CPU utils
 //
@@ -543,41 +582,6 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
    return buf.str();
 }

-std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
-    std::stringstream buf;
-
-    buf << "[ ";
-
-    bool first = true;
-    for (int i = 0; i < batch.n_tokens; ++i) {
-        if (!first) {
-            buf << ", ";
-        } else {
-            first = false;
-        }
-
-        auto detokenized = common_token_to_piece(ctx, batch.token[i]);
-
-        detokenized.erase(
-                std::remove_if(
-                    detokenized.begin(),
-                    detokenized.end(),
-                    [](const unsigned char c) { return !std::isprint(c); }),
-                detokenized.end());
-
-        buf << "\n"          << std::to_string(i)
-            << ", token '"   << detokenized << "'"
-            << ", pos "      << std::to_string(batch.pos[i])
-            << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
-            << ", seq_id "   << std::to_string(batch.seq_id[i][0])
-            << ", logits "   << std::to_string(batch.logits[i]);
-    }
-
-    buf << " ]";
-
-    return buf.str();
-}
-
 void string_process_escapes(std::string & input) {
    std::size_t input_len = input.length();
    std::size_t output_idx = 0;
@@ -861,14 +865,22 @@ std::string fs_get_cache_file(const std::string & filename) {
 //
 // Model utils
 //
-
 struct common_init_result common_init_from_params(common_params & params) {
    common_init_result iparams;
    auto mparams = common_model_params_to_llama(params);

-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+    llama_model * model = nullptr;
+
+    if (!params.hf_repo.empty() && !params.hf_file.empty()) {
+        model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams);
+    } else if (!params.model_url.empty()) {
+        model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams);
+    } else {
+        model = llama_model_load_from_file(params.model.c_str(), mparams);
+    }
+
    if (model == NULL) {
-        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
+        LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str());
        return iparams;
    }

@@ -903,7 +915,7 @@ struct common_init_result common_init_from_params(common_params & params) {

    llama_context * lctx = llama_init_from_model(model, cparams);
    if (lctx == NULL) {
-        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str());
+        LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
        llama_model_free(model);
        return iparams;
    }
@@ -1004,7 +1016,8 @@ struct common_init_result common_init_from_params(common_params & params) {
        }

        if (llama_model_has_encoder(model)) {
-            llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size()));
+            llama_batch_ext_ptr batch(llama_batch_ext_init_from_text(tmp.data(), tmp.size(), 0, 0, true));
+            llama_encode_ext(lctx, batch.get());
            llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
            if (decoder_start_token_id == LLAMA_TOKEN_NULL) {
                decoder_start_token_id = bos;
@@ -1013,7 +1026,8 @@ struct common_init_result common_init_from_params(common_params & params) {
            tmp.push_back(decoder_start_token_id);
        }
        if (llama_model_has_decoder(model)) {
-            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
+            llama_batch_ext_ptr batch(llama_batch_ext_init_from_text(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0, true));
+            llama_decode_ext(lctx, batch.get());
        }
        llama_kv_self_clear(lctx);
        llama_synchronize(lctx);
@@ -1042,18 +1056,15 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
    if (!params.devices.empty()) {
        mparams.devices = params.devices.data();
    }
-
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
-
    mparams.main_gpu        = params.main_gpu;
    mparams.split_mode      = params.split_mode;
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
-
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
    } else {
@@ -1061,13 +1072,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
        mparams.kv_overrides = params.kv_overrides.data();
    }

-    if (params.tensor_buft_overrides.empty()) {
-        mparams.tensor_buft_overrides = NULL;
-    } else {
-        GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
-        mparams.tensor_buft_overrides = params.tensor_buft_overrides.data();
-    }
-
    return mparams;
 }

@@ -1127,14 +1131,461 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
    return tpp;
 }

+#ifdef LLAMA_USE_CURL
+
+#define CURL_MAX_RETRY 3
+#define CURL_RETRY_DELAY_SECONDS 2
+
+static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) {
+    int remaining_attempts = max_attempts;
+
+    while (remaining_attempts > 0) {
+        LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts);
+
+        CURLcode res = curl_easy_perform(curl);
+        if (res == CURLE_OK) {
+            return true;
+        }
+
+        int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000;
+        LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay);
+
+        remaining_attempts--;
+        std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay));
+    }
+
+    LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts);
+
+    return false;
+}
+
+static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
+    // Initialize libcurl
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    if (!curl) {
+        LOG_ERR("%s: error initializing libcurl\n", __func__);
+        return false;
+    }
+
+    bool force_download = false;
+
+    // Set the URL, allow to follow http redirection
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
+
+    // Check if hf-token or bearer-token was specified
+    if (!hf_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + hf_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+    }
+
+#if defined(_WIN32)
+    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
+    //   operating system. Currently implemented under MS-Windows.
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+
+    // Check if the file already exists locally
+    auto file_exists = std::filesystem::exists(path);
+
+    // If the file exists, check its JSON metadata companion file.
+    std::string metadata_path = path + ".json";
+    nlohmann::json metadata;
+    std::string etag;
+    std::string last_modified;
+
+    if (file_exists) {
+        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
+        std::ifstream metadata_in(metadata_path);
+        if (metadata_in.good()) {
+            try {
+                metadata_in >> metadata;
+                LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
+                if (metadata.contains("url") && metadata.at("url").is_string()) {
+                    auto previous_url = metadata.at("url").get<std::string>();
+                    if (previous_url != url) {
+                        LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
+                        return false;
+                    }
+                }
+                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
+                    etag = metadata.at("etag");
+                }
+                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
+                    last_modified = metadata.at("lastModified");
+                }
+            } catch (const nlohmann::json::exception & e) {
+            LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
+                return false;
+            }
+        }
+    } else {
+        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
+    }
+
+    // Send a HEAD request to retrieve the etag and last-modified headers
+    struct common_load_model_from_url_headers {
+        std::string etag;
+        std::string last_modified;
+    };
+
+    common_load_model_from_url_headers headers;
+
+    {
+        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
+        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
+            common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
+
+            static std::regex header_regex("([^:]+): (.*)\r\n");
+            static std::regex etag_regex("ETag", std::regex_constants::icase);
+            static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase);
+
+            std::string header(buffer, n_items);
+            std::smatch match;
+            if (std::regex_match(header, match, header_regex)) {
+                const std::string & key = match[1];
+                const std::string & value = match[2];
+                if (std::regex_match(key, match, etag_regex)) {
+                    headers->etag = value;
+                } else if (std::regex_match(key, match, last_modified_regex)) {
+                    headers->last_modified = value;
+                }
+            }
+            return n_items;
+        };
+
+        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
+        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
+        curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers);
+
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
+            return false;
+        }
+
+        long http_code = 0;
+        curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+        if (http_code != 200) {
+            // HEAD not supported, we don't know if the file has changed
+            // force trigger downloading
+            force_download = true;
+            LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+        }
+    }
+
+    bool should_download = !file_exists || force_download;
+    if (!should_download) {
+        if (!etag.empty() && etag != headers.etag) {
+            LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str());
+            should_download = true;
+        } else if (!last_modified.empty() && last_modified != headers.last_modified) {
+            LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str());
+            should_download = true;
+        }
+    }
+    if (should_download) {
+        std::string path_temporary = path + ".downloadInProgress";
+        if (file_exists) {
+            LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str());
+            if (remove(path.c_str()) != 0) {
+                LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str());
+                return false;
+            }
+        }
+
+        // Set the output file
+
+        struct FILE_deleter {
+            void operator()(FILE * f) const {
+                fclose(f);
+            }
+        };
+
+        std::unique_ptr<FILE, FILE_deleter> outfile(fopen(path_temporary.c_str(), "wb"));
+        if (!outfile) {
+            LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str());
+            return false;
+        }
+
+        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
+        auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
+            return fwrite(data, size, nmemb, (FILE *)fd);
+        };
+        curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L);
+        curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+        curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get());
+
+        //  display download progress
+        curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L);
+
+        // helper function to hide password in URL
+        auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string {
+            std::size_t protocol_pos = url.find("://");
+            if (protocol_pos == std::string::npos) {
+                return url;  // Malformed URL
+            }
+
+            std::size_t at_pos = url.find('@', protocol_pos + 3);
+            if (at_pos == std::string::npos) {
+                return url;  // No password in URL
+            }
+
+            return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos);
+        };
+
+        // start the download
+        LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+            llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str());
+        bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS);
+        if (!was_perform_successful) {
+            return false;
+        }
+
+        long http_code = 0;
+        curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code);
+        if (http_code < 200 || http_code >= 400) {
+            LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code);
+            return false;
+        }
+
+        // Causes file to be closed explicitly here before we rename it.
+        outfile.reset();
+
+        // Write the updated JSON metadata file.
+        metadata.update({
+            {"url", url},
+            {"etag", headers.etag},
+            {"lastModified", headers.last_modified}
+        });
+        std::ofstream(metadata_path) << metadata.dump(4);
+        LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str());
+
+        if (rename(path_temporary.c_str(), path.c_str()) != 0) {
+            LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str());
+            return false;
+        }
+    }
+
+    return true;
+}
+
+struct llama_model * common_load_model_from_url(
+        const std::string & model_url,
+        const std::string & local_path,
+        const std::string & hf_token,
+        const struct llama_model_params & params) {
+    // Basic validation of the model_url
+    if (model_url.empty()) {
+        LOG_ERR("%s: invalid model_url\n", __func__);
+        return NULL;
+    }
+
+    if (!common_download_file(model_url, local_path, hf_token)) {
+        return NULL;
+    }
+
+    // check for additional GGUFs split to download
+    int n_split = 0;
+    {
+        struct gguf_init_params gguf_params = {
+            /*.no_alloc = */ true,
+            /*.ctx      = */ NULL,
+        };
+        auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params);
+        if (!ctx_gguf) {
+            LOG_ERR("\n%s:  failed to load input GGUF from %s\n", __func__, local_path.c_str());
+            return NULL;
+        }
+
+        auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT);
+        if (key_n_split >= 0) {
+            n_split = gguf_get_val_u16(ctx_gguf, key_n_split);
+        }
+
+        gguf_free(ctx_gguf);
+    }
+
+    if (n_split > 1) {
+        char split_prefix[PATH_MAX] = {0};
+        char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+
+        // Verify the first split file format
+        // and extract split URL and PATH prefixes
+        {
+            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split);
+                return NULL;
+            }
+
+            if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) {
+                LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split);
+                return NULL;
+            }
+        }
+
+        // Prepare download in parallel
+        std::vector<std::future<bool>> futures_download;
+        for (int idx = 1; idx < n_split; idx++) {
+            futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool {
+                char split_path[PATH_MAX] = {0};
+                llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split);
+
+                char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
+                llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
+
+                return common_download_file(split_url, split_path, hf_token);
+            }, idx));
+        }
+
+        // Wait for all downloads to complete
+        for (auto & f : futures_download) {
+            if (!f.get()) {
+                return NULL;
+            }
+        }
+    }
+
+    return llama_model_load_from_file(local_path.c_str(), params);
+}
+
+struct llama_model * common_load_model_from_hf(
+        const std::string & repo,
+        const std::string & remote_path,
+        const std::string & local_path,
+        const std::string & hf_token,
+        const struct llama_model_params & params) {
+    // construct hugging face model url:
+    //
+    //  --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
+    //    https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
+    //
+    //  --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
+    //    https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
+    //
+
+    std::string model_url = "https://huggingface.co/";
+    model_url += repo;
+    model_url += "/resolve/main/";
+    model_url += remote_path;
+
+    return common_load_model_from_url(model_url, local_path, hf_token, params);
+}
+
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
+    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
+    std::string hf_repo = parts[0];
+    if (string_split<std::string>(hf_repo, '/').size() != 2) {
+        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
+    }
+
+    // fetch model info from Hugging Face Hub API
+    json model_info;
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    std::string res_str;
+    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
+    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
+    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
+        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
+        return size * nmemb;
+    };
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
+#if defined(_WIN32)
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+    if (!hf_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + hf_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+    }
+    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
+    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+
+    CURLcode res = curl_easy_perform(curl.get());
+
+    if (res != CURLE_OK) {
+        throw std::runtime_error("error: cannot make GET request to HF API");
+    }
+
+    long res_code;
+    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
+    if (res_code == 200) {
+        model_info = json::parse(res_str);
+    } else if (res_code == 401) {
+        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
+    } else {
+        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
+    }
+
+    // check response
+    if (!model_info.contains("ggufFile")) {
+        throw std::runtime_error("error: model does not have ggufFile");
+    }
+    json & gguf_file = model_info.at("ggufFile");
+    if (!gguf_file.contains("rfilename")) {
+        throw std::runtime_error("error: ggufFile does not have rfilename");
+    }
+
+    return std::make_pair(hf_repo, gguf_file.at("rfilename"));
+}
+
+#else
+
+struct llama_model * common_load_model_from_url(
+        const std::string & /*model_url*/,
+        const std::string & /*local_path*/,
+        const std::string & /*hf_token*/,
+        const struct llama_model_params & /*params*/) {
+    LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
+    return nullptr;
+}
+
+struct llama_model * common_load_model_from_hf(
+        const std::string & /*repo*/,
+        const std::string & /*remote_path*/,
+        const std::string & /*local_path*/,
+        const std::string & /*hf_token*/,
+        const struct llama_model_params & /*params*/) {
+    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+    return nullptr;
+}
+
+std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
+    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+    return std::make_pair("", "");
+}
+
+#endif // LLAMA_USE_CURL
+
 //
 // Batch utils
 //

+// DEPRECATED
 void common_batch_clear(struct llama_batch & batch) {
    batch.n_tokens = 0;
 }

+// DEPRECATED
 void common_batch_add(
                 struct llama_batch & batch,
                        llama_token   id,
@@ -1550,3 +2001,26 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c

    return result;
 }
+
+template <>
+json common_grammar_trigger::to_json() const {
+    json out {
+        {"type", (int) type},
+        {"value", value},
+    };
+    if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+        out["token"] = (int) token;
+    }
+    return out;
+}
+
+template <>
+common_grammar_trigger common_grammar_trigger::from_json(const json & in) {
+    common_grammar_trigger out;
+    out.type = (common_grammar_trigger_type) in.at("type").get<int>();
+    out.value = in.at("value").get<std::string>();
+    if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
+        out.token = (llama_token) in.at("token").get<int>();
+    }
+    return out;
+}
--- a/common/common.h
+++ b/common/common.h
@@ -121,6 +121,10 @@ struct common_grammar_trigger {
    common_grammar_trigger_type type;
    std::string value;
    llama_token token = LLAMA_TOKEN_NULL;
+
+    // T can only be nlohmann::ordered_json
+    template <class T> T to_json() const;
+    template <class T> static common_grammar_trigger from_json(const T & in);
 };

 // sampling parameters
@@ -180,13 +184,6 @@ struct common_params_sampling {
    std::string print() const;
 };

-struct common_params_model {
-    std::string path    = ""; // model local path                                           // NOLINT
-    std::string url     = ""; // model url to download                                      // NOLINT
-    std::string hf_repo = ""; // HF repo                                                    // NOLINT
-    std::string hf_file = ""; // HF file                                                    // NOLINT
-};
-
 struct common_params_speculative {
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

@@ -200,11 +197,19 @@ struct common_params_speculative {
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;

-    struct common_params_model model;
+    std::string hf_repo = ""; // HF repo                                                     // NOLINT
+    std::string hf_file = ""; // HF file                                                     // NOLINT
+
+    std::string model = "";     // draft model for speculative decoding                      // NOLINT
+    std::string model_url = ""; // model url to download                                     // NOLINT
 };

 struct common_params_vocoder {
-    struct common_params_model model;
+    std::string hf_repo = ""; // HF repo                                                     // NOLINT
+    std::string hf_file = ""; // HF file                                                     // NOLINT
+
+    std::string model     = ""; // model path                                                // NOLINT
+    std::string model_url = ""; // model url to download                                     // NOLINT

    std::string speaker_file = ""; // speaker file path                                      // NOLINT

@@ -262,10 +267,12 @@ struct common_params {
    struct common_params_speculative speculative;
    struct common_params_vocoder     vocoder;

-    struct common_params_model model;
-
+    std::string model                = ""; // model path                                                    // NOLINT
    std::string model_alias          = ""; // model alias                                                   // NOLINT
+    std::string model_url            = ""; // model url to download                                         // NOLINT
    std::string hf_token             = ""; // HF token                                                      // NOLINT
+    std::string hf_repo              = ""; // HF repo                                                       // NOLINT
+    std::string hf_file              = ""; // HF file                                                       // NOLINT
    std::string prompt               = "";                                                                  // NOLINT
    std::string system_prompt        = "";                                                                  // NOLINT
    std::string prompt_file          = ""; // store the external prompt file name                           // NOLINT
@@ -279,7 +286,6 @@ struct common_params {
    std::vector<std::string> in_files;   // all input files
    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
    std::vector<llama_model_kv_override> kv_overrides;
-    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;

    bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
    std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@@ -341,7 +347,7 @@ struct common_params {
    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;

    // multimodal models (see examples/llava)
-    struct common_params_model mmproj;
+    std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
    std::vector<std::string> image; // path to image file(s)

    // embedding
@@ -510,7 +516,6 @@ void string_process_escapes(std::string & input);
 std::string string_from(bool value);
 std::string string_from(const std::vector<int> & values);
 std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
-std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);

 //
 // Filesystem utils
@@ -540,6 +545,23 @@ struct llama_model_params     common_model_params_to_llama  (      common_params
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);

+struct llama_model * common_load_model_from_url(
+    const std::string & model_url,
+    const std::string & local_path,
+    const std::string & hf_token,
+    const struct llama_model_params & params);
+
+struct llama_model * common_load_model_from_hf(
+    const std::string & repo,
+    const std::string & remote_path,
+    const std::string & local_path,
+    const std::string & hf_token,
+    const struct llama_model_params & params);
+
+std::pair<std::string, std::string> common_get_hf_file(
+    const std::string & hf_repo_with_tag,
+    const std::string & hf_token);
+
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);

@@ -547,8 +569,10 @@ void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adap
 // Batch utils
 //

+// DEPRECATED
 void common_batch_clear(struct llama_batch & batch);

+// DEPRECATED
 void common_batch_add(
                 struct llama_batch & batch,
                        llama_token   id,
@@ -556,6 +580,66 @@ void common_batch_add(
    const std::vector<llama_seq_id> & seq_ids,
                               bool   logits);

+// convenient wrapper around llama_batch_ext, to provide a way to get embeddings positions
+// this is meant to be temporary
+struct common_batch {
+    llama_batch_ext_ptr batch;
+    struct batch_token {
+        llama_token  token;
+        llama_seq_id seq_id; // only support single seq for now
+        bool         logits;
+    };
+    std::vector<batch_token> tokens;
+    int n_outputs = 0;
+    common_batch() = default;
+    common_batch(int32_t n_tokens, int32_t n_seq_max) {
+        batch.reset(llama_batch_ext_init(n_tokens, n_seq_max));
+        tokens.reserve(n_tokens);
+    }
+    void clear() {
+        llama_batch_ext_clear(batch.get());
+        tokens.clear();
+    }
+    void add_text(llama_token token, llama_pos pos, llama_seq_id seq_id, bool logits) {
+        llama_batch_ext_add_text(batch.get(), token, pos, &seq_id, 1, logits);
+        tokens.push_back({token, seq_id, logits});
+        if (logits) {
+            n_outputs++;
+        }
+    }
+    void add_text_multi_seq(llama_token token, llama_pos pos, std::vector<llama_seq_id> seq_ids, bool logits) {
+        llama_batch_ext_add_text(batch.get(), token, pos, seq_ids.data(), seq_ids.size(), logits);
+        tokens.push_back({token, seq_ids[0], logits});
+        if (logits) {
+            n_outputs++;
+        }
+    }
+    void set_logits_last() {
+        if (!tokens.empty()) {
+            llama_batch_ext_set_output_last(batch.get());
+            tokens.back().logits = true;
+        }
+    }
+    int32_t get_n_tokens() const {
+        return (int32_t)tokens.size();
+    }
+    llama_batch_ext * get() {
+        return batch.get();
+    }
+    common_batch get_view(int32_t offset, int32_t n_tokens) {
+        common_batch view;
+        view.batch = llama_batch_ext_ptr(llama_batch_ext_get_view(batch.get(), offset, n_tokens));
+        view.tokens.reserve(n_tokens);
+        for (int32_t i = 0; i < n_tokens; i++) {
+            view.tokens.push_back(tokens[offset + i]);
+            if (tokens[offset + i].logits) {
+                view.n_outputs++;
+            }
+        }
+        return view;
+    }
+};
+
 //
 // Token utils
 //
--- a/common/llguidance.cpp
+++ b/common/llguidance.cpp
@@ -11,24 +11,25 @@ struct llama_sampler_llg {
    std::string         grammar_kind;
    std::string         grammar_data;
    LlgTokenizer *      tokenizer;
-    LlgMatcher *        grammar;
+    LlgConstraint *     grammar;
+    LlgMaskResult       llg_res;
+    bool                has_llg_res;
 };

-static LlgMatcher * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
-                                          const char * grammar_data) {
+static LlgConstraint * llama_sampler_llg_new(LlgTokenizer * tokenizer, const char * grammar_kind,
+                                             const char * grammar_data) {
    LlgConstraintInit cinit;
    llg_constraint_init_set_defaults(&cinit, tokenizer);
    const char * log_level = getenv("LLGUIDANCE_LOG_LEVEL");
    if (log_level && *log_level) {
        cinit.log_stderr_level = atoi(log_level);
    }
-    auto c = llg_new_matcher(&cinit, grammar_kind, grammar_data);
-    if (llg_matcher_get_error(c)) {
-        LOG_ERR("llg error: %s\n", llg_matcher_get_error(c));
-        llg_free_matcher(c);
+    auto c = llg_new_constraint_any(&cinit, grammar_kind, grammar_data);
+    if (llg_get_error(c)) {
+        LOG_ERR("llg error: %s\n", llg_get_error(c));
+        llg_free_constraint(c);
        return nullptr;
    }
-
    return c;
 }

@@ -39,29 +40,39 @@ static const char * llama_sampler_llg_name(const llama_sampler * /*smpl*/) {
 static void llama_sampler_llg_accept_impl(llama_sampler * smpl, llama_token token) {
    auto * ctx = (llama_sampler_llg *) smpl->ctx;
    if (ctx->grammar) {
-        llg_matcher_consume_token(ctx->grammar, token);
+        LlgCommitResult res;
+        llg_commit_token(ctx->grammar, token, &res);
+        ctx->has_llg_res = false;
    }
 }

 static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array * cur_p) {
    auto * ctx = (llama_sampler_llg *) smpl->ctx;
    if (ctx->grammar) {
-        const uint32_t * mask = llg_matcher_get_mask(ctx->grammar);
-        if (mask == nullptr) {
-            if (llg_matcher_compute_mask(ctx->grammar) == 0) {
-                mask = llg_matcher_get_mask(ctx->grammar);
+        if (!ctx->has_llg_res) {
+            if (llg_compute_mask(ctx->grammar, &ctx->llg_res) == 0) {
+                ctx->has_llg_res = true;
            } else {
-                LOG_ERR("llg error: %s\n", llg_matcher_get_error(ctx->grammar));
-                llg_free_matcher(ctx->grammar);
+                LOG_ERR("llg error: %s\n", llg_get_error(ctx->grammar));
+                llg_free_constraint(ctx->grammar);
                ctx->grammar = nullptr;
-                return;
            }
        }
-
-        for (size_t i = 0; i < cur_p->size; ++i) {
-            auto token = cur_p->data[i].id;
-            if ((mask[token / 32] & (1 << (token % 32))) == 0) {
-                cur_p->data[i].logit = -INFINITY;
+        if (ctx->has_llg_res) {
+            if (ctx->llg_res.is_stop) {
+                for (size_t i = 0; i < cur_p->size; ++i) {
+                    if (!llama_vocab_is_eog(ctx->vocab, cur_p->data[i].id)) {
+                        cur_p->data[i].logit = -INFINITY;
+                    }
+                }
+            } else {
+                const uint32_t * mask = ctx->llg_res.sample_mask;
+                for (size_t i = 0; i < cur_p->size; ++i) {
+                    auto token = cur_p->data[i].id;
+                    if ((mask[token / 32] & (1 << (token % 32))) == 0) {
+                        cur_p->data[i].logit = -INFINITY;
+                    }
+                }
            }
        }
    }
@@ -69,9 +80,14 @@ static void llama_sampler_llg_apply(llama_sampler * smpl, llama_token_data_array

 static void llama_sampler_llg_reset(llama_sampler * smpl) {
    auto * ctx = (llama_sampler_llg *) smpl->ctx;
-    if (ctx->grammar) {
-        llg_matcher_reset(ctx->grammar);
+    if (!ctx->grammar) {
+        return;
    }
+
+    auto * grammar_new = llama_sampler_llg_new(ctx->tokenizer, ctx->grammar_kind.c_str(), ctx->grammar_data.c_str());
+    llg_free_constraint(ctx->grammar);
+    ctx->grammar     = grammar_new;
+    ctx->has_llg_res = false;
 }

 static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
@@ -86,7 +102,7 @@ static llama_sampler * llama_sampler_llg_clone(const llama_sampler * smpl) {
        if (ctx->grammar) {
            result_ctx->grammar_kind = ctx->grammar_kind;
            result_ctx->grammar_data = ctx->grammar_data;
-            result_ctx->grammar      = llg_clone_matcher(ctx->grammar);
+            result_ctx->grammar      = llg_clone_constraint(ctx->grammar);
            result_ctx->tokenizer    = llg_clone_tokenizer(ctx->tokenizer);
        }
    }
@@ -98,7 +114,7 @@ static void llama_sampler_llg_free(llama_sampler * smpl) {
    const auto * ctx = (llama_sampler_llg *) smpl->ctx;

    if (ctx->grammar) {
-        llg_free_matcher(ctx->grammar);
+        llg_free_constraint(ctx->grammar);
        llg_free_tokenizer(ctx->tokenizer);
    }

@@ -223,11 +239,9 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
            /* .grammar_data = */ grammar_data,
            /* .tokenizer    = */ tokenizer,
            /* .grammar      = */ llama_sampler_llg_new(tokenizer, grammar_kind, grammar_data),
+            /* .llg_res      = */ {},
+            /* .has_llg_res  = */ false,
        };
-        if (ctx->grammar) {
-            GGML_ASSERT(((size_t) llama_vocab_n_tokens(vocab) + 31) / 32 * 4 ==
-                        llg_matcher_get_mask_byte_size(ctx->grammar));
-        }
    } else {
        *ctx = {
            /* .vocab        = */ vocab,
@@ -235,12 +249,15 @@ llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab, const char * g
            /* .grammar_data = */ {},
            /* .tokenizer    = */ nullptr,
            /* .grammar      = */ nullptr,
+            /* .llg_res      = */ {},
+            /* .has_llg_res  = */ false,
        };
    }

    return llama_sampler_init(
        /* .iface = */ &llama_sampler_llg_i,
-        /* .ctx   = */ ctx);
+        /* .ctx   = */ ctx
+    );
 }

 #else
--- a/common/minja/chat-template.hpp
+++ b/common/minja/chat-template.hpp
@@ -9,19 +9,10 @@
 #pragma once

 #include "minja.hpp"
-
-#include <chrono>
-#include <cstddef>
-#include <cstdio>
-#include <exception>
-#include <iomanip>
-#include <memory>
-#include <sstream>
+#include <json.hpp>
 #include <string>
 #include <vector>

-#include <json.hpp>
-
 using json = nlohmann::ordered_json;

 namespace minja {
@@ -434,7 +425,7 @@ class chat_template {
                        auto obj = json {
                            {"tool_calls", tool_calls},
                        };
-                        if (!content.is_null() && !content.empty()) {
+                        if (!content.is_null() && content != "") {
                            obj["content"] = content;
                        }
                        message["content"] = obj.dump(2);
@@ -444,12 +435,13 @@ class chat_template {
                if (polyfill_tool_responses && role == "tool") {
                    message["role"] = "user";
                    auto obj = json {
-                        {"tool_response", json::object()},
+                        {"tool_response", {
+                            {"content", message.at("content")},
+                        }},
                    };
                    if (message.contains("name")) {
-                        obj["tool_response"]["tool"] = message.at("name");
+                        obj["tool_response"]["name"] = message.at("name");
                    }
-                    obj["tool_response"]["content"] = message.at("content");
                    if (message.contains("tool_call_id")) {
                        obj["tool_response"]["tool_call_id"] = message.at("tool_call_id");
                    }
@@ -518,7 +510,7 @@ class chat_template {
    static nlohmann::ordered_json add_system(const nlohmann::ordered_json & messages, const std::string & system_prompt) {
        json messages_with_system = messages;

-        if (!messages_with_system.empty() && messages_with_system[0].at("role") == "system") {
+        if (messages_with_system.size() > 0 && messages_with_system[0].at("role") == "system") {
            std::string existing_system = messages_with_system.at(0).at("content");
            messages_with_system[0] = json {
                {"role", "system"},
--- a/common/minja/minja.hpp
+++ b/common/minja/minja.hpp
@@ -8,26 +8,14 @@
 // SPDX-License-Identifier: MIT
 #pragma once

-#include <algorithm>
-#include <cctype>
-#include <cstddef>
-#include <cmath>
-#include <exception>
-#include <functional>
 #include <iostream>
-#include <iterator>
-#include <limits>
-#include <map>
-#include <memory>
-#include <regex>
-#include <sstream>
 #include <string>
-#include <stdexcept>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
 #include <vector>
-
+#include <regex>
+#include <memory>
+#include <stdexcept>
+#include <sstream>
+#include <unordered_set>
 #include <json.hpp>

 using json = nlohmann::ordered_json;
@@ -743,51 +731,51 @@ public:

 struct TextTemplateToken : public TemplateToken {
    std::string text;
-    TextTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Text, loc, pre, post), text(t) {}
+    TextTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Text, location, pre, post), text(t) {}
 };

 struct ExpressionTemplateToken : public TemplateToken {
    std::shared_ptr<Expression> expr;
-    ExpressionTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && e) : TemplateToken(Type::Expression, loc, pre, post), expr(std::move(e)) {}
+    ExpressionTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && e) : TemplateToken(Type::Expression, location, pre, post), expr(std::move(e)) {}
 };

 struct IfTemplateToken : public TemplateToken {
    std::shared_ptr<Expression> condition;
-    IfTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::If, loc, pre, post), condition(std::move(c)) {}
+    IfTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::If, location, pre, post), condition(std::move(c)) {}
 };

 struct ElifTemplateToken : public TemplateToken {
    std::shared_ptr<Expression> condition;
-    ElifTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::Elif, loc, pre, post), condition(std::move(c)) {}
+    ElifTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && c) : TemplateToken(Type::Elif, location, pre, post), condition(std::move(c)) {}
 };

 struct ElseTemplateToken : public TemplateToken {
-    ElseTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Else, loc, pre, post) {}
+    ElseTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Else, location, pre, post) {}
 };

 struct EndIfTemplateToken : public TemplateToken {
-    EndIfTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndIf, loc, pre, post) {}
+    EndIfTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndIf, location, pre, post) {}
 };

 struct MacroTemplateToken : public TemplateToken {
    std::shared_ptr<VariableExpr> name;
    Expression::Parameters params;
-    MacroTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p)
-      : TemplateToken(Type::Macro, loc, pre, post), name(std::move(n)), params(std::move(p)) {}
+    MacroTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p)
+      : TemplateToken(Type::Macro, location, pre, post), name(std::move(n)), params(std::move(p)) {}
 };

 struct EndMacroTemplateToken : public TemplateToken {
-    EndMacroTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndMacro, loc, pre, post) {}
+    EndMacroTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndMacro, location, pre, post) {}
 };

 struct FilterTemplateToken : public TemplateToken {
    std::shared_ptr<Expression> filter;
-    FilterTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && filter)
-      : TemplateToken(Type::Filter, loc, pre, post), filter(std::move(filter)) {}
+    FilterTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, std::shared_ptr<Expression> && filter)
+      : TemplateToken(Type::Filter, location, pre, post), filter(std::move(filter)) {}
 };

 struct EndFilterTemplateToken : public TemplateToken {
-    EndFilterTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFilter, loc, pre, post) {}
+    EndFilterTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFilter, location, pre, post) {}
 };

 struct ForTemplateToken : public TemplateToken {
@@ -795,38 +783,38 @@ struct ForTemplateToken : public TemplateToken {
    std::shared_ptr<Expression> iterable;
    std::shared_ptr<Expression> condition;
    bool recursive;
-    ForTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::vector<std::string> & vns, std::shared_ptr<Expression> && iter,
+    ForTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::vector<std::string> & vns, std::shared_ptr<Expression> && iter,
      std::shared_ptr<Expression> && c, bool r)
-      : TemplateToken(Type::For, loc, pre, post), var_names(vns), iterable(std::move(iter)), condition(std::move(c)), recursive(r) {}
+      : TemplateToken(Type::For, location, pre, post), var_names(vns), iterable(std::move(iter)), condition(std::move(c)), recursive(r) {}
 };

 struct EndForTemplateToken : public TemplateToken {
-    EndForTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFor, loc, pre, post) {}
+    EndForTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndFor, location, pre, post) {}
 };

 struct GenerationTemplateToken : public TemplateToken {
-    GenerationTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Generation, loc, pre, post) {}
+    GenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::Generation, location, pre, post) {}
 };

 struct EndGenerationTemplateToken : public TemplateToken {
-    EndGenerationTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndGeneration, loc, pre, post) {}
+    EndGenerationTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndGeneration, location, pre, post) {}
 };

 struct SetTemplateToken : public TemplateToken {
    std::string ns;
    std::vector<std::string> var_names;
    std::shared_ptr<Expression> value;
-    SetTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
-      : TemplateToken(Type::Set, loc, pre, post), ns(ns), var_names(vns), value(std::move(v)) {}
+    SetTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
+      : TemplateToken(Type::Set, location, pre, post), ns(ns), var_names(vns), value(std::move(v)) {}
 };

 struct EndSetTemplateToken : public TemplateToken {
-    EndSetTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndSet, loc, pre, post) {}
+    EndSetTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post) : TemplateToken(Type::EndSet, location, pre, post) {}
 };

 struct CommentTemplateToken : public TemplateToken {
    std::string text;
-    CommentTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Comment, loc, pre, post), text(t) {}
+    CommentTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, const std::string& t) : TemplateToken(Type::Comment, location, pre, post), text(t) {}
 };

 enum class LoopControlType { Break, Continue };
@@ -842,7 +830,7 @@ public:

 struct LoopControlTemplateToken : public TemplateToken {
    LoopControlType control_type;
-    LoopControlTemplateToken(const Location & loc, SpaceHandling pre, SpaceHandling post, LoopControlType control_type) : TemplateToken(Type::Break, loc, pre, post), control_type(control_type) {}
+    LoopControlTemplateToken(const Location & location, SpaceHandling pre, SpaceHandling post, LoopControlType control_type) : TemplateToken(Type::Break, location, pre, post), control_type(control_type) {}
 };

 class TemplateNode {
@@ -880,8 +868,8 @@ public:
 class SequenceNode : public TemplateNode {
    std::vector<std::shared_ptr<TemplateNode>> children;
 public:
-    SequenceNode(const Location & loc, std::vector<std::shared_ptr<TemplateNode>> && c)
-      : TemplateNode(loc), children(std::move(c)) {}
+    SequenceNode(const Location & location, std::vector<std::shared_ptr<TemplateNode>> && c)
+      : TemplateNode(location), children(std::move(c)) {}
    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
        for (const auto& child : children) child->render(out, context);
    }
@@ -890,7 +878,7 @@ public:
 class TextNode : public TemplateNode {
    std::string text;
 public:
-    TextNode(const Location & loc, const std::string& t) : TemplateNode(loc), text(t) {}
+    TextNode(const Location & location, const std::string& t) : TemplateNode(location), text(t) {}
    void do_render(std::ostringstream & out, const std::shared_ptr<Context> &) const override {
      out << text;
    }
@@ -899,7 +887,7 @@ public:
 class ExpressionNode : public TemplateNode {
    std::shared_ptr<Expression> expr;
 public:
-    ExpressionNode(const Location & loc, std::shared_ptr<Expression> && e) : TemplateNode(loc), expr(std::move(e)) {}
+    ExpressionNode(const Location & location, std::shared_ptr<Expression> && e) : TemplateNode(location), expr(std::move(e)) {}
    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
      if (!expr) throw std::runtime_error("ExpressionNode.expr is null");
      auto result = expr->evaluate(context);
@@ -916,8 +904,8 @@ public:
 class IfNode : public TemplateNode {
    std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> cascade;
 public:
-    IfNode(const Location & loc, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> && c)
-        : TemplateNode(loc), cascade(std::move(c)) {}
+    IfNode(const Location & location, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<TemplateNode>>> && c)
+        : TemplateNode(location), cascade(std::move(c)) {}
    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
      for (const auto& branch : cascade) {
          auto enter_branch = true;
@@ -936,7 +924,7 @@ public:
 class LoopControlNode : public TemplateNode {
    LoopControlType control_type_;
  public:
-    LoopControlNode(const Location & loc, LoopControlType control_type) : TemplateNode(loc), control_type_(control_type) {}
+    LoopControlNode(const Location & location, LoopControlType control_type) : TemplateNode(location), control_type_(control_type) {}
    void do_render(std::ostringstream &, const std::shared_ptr<Context> &) const override {
      throw LoopControlException(control_type_);
    }
@@ -950,9 +938,9 @@ class ForNode : public TemplateNode {
    bool recursive;
    std::shared_ptr<TemplateNode> else_body;
 public:
-    ForNode(const Location & loc, std::vector<std::string> && var_names, std::shared_ptr<Expression> && iterable,
+    ForNode(const Location & location, std::vector<std::string> && var_names, std::shared_ptr<Expression> && iterable,
      std::shared_ptr<Expression> && condition, std::shared_ptr<TemplateNode> && body, bool recursive, std::shared_ptr<TemplateNode> && else_body)
-            : TemplateNode(loc), var_names(var_names), iterable(std::move(iterable)), condition(std::move(condition)), body(std::move(body)), recursive(recursive), else_body(std::move(else_body)) {}
+            : TemplateNode(location), var_names(var_names), iterable(std::move(iterable)), condition(std::move(condition)), body(std::move(body)), recursive(recursive), else_body(std::move(else_body)) {}

    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
      // https://jinja.palletsprojects.com/en/3.0.x/templates/#for
@@ -1037,8 +1025,8 @@ class MacroNode : public TemplateNode {
    std::shared_ptr<TemplateNode> body;
    std::unordered_map<std::string, size_t> named_param_positions;
 public:
-    MacroNode(const Location & loc, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p, std::shared_ptr<TemplateNode> && b)
-        : TemplateNode(loc), name(std::move(n)), params(std::move(p)), body(std::move(b)) {
+    MacroNode(const Location & location, std::shared_ptr<VariableExpr> && n, Expression::Parameters && p, std::shared_ptr<TemplateNode> && b)
+        : TemplateNode(location), name(std::move(n)), params(std::move(p)), body(std::move(b)) {
        for (size_t i = 0; i < params.size(); ++i) {
          const auto & name = params[i].first;
          if (!name.empty()) {
@@ -1084,8 +1072,8 @@ class FilterNode : public TemplateNode {
    std::shared_ptr<TemplateNode> body;

 public:
-    FilterNode(const Location & loc, std::shared_ptr<Expression> && f, std::shared_ptr<TemplateNode> && b)
-        : TemplateNode(loc), filter(std::move(f)), body(std::move(b)) {}
+    FilterNode(const Location & location, std::shared_ptr<Expression> && f, std::shared_ptr<TemplateNode> && b)
+        : TemplateNode(location), filter(std::move(f)), body(std::move(b)) {}

    void do_render(std::ostringstream & out, const std::shared_ptr<Context> & context) const override {
        if (!filter) throw std::runtime_error("FilterNode.filter is null");
@@ -1107,8 +1095,8 @@ class SetNode : public TemplateNode {
    std::vector<std::string> var_names;
    std::shared_ptr<Expression> value;
 public:
-    SetNode(const Location & loc, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
-        : TemplateNode(loc), ns(ns), var_names(vns), value(std::move(v)) {}
+    SetNode(const Location & location, const std::string & ns, const std::vector<std::string> & vns, std::shared_ptr<Expression> && v)
+        : TemplateNode(location), ns(ns), var_names(vns), value(std::move(v)) {}
    void do_render(std::ostringstream &, const std::shared_ptr<Context> & context) const override {
      if (!value) throw std::runtime_error("SetNode.value is null");
      if (!ns.empty()) {
@@ -1130,8 +1118,8 @@ class SetTemplateNode : public TemplateNode {
    std::string name;
    std::shared_ptr<TemplateNode> template_value;
 public:
-    SetTemplateNode(const Location & loc, const std::string & name, std::shared_ptr<TemplateNode> && tv)
-        : TemplateNode(loc), name(name), template_value(std::move(tv)) {}
+    SetTemplateNode(const Location & location, const std::string & name, std::shared_ptr<TemplateNode> && tv)
+        : TemplateNode(location), name(name), template_value(std::move(tv)) {}
    void do_render(std::ostringstream &, const std::shared_ptr<Context> & context) const override {
      if (!template_value) throw std::runtime_error("SetTemplateNode.template_value is null");
      Value value { template_value->render(context) };
@@ -1144,8 +1132,8 @@ class IfExpr : public Expression {
    std::shared_ptr<Expression> then_expr;
    std::shared_ptr<Expression> else_expr;
 public:
-    IfExpr(const Location & loc, std::shared_ptr<Expression> && c, std::shared_ptr<Expression> && t, std::shared_ptr<Expression> && e)
-        : Expression(loc), condition(std::move(c)), then_expr(std::move(t)), else_expr(std::move(e)) {}
+    IfExpr(const Location & location, std::shared_ptr<Expression> && c, std::shared_ptr<Expression> && t, std::shared_ptr<Expression> && e)
+        : Expression(location), condition(std::move(c)), then_expr(std::move(t)), else_expr(std::move(e)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
      if (!condition) throw std::runtime_error("IfExpr.condition is null");
      if (!then_expr) throw std::runtime_error("IfExpr.then_expr is null");
@@ -1162,16 +1150,16 @@ public:
 class LiteralExpr : public Expression {
    Value value;
 public:
-    LiteralExpr(const Location & loc, const Value& v)
-      : Expression(loc), value(v) {}
+    LiteralExpr(const Location & location, const Value& v)
+      : Expression(location), value(v) {}
    Value do_evaluate(const std::shared_ptr<Context> &) const override { return value; }
 };

 class ArrayExpr : public Expression {
    std::vector<std::shared_ptr<Expression>> elements;
 public:
-    ArrayExpr(const Location & loc, std::vector<std::shared_ptr<Expression>> && e)
-      : Expression(loc), elements(std::move(e)) {}
+    ArrayExpr(const Location & location, std::vector<std::shared_ptr<Expression>> && e)
+      : Expression(location), elements(std::move(e)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        auto result = Value::array();
        for (const auto& e : elements) {
@@ -1185,8 +1173,8 @@ public:
 class DictExpr : public Expression {
    std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> elements;
 public:
-    DictExpr(const Location & loc, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> && e)
-      : Expression(loc), elements(std::move(e)) {}
+    DictExpr(const Location & location, std::vector<std::pair<std::shared_ptr<Expression>, std::shared_ptr<Expression>>> && e)
+      : Expression(location), elements(std::move(e)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        auto result = Value::object();
        for (const auto& [key, value] : elements) {
@@ -1201,8 +1189,8 @@ public:
 class SliceExpr : public Expression {
 public:
    std::shared_ptr<Expression> start, end;
-    SliceExpr(const Location & loc, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
-      : Expression(loc), start(std::move(s)), end(std::move(e)) {}
+    SliceExpr(const Location & location, std::shared_ptr<Expression> && s, std::shared_ptr<Expression> && e)
+      : Expression(location), start(std::move(s)), end(std::move(e)) {}
    Value do_evaluate(const std::shared_ptr<Context> &) const override {
        throw std::runtime_error("SliceExpr not implemented");
    }
@@ -1212,8 +1200,8 @@ class SubscriptExpr : public Expression {
    std::shared_ptr<Expression> base;
    std::shared_ptr<Expression> index;
 public:
-    SubscriptExpr(const Location & loc, std::shared_ptr<Expression> && b, std::shared_ptr<Expression> && i)
-        : Expression(loc), base(std::move(b)), index(std::move(i)) {}
+    SubscriptExpr(const Location & location, std::shared_ptr<Expression> && b, std::shared_ptr<Expression> && i)
+        : Expression(location), base(std::move(b)), index(std::move(i)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        if (!base) throw std::runtime_error("SubscriptExpr.base is null");
        if (!index) throw std::runtime_error("SubscriptExpr.index is null");
@@ -1255,8 +1243,8 @@ public:
    enum class Op { Plus, Minus, LogicalNot, Expansion, ExpansionDict };
    std::shared_ptr<Expression> expr;
    Op op;
-    UnaryOpExpr(const Location & loc, std::shared_ptr<Expression> && e, Op o)
-      : Expression(loc), expr(std::move(e)), op(o) {}
+    UnaryOpExpr(const Location & location, std::shared_ptr<Expression> && e, Op o)
+      : Expression(location), expr(std::move(e)), op(o) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        if (!expr) throw std::runtime_error("UnaryOpExpr.expr is null");
        auto e = expr->evaluate(context);
@@ -1281,8 +1269,8 @@ private:
    std::shared_ptr<Expression> right;
    Op op;
 public:
-    BinaryOpExpr(const Location & loc, std::shared_ptr<Expression> && l, std::shared_ptr<Expression> && r, Op o)
-        : Expression(loc), left(std::move(l)), right(std::move(r)), op(o) {}
+    BinaryOpExpr(const Location & location, std::shared_ptr<Expression> && l, std::shared_ptr<Expression> && r, Op o)
+        : Expression(location), left(std::move(l)), right(std::move(r)), op(o) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        if (!left) throw std::runtime_error("BinaryOpExpr.left is null");
        if (!right) throw std::runtime_error("BinaryOpExpr.right is null");
@@ -1439,8 +1427,8 @@ class MethodCallExpr : public Expression {
    std::shared_ptr<VariableExpr> method;
    ArgumentsExpression args;
 public:
-    MethodCallExpr(const Location & loc, std::shared_ptr<Expression> && obj, std::shared_ptr<VariableExpr> && m, ArgumentsExpression && a)
-        : Expression(loc), object(std::move(obj)), method(std::move(m)), args(std::move(a)) {}
+    MethodCallExpr(const Location & location, std::shared_ptr<Expression> && obj, std::shared_ptr<VariableExpr> && m, ArgumentsExpression && a)
+        : Expression(location), object(std::move(obj)), method(std::move(m)), args(std::move(a)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        if (!object) throw std::runtime_error("MethodCallExpr.object is null");
        if (!method) throw std::runtime_error("MethodCallExpr.method is null");
@@ -1538,8 +1526,8 @@ class CallExpr : public Expression {
 public:
    std::shared_ptr<Expression> object;
    ArgumentsExpression args;
-    CallExpr(const Location & loc, std::shared_ptr<Expression> && obj, ArgumentsExpression && a)
-        : Expression(loc), object(std::move(obj)), args(std::move(a)) {}
+    CallExpr(const Location & location, std::shared_ptr<Expression> && obj, ArgumentsExpression && a)
+        : Expression(location), object(std::move(obj)), args(std::move(a)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        if (!object) throw std::runtime_error("CallExpr.object is null");
        auto obj = object->evaluate(context);
@@ -1554,8 +1542,8 @@ public:
 class FilterExpr : public Expression {
    std::vector<std::shared_ptr<Expression>> parts;
 public:
-    FilterExpr(const Location & loc, std::vector<std::shared_ptr<Expression>> && p)
-      : Expression(loc), parts(std::move(p)) {}
+    FilterExpr(const Location & location, std::vector<std::shared_ptr<Expression>> && p)
+      : Expression(location), parts(std::move(p)) {}
    Value do_evaluate(const std::shared_ptr<Context> & context) const override {
        Value result;
        bool first = true;
@@ -2472,7 +2460,7 @@ private:
                static std::regex leading_space_regex(R"(^\s+)");
                text = std::regex_replace(text, leading_space_regex, "");
              } else if (options.trim_blocks && (it - 1) != begin && !dynamic_cast<ExpressionTemplateToken*>((*(it - 2)).get())) {
-                if (!text.empty() && text[0] == '\n') {
+                if (text.length() > 0 && text[0] == '\n') {
                  text.erase(0, 1);
                }
              }
@@ -2550,7 +2538,7 @@ public:
        TemplateTokenIterator begin = tokens.begin();
        auto it = begin;
        TemplateTokenIterator end = tokens.end();
-        return parser.parseTemplate(begin, it, end, /* fully= */ true);
+        return parser.parseTemplate(begin, it, end, /* full= */ true);
    }
 };

@@ -2589,7 +2577,7 @@ inline std::shared_ptr<Context> Context::builtins() {
    throw std::runtime_error(args.at("message").get<std::string>());
  }));
  globals.set("tojson", simple_function("tojson", { "value", "indent" }, [](const std::shared_ptr<Context> &, Value & args) {
-    return Value(args.at("value").dump(args.get<int64_t>("indent", -1), /* to_json= */ true));
+    return Value(args.at("value").dump(args.get<int64_t>("indent", -1), /* tojson= */ true));
  }));
  globals.set("items", simple_function("items", { "object" }, [](const std::shared_ptr<Context> &, Value & args) {
    auto items = Value::array();
@@ -2611,25 +2599,21 @@ inline std::shared_ptr<Context> Context::builtins() {
  globals.set("last", simple_function("last", { "items" }, [](const std::shared_ptr<Context> &, Value & args) {
    auto items = args.at("items");
    if (!items.is_array()) throw std::runtime_error("object is not a list");
-    if (items.empty()) return Value();
+    if (items.size() == 0) return Value();
    return items.at(items.size() - 1);
  }));
  globals.set("trim", simple_function("trim", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
    auto & text = args.at("text");
    return text.is_null() ? text : Value(strip(text.get<std::string>()));
  }));
-  auto char_transform_function = [](const std::string & name, const std::function<char(char)> & fn) {
-    return simple_function(name, { "text" }, [=](const std::shared_ptr<Context> &, Value & args) {
-      auto text = args.at("text");
-      if (text.is_null()) return text;
-      std::string res;
-      auto str = text.get<std::string>();
-      std::transform(str.begin(), str.end(), std::back_inserter(res), fn);
-      return Value(res);
-    });
-  };
-  globals.set("lower", char_transform_function("lower", ::tolower));
-  globals.set("upper", char_transform_function("upper", ::toupper));
+  globals.set("lower", simple_function("lower", { "text" }, [](const std::shared_ptr<Context> &, Value & args) {
+    auto text = args.at("text");
+    if (text.is_null()) return text;
+    std::string res;
+    auto str = text.get<std::string>();
+    std::transform(str.begin(), str.end(), std::back_inserter(res), ::tolower);
+    return Value(res);
+  }));
  globals.set("default", Value::callable([=](const std::shared_ptr<Context> &, ArgumentsValue & args) {
    args.expectArgs("default", {2, 3}, {0, 1});
    auto & value = args.args[0];
@@ -2759,17 +2743,12 @@ inline std::shared_ptr<Context> Context::builtins() {
    return Value::callable([=](const std::shared_ptr<Context> & context, ArgumentsValue & args) {
      args.expectArgs(is_select ? "select" : "reject", {2, (std::numeric_limits<size_t>::max)()}, {0, 0});
      auto & items = args.args[0];
-      if (items.is_null()) {
+      if (items.is_null())
        return Value::array();
-      }
-      if (!items.is_array()) {
-        throw std::runtime_error("object is not iterable: " + items.dump());
-      }
+      if (!items.is_array()) throw std::runtime_error("object is not iterable: " + items.dump());

      auto filter_fn = context->get(args.args[1]);
-      if (filter_fn.is_null()) {
-        throw std::runtime_error("Undefined filter: " + args.args[1].dump());
-      }
+      if (filter_fn.is_null()) throw std::runtime_error("Undefined filter: " + args.args[1].dump());

      auto filter_args = Value::array();
      for (size_t i = 2, n = args.args.size(); i < n; i++) {
@@ -2891,25 +2870,20 @@ inline std::shared_ptr<Context> Context::builtins() {
        auto v = arg.get<int64_t>();
        startEndStep[i] = v;
        param_set[i] = true;
+        }
      }
-    }
-    for (auto & [name, value] : args.kwargs) {
-      size_t i;
-      if (name == "start") {
-        i = 0;
-      } else if (name == "end") {
-        i = 1;
-      } else if (name == "step") {
-        i = 2;
-      } else {
-        throw std::runtime_error("Unknown argument " + name + " for function range");
-      }
+      for (auto & [name, value] : args.kwargs) {
+        size_t i;
+        if (name == "start") i = 0;
+        else if (name == "end") i = 1;
+        else if (name == "step") i = 2;
+        else throw std::runtime_error("Unknown argument " + name + " for function range");

-      if (param_set[i]) {
-        throw std::runtime_error("Duplicate argument " + name + " for function range");
-      }
-      startEndStep[i] = value.get<int64_t>();
-      param_set[i] = true;
+        if (param_set[i]) {
+          throw std::runtime_error("Duplicate argument " + name + " for function range");
+        }
+        startEndStep[i] = value.get<int64_t>();
+        param_set[i] = true;
    }
    if (!param_set[1]) {
      throw std::runtime_error("Missing required argument 'end' for function range");
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -208,9 +208,6 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                                                        trigger_patterns_c.data(), trigger_patterns_c.size(),
                                                        trigger_tokens.data(), trigger_tokens.size())
             :      llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root");
-        if (!grmr) {
-            return nullptr;
-        }
    }

    auto * result = new common_sampler {
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@@ -14,7 +14,7 @@ struct common_speculative {
    struct llama_context * ctx;
    struct common_sampler * smpl;

-    llama_batch batch;
+    llama_batch_ext_ptr batch;
    llama_tokens prompt;
 };

@@ -23,7 +23,7 @@ struct common_speculative * common_speculative_init(
    auto * result = new common_speculative {
        /* .ctx    = */ ctx_dft,
        /* .smpl   = */ nullptr,
-        /* .batch  = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
+        /* .batch  = */ llama_batch_ext_ptr(llama_batch_ext_init(llama_n_batch(ctx_dft), 1)),
        /* .prompt = */ {},
    };

@@ -69,8 +69,6 @@ void common_speculative_free(struct common_speculative * spec) {

    common_sampler_free(spec->smpl);

-    llama_batch_free(spec->batch);
-
    delete spec;
 }

@@ -151,6 +149,8 @@ llama_tokens common_speculative_gen_draft(

    const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);

+    const llama_seq_id seq_id = 0;
+
    // reuse as much as possible from the old draft context
    // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
    for (int i = 0; i < (int) prompt.size(); ++i) {
@@ -206,40 +206,40 @@ llama_tokens common_speculative_gen_draft(
    }

    // prepare a batch to evaluate any new tokens in the prompt
-    common_batch_clear(batch);
+    llama_batch_ext_clear(batch.get());

    for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
        //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
-        common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
+        llama_batch_ext_add_text(batch.get(), prompt_tgt[i], i - i_start, &seq_id, 1, false);

        prompt.push_back(prompt_tgt[i]);
    }

    // we should rarely end-up here during normal decoding
-    if (batch.n_tokens > 0) {
+    if (llama_batch_ext_get_n_tokens(batch.get()) > 0) {
        //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());

-        llama_decode(ctx, batch);
+        llama_decode_ext(ctx, batch.get());
    }

    const llama_pos n_past = prompt.size();

    LOG_DBG("%s: n_past = %d\n", __func__, n_past);

-    common_batch_clear(batch);
-    common_batch_add  (batch, id_last, n_past, { 0 }, true);
+    llama_batch_ext_clear(batch.get());
+    llama_batch_ext_add_text(batch.get(), id_last, n_past, &seq_id, 1, true);

    prompt.push_back(id_last);

    //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());

-    llama_decode(ctx, batch);
+    llama_decode_ext(ctx, batch.get());

    common_sampler_reset(smpl);

    // sample n_draft tokens from the draft model
    for (int i = 0; i < params.n_draft; ++i) {
-        common_batch_clear(batch);
+        llama_batch_ext_clear(batch.get());

        common_sampler_sample(smpl, ctx, 0, true);

@@ -266,10 +266,10 @@ llama_tokens common_speculative_gen_draft(
            break;
        }

-        common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
+        llama_batch_ext_add_text(batch.get(), id, n_past + i + 1, &seq_id, 1, true);

        // evaluate the drafted tokens on the draft model
-        llama_decode(ctx, batch);
+        llama_decode_ext(ctx, batch.get());

        prompt.push_back(id);
    }
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -180,8 +180,7 @@ class Model:
            extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
            missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
            if len(extra) == 0 and len(missing_files) > 0:
-                raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
-                                 f"Missing tensors: {missing}")
+                raise ValueError(f"Missing or incomplete model files: {missing_files}")
            else:
                raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
                                 f"Missing tensors: {missing}\n"
@@ -529,8 +528,6 @@ class Model:
        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
        added_vocab = tokenizer.get_added_vocab()

-        added_tokens_decoder = tokenizer.added_tokens_decoder
-
        for i in range(vocab_size):
            if i not in reverse_vocab:
                tokens.append(f"[PAD{i}]")
@@ -540,13 +537,13 @@ class Model:
                if token in added_vocab:
                    # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
                    # To avoid unexpected issues - we make sure to normalize non-normalized tokens
-                    if not added_tokens_decoder[i].normalized:
+                    if not tokenizer.added_tokens_decoder[i].normalized:
                        previous_token = token
                        token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
                        if previous_token != token:
                            logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")

-                    if added_tokens_decoder[i].special or self.does_token_look_special(token):
+                    if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
                        toktypes.append(gguf.TokenType.CONTROL)
                    else:
                        # NOTE: this was added for Gemma.
@@ -705,15 +702,6 @@ class Model:
        if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e":
            # ref: https://huggingface.co/Xenova/gpt-4o
            res = "gpt-4o"
-        if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f":
-            # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k
-            res = "superbpe"
-        if chkhsh == "1994ffd01900cfb37395608534236ecd63f2bd5995d6cb1004dda1af50240f15":
-            # ref: https://huggingface.co/trillionlabs/Trillion-7B-preview
-            res = "trillion"
-        if chkhsh == "96a5f08be6259352137b512d4157e333e21df7edd3fcd152990608735a65b224":
-            # ref: https://huggingface.co/inclusionAI/Ling-lite
-            res = "bailingmoe"

        if res is None:
            logger.warning("\n")
@@ -1111,6 +1099,13 @@ class BloomModel(Model):

        tensors.append((self.map_tensor_name(name), data_torch))

+        if name == "word_embeddings.weight":
+            assert self.tensor_names is not None
+
+            # TODO: tie them at runtime, don't duplicate in the model file
+            if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
+                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
+
        return tensors


@@ -1752,25 +1747,6 @@ class LlamaModel(Model):
                raise ValueError(f"Unprocessed experts: {experts}")


-@Model.register("Mistral3ForConditionalGeneration")
-class Mistral3Model(LlamaModel):
-    model_arch = gguf.MODEL_ARCH.LLAMA
-
-    # we need to merge the text_config into the root level of hparams
-    def __init__(self, *args, **kwargs):
-        hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
-        if "text_config" in hparams:
-            hparams = {**hparams, **hparams["text_config"]}
-            kwargs["hparams"] = hparams
-        super().__init__(*args, **kwargs)
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
-        name = name.replace("language_model.", "")
-        if "multi_modal_projector" in name or "vision_tower" in name:
-            return []
-        return super().modify_tensors(data_torch, name, bid)
-
-
@Model.register("DeciLMForCausalLM")
 class DeciModel(Model):
    model_arch = gguf.MODEL_ARCH.DECI
@@ -2275,7 +2251,7 @@ class Qwen2Model(Model):
                self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])


-@Model.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
+@Model.register("Qwen2VLForConditionalGeneration")
 class Qwen2VLModel(Model):
    model_arch = gguf.MODEL_ARCH.QWEN2VL

@@ -2428,6 +2404,10 @@ class GPT2Model(Model):

        tensors.append((new_name, data_torch))

+        # note: GPT2 output is tied to (same as) wte in original model
+        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
+
        return tensors


@@ -2757,26 +2737,21 @@ class CodeShellModel(Model):
        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
        self.gguf_writer.add_rope_scaling_factor(1.0)

-    _has_tok_embd = False
-
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        del bid  # unused

-        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
-        tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
-
        new_name = self.map_tensor_name(name)

-        # assuming token_embd.weight is seen before output.weight
-        if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
-            # even though the tensor file(s) does not contain the word embeddings they are still in the weight map
-            if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
-                logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
-                self.tensor_names.remove("transformer.wte.weight")
-        elif new_name == tok_embd_name:
-            self._has_tok_embd = True
+        tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]

-        return [(new_name, data_torch)]
+        if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
+            assert self.tensor_names is not None
+
+            if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
+                # copy tok_embd.weight to output.weight
+                tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
+
+        return tensors


@Model.register("InternLM2ForCausalLM")
@@ -3391,7 +3366,7 @@ class Gemma3Model(Model):

    # we need to merge the text_config into the root level of hparams
    def __init__(self, *args, **kwargs):
-        hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
+        hparams = Model.load_hparams(kwargs["dir_model"])
        if "text_config" in hparams:
            hparams = {**hparams, **hparams["text_config"]}
            kwargs["hparams"] = hparams
@@ -3557,8 +3532,8 @@ class RWKV6Qwen2Model(Rwkv6Model):
        head_size = hidden_size // num_attention_heads
        rms_norm_eps = self.hparams["rms_norm_eps"]
        intermediate_size = self.hparams["intermediate_size"]
-        time_mix_extra_dim = self.hparams.get("lora_rank_tokenshift", 64 if hidden_size >= 4096 else 32)
-        time_decay_extra_dim = self.hparams.get("lora_rank_decay", 128 if hidden_size >= 4096 else 64)
+        time_mix_extra_dim = 64 if hidden_size >= 4096 else 32
+        time_decay_extra_dim = 128 if hidden_size >= 4096 else 64

        # RWKV isn't context limited
        self.gguf_writer.add_context_length(1048576)
@@ -3809,6 +3784,8 @@ class MambaModel(Model):
    _tok_embd = None

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
        tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)

@@ -3818,10 +3795,6 @@ class MambaModel(Model):
            logger.debug("A_log --> A ==> " + new_name)
            data_torch = -torch.exp(data_torch)

-        # [4 1 8192 1] -> [4 8192 1 1]
-        if self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_CONV1D, bid):
-            data_torch = data_torch.squeeze()
-
        # assuming token_embd.weight is seen before output.weight
        if self._tok_embd is not None and new_name == output_name:
            if torch.equal(self._tok_embd, data_torch):
@@ -4425,29 +4398,6 @@ class DeepseekV2Model(Model):
                raise ValueError(f"Unprocessed experts: {experts}")


-@Model.register("PLMForCausalLM")
-class PLMModel(Model):
-    model_arch = gguf.MODEL_ARCH.PLM
-
-    def set_vocab(self):
-        self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
-        self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
-        self.gguf_writer.add_value_length(hparams["v_head_dim"])
-        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        return [(self.map_tensor_name(name), data_torch)]
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-
-
@Model.register("T5WithLMHeadModel")
@Model.register("T5ForConditionalGeneration")
@Model.register("MT5ForConditionalGeneration")
@@ -5136,105 +5086,6 @@ class GraniteMoeModel(GraniteModel):
        return super().modify_tensors(data_torch, name, bid)


-@Model.register("BailingMoeForCausalLM")
-class BailingMoeModel(Model):
-    model_arch = gguf.MODEL_ARCH.BAILINGMOE
-
-    def set_vocab(self):
-        self._set_vocab_gpt2()
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        hparams = self.hparams
-        rope_dim = hparams.get("head_dim") or hparams["hidden_size"] // hparams["num_attention_heads"]
-
-        self.gguf_writer.add_rope_dimension_count(rope_dim)
-        self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
-        self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
-        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
-        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
-        self.gguf_writer.add_expert_weights_scale(1.0)
-        self.gguf_writer.add_expert_count(hparams["num_experts"])
-        self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"])
-        self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
-
-    _experts: list[dict[str, Tensor]] | None = None
-
-    @staticmethod
-    def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
-        if n_head_kv is not None and n_head != n_head_kv:
-            n_head = n_head_kv
-        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
-                .swapaxes(1, 2)
-                .reshape(weights.shape))
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        n_head = self.hparams["num_attention_heads"]
-        n_kv_head = self.hparams.get("num_key_value_heads")
-        n_embd = self.hparams["hidden_size"]
-        head_dim = self.hparams.get("head_dim") or n_embd // n_head
-
-        output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
-
-        if name.endswith("attention.dense.weight"):
-            return [(self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch)]
-        elif name.endswith("query_key_value.weight"):
-            q, k, v = data_torch.split([n_head * head_dim, n_kv_head * head_dim, n_kv_head * head_dim], dim=-2)
-
-            return [
-                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), BailingMoeModel.permute(q, n_head, n_head)),
-                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), BailingMoeModel.permute(k, n_head, n_kv_head)),
-                (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v)
-            ]
-        elif name.find("mlp.experts") != -1:
-            n_experts = self.hparams["num_experts"]
-            assert bid is not None
-
-            tensors: list[tuple[str, Tensor]] = []
-
-            if self._experts is None:
-                self._experts = [{} for _ in range(self.block_count)]
-
-            self._experts[bid][name] = data_torch
-
-            if len(self._experts[bid]) >= n_experts * 3:
-                # merge the experts into a single 3d tensor
-                for w_name in ["down_proj", "gate_proj", "up_proj"]:
-                    datas: list[Tensor] = []
-
-                    for xid in range(n_experts):
-                        ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
-                        datas.append(self._experts[bid][ename])
-                        del self._experts[bid][ename]
-
-                    data_torch = torch.stack(datas, dim=0)
-
-                    merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
-
-                    new_name = self.map_tensor_name(merged_name)
-
-                    tensors.append((new_name, data_torch))
-
-            return tensors
-
-        new_name = self.map_tensor_name(name)
-
-        if new_name == output_name and self.hparams.get("norm_head"):
-            data_torch = data_torch.float()
-            data_torch /= torch.norm(data_torch, p=2, dim=0, keepdim=True) + 1e-7
-
-        return [(new_name, data_torch)]
-
-    def prepare_tensors(self):
-        super().prepare_tensors()
-
-        if self._experts is not None:
-            # flatten `list[dict[str, Tensor]]` into `list[str]`
-            experts = [k for d in self._experts for k in d.keys()]
-            if len(experts) > 0:
-                raise ValueError(f"Unprocessed experts: {experts}")
-
-
@Model.register("ChameleonForConditionalGeneration")
@Model.register("ChameleonForCausalLM")  # obsolete
 class ChameleonModel(Model):
@@ -5488,7 +5339,7 @@ def main() -> None:
            logger.error(f"Model {model_architecture} is not supported")
            sys.exit(1)

-        model_instance = model_class(dir_model, output_type, fname_out,
+        model_instance = model_class(dir_model=dir_model, ftype=output_type, fname_out=fname_out,
                                     is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
                                     eager=args.no_lazy,
                                     metadata_override=args.metadata, model_name=args.model_name,
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -110,9 +110,6 @@ models = [
    {"name": "deepseek-v3",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"},
    {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"},
    {"name": "gpt-4o",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", },
-    {"name": "superbpe",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", },
-    {"name": "trillion",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/trillionlabs/Trillion-7B-preview", },
-    {"name": "bailingmoe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
 ]


--- a/docs/backend/OPENCL.md
+++ b/docs/backend/OPENCL.md
@@ -145,13 +145,8 @@ A Snapdragon X Elite device with Windows 11 Arm64 is used. Make sure the followi
 * Clang 19
 * Ninja
 * Visual Studio 2022
-* Powershell 7

-Visual Studio provides necessary headers and libraries although it is not directly used for building.
-Alternatively, Visual Studio Build Tools can be installed instead of the full Visual Studio.
-
-Powershell 7 is used for the following commands.
-If an older version of Powershell is used, these commands may not work as they are.
+Powershell is used for the following instructions.

 ### I. Setup Environment

@@ -201,9 +196,10 @@ ninja

 ## Known Issues

- Currently OpenCL backend does not work on Adreno 6xx GPUs.
+- Qwen2.5 0.5B model produces gibberish output with Adreno kernels.

 ## TODO

+- Fix Qwen2.5 0.5B
 - Optimization for Q6_K
 - Support and optimization for Q4_K
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -20,7 +20,7 @@
 **oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:

 - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
+- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
 - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.

@@ -227,19 +227,30 @@ Upon a successful installation, SYCL is enabled for the available intel devices,

 **oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.

-**oneDNN**: The current oneDNN releases *(shipped with the oneAPI base-toolkit)* do not include the NVIDIA backend. Therefore, oneDNN must be compiled from source to enable the NVIDIA target:
+
+**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.

 ```sh
-git clone https://github.com/oneapi-src/oneDNN.git
-cd oneDNN
-cmake -GNinja -Bbuild-nvidia -DDNNL_CPU_RUNTIME=DPCPP -DDNNL_GPU_RUNTIME=DPCPP -DDNNL_GPU_VENDOR=NVIDIA -DONEDNN_BUILD_GRAPH=OFF -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-cmake --build build-nvidia --config Release
+git clone https://github.com/oneapi-src/oneMKL
+cd oneMKL
+cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
+cmake --build buildWithCublas --config Release
 ```

 - **Adding support to AMD GPUs**

 **oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.

+**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
+
+```sh
+git clone https://github.com/oneapi-src/oneMKL
+cd oneMKL
+# Find your HIPTARGET with rocminfo, under the key 'Name:'
+cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
+cmake --build buildWithrocBLAS --config Release
+```
+
 3. **Verify installation and environment**

 In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
@@ -302,39 +313,37 @@ cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -
 cmake --build build --config Release -j -v
 ```

-It is possible to come across some precision issues when running tests that stem from using faster
-instructions, which can be circumvented by setting the environment variable `SYCL_PROGRAM_COMPILE_OPTIONS`
-as `-cl-fp32-correctly-rounded-divide-sqrt`
-
 #### Nvidia GPU

-The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
-By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
-
 ```sh
+# Export relevant ENV variables
+export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
+export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
+export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
+
 # Build LLAMA with Nvidia BLAS acceleration through SYCL
 # Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
 GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture

 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

 # Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DDNNL_DIR=/path/to/oneDNN/build-nvidia/install/lib/cmake/dnnl
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON

 # build all binary
 cmake --build build --config Release -j -v
 ```

-It is possible to come across some precision issues when running tests that stem from using faster
-instructions, which can be circumvented by passing the `-fno-fast-math` flag to the compiler.
-
 #### AMD GPU

-The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
-By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
-
 ```sh
+# Export relevant ENV variables
+export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
+export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
+export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
+
 # Build LLAMA with rocBLAS acceleration through SYCL

 ## AMD
@@ -475,12 +484,6 @@ b. Enable oneAPI running environment:
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```

- if you are using Powershell, enable the runtime environment with the following:
-
-```
-cmd.exe "/K" '"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" && powershell'
-```
-
 c. Verify installation

 In the oneAPI command line, run the following to print the available SYCL devices:
@@ -511,13 +514,13 @@ You could download the release package for Windows directly, which including bin

 Choose one of following methods to build from source code.

-#### 1. Script
+1. Script

 ```sh
 .\examples\sycl\win-build-sycl.bat
 ```

-#### 2. CMake
+2. CMake

 On the oneAPI command line window, step into the llama.cpp main directory and run the following:

@@ -546,84 +549,13 @@ cmake --preset x64-windows-sycl-debug
 cmake --build build-x64-windows-sycl-debug -j --target llama-cli
 ```

-#### 3. Visual Studio
+3. Visual Studio

-You have two options to use Visual Studio to build llama.cpp:
- As CMake Project using CMake presets.
- Creating a Visual Studio solution to handle the project.
-
-**Note**:
-
-All following commands are executed in PowerShell.
-
-##### - Open as a CMake Project
-
-You can use Visual Studio to open the `llama.cpp` folder directly as a CMake project. Before compiling, select one of the SYCL CMake presets:
-
- `x64-windows-sycl-release`
-
- `x64-windows-sycl-debug`
+You can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.

 *Notes:*
- For a minimal experimental setup, you can build only the inference executable using:

-    ```Powershell
-    cmake --build build --config Release -j --target llama-cli
-    ```
-
-##### - Generating a Visual Studio Solution
-
-You can use Visual Studio solution to build and work on llama.cpp on Windows. You need to convert the CMake Project into a `.sln` file.
-
-If you want to use the Intel C++ Compiler for the entire `llama.cpp` project, run the following command:
-
-```Powershell
-cmake -B build -G "Visual Studio 17 2022" -T "Intel C++ Compiler 2025" -A x64 -DGGML_SYCL=ON -DCMAKE_BUILD_TYPE=Release
-```
-
-If you prefer to use the Intel C++ Compiler only for `ggml-sycl`, ensure that `ggml` and its backend libraries are built as shared libraries ( i.e. `-DBUILD_SHARED_LIBRARIES=ON`, this is default behaviour):
-
-```Powershell
-cmake -B build -G "Visual Studio 17 2022" -A x64 -DGGML_SYCL=ON -DCMAKE_BUILD_TYPE=Release \
-      -DSYCL_INCLUDE_DIR="C:\Program Files (x86)\Intel\oneAPI\compiler\latest\include" \
-      -DSYCL_LIBRARY_DIR="C:\Program Files (x86)\Intel\oneAPI\compiler\latest\lib"
-```
-
-If successful the build files have been written to: *path/to/llama.cpp/build*
-Open the project file **build/llama.cpp.sln** with Visual Studio.
-
-Once the Visual Studio solution is created, follow these steps:
-
-1. Open the solution in Visual Studio.
-
-2. Right-click on `ggml-sycl` and select **Properties**.
-
-3. In the left column, expand **C/C++** and select **DPC++**.
-
-4. In the right panel, find **Enable SYCL Offload** and set it to `Yes`.
-
-5. Apply the changes and save.
-
-
-*Navigation Path:*
-
-```
-Properties -> C/C++ -> DPC++ -> Enable SYCL Offload (Yes)
-```
-
-Now, you can build `llama.cpp` with the SYCL backend as a Visual Studio project.
-To do it from menu: `Build -> Build Solution`.
-Once it is completed, final results will be in **build/Release/bin**
-
-*Additional Note*
-
- You can avoid specifying `SYCL_INCLUDE_DIR` and `SYCL_LIBRARY_DIR` in the CMake command by setting the environment variables:
-
-    - `SYCL_INCLUDE_DIR_HINT`
-
-    - `SYCL_LIBRARY_DIR_HINT`
-
- Above instruction has been tested with Visual Studio 17 Community edition and oneAPI 2025.0. We expect them to work also with future version if the instructions are adapted accordingly.
+- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.

 ### III. Run the inference

--- a/docs/build.md
+++ b/docs/build.md
@@ -132,14 +132,12 @@ You may find the official downloads here: [NVIDIA developer site](https://develo


 #### Compile and run inside a Fedora Toolbox Container
-We also have a [guide](./backend/CUDA-FEDORA.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).
+We also have a [guide](./cuda-fedora.md) for setting up CUDA toolkit in a Fedora [toolbox container](https://containertoolbx.org/).

 **Recommended for:**
- ***Necessary*** for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
-  - (there are no supported CUDA packages for these systems)
- ***Necessary*** for users that have a host that is not a: [Supported Nvidia CUDA Release Platform](https://developer.nvidia.com/cuda-downloads).
-  - (for example, you may have [Fedora 42 Beta](https://fedoramagazine.org/announcing-fedora-linux-42-beta/) as your your host operating system)
- ***Convenient*** For those running [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde), and want to keep their host system clean.
+
+- ***Particularly*** *convenient* for users of [Atomic Desktops for Fedora](https://fedoraproject.org/atomic-desktops/); such as: [Silverblue](https://fedoraproject.org/atomic-desktops/silverblue/) and [Kinoite](https://fedoraproject.org/atomic-desktops/kinoite/).
+- Toolbox is installed by default: [Fedora Workstation](https://fedoraproject.org/workstation/) or [Fedora KDE Plasma Desktop](https://fedoraproject.org/spins/kde).
 - *Optionally* toolbox packages are available: [Arch Linux](https://archlinux.org/), [Red Hat Enterprise Linux >= 8.5](https://www.redhat.com/en/technologies/linux-platforms/enterprise-linux), or [Ubuntu](https://ubuntu.com/download)


@@ -191,7 +189,7 @@ The following compilation options are also available to tweak performance:

 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
 |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, CDNA and RDNA3+). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
+| GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
 | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
 | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
@@ -218,7 +216,6 @@ By default, all supported compute capabilities are enabled. To customize this be

 ```bash
 cmake -B build -DGGML_MUSA=ON -DMUSA_ARCHITECTURES="21"
-cmake --build build --config Release
 ```

 This configuration enables only compute capability `2.1` (MTT S80) during compilation, which can help reduce compilation time.
@@ -436,116 +433,6 @@ llama_new_context_with_model:       CANN compute buffer size =  1260.81 MiB

 For detailed info, such as model/device supports, CANN install, please refer to [llama.cpp for CANN](./backend/CANN.md).

-## Arm® KleidiAI™
-KleidiAI is a library of optimized microkernels for AI workloads, specifically designed for Arm CPUs. These microkernels enhance performance and can be enabled for use by the CPU backend.
-
-To enable KleidiAI, go to the llama.cpp directory and build using CMake
-```bash
-cmake -B build -DGGML_CPU_KLEIDIAI=ON
-cmake --build build --config Release
-```
-You can verify that KleidiAI is being used by running
-```bash
-./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
-```
-If KleidiAI is enabled, the ouput will contain a line similar to:
-```
-load_tensors: CPU_KLEIDIAI model buffer size =  3474.00 MiB
-```
-KleidiAI's microkernels implement optimized tensor operations using Arm CPU features such as dotprod, int8mm and SME. llama.cpp selects the most efficient kernel based on runtime CPU feature detection. However, on platforms that support SME, you must manually enable SME microkernels by setting the environment variable `GGML_KLEIDIAI_SME=1`.
-
-Depending on your build target, other higher priority backends may be enabled by default. To ensure the CPU backend is used, you must disable the higher priority backends either at compile time, e.g. -DGGML_METAL=OFF, or during run-time using the command line option `--device none`.
-
-## OpenCL
-
-This provides GPU acceleration through OpenCL on recent Adreno GPU.
-More information about OpenCL backend can be found in [OPENCL.md](./backend/OPENCL.md) for more information.
-
-### Android
-
-Assume NDK is available in `$ANDROID_NDK`. First, install OpenCL headers and ICD loader library if not available,
-
-```sh
-mkdir -p ~/dev/llm
-cd ~/dev/llm
-
-git clone https://github.com/KhronosGroup/OpenCL-Headers && \
-cd OpenCL-Headers && \
-cp -r CL $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include
-
-cd ~/dev/llm
-
-git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && \
-cd OpenCL-ICD-Loader && \
-mkdir build_ndk && cd build_ndk && \
-cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \
-  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-  -DOPENCL_ICD_LOADER_HEADERS_DIR=$ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \
-  -DANDROID_ABI=arm64-v8a \
-  -DANDROID_PLATFORM=24 \
-  -DANDROID_STL=c++_shared && \
-ninja && \
-cp libOpenCL.so $ANDROID_NDK/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android
-```
-
-Then build llama.cpp with OpenCL enabled,
-
-```sh
-cd ~/dev/llm
-
-git clone https://github.com/ggml-org/llama.cpp && \
-cd llama.cpp && \
-mkdir build-android && cd build-android
-
-cmake .. -G Ninja \
-  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-  -DANDROID_ABI=arm64-v8a \
-  -DANDROID_PLATFORM=android-28 \
-  -DBUILD_SHARED_LIBS=OFF \
-  -DGGML_OPENCL=ON
-
-ninja
-```
-
-### Windows Arm64
-
-First, install OpenCL headers and ICD loader library if not available,
-
-```powershell
-mkdir -p ~/dev/llm
-
-cd ~/dev/llm
-git clone https://github.com/KhronosGroup/OpenCL-Headers && cd OpenCL-Headers
-mkdir build && cd build
-cmake .. -G Ninja `
-  -DBUILD_TESTING=OFF `
-  -DOPENCL_HEADERS_BUILD_TESTING=OFF `
-  -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-  -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
-cmake --build . --target install
-
-cd ~/dev/llm
-git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader && cd OpenCL-ICD-Loader
-mkdir build && cd build
-cmake .. -G Ninja `
-  -DCMAKE_BUILD_TYPE=Release `
-  -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
-  -DCMAKE_INSTALL_PREFIX="$HOME/dev/llm/opencl"
-cmake --build . --target install
-```
-
-Then build llama.cpp with OpenCL enabled,
-
-```powershell
-cmake .. -G Ninja `
-  -DCMAKE_TOOLCHAIN_FILE="$HOME/dev/llm/llama.cpp/cmake/arm64-windows-llvm.cmake" `
-  -DCMAKE_BUILD_TYPE=Release `
-  -DCMAKE_PREFIX_PATH="$HOME/dev/llm/opencl" `
-  -DBUILD_SHARED_LIBS=OFF `
-  -DGGML_OPENCL=ON
-ninja
-```
-
 ## Android

 To read documentation for how to build on Android, [click here](./android.md)
--- a/docs/backend/CUDA-FEDORA.md
+++ b/docs/backend/CUDA-FEDORA.md
@@ -14,7 +14,9 @@ In this guide we setup [Nvidia CUDA](https://docs.nvidia.com/cuda/) in a toolbox
 - [Creating a Fedora Toolbox Environment](#creating-a-fedora-toolbox-environment)
 - [Installing Essential Development Tools](#installing-essential-development-tools)
 - [Adding the CUDA Repository](#adding-the-cuda-repository)
- [Installing Nvidia Driver Libraries](#installing-nvidia-driver-libraries)
+- [Installing `nvidia-driver-libs`](#installing-nvidia-driver-libs)
+- [Manually Resolving Package Conflicts](#manually-resolving-package-conflicts)
+- [Finalizing the Installation of `nvidia-driver-libs`](#finalizing-the-installation-of-nvidia-driver-libs)
 - [Installing the CUDA Meta-Package](#installing-the-cuda-meta-package)
 - [Configuring the Environment](#configuring-the-environment)
 - [Verifying the Installation](#verifying-the-installation)
@@ -65,7 +67,7 @@ This guide focuses on Fedora hosts, but with small adjustments, it can work for
   sudo dnf distro-sync
   ```

-2. **Install **Vim** the default text editor (Optional):**
+2. **Install the Default Text Editor (Optional):**

   ```bash
   sudo dnf install vim-default-editor --allowerasing
@@ -95,48 +97,36 @@ After adding the repository, synchronize the package manager again:
 sudo dnf distro-sync
 ```

-## Installing Nvidia Driver Libraries
+## Installing `nvidia-driver-libs` and `nvidia-driver-cuda-libs`

-First, we need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go):
+We need to detect if the host is supplying the [NVIDIA driver libraries into the toolbox](https://github.com/containers/toolbox/blob/main/src/pkg/nvidia/nvidia.go).

 ```bash
 ls -la /usr/lib64/libcuda.so.1
 ```

-### If *`libcuda.so.1`* is missing:
-
-```
-ls: cannot access '/usr/lib64/libcuda.so.1': No such file or directory
-```
-
 **Explanation:**
-The host dose not supply the CUDA drivers, **install them now:**

-#### Install the Nvidia Driver Libraries on Guest:
+- `nvidia-driver-libs` and `nvidia-driver-cuda-libs` contains necessary NVIDIA driver libraries required by CUDA,
+  on hosts with NVIDIA drivers installed the Fedora Container will supply the host libraries.
+
+### Install Nvidia Driver Libraries on Guest (if `libcuda.so.1` was NOT found).

 ```bash
-sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
+sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
 ```

-### If *`libcuda.so.1`* exists:
-```
-lrwxrwxrwx. 1 root root 21 Mar 24 11:26 /usr/lib64/libcuda.so.1 -> libcuda.so.570.133.07
-```
+### Manually Updating the RPM database for host-supplied NVIDIA drivers (if `libcuda.so.1` was found).

-**Explanation:**
-The host is supply the CUDA drivers, **we need to update the guest RPM Database accordingly:**
+If the installation fails due to conflicts, we'll manually download and install the required packages, excluding conflicting files.

-#### Update the Toolbox RPM Database to include the Host-Supplied Libraries:
-
-Note: we do not actually install the libraries, we just update the DB so that the guest system knows they are supplied by the host.
-
-##### 1. Download `nvidia-` parts that are supplied by the host RPM's (with dependencies)
+#### 1. Download `nvidia-driver-libs` and `nvidia-driver-cuda-libs` RPM's (with dependencies)

 ```bash
-sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
+sudo dnf download --destdir=/tmp/nvidia-driver-libs --resolve --arch x86_64 nvidia-driver-libs nvidia-driver-cuda-libs
 ```

-##### 2. Update the RPM database to assume the installation of these packages.
+#### 2. Update the RPM database to assume the installation of these packages.

 ```bash
 sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*
@@ -144,26 +134,23 @@ sudo rpm --install --verbose --hash --justdb /tmp/nvidia-driver-libs/*

 **Note:**

- The `--justdb` option only updates the RPM database, without touching the filesystem elsewhere.
+- The `--justdb` option only updates the RPM database, without touching the filesystem.

-##### Check that the RPM Database has been correctly updated:
-
-**Note:** This is the same command as in the *"Install the Nvidia Driver Libraries on Guest"* for if *`libcuda.so.1`* was missing.
+#### Finalizing the Installation of `nvidia-driver-libs` and `nvidia-driver-cuda-libs`

+After manually installing the dependencies, run:

 ```bash
-sudo dnf install nvidia-driver-cuda nvidia-driver-libs nvidia-driver-cuda-libs nvidia-persistenced
+sudo dnf install nvidia-driver-libs nvidia-driver-cuda-libs
 ```

-*(this time it will not install anything, as the database things that these packages are already installed)*
+You should receive a message indicating the package is already installed:

 ```
 Updating and loading repositories:
 Repositories loaded.
-Package "nvidia-driver-cuda-3:570.124.06-1.fc41.x86_64" is already installed.
-Package "nvidia-driver-libs-3:570.124.06-1.fc41.x86_64" is already installed.
-Package "nvidia-driver-cuda-libs-3:570.124.06-1.fc41.x86_64" is already installed.
-Package "nvidia-persistenced-3:570.124.06-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-libs-3:570.86.10-1.fc41.x86_64" is already installed.
+Package "nvidia-driver-cuda-libs-3:570.86.10-1.fc41.x86_64" is already installed.

 Nothing to do.
 ```
@@ -220,9 +207,9 @@ You should see output similar to:
 ```
 nvcc: NVIDIA (R) Cuda compiler driver
 Copyright (c) 2005-2025 NVIDIA Corporation
-Built on Fri_Feb_21_20:23:50_PST_2025
-Cuda compilation tools, release 12.8, V12.8.93
-Build cuda_12.8.r12.8/compiler.35583870_0
+Built on Wed_Jan_15_19:20:09_PST_2025
+Cuda compilation tools, release 12.8, V12.8.61
+Build cuda_12.8.r12.8/compiler.35404655_0
 ```

 This output confirms that the CUDA compiler is accessible and indicates the installed version.
--- a/docs/install.md
+++ b/docs/install.md
@@ -9,13 +9,6 @@ brew install llama.cpp
 ```
 The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668

-## MacPorts
-
-```sh
-sudo port install llama.cpp
-```
-see also: https://ports.macports.org/port/llama.cpp/details/
-
 ## Nix

 On Mac and Linux, the Nix package manager can be used via
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -38,7 +38,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = common_model_params_to_llama(params);

-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);

    if (model == NULL) {
        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
@@ -59,24 +59,17 @@ int main(int argc, char ** argv) {

    const int32_t n_kv_max = llama_n_ctx(ctx);

-    llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
+    llama_batch_ext * batch = llama_batch_ext_init(n_kv_max, 1);

    // decode in batches of ctx_params.n_batch tokens
-    auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) {
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+    auto decode_helper = [](llama_context * ctx, llama_batch_ext * batch, int32_t n_batch) {
+        const int32_t n_batch_tokens = llama_batch_ext_get_n_tokens(batch);
+        for (int32_t i = 0; i < (int32_t) n_batch_tokens; i += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, (int32_t) (n_batch_tokens - i));

-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-            };
+            llama_batch_ext_ptr batch_view = llama_batch_ext_ptr(llama_batch_ext_get_view(batch, i, n_tokens));

-            const int ret = llama_decode(ctx, batch_view);
+            const int ret = llama_decode_ext(ctx, batch_view.get());
            if (ret != 0) {
                LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
                return false;
@@ -91,7 +84,8 @@ int main(int argc, char ** argv) {
    // warm up
    {
        for (int i = 0; i < 16; ++i) {
-            common_batch_add(batch, 0, i, { 0 }, false);
+            const llama_seq_id seq_id = 0;
+            llama_batch_ext_add_text(batch, 0, i, &seq_id, 1, false);
        }

        if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
@@ -121,14 +115,14 @@ int main(int argc, char ** argv) {
                    continue;
                }

-                common_batch_clear(batch);
+                llama_batch_ext_clear(batch);

                for (int i = 0; i < pp; ++i) {
                    for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
-                        common_batch_add(batch, 0, i, { j }, false);
+                        llama_batch_ext_add_text(batch, 0, i, &j, 1, false);
                    }
                }
-                batch.logits[batch.n_tokens - 1] = true;
+                llama_batch_ext_set_output_last(batch);

                const auto t_pp_start = ggml_time_us();

@@ -150,10 +144,10 @@ int main(int argc, char ** argv) {
                const auto t_tg_start = ggml_time_us();

                for (int i = 0; i < tg; ++i) {
-                    common_batch_clear(batch);
+                    llama_batch_ext_clear(batch);

                    for (int j = 0; j < pl; ++j) {
-                        common_batch_add(batch, 0, pp + i, { j }, true);
+                        llama_batch_ext_add_text(batch, 0, pp + i, &j, 1, true);
                    }

                    if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
@@ -191,7 +185,7 @@ int main(int argc, char ** argv) {
    LOG("\n");
    llama_perf_context_print(ctx);

-    llama_batch_free(batch);
+    llama_batch_ext_free(batch);

    llama_free(ctx);
    llama_model_free(model);
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -41,7 +41,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = common_model_params_to_llama(params);

-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);

    if (model == NULL) {
        LOG_ERR("%s: error: unable to load model\n" , __func__);
@@ -102,7 +102,7 @@ int main(int argc, char ** argv) {

    // create a llama_batch
    // we use this object to submit token data for decoding
-    llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
+    llama_batch_ext * batch = llama_batch_ext_init(std::max(tokens_list.size(), (size_t) n_parallel), n_parallel);

    std::vector<llama_seq_id> seq_ids(n_parallel, 0);
    for (int32_t i = 0; i < n_parallel; ++i) {
@@ -111,12 +111,12 @@ int main(int argc, char ** argv) {

    // evaluate the initial prompt
    for (size_t i = 0; i < tokens_list.size(); ++i) {
-        common_batch_add(batch, tokens_list[i], i, seq_ids, false);
+        llama_batch_ext_add_text(batch, tokens_list[i], i, seq_ids.data(), seq_ids.size(), false);
    }
-    GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
+    GGML_ASSERT(llama_batch_ext_get_n_tokens(batch) == (int) tokens_list.size());

    if (llama_model_has_encoder(model)) {
-        if (llama_encode(ctx, batch)) {
+        if (llama_encode_ext(ctx, batch)) {
            LOG_ERR("%s : failed to eval\n", __func__);
            return 1;
        }
@@ -126,14 +126,14 @@ int main(int argc, char ** argv) {
            decoder_start_token_id = llama_vocab_bos(vocab);
        }

-        common_batch_clear(batch);
-        common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
+        llama_batch_ext_clear(batch);
+        llama_batch_ext_add_text(batch, decoder_start_token_id, 0, seq_ids.data(), seq_ids.size(), false);
    }

    // llama_decode will output logits only for the last token of the prompt
-    batch.logits[batch.n_tokens - 1] = true;
+    llama_batch_ext_set_output_last(batch);

-    if (llama_decode(ctx, batch) != 0) {
+    if (llama_decode_ext(ctx, batch) != 0) {
        LOG_ERR("%s: llama_decode() failed\n", __func__);
        return 1;
    }
@@ -155,16 +155,16 @@ int main(int argc, char ** argv) {

    // remember the batch index of the last token for each parallel sequence
    // we need this to determine which logits to sample from
-    std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
+    std::vector<int32_t> i_batch(n_parallel, llama_batch_ext_get_n_tokens(batch) - 1);

-    int n_cur    = batch.n_tokens;
+    int n_cur    = llama_batch_ext_get_n_tokens(batch);
    int n_decode = 0;

    const auto t_main_start = ggml_time_us();

    while (n_cur <= n_predict) {
        // prepare the next batch
-        common_batch_clear(batch);
+        llama_batch_ext_clear(batch);

        // sample the next token for each parallel sequence / stream
        for (int32_t i = 0; i < n_parallel; ++i) {
@@ -193,23 +193,23 @@ int main(int argc, char ** argv) {

            streams[i] += common_token_to_piece(ctx, new_token_id);

-            i_batch[i] = batch.n_tokens;
+            i_batch[i] = llama_batch_ext_get_n_tokens(batch);

            // push this new token for next evaluation
-            common_batch_add(batch, new_token_id, n_cur, { i }, true);
+            llama_batch_ext_add_text(batch, new_token_id, n_cur, &i, 1, true);

            n_decode += 1;
        }

        // all streams are finished
-        if (batch.n_tokens == 0) {
+        if (llama_batch_ext_get_n_tokens(batch) == 0) {
            break;
        }

        n_cur += 1;

        // evaluate the current batch with the transformer model
-        if (llama_decode(ctx, batch)) {
+        if (llama_decode_ext(ctx, batch)) {
            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }
@@ -234,7 +234,7 @@ int main(int argc, char ** argv) {

    fprintf(stderr, "\n");

-    llama_batch_free(batch);
+    llama_batch_ext_free(batch);

    llama_sampler_free(smpl);
    llama_free(ctx);
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@@ -343,7 +343,8 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {

 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
    llama_kv_self_clear(ctx);
-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
+    auto batch = llama_batch_ext_ptr::init_from_text(tokens.data(), tokens.size(), 0, 0, true);
+    if (llama_decode_ext(ctx, batch.get())) {
        fprintf(stderr, "%s : failed to eval\n", __func__);
        return false;
    }
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -26,14 +26,14 @@ static std::vector<std::string> split_lines(const std::string & s, const std::st
    return lines;
 }

-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+static void batch_add_seq(common_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
    size_t n_tokens = tokens.size();
    for (size_t i = 0; i < n_tokens; i++) {
-        common_batch_add(batch, tokens[i], i, { seq_id }, true);
+        batch.add_text(tokens[i], i, seq_id, true);
    }
 }

-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
+static void batch_decode(llama_context * ctx, common_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
    const struct llama_model * model = llama_get_model(ctx);

@@ -41,21 +41,21 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    llama_kv_self_clear(ctx);

    // run model
-    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, llama_batch_ext_get_n_tokens(batch.get()), n_seq);
    if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
        // encoder-only model
-        if (llama_encode(ctx, batch) < 0) {
+        if (llama_encode_ext(ctx, batch.get()) < 0) {
            LOG_ERR("%s : failed to encode\n", __func__);
        }
    } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
        // decoder-only model
-        if (llama_decode(ctx, batch) < 0) {
+        if (llama_decode_ext(ctx, batch.get()) < 0) {
            LOG_ERR("%s : failed to decode\n", __func__);
        }
    }

-    for (int i = 0; i < batch.n_tokens; i++) {
-        if (!batch.logits[i]) {
+    for (int i = 0; i < llama_batch_ext_get_n_tokens(batch.get()); i++) {
+        if (!batch.tokens[i].logits) {
            continue;
        }

@@ -69,8 +69,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
            GGML_ASSERT(embd != NULL && "failed to get token embeddings");
        } else {
            // try to get sequence embeddings - supported only when pooling_type is not NONE
-            embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-            embd_pos = batch.seq_id[i][0];
+            embd = llama_get_embeddings_seq(ctx, batch.tokens[i].seq_id);
+            embd_pos = batch.tokens[i].seq_id;
            GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
        }

@@ -171,7 +171,7 @@ int main(int argc, char ** argv) {

    // initialize batch
    const int n_prompts = prompts.size();
-    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+    struct common_batch batch = common_batch(n_batch, 1);

    // count number of embeddings
    int n_embd_count = 0;
@@ -198,12 +198,12 @@ int main(int argc, char ** argv) {
        const uint64_t n_toks = inp.size();

        // encode if at capacity
-        if (batch.n_tokens + n_toks > n_batch) {
+        if (batch.get_n_tokens() + n_toks > n_batch) {
            float * out = emb + e * n_embd;
            batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
-            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
+            e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.get_n_tokens() : s;
            s = 0;
-            common_batch_clear(batch);
+            batch.clear();
        }

        // add to batch
@@ -319,7 +319,6 @@ int main(int argc, char ** argv) {
    llama_perf_context_print(ctx);

    // clean up
-    llama_batch_free(batch);
    llama_backend_free();

    return 0;
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -134,7 +134,8 @@ static bool run(llama_context * ctx, const common_params & params) {

    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);

-    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
+    auto batch = llama_batch_ext_ptr::init_from_text(tokens.data(), tokens.size(), 0, 0, true);
+    if (llama_decode_ext(ctx, batch.get())) {
        LOG_ERR("%s : failed to eval\n", __func__);
        return false;
    }
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -421,7 +421,7 @@ int main(int argc, char ** argv) {

    g_verbose = (params.verbosity > 1);
    try {
-        lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
+        lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads);
        ctx.run_merge();
    } catch (const std::exception & err) {
        fprintf(stderr, "%s\n", err.what());
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -408,6 +408,8 @@ static void gguf_merge(const split_params & split_params) {
        exit(EXIT_FAILURE);
    }

+    std::ofstream fout(split_params.output.c_str(), std::ios::binary);
+    fout.exceptions(std::ofstream::failbit); // fail fast on write errors

    auto * ctx_out = gguf_init_empty();

@@ -451,6 +453,7 @@ static void gguf_merge(const split_params & split_params) {
                gguf_free(ctx_gguf);
                ggml_free(ctx_meta);
                gguf_free(ctx_out);
+                fout.close();
                exit(EXIT_FAILURE);
            }

@@ -463,6 +466,7 @@ static void gguf_merge(const split_params & split_params) {
                gguf_free(ctx_gguf);
                ggml_free(ctx_meta);
                gguf_free(ctx_out);
+                fout.close();
                exit(EXIT_FAILURE);
            }

@@ -475,6 +479,7 @@ static void gguf_merge(const split_params & split_params) {
                gguf_free(ctx_gguf);
                ggml_free(ctx_meta);
                gguf_free(ctx_out);
+                fout.close();
                exit(EXIT_FAILURE);
            }

@@ -495,11 +500,9 @@ static void gguf_merge(const split_params & split_params) {

        fprintf(stderr, "\033[3Ddone\n");
    }
-    std::ofstream fout;
-    if (!split_params.dry_run) {
-        fout.open(split_params.output.c_str(), std::ios::binary);
-        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
-        // placeholder for the meta data
+
+    // placeholder for the meta data
+    {
        auto meta_size = gguf_get_meta_size(ctx_out);
        ::zeros(fout, meta_size);
    }
@@ -515,9 +518,7 @@ static void gguf_merge(const split_params & split_params) {
                ggml_free(ctx_metas[i]);
            }
            gguf_free(ctx_out);
-            if (!split_params.dry_run) {
-                fout.close();
-            }
+            fout.close();
            exit(EXIT_FAILURE);
        }
        fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path);
@@ -539,11 +540,10 @@ static void gguf_merge(const split_params & split_params) {
            auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor);
            f_input.seekg(offset);
            f_input.read((char *)read_data.data(), n_bytes);
-            if (!split_params.dry_run) {
-                // write tensor data + padding
-                fout.write((const char *)read_data.data(), n_bytes);
-                zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
-            }
+
+            // write tensor data + padding
+            fout.write((const char *)read_data.data(), n_bytes);
+            zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes);
        }

        gguf_free(ctx_gguf);
@@ -552,15 +552,16 @@ static void gguf_merge(const split_params & split_params) {
        fprintf(stderr, "\033[3Ddone\n");
    }

-    if (!split_params.dry_run) {
+    {
        // go back to beginning of file and write the updated metadata
        fout.seekp(0);
        std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
        gguf_get_meta_data(ctx_out, data.data());
        fout.write((const char *)data.data(), data.size());
+
        fout.close();
+        gguf_free(ctx_out);
    }
-    gguf_free(ctx_out);

    fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n",
            __func__, split_params.output.c_str(), n_split, total_tensors);
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -13,10 +13,10 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
    const llama_model * model = llama_get_model(ctx);
    const llama_vocab * vocab = llama_model_get_vocab(model);

-    llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
+    llama_batch_ext * batch = llama_batch_ext_init(llama_n_batch(ctx), 1);

    for (uint64_t i = 0; i < sentences.size(); i++) {
-        common_batch_clear(batch);
+        llama_batch_ext_clear(batch);

        const std::string input_string = instruction + sentences[i];

@@ -41,7 +41,8 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve

        // add input to batch (this increments n_tokens)
        for (int32_t j = 0; j < n_toks; j++) {
-            common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
+            const llama_seq_id seq_id = 0;
+            llama_batch_ext_add_text(batch, inputs[j], j, &seq_id, 1 , j >= n_inst);
        }

        // clear previous kv_cache values (irrelevant for embeddings)
@@ -50,7 +51,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        llama_set_causal_attn(ctx, false);

        // run model
-        llama_decode(ctx, batch);
+        llama_decode_ext(ctx, batch);

        // get embedding dimensions
        uint64_t n_embd = llama_model_n_embd(model);
@@ -89,7 +90,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
 #endif
    }

-    llama_batch_free(batch);
+    llama_batch_ext_free(batch);

    return result;
 }
@@ -106,25 +107,26 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
    llama_set_embeddings(ctx, false);
    llama_set_causal_attn(ctx, true);

-    llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
+    llama_batch_ext * bat = llama_batch_ext_init(llama_n_batch(ctx), 1);

    std::vector<llama_token> inputs = common_tokenize(vocab, prompt, false, true);
    int32_t i_current_token = 0;

    while (true) {
-        common_batch_clear(bat);
+        llama_batch_ext_clear(bat);
        {
            const int32_t n_inputs = inputs.size();

            for (int32_t i = 0; i < n_inputs; i++) {
-                common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
+                const llama_seq_id seq_id = 0;
+                llama_batch_ext_add_text(bat, inputs[i], i_current_token++, &seq_id, 1, i == n_inputs - 1);
            }
        }
        inputs.clear();

-        llama_decode(ctx, bat);
+        llama_decode_ext(ctx, bat);

-        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
+        llama_token token = llama_sampler_sample(smpl, ctx, llama_batch_ext_get_n_tokens(bat) - 1);

        if (token == eos_token) {
            break;
@@ -145,7 +147,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
        std::printf("\n");
    }

-    llama_batch_free(bat);
+    llama_batch_ext_free(bat);

    return result;
 }
@@ -168,7 +170,7 @@ int main(int argc, char * argv[]) {

    llama_backend_init();

-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
+    llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams);

    // create generation context
    llama_context * ctx = llama_init_from_model(model, cparams);
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -497,7 +497,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
        // clear the KV cache
        llama_kv_self_clear(ctx);

-        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+        llama_batch_ext * batch = llama_batch_ext_init(n_batch, 1);

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@@ -511,14 +511,15 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
                tokens[batch_start] = llama_vocab_bos(vocab);
            }

-            common_batch_clear(batch);
+            llama_batch_ext_clear(batch);
            for (int i = 0; i < batch_size; i++) {
-                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+                const llama_seq_id seq_id = 0;
+                llama_batch_ext_add_text(batch, tokens[batch_start + i], j*n_batch + i, &seq_id, 1, true);
            }

-            if (llama_decode(ctx, batch)) {
+            if (llama_decode_ext(ctx, batch)) {
                LOG_ERR("%s : failed to eval\n", __func__);
-                llama_batch_free(batch);
+                llama_batch_ext_free(batch);
                return false;
            }

@@ -531,7 +532,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
            }
        }

-        llama_batch_free(batch);
+        llama_batch_ext_free(batch);

        const auto t_end = std::chrono::high_resolution_clock::now();

--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -353,7 +353,8 @@ int main(int argc, char ** argv) {

                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());

-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
+                auto batch = llama_batch_ext_ptr::init_from_text(&embd[i], n_eval, n_past, 0, true);
+                if (llama_decode_ext(ctx, batch.get())) {
                    LOG_ERR("%s : failed to eval\n", __func__);
                    return 1;
                }
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -1427,7 +1427,7 @@ struct sql_printer : public printer {
    }
 };

-static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_threads) {
+static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
    llama_set_n_threads(ctx, n_threads, n_threads);

    const llama_model * model   = llama_get_model(ctx);
@@ -1444,14 +1444,15 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_batch, int n_th
        for (int i = 1; i < n_tokens; i++) {
            tokens[i] = std::rand() % n_vocab;
        }
-        llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens));
+        auto batch = llama_batch_ext_ptr::init_from_text(tokens.data(), n_tokens, n_past + n_processed, 0, true);
+        llama_decode_ext(ctx, batch.get());
        n_processed += n_tokens;
    }

    llama_synchronize(ctx);
 }

-static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
+static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
    llama_set_n_threads(ctx, n_threads, n_threads);

    const llama_model * model   = llama_get_model(ctx);
@@ -1461,7 +1462,8 @@ static void test_gen(llama_context * ctx, int n_gen, int n_threads) {
    llama_token token = llama_vocab_get_add_bos(vocab) ? llama_vocab_bos(vocab) : std::rand() % n_vocab;

    for (int i = 0; i < n_gen; i++) {
-        llama_decode(ctx, llama_batch_get_one(&token, 1));
+        auto batch = llama_batch_ext_ptr::init_from_text(&token, 1, n_past + i, 0, true);
+        llama_decode_ext(ctx, batch.get());
        llama_synchronize(ctx);
        token = std::rand() % n_vocab;
    }
@@ -1608,13 +1610,13 @@ int main(int argc, char ** argv) {
                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
            }
            //test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
-            test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
+            test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
        }
        if (t.n_gen > 0) {
            if (params.progress) {
                fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
            }
-            test_gen(ctx, 1, t.n_threads);
+            test_gen(ctx, 1, 0, t.n_threads);
        }

        for (int i = 0; i < params.reps; i++) {
@@ -1627,14 +1629,14 @@ int main(int argc, char ** argv) {
                    fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
                            i + 1, params.reps);
                }
-                test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
+                test_prompt(ctx, t.n_prompt, 0, t.n_batch, t.n_threads);
            }
            if (t.n_gen > 0) {
                if (params.progress) {
                    fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
                            i + 1, params.reps);
                }
-                test_gen(ctx, t.n_gen, t.n_threads);
+                test_gen(ctx, t.n_gen, t.n_prompt, t.n_threads);
            }

            uint64_t t_ns = get_time_ns() - t_start;
--- a/examples/llava/README-gemma3.md
+++ b/examples/llava/README-gemma3.md
@@ -4,26 +4,6 @@
 >
 > This is very experimental, only used for demo purpose.

-## Quick started
-
-You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account
-
-```bash
-# build
-cmake -B build
-cmake --build build --target llama-gemma3-cli
-
-# alternatively, install from brew (MacOS)
-brew install llama.cpp
-
-# run it
-llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
-llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
-llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF
-
-# note: 1B model does not support vision
-```
-
 ## How to get mmproj.gguf?

 ```bash
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -1,273 +0,0 @@
-#include "ggml.h"
-#include "gguf.h"
-
-#include <climits>
-#include <cstdarg>
-#include <string>
-#include <map>
-#include <sstream>
-#include <vector>
-
-// Internal header for clip.cpp
-
-#define KEY_FTYPE               "general.file_type"
-#define KEY_NAME                "general.name"
-#define KEY_DESCRIPTION         "general.description"
-#define KEY_HAS_TEXT_ENC        "clip.has_text_encoder"
-#define KEY_HAS_VIS_ENC         "clip.has_vision_encoder"
-#define KEY_HAS_LLAVA_PROJ      "clip.has_llava_projector"
-#define KEY_HAS_MINICPMV_PROJ   "clip.has_minicpmv_projector"
-#define KEY_HAS_GLM_PROJ        "clip.has_glm_projector"
-#define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
-#define KEY_HAS_QWEN2VL_MERGER  "clip.has_qwen2vl_merger"
-#define KEY_USE_GELU            "clip.use_gelu"
-#define KEY_USE_SILU            "clip.use_silu"
-#define KEY_N_EMBD              "clip.%s.embedding_length"
-#define KEY_N_FF                "clip.%s.feed_forward_length"
-#define KEY_N_BLOCK             "clip.%s.block_count"
-#define KEY_N_HEAD              "clip.%s.attention.head_count"
-#define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
-#define KEY_PROJ_DIM            "clip.%s.projection_dim"
-#define KEY_TOKENS              "tokenizer.ggml.tokens"
-#define KEY_N_POSITIONS         "clip.text.context_length"
-#define KEY_IMAGE_SIZE          "clip.vision.image_size"
-#define KEY_PATCH_SIZE          "clip.vision.patch_size"
-#define KEY_IMAGE_MEAN          "clip.vision.image_mean"
-#define KEY_IMAGE_STD           "clip.vision.image_std"
-#define KEY_PROJ_TYPE           "clip.projector_type"
-#define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
-
-#define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
-#define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
-#define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
-
-
-//
-// tensor name constants
-//
-
-#define TN_TOKEN_EMBD      "%s.token_embd.weight"
-#define TN_POS_EMBD        "%s.position_embd.weight"
-#define TN_CLASS_EMBD      "v.class_embd"
-#define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
-#define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
-#define TN_PATCH_BIAS      "v.patch_embd.bias"
-#define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
-#define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
-#define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
-#define TN_ATTN_OUTPUT     "%s.blk.%d.attn_out.%s"
-#define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s"
-#define TN_FFN_UP          "%s.blk.%d.ffn_up.%s"
-#define TN_LN_1            "%s.blk.%d.ln1.%s"
-#define TN_LN_2            "%s.blk.%d.ln2.%s"
-#define TN_LN_PRE          "%s.pre_ln.%s"
-#define TN_LN_POST         "%s.post_ln.%s"
-#define TN_TEXT_PROJ       "text_projection.weight"
-#define TN_VIS_PROJ        "visual_projection.weight"
-#define TN_LLAVA_PROJ      "mm.%d.%s"
-#define TN_MVLM_PROJ_MLP   "mm.model.mlp.%d.%s"
-#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
-#define TN_MVLM_PROJ_PEG   "mm.model.peg.%d.%s"
-#define TN_IMAGE_NEWLINE   "model.image_newline"
-#define TN_MM_INP_PROJ     "mm.input_projection.weight" // gemma3
-#define TN_MM_SOFT_EMB_N   "mm.soft_emb_norm.weight"    // gemma3
-
-// mimicpmv
-#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
-#define TN_MINICPMV_QUERY      "resampler.query"
-#define TN_MINICPMV_PROJ       "resampler.proj.weight"
-#define TN_MINICPMV_KV_PROJ    "resampler.kv.weight"
-#define TN_MINICPMV_ATTN       "resampler.attn.%s.%s"
-#define TN_MINICPMV_LN         "resampler.ln_%s.%s"
-
-#define TN_GLM_ADAPER_CONV      "adapter.conv.%s"
-#define TN_GLM_ADAPTER_LINEAR   "adapter.linear.linear.%s"
-#define TN_GLM_ADAPTER_NORM_1   "adapter.linear.norm1.%s"
-#define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
-#define TN_GLM_ADAPTER_GATE     "adapter.linear.gate.%s"
-#define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
-#define TN_GLM_BOI_W            "adapter.boi"
-#define TN_GLM_EOI_W            "adapter.eoi"
-
-enum projector_type {
-    PROJECTOR_TYPE_MLP,
-    PROJECTOR_TYPE_MLP_NORM,
-    PROJECTOR_TYPE_LDP,
-    PROJECTOR_TYPE_LDPV2,
-    PROJECTOR_TYPE_RESAMPLER,
-    PROJECTOR_TYPE_GLM_EDGE,
-    PROJECTOR_TYPE_MERGER,
-    PROJECTOR_TYPE_GEMMA3,
-    PROJECTOR_TYPE_UNKNOWN,
-};
-
-static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
-    { PROJECTOR_TYPE_MLP,       "mlp" },
-    { PROJECTOR_TYPE_LDP,       "ldp" },
-    { PROJECTOR_TYPE_LDPV2,     "ldpv2"},
-    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
-    { PROJECTOR_TYPE_GLM_EDGE,  "adapter"},
-    { PROJECTOR_TYPE_MERGER,    "qwen2vl_merger"},
-    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
-};
-
-static projector_type clip_projector_type_from_string(const std::string & str) {
-    for (const auto & pair : PROJECTOR_TYPE_NAMES) {
-        if (pair.second == str) {
-            return pair.first;
-        }
-    }
-    return PROJECTOR_TYPE_UNKNOWN;
-}
-
-//
-// logging
-//
-
-static void clip_log_callback_default(enum ggml_log_level level, const char * text, void * user_data) {
-    (void) level;
-    (void) user_data;
-    fputs(text, stderr);
-    fflush(stderr);
-}
-
-struct clip_logger_state {
-    ggml_log_level verbosity_thold;
-    ggml_log_callback log_callback;
-    void * log_callback_user_data;
-};
-
-extern struct clip_logger_state g_logger_state;
-
-static void clip_log_internal_v(enum ggml_log_level level, const char * format, va_list args) {
-    if (format == NULL) {
-        return;
-    }
-    va_list args_copy;
-    va_copy(args_copy, args);
-    char buffer[128];
-    int len = vsnprintf(buffer, 128, format, args);
-    if (len < 128) {
-        g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
-    } else {
-        char * buffer2 = (char *) calloc(len + 1, sizeof(char));
-        vsnprintf(buffer2, len + 1, format, args_copy);
-        buffer2[len] = 0;
-        g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
-        free(buffer2);
-    }
-    va_end(args_copy);
-}
-
-static void clip_log_internal(enum ggml_log_level level, const char * format, ...) {
-    va_list args;
-    va_start(args, format);
-    clip_log_internal_v(level, format, args);
-    va_end(args);
-}
-
-#define LOG_TMPL(level, ...) \
-    do { \
-        if ((level) >= g_logger_state.verbosity_thold) { \
-            clip_log_internal((level), __VA_ARGS__); \
-        } \
-    } while (0)
-#define LOG_INF(...) LOG_TMPL(GGML_LOG_LEVEL_INFO,  __VA_ARGS__)
-#define LOG_WRN(...) LOG_TMPL(GGML_LOG_LEVEL_WARN,  __VA_ARGS__)
-#define LOG_ERR(...) LOG_TMPL(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
-#define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
-#define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  __VA_ARGS__)
-
-//
-// common utils
-//
-
-static std::string string_format(const char * fmt, ...) {
-    va_list ap;
-    va_list ap2;
-    va_start(ap, fmt);
-    va_copy(ap2, ap);
-    int size = vsnprintf(NULL, 0, fmt, ap);
-    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
-    std::vector<char> buf(size + 1);
-    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
-    GGML_ASSERT(size2 == size);
-    va_end(ap2);
-    va_end(ap);
-    return std::string(buf.data(), buf.size());
-}
-
-static void string_replace_all(std::string & s, const std::string & search, const std::string & replace) {
-    if (search.empty()) {
-        return;
-    }
-    std::string builder;
-    builder.reserve(s.length());
-    size_t pos = 0;
-    size_t last_pos = 0;
-    while ((pos = s.find(search, last_pos)) != std::string::npos) {
-        builder.append(s, last_pos, pos - last_pos);
-        builder.append(replace);
-        last_pos = pos + search.length();
-    }
-    builder.append(s, last_pos, std::string::npos);
-    s = std::move(builder);
-}
-
-//
-// gguf utils
-//
-
-static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
-    switch (type) {
-        case GGUF_TYPE_UINT8:   return std::to_string(((const uint8_t  *)data)[i]);
-        case GGUF_TYPE_INT8:    return std::to_string(((const int8_t   *)data)[i]);
-        case GGUF_TYPE_UINT16:  return std::to_string(((const uint16_t *)data)[i]);
-        case GGUF_TYPE_INT16:   return std::to_string(((const int16_t  *)data)[i]);
-        case GGUF_TYPE_UINT32:  return std::to_string(((const uint32_t *)data)[i]);
-        case GGUF_TYPE_INT32:   return std::to_string(((const int32_t  *)data)[i]);
-        case GGUF_TYPE_UINT64:  return std::to_string(((const uint64_t *)data)[i]);
-        case GGUF_TYPE_INT64:   return std::to_string(((const int64_t  *)data)[i]);
-        case GGUF_TYPE_FLOAT32: return std::to_string(((const float    *)data)[i]);
-        case GGUF_TYPE_FLOAT64: return std::to_string(((const double   *)data)[i]);
-        case GGUF_TYPE_BOOL:    return ((const bool *)data)[i] ? "true" : "false";
-        default:                return string_format("unknown type %d", type);
-    }
-}
-
-static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
-    const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
-
-    switch (type) {
-        case GGUF_TYPE_STRING:
-            return gguf_get_val_str(ctx_gguf, i);
-        case GGUF_TYPE_ARRAY:
-            {
-                const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
-                int arr_n = gguf_get_arr_n(ctx_gguf, i);
-                const void * data = arr_type == GGUF_TYPE_STRING ? nullptr : gguf_get_arr_data(ctx_gguf, i);
-                std::stringstream ss;
-                ss << "[";
-                for (int j = 0; j < arr_n; j++) {
-                    if (arr_type == GGUF_TYPE_STRING) {
-                        std::string val = gguf_get_arr_str(ctx_gguf, i, j);
-                        // escape quotes
-                        string_replace_all(val, "\\", "\\\\");
-                        string_replace_all(val, "\"", "\\\"");
-                        ss << '"' << val << '"';
-                    } else if (arr_type == GGUF_TYPE_ARRAY) {
-                        ss << "???";
-                    } else {
-                        ss << gguf_data_to_str(arr_type, data, j);
-                    }
-                    if (j < arr_n - 1) {
-                        ss << ", ";
-                    }
-                }
-                ss << "]";
-                return ss.str();
-            }
-        default:
-            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
-    }
-}
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -1,7 +1,6 @@
 #ifndef CLIP_H
 #define CLIP_H

-#include "ggml.h"
 #include <stddef.h>
 #include <stdint.h>

@@ -42,7 +41,7 @@ struct clip_image_f32_batch {

 struct clip_context_params {
    bool use_gpu;
-    ggml_log_level verbosity;
+    int verbosity;
 };

 // deprecated, use clip_init
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -5,12 +5,13 @@
 #include "clip.h"
 #include "stb_image.h"
 #include "llama.h"
+#include "llama-cpp.h"
 #include "ggml.h"
 #include "console.h"

 #include <vector>
 #include <limits.h>
-#include <cinttypes>
+#include <inttypes.h>

 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
 #include <signal.h>
@@ -63,7 +64,7 @@ struct gemma3_context {
    llama_model       * model;
    llama_context     * lctx;
    const llama_vocab * vocab;
-    llama_batch         batch;
+    llama_batch_ext_ptr batch;

    int n_threads    = 1;
    llama_pos n_past = 0;
@@ -73,17 +74,13 @@ struct gemma3_context {
        lctx = llama_init.context.get();
        vocab = llama_model_get_vocab(model);
        n_threads = params.cpuparams.n_threads;
-        batch = llama_batch_init(params.n_batch, 0, 1);
+        batch.reset(llama_batch_ext_init(params.n_batch, 1));
        init_clip_model(params);
    }

    void init_clip_model(common_params & params) {
-        const char * clip_path = params.mmproj.path.c_str();
-        ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
-        if (!ctx_clip) {
-            LOG_ERR("Failed to load CLIP model from %s\n", clip_path);
-            exit(1);
-        }
+        const char * clip_path = params.mmproj.c_str();
+        ctx_clip = clip_model_load(clip_path, params.verbosity > 1);
    }

    ~gemma3_context() {
@@ -91,50 +88,18 @@ struct gemma3_context {
    }
 };

-struct decode_embd_batch {
-    std::vector<llama_pos>      pos;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id>   seq_id_0;
-    std::vector<llama_seq_id *> seq_ids;
-    std::vector<int8_t>         logits;
-    llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-        pos     .resize(n_tokens);
-        n_seq_id.resize(n_tokens);
-        seq_ids .resize(n_tokens + 1);
-        logits  .resize(n_tokens);
-        seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
-        seq_ids [n_tokens] = nullptr;
-        batch = {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ nullptr,
-            /*embd           =*/ embd,
-            /*pos            =*/ pos.data(),
-            /*n_seq_id       =*/ n_seq_id.data(),
-            /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
-        };
-        for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i] = pos_0 + i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
-        }
-    }
-};
-
 static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
    llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
-    common_batch_clear(ctx.batch);
+    llama_batch_ext_clear(ctx.batch.get());
    for (llama_token & t : tokens) {
-        common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
+        llama_seq_id seq_id = 0;
+        llama_batch_ext_add_text(ctx.batch.get(), t, ctx.n_past++, &seq_id, 1, false);
    }
    if (logits_last) {
-        ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
+        llama_batch_ext_set_output_last(ctx.batch.get());
    }
    // LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
-    if (llama_decode(ctx.lctx, ctx.batch)) {
+    if (llama_decode_ext(ctx.lctx, ctx.batch.get())) {
        LOG_ERR("Failed to decode text\n");
        return 1;
    }
@@ -183,8 +148,8 @@ static int eval_image(gemma3_context & ctx, std::string & fname) {
    int64_t t1 = ggml_time_ms();
    eval_text(ctx, "<start_of_image>");
    llama_set_causal_attn(ctx.lctx, false);
-    decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
-    if (llama_decode(ctx.lctx, batch_img.batch)) {
+    llama_batch_ext_ptr batch_img(llama_batch_ext_init_from_embd(image_embd_v.data(), n_tokens, n_embd, ctx.n_past, 0));
+    if (llama_decode_ext(ctx.lctx, batch_img.get())) {
        LOG_ERR("failed to decode image\n");
        return 1;
    }
@@ -214,9 +179,10 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
        fflush(stdout);

        // eval the token
-        common_batch_clear(ctx.batch);
-        common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
-        if (llama_decode(ctx.lctx, ctx.batch)) {
+        llama_batch_ext_clear(ctx.batch.get());
+        llama_seq_id seq_id = 0;
+        llama_batch_ext_add_text(ctx.batch.get(), token_id, ctx.n_past++, &seq_id, 1, true);
+        if (llama_decode_ext(ctx.lctx, ctx.batch.get())) {
            LOG_ERR("failed to decode token\n");
            return 1;
        }
@@ -236,13 +202,13 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.mmproj.path.empty()) {
+    if (params.mmproj.empty()) {
        show_additional_info(argc, argv);
        return 1;
    }

    gemma3_context ctx(params);
-    printf("%s: %s\n", __func__, params.model.path.c_str());
+    printf("%s: %s\n", __func__, params.model.c_str());

    bool is_single_turn = !params.prompt.empty() && !params.image.empty();

--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -20,7 +20,8 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
+        auto batch = llama_batch_ext_ptr::init_from_text(&tokens[i], n_eval, *n_past, 0, true);
+        if (llama_decode_ext(ctx_llama, batch.get())) {
            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
            return false;
        }
@@ -225,7 +226,7 @@ static struct llama_model * llava_init(common_params * params) {

    llama_model_params model_params = common_model_params_to_llama(*params);

-    llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n" , __func__);
        return NULL;
@@ -234,14 +235,14 @@ static struct llama_model * llava_init(common_params * params) {
 }

 static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
-    const char * clip_path = params->mmproj.path.c_str();
+    const char * clip_path = params->mmproj.c_str();

    auto prompt = params->prompt;
    if (prompt.empty()) {
        prompt = "describe the image in detail.";
    }

-    auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);

    llama_context_params ctx_params = common_context_params_to_llama(*params);
    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@@ -283,7 +284,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
        print_usage(argc, argv);
        return 1;
    }
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -2,6 +2,7 @@
 #include "llava.h"

 #include "llama.h"
+#include "llama-cpp.h"

 #include <algorithm>
 #include <cerrno>
@@ -438,39 +439,6 @@ bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, co
    return true;
 }

-struct llava_embd_batch {
-    std::vector<llama_pos>      pos;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id>   seq_id_0;
-    std::vector<llama_seq_id *> seq_ids;
-    std::vector<int8_t>         logits;
-    llama_batch batch;
-    llava_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-        pos     .resize(n_tokens);
-        n_seq_id.resize(n_tokens);
-        seq_ids .resize(n_tokens + 1);
-        logits  .resize(n_tokens);
-        seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
-        seq_ids [n_tokens] = nullptr;
-        batch = {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ nullptr,
-            /*embd           =*/ embd,
-            /*pos            =*/ pos.data(),
-            /*n_seq_id       =*/ n_seq_id.data(),
-            /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
-        };
-        for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i] = pos_0 + i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
-        }
-    }
-};
-
 bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed, int n_batch, int * n_past) {
    int n_embd  = llama_model_n_embd(llama_get_model(ctx_llama));

@@ -480,8 +448,8 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
            n_eval = n_batch;
        }
        float * embd = image_embed->embed+i*n_embd;
-        llava_embd_batch llava_batch = llava_embd_batch(embd, n_eval, *n_past, 0);
-        if (llama_decode(ctx_llama, llava_batch.batch)) {
+        auto batch = llama_batch_ext_ptr::init_from_embd(embd, n_eval, n_embd, 0, 0);
+        if (llama_decode_ext(ctx_llama, batch.get())) {
            LOG_ERR("%s : failed to eval\n", __func__);
            return false;
        }
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) {

    llama_model_params model_params = common_model_params_to_llama(*params);

-    llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n" , __func__);
        return NULL;
@@ -80,7 +80,7 @@ static void llava_free(struct llava_context * ctx_llava) {
 }

 static struct clip_ctx * clip_init_context(common_params * params) {
-    const char * clip_path = params->mmproj.path.c_str();
+    const char * clip_path = params->mmproj.c_str();

    auto prompt = params->prompt;
    if (prompt.empty()) {
@@ -88,7 +88,7 @@ static struct clip_ctx * clip_init_context(common_params * params) {
    }
    struct clip_context_params clip_params = {
        /* use_gpu */   params->n_gpu_layers != 0,
-        /* verbosity */ GGML_LOG_LEVEL_INFO, // TODO: make this configurable
+        /* verbosity */ params->verbosity,
    };
    auto * ctx_clip = clip_init(clip_path, clip_params);
    return ctx_clip;
@@ -101,7 +101,8 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
+        auto batch = llama_batch_ext_ptr::init_from_text(&tokens[i], n_eval, *n_past, 0, true);
+        if (llama_decode_ext(ctx_llama, batch.get())) {
            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
            return false;
        }
@@ -290,7 +291,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.mmproj.path.empty() || (params.image.empty())) {
+    if (params.mmproj.empty() || (params.image.empty())) {
        show_additional_info(argc, argv);
        return 1;
    }
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -66,17 +66,11 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
        memcpy(&batch_mrope_pos[n_eval * 2], &mrope_pos[img_tokens * 2 + processed], n_eval * sizeof(llama_pos));
        memcpy(&batch_mrope_pos[n_eval * 3], &mrope_pos[img_tokens * 3 + processed], n_eval * sizeof(llama_pos));

-        llama_batch batch = {
-            int32_t(n_eval),                // n_tokens
-            nullptr,                        // token
-            (image_embed->embed+i*n_embd),  // embed
-            batch_mrope_pos.data(),         // pos
-            nullptr,  // n_seq_id
-            nullptr,  // seq_id
-            nullptr,  // logits
-        };
+        float * batch_embd = image_embed->embed+i*n_embd;
+        auto batch = llama_batch_ext_ptr::init_from_embd(batch_embd, n_eval, n_embd, 0, 0);
+        llama_batch_ext_set_pos(batch.get(), batch_mrope_pos.data(), n_eval);

-        if (llama_decode(ctx_llama, batch)) {
+        if (llama_decode_ext(ctx_llama, batch.get())) {
            LOG_ERR("%s : failed to eval\n", __func__);
            return false;
        }
@@ -95,16 +89,24 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
        if (n_eval > n_batch) {
            n_eval = n_batch;
        }
-        auto batch = llama_batch_get_one(&tokens[i], n_eval);
-        // TODO: add mrope pos ids somewhere else
-        pos.resize(batch.n_tokens * 4);
-        std::fill(pos.begin(), pos.end(), 0);
-        for (int j = 0; j < batch.n_tokens * 3; j ++) {
-            pos[j] = *st_pos_id + (j % batch.n_tokens);
-        }
-        batch.pos = pos.data();

-        if (llama_decode(ctx_llama, batch)) {
+        // TODO: add mrope pos ids somewhere else
+        int n_tokens = n_eval;
+        pos.resize(n_tokens * 4);
+        std::fill(pos.begin(), pos.end(), 0);
+        for (int j = 0; j < n_tokens * 3; j ++) {
+            pos[j] = *st_pos_id + (j % n_tokens);
+        }
+
+        llama_batch_ext_ptr batch(llama_batch_ext_init(n_eval, 1));
+        for (int j = 0; j < n_eval; j++) {
+            llama_token token = tokens[i + j];
+            llama_seq_id seq_id = 0;
+            llama_batch_ext_add_text(batch.get(), token, pos[j], &seq_id, 1, false);
+        }
+        llama_batch_ext_set_output_last(batch.get());
+
+        if (llama_decode_ext(ctx_llama, batch.get())) {
            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
            return false;
        }
@@ -314,7 +316,7 @@ static struct llama_model * llava_init(common_params * params) {

    llama_model_params model_params = common_model_params_to_llama(*params);

-    llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params);
    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n" , __func__);
        return NULL;
@@ -323,14 +325,14 @@ static struct llama_model * llava_init(common_params * params) {
 }

 static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
-    const char * clip_path = params->mmproj.path.c_str();
+    const char * clip_path = params->mmproj.c_str();

    auto prompt = params->prompt;
    if (prompt.empty()) {
        prompt = "describe the image in detail.";
    }

-    auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);

    llama_context_params ctx_params = common_context_params_to_llama(*params);
    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@@ -524,7 +526,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
+    if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
        print_usage(argc, argv);
        return 1;
    }
--- a/examples/llava/test-1.jpeg
+++ b/examples/llava/test-1.jpeg
--- a/examples/llava/tests.sh
+++ b/examples/llava/tests.sh
@@ -1,81 +0,0 @@
-#!/bin/bash
-
-# make sure we are in the right directory
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-cd $SCRIPT_DIR
-
-#export LLAMA_CACHE="$SCRIPT_DIR/tmp"
-
-set -eux
-
-mkdir -p $SCRIPT_DIR/output
-
-PROJ_ROOT="$SCRIPT_DIR/../.."
-cd $PROJ_ROOT
-
-###############
-
-arr_bin=()
-arr_hf=()
-
-add_test() {
-    local bin=$1
-    local hf=$2
-    arr_bin+=("$bin")
-    arr_hf+=("$hf")
-}
-
-add_test "llama-gemma3-cli"   "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
-add_test "llama-llava-cli"    "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
-add_test "llama-llava-cli"    "guinmoon/MobileVLM-3B-GGUF:Q4_K_M"
-add_test "llama-llava-cli"    "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
-add_test "llama-llava-cli"    "second-state/Llava-v1.5-7B-GGUF:Q2_K"
-add_test "llama-llava-cli"    "cjpais/llava-1.6-mistral-7b-gguf:Q3_K"
-add_test "llama-llava-cli"    "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
-add_test "llama-minicpmv-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
-add_test "llama-minicpmv-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
-add_test "llama-minicpmv-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
-add_test "llama-qwen2vl-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
-
-###############
-
-cmake --build build -j --target "${arr_bin[@]}"
-
-arr_res=()
-
-for i in "${!arr_bin[@]}"; do
-    bin="${arr_bin[$i]}"
-    hf="${arr_hf[$i]}"
-
-    echo "Running test with binary: $bin and HF model: $hf"
-    echo ""
-    echo ""
-
-    output=$("$PROJ_ROOT/build/bin/$bin" -hf "$hf" --image $SCRIPT_DIR/test-1.jpeg -p "what is the publisher name of the newspaper?" --temp 0 2>&1 | tee /dev/tty)
-
-    echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
-
-    if echo "$output" | grep -iq "new york"; then
-        result="\033[32mOK\033[0m:   $bin $hf"
-    else
-        result="\033[31mFAIL\033[0m: $bin $hf"
-    fi
-    echo -e "$result"
-    arr_res+=("$result")
-
-    echo ""
-    echo ""
-    echo ""
-    echo "#################################################"
-    echo "#################################################"
-    echo ""
-    echo ""
-done
-
-set +x
-
-for i in "${!arr_res[@]}"; do
-    echo -e "${arr_res[$i]}"
-done
-echo ""
-echo "Output logs are saved in $SCRIPT_DIR/output"
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -92,8 +92,10 @@ int main(int argc, char ** argv) {
    const auto t_enc_start = ggml_time_us();

    // eval the prompt
-    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
-    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
+    llama_batch_ext_ptr batch0(llama_batch_ext_init_from_text( inp.data(), n_input - 1, 0,           0, true));
+    llama_batch_ext_ptr batch1(llama_batch_ext_init_from_text(&inp.back(),           1, n_input - 1, 0, true));
+    llama_decode_ext(ctx, batch0.get());
+    llama_decode_ext(ctx, batch1.get());

    for (int s = 1; s < W + G + 1; ++s) {
        llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
@@ -115,7 +117,7 @@ int main(int argc, char ** argv) {
    // seq_id == 0           : the current input token
    // seq_id [1, W]         : tokens from the past N - 1 Jacobi iterations
    // seq_id [W + 1, W + G] : verification n-grams
-    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
+    llama_batch_ext * batch = llama_batch_ext_init(params.n_ctx, W + G + 1);

    // target model sampling context
    struct common_sampler * smpl = common_sampler_init(model, params.sampling);
@@ -204,10 +206,10 @@ int main(int argc, char ** argv) {
        //                                                      V  V  V  V  V  V
        //                                                             id
        {
-            common_batch_clear(batch);
+            llama_batch_ext_clear(batch);

            // current token - first token of the first level
-            common_batch_add(batch, id, n_past, seq_id_all, true);
+            llama_batch_ext_add_text(batch, id, n_past, seq_id_all.data(), seq_id_all.size(), true);

            // verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
            {
@@ -230,9 +232,10 @@ int main(int argc, char ** argv) {
                        const llama_token t = ngrams_observed.tokens[idx + j];

                        ngrams_cur[g].tokens [j + 1] = t;
-                        ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
+                        ngrams_cur[g].i_batch[j + 1] = llama_batch_ext_get_n_tokens(batch);

-                        common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
+                        llama_seq_id seq_id = W + 1 + g;
+                        llama_batch_ext_add_text(batch, t, n_past + j + 1, &seq_id, 1, true);
                    }
                }
            }
@@ -244,18 +247,20 @@ int main(int argc, char ** argv) {
                    seq_id_look[j] = i + j + 1;
                }

-                common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
+                llama_batch_ext_add_text(batch, tokens_j[0][i], n_past + i,
+                    seq_id_look.data(), seq_id_look.size(), false);
            }

            // fill the rest of the levels
            for (int j = 1; j < N - 1; j++) {
                for (int i = 0; i < W; i++) {
-                    common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
+                    llama_seq_id seq_id = i + 1;
+                    llama_batch_ext_add_text(batch, tokens_j[j][i], n_past + j + i, &seq_id, 1, j == N - 2);
                }
            }
        }

-        if (llama_decode(ctx, batch) != 0) {
+        if (llama_decode_ext(ctx, batch) != 0) {
            LOG_ERR("\n\n%s: llama_decode failed - increase KV cache size\n", __func__);
            return 1;
        }
@@ -475,7 +480,7 @@ int main(int argc, char ** argv) {

    llama_kv_cache_view_free(&kvc_view);

-    llama_batch_free(batch);
+    llama_batch_ext_free(batch);

    llama_backend_free();

--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -91,8 +91,10 @@ int main(int argc, char ** argv){

    const auto t_enc_start = ggml_time_us();

-    llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1));
-    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
+    llama_batch_ext_ptr batch0(llama_batch_ext_init_from_text( inp.data(), n_input - 1, 0,           0, true));
+    llama_batch_ext_ptr batch1(llama_batch_ext_init_from_text(&inp.back(),           1, n_input - 1, 0, true));
+    llama_decode_ext(ctx, batch0.get());
+    llama_decode_ext(ctx, batch1.get());

    const auto t_enc_end = ggml_time_us();

@@ -108,7 +110,7 @@ int main(int argc, char ** argv){

    std::vector<llama_token> draft;

-    llama_batch batch_tgt = llama_batch_init(params.n_ctx, 0, 1);
+    llama_batch_ext * batch_tgt = llama_batch_ext_init(params.n_ctx, 1);

    // debug
    struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, 1);
@@ -194,8 +196,9 @@ int main(int argc, char ** argv){
        // clean the cache of draft tokens that weren't accepted
        llama_kv_self_seq_rm(ctx, 0, n_past, -1);

-        common_batch_clear(batch_tgt);
-        common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
+        const llama_seq_id seq_id = 0;
+        llama_batch_ext_clear(batch_tgt);
+        llama_batch_ext_add_text(batch_tgt, draft[0], n_past, &seq_id, 1, true);

        // Draft already contains a single token sampled from the model:
        GGML_ASSERT(draft.size() == 1);
@@ -205,13 +208,13 @@ int main(int argc, char ** argv){
        common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);

        for (size_t i = 1; i < draft.size(); ++i) {
-            common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
+            llama_batch_ext_add_text(batch_tgt, draft[i], n_past + i, &seq_id, 1, true);
        }

        t_draft_us += ggml_time_us() - t_start_draft_us;
        n_drafted += draft.size() - 1;

-        llama_decode(ctx, batch_tgt);
+        llama_decode_ext(ctx, batch_tgt);
        ++n_past;

        draft.erase(draft.begin());
@@ -243,7 +246,7 @@ int main(int argc, char ** argv){

    common_sampler_free(smpl);

-    llama_batch_free(batch_tgt);
+    llama_batch_ext_free(batch_tgt);

    llama_backend_free();

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -548,7 +548,8 @@ int main(int argc, char ** argv) {
        int enc_input_size = embd_inp.size();
        llama_token * enc_input_buf = embd_inp.data();

-        if (llama_encode(ctx, llama_batch_get_one(enc_input_buf, enc_input_size))) {
+        auto batch = llama_batch_ext_ptr::init_from_text(enc_input_buf, enc_input_size, 0, 0, true);
+        if (llama_decode_ext(ctx, batch.get())) {
            LOG_ERR("%s : failed to eval\n", __func__);
            return 1;
        }
@@ -668,7 +669,8 @@ int main(int argc, char ** argv) {

                LOG_DBG("eval: %s\n", string_from(ctx, embd).c_str());

-                if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval))) {
+                auto batch = llama_batch_ext_ptr::init_from_text(&embd[i], n_eval, n_past, 0, true);
+                if (llama_decode_ext(ctx, batch.get())) {
                    LOG_ERR("%s : failed to eval\n", __func__);
                    return 1;
                }
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -106,8 +106,6 @@ int main(int argc, char ** argv) {

    common_params params;

-    params.n_predict = 128;
-
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
        return 1;
    }
@@ -176,7 +174,7 @@ int main(int argc, char ** argv) {

    // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple
    // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time
-    llama_batch batch = llama_batch_init(n_ctx, 0, 1);
+    llama_batch_ext * batch = llama_batch_ext_init(n_ctx, 1);

    int32_t n_total_prompt = 0;
    int32_t n_total_gen    = 0;
@@ -194,10 +192,11 @@ int main(int argc, char ** argv) {
        LOG_INF("%s: Evaluating the system prompt ...\n", __func__);

        for (int32_t i = 0; i < n_tokens_system; ++i) {
-            common_batch_add(batch, tokens_system[i], i, { 0 }, false);
+            llama_seq_id seq_id = 0;
+            llama_batch_ext_add_text(batch, tokens_system[i], i, &seq_id, 1, false);
        }

-        if (llama_decode(ctx, batch) != 0) {
+        if (llama_decode_ext(ctx, batch) != 0) {
            LOG_ERR("%s: llama_decode() failed\n", __func__);
            return 1;
        }
@@ -218,7 +217,7 @@ int main(int argc, char ** argv) {
            common_kv_cache_dump_view_seqs(kvc_view, 40);
        }

-        common_batch_clear(batch);
+        llama_batch_ext_clear(batch);

        // decode any currently ongoing sequences
        for (auto & client : clients) {
@@ -226,14 +225,15 @@ int main(int argc, char ** argv) {
                continue;
            }

-            client.i_batch = batch.n_tokens;
+            client.i_batch = llama_batch_ext_get_n_tokens(batch);

-            common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
+            llama_seq_id seq_id = client.id + 1;
+            llama_batch_ext_add_text(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, &seq_id, 1, true);

            client.n_decoded += 1;
        }

-        if (batch.n_tokens == 0) {
+        if (llama_batch_ext_get_n_tokens(batch) == 0) {
            // all sequences have ended - clear the entire KV cache
            for (int i = 1; i <= n_clients; ++i) {
                llama_kv_self_seq_rm(ctx, i, -1, -1);
@@ -245,7 +245,7 @@ int main(int argc, char ** argv) {
        }

        // insert new sequences for decoding
-        if (cont_batching || batch.n_tokens == 0) {
+        if (cont_batching || llama_batch_ext_get_n_tokens(batch) == 0) {
            for (auto & client : clients) {
                if (client.seq_id == -1 && g_seq_id < n_seq) {
                    client.seq_id = g_seq_id;
@@ -264,17 +264,18 @@ int main(int argc, char ** argv) {
                    tokens_prompt = common_tokenize(ctx, client.prompt, false);

                    for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
+                        llama_seq_id seq_id = client.id + 1;
+                        llama_batch_ext_add_text(batch, tokens_prompt[i], i + n_tokens_system, &seq_id, 1, false);
                    }

                    // extract the logits only for the last token
-                    if (batch.n_tokens > 0) {
-                        batch.logits[batch.n_tokens - 1] = true;
+                    if (llama_batch_ext_get_n_tokens(batch) > 0) {
+                        llama_batch_ext_set_output_last(batch);
                    }

                    client.n_prompt  = tokens_prompt.size();
                    client.n_decoded = 0;
-                    client.i_batch   = batch.n_tokens - 1;
+                    client.i_batch   = llama_batch_ext_get_n_tokens(batch) - 1;

                    LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id);

@@ -288,14 +289,15 @@ int main(int argc, char ** argv) {
            }
        }

-        if (batch.n_tokens == 0) {
+        if (llama_batch_ext_get_n_tokens(batch) == 0) {
            break;
        }

        // process in chunks of params.n_batch
        int32_t n_batch = params.n_batch;

-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+        int32_t n_tokens_in_batch = llama_batch_ext_get_n_tokens(batch);
+        for (int32_t i = 0; i < (int32_t) n_tokens_in_batch; i += n_batch) {
            // experiment: process in powers of 2
            //if (i + n_batch > (int32_t) batch.n_tokens && n_batch > 32) {
            //    n_batch /= 2;
@@ -303,19 +305,11 @@ int main(int argc, char ** argv) {
            //    continue;
            //}

-            const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
+            const int32_t n_tokens = std::min(n_batch, (int32_t) (n_tokens_in_batch - i));

-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-            };
-
-            const int ret = llama_decode(ctx, batch_view);
+            llama_batch_ext * batch_view = llama_batch_ext_get_view(batch, i, n_tokens);
+            const int ret = llama_decode_ext(ctx, batch_view);
+            llama_batch_ext_free(batch_view);
            if (ret != 0) {
                if (n_batch == 1 || ret < 0) {
                    // if you get here, it means the KV cache is full - try increasing it via the context size
@@ -407,7 +401,7 @@ int main(int argc, char ** argv) {
        params.prompt_file = "used built-in defaults";
    }
    LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str());
-    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.path.c_str());
+    LOG_INF("Model and path used:  \033[32m%s\033[0m\n\n", params.model.c_str());

    LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt              ) / (t_main_end - t_main_start) * 1e6);
    LOG_INF("Total gen tokens:    %6d, speed: %5.2f t/s\n", n_total_gen,    (double) (n_total_gen                 ) / (t_main_end - t_main_start) * 1e6);
@@ -419,7 +413,7 @@ int main(int argc, char ** argv) {
    // TODO: print sampling/grammar timings for all clients
    llama_perf_context_print(ctx);

-    llama_batch_free(batch);
+    llama_batch_ext_free(batch);

    llama_backend_free();

--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -2,6 +2,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "llama-cpp.h"

 #include <cmath>
 #include <cstdio>
@@ -64,7 +65,7 @@ int main(int argc, char ** argv) {

    llama_model_params model_params = common_model_params_to_llama(params);

-    llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params);
+    llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params);

    if (model == NULL) {
        LOG_ERR("%s: unable to load model\n" , __func__);
@@ -122,7 +123,7 @@ int main(int argc, char ** argv) {
    LOG_INF("prompt tokens: %d\n", n_tokens_all);
    //LOG_INF("prompt: %s\n", params.prompt.c_str());

-    llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
+    llama_batch_ext_ptr batch(llama_batch_ext_init(params.n_batch, 1));

    int n_past = 0;

@@ -140,17 +141,18 @@ int main(int argc, char ** argv) {
            n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
        }

-        common_batch_clear(batch);
+        llama_batch_ext_clear(batch.get());

        for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
-            common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
+            llama_seq_id seq_id = 0;
+            llama_batch_ext_add_text(batch.get(), tokens_list[i + j], n_past++, &seq_id, 1, false);
        }

        if (i + n_batch >= n_tokens_all) {
-            batch.logits[batch.n_tokens - 1] = true;
+            llama_batch_ext_set_output_last(batch.get());
        }

-        if (llama_decode(ctx, batch) != 0) {
+        if (llama_decode_ext(ctx, batch.get()) != 0) {
            LOG_INF("%s: llama_decode() failed\n", __func__);
            return 1;
        }
@@ -174,17 +176,18 @@ int main(int argc, char ** argv) {

        n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;

-        common_batch_clear(batch);
+        llama_batch_ext_clear(batch.get());

        for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
-            common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
+            llama_seq_id seq_id = 0;
+            llama_batch_ext_add_text(batch.get(), tokens_list[i + j], n_past++, &seq_id, 1, false);
        }

        if (i + n_batch >= n_tokens_all) {
-            batch.logits[batch.n_tokens - 1] = true;
+            llama_batch_ext_set_output_last(batch.get());
        }

-        if (llama_decode(ctx, batch) != 0) {
+        if (llama_decode_ext(ctx, batch.get()) != 0) {
            LOG_ERR("%s: llama_decode() failed\n", __func__);
            return 1;
        }
@@ -223,7 +226,7 @@ int main(int argc, char ** argv) {
    while (n_cur <= n_len) {
        // sample the next token
        {
-            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, llama_batch_ext_get_n_tokens(batch.get()) - 1);

            // is it an end of generation?
            if (llama_vocab_is_eog(vocab, new_token_id) || n_cur == n_len) {
@@ -237,16 +240,17 @@ int main(int argc, char ** argv) {
            n_decode += 1;

            // prepare the next batch
-            common_batch_clear(batch);
+            llama_batch_ext_clear(batch.get());

            // push this new token for next evaluation
-            common_batch_add(batch, new_token_id, n_past++, { 0 }, true);
+            llama_seq_id seq_id = 0;
+            llama_batch_ext_add_text(batch.get(), new_token_id, n_past++, &seq_id, 1, true);
        }

        n_cur += 1;

        // evaluate the current batch with the transformer model
-        if (llama_decode(ctx, batch)) {
+        if (llama_decode_ext(ctx, batch.get())) {
            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }
@@ -266,8 +270,6 @@ int main(int argc, char ** argv) {

    llama_sampler_free(smpl);

-    llama_batch_free(batch);
-
    llama_free(ctx);
    llama_model_free(model);

--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -363,21 +363,20 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
        // clear the KV cache
        llama_kv_self_clear(ctx);

-        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+        common_batch batch(n_batch, 1);

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);

-            common_batch_clear(batch);
+            batch.clear();
            for (int i = 0; i < batch_size; i++) {
-                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+                batch.add_text(tokens[batch_start + i], j*n_batch + i, 0, true);
            }

            //LOG_DBG("    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
-            if (llama_decode(ctx, batch)) {
+            if (llama_decode_ext(ctx, batch.get())) {
                //LOG_ERR("%s : failed to eval\n", __func__);
-                llama_batch_free(batch);
                return {tokens, -1, logit_history, prob_history};
            }

@@ -397,8 +396,6 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
            }
        }

-        llama_batch_free(batch);
-
        const auto t_end = std::chrono::high_resolution_clock::now();

        if (i == 0) {
@@ -504,7 +501,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
    GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
    GGML_ASSERT(params.n_ctx == n_seq * n_ctx);

-    llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);
+    common_batch batch(std::min(n_batch, n_ctx*n_seq), 1);

    std::vector<float> logits;
    if (num_batches > 1) {
@@ -555,7 +552,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &

            int n_outputs = 0;

-            batch.n_tokens = 0;
+            batch.clear();
            for (int seq = 0; seq < n_seq_batch; seq++) {
                int seq_start = batch_start + seq*n_ctx;

@@ -568,22 +565,18 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
                }

                for (int k = 0; k < batch_size; ++k) {
-                    const int idx = seq*n_ctx + k;
-                    batch.token   [idx]    = tokens[seq_start + k];
-                    batch.pos     [idx]    = j*n_batch + k;
-                    batch.n_seq_id[idx]    = 1;
-                    batch.seq_id  [idx][0] = seq;
-                    batch.logits  [idx]    = batch.pos[idx] >= first ? 1 : 0;
+                    const llama_pos pos = j*n_batch + k;
+                    bool output = pos >= first;
+                    batch.add_text(tokens[seq_start + k], pos, seq, output);

-                    n_outputs += batch.logits[idx] != 0;
+                    n_outputs += output ? 1 : 0;
                }
-                batch.n_tokens += batch_size;

                // restore the original token in case it was set to BOS
                tokens[seq_start] = token_org;
            }

-            if (llama_decode(ctx, batch)) {
+            if (llama_decode_ext(ctx, batch.get())) {
                LOG_INF("%s : failed to eval\n", __func__);
                return {tokens, -1, logit_history, prob_history};
            }
@@ -653,36 +646,23 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
        LOG_ERR("Unexpected negative standard deviation of log(prob)\n");
    }

-    llama_batch_free(batch);
-
    return {tokens, ppl, logit_history, prob_history};
 }

-static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
+static bool decode_helper(llama_context * ctx, common_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
    int prev_outputs = 0;
-    for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
-        const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
+    for (int i = 0; i < (int) batch.get_n_tokens(); i += n_batch) {
+        const int n_tokens = std::min<int>(n_batch, batch.get_n_tokens() - i);

-        llama_batch batch_view = {
-            n_tokens,
-            batch.token    + i,
-            nullptr,
-            batch.pos      + i,
-            batch.n_seq_id + i,
-            batch.seq_id   + i,
-            batch.logits   + i,
-        };
+        common_batch batch_view = batch.get_view(i, n_tokens);

-        const int ret = llama_decode(ctx, batch_view);
+        const int ret = llama_decode_ext(ctx, batch_view.get());
        if (ret != 0) {
            LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
            return false;
        }

-        int n_outputs = 0;
-        for (int i = 0; i < n_tokens; ++i) {
-            n_outputs += batch_view.logits[i] != 0;
-        }
+        int n_outputs = batch_view.n_outputs;

        memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));

@@ -863,7 +843,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
    const int max_tasks_per_batch = 32;
    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));

-    llama_batch batch = llama_batch_init(n_ctx, 0, 4);
+    common_batch batch(n_ctx, 4);

    std::vector<float> tok_logits(n_vocab);
    // TODO: this could be made smaller; it's currently the worst-case size
@@ -879,7 +859,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
        size_t i1 = i0;
        size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch

-        common_batch_clear(batch);
+        batch.clear();

        // batch as much tasks as possible into the available context
        // each task has 4 unique sequence ids - one for each ending
@@ -895,9 +875,9 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
            }

            for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
-                common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
+                batch.add_text_multi_seq(hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
            }
-            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
+            llama_batch_ext_set_output_last(batch.get());
            n_logits += 1;

            for (int s = 0; s < 4; ++s) {
@@ -905,7 +885,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
                // TODO: don't evaluate the last token of each sequence
                for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
                    const bool needs_logits = i < seq_tokens_size - 1;
-                    common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
+                    batch.add_text_multi_seq(hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
                    n_logits += needs_logits;
                }
            }
@@ -992,8 +972,6 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
        i0 = i1 - 1;
    }

-    llama_batch_free(batch);
-
    LOG("\n");
 }

@@ -1147,7 +1125,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
    const int max_tasks_per_batch = 128;
    const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));

-    llama_batch batch = llama_batch_init(n_ctx, 0, 2);
+    common_batch batch(n_ctx, 2);

    std::vector<float> tok_logits(n_vocab);
    // TODO: this could be made smaller; it's currently the worst-case size
@@ -1166,7 +1144,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
        size_t i1 = i0;
        size_t i_logits = 0;

-        common_batch_clear(batch);
+        batch.clear();

        while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
            int n_logits = 0;
@@ -1176,15 +1154,15 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
            }

            for (size_t i = 0; i < data[i1].common_prefix; ++i) {
-                common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
+                batch.add_text_multi_seq(data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
            }
-            batch.logits[batch.n_tokens - 1] = true;
+            llama_batch_ext_set_output_last(batch.get());
            n_logits += 1;

            for (int s = 0; s < 2; ++s) {
                // TODO: end before the last token, no need to predict past the end of the sequences
                for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
-                    common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
+                    batch.add_text_multi_seq(data[i1].seq_tokens[s][i], i, { s0 + s }, true);
                    n_logits += 1;
                }
            }
@@ -1501,7 +1479,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
    const int max_tasks_per_batch = 32;
    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));

-    llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
+    common_batch batch(n_ctx, max_seq);

    std::vector<float> tok_logits(n_vocab);
    std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
@@ -1521,7 +1499,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
        size_t i1 = i0;
        size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch

-        common_batch_clear(batch);
+        batch.clear();

        // batch as much tasks as possible into the available context
        // each task has 4 unique sequence ids - one for each ending
@@ -1544,9 +1522,9 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par

            for (size_t i = 0; i < cur_task.common_prefix; ++i) {
                //llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
-                common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
+                batch.add_text_multi_seq(cur_task.seq_tokens[0][i], i, batch_indeces, false);
            }
-            batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
+            llama_batch_ext_set_output_last(batch.get()); // we need logits for the last token of the common prefix
            n_logits += 1;

            for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) {
@@ -1554,7 +1532,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
                // TODO: don't evaluate the last token of each sequence
                for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
                    const bool needs_logits = i < seq_tokens_size - 1;
-                    common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
+                    batch.add_text_multi_seq(cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
                    n_logits += needs_logits;
                }
            }
@@ -1653,8 +1631,6 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
        i0 = i1 - 1;
    }

-    llama_batch_free(batch);
-
    if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;

    float p = 1.f*n_correct/n_done;
@@ -1767,7 +1743,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
        // clear the KV cache
        llama_kv_self_clear(ctx);

-        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+        common_batch batch(n_batch, 1);

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@@ -1781,14 +1757,13 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
                tokens[batch_start] = llama_vocab_bos(vocab);
            }

-            common_batch_clear(batch);
+            batch.clear();
            for (int i = 0; i < batch_size; i++) {
-                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+                batch.add_text_multi_seq(tokens[batch_start + i], j*n_batch + i, {0}, true);
            }

-            if (llama_decode(ctx, batch)) {
+            if (llama_decode_ext(ctx, batch.get())) {
                LOG_ERR("%s : failed to eval\n", __func__);
-                llama_batch_free(batch);
                return;
            }

@@ -1801,8 +1776,6 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
            }
        }

-        llama_batch_free(batch);
-
        const auto t_end = std::chrono::high_resolution_clock::now();

        if (i == 0) {
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -74,40 +74,56 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
    return chunks;
 }

-static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
+static void batch_add_seq(common_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
    size_t n_tokens = tokens.size();
    for (size_t i = 0; i < n_tokens; i++) {
-        common_batch_add(batch, tokens[i], i, { seq_id }, true);
+        batch.add_text(tokens[i], i, seq_id, true);
    }
 }

-static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+static void batch_decode(llama_context * ctx, common_batch & batch, float * output, int n_seq, int n_embd, int embd_norm = 2) {
+    const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
+    const struct llama_model * model = llama_get_model(ctx);
+
    // clear previous kv_cache values (irrelevant for embeddings)
    llama_kv_self_clear(ctx);

    // run model
-    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_decode(ctx, batch) < 0) {
-        LOG_ERR("%s : failed to decode\n", __func__);
+    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, llama_batch_ext_get_n_tokens(batch.get()), n_seq);
+    if (llama_model_has_encoder(model) && !llama_model_has_decoder(model)) {
+        // encoder-only model
+        if (llama_encode_ext(ctx, batch.get()) < 0) {
+            LOG_ERR("%s : failed to encode\n", __func__);
+        }
+    } else if (!llama_model_has_encoder(model) && llama_model_has_decoder(model)) {
+        // decoder-only model
+        if (llama_decode_ext(ctx, batch.get()) < 0) {
+            LOG_ERR("%s : failed to decode\n", __func__);
+        }
    }

-    for (int i = 0; i < batch.n_tokens; i++) {
-        if (!batch.logits[i]) {
+    for (int i = 0; i < llama_batch_ext_get_n_tokens(batch.get()); i++) {
+        if (!batch.tokens[i].logits) {
            continue;
        }

-        // try to get sequence embeddings - supported only when pooling_type is not NONE
-        const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-        if (embd == NULL) {
+        const float * embd = nullptr;
+        int embd_pos = 0;
+
+        if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
+            // try to get token embeddings
            embd = llama_get_embeddings_ith(ctx, i);
-            if (embd == NULL) {
-                LOG_ERR("%s: failed to get embeddings for token %d\n", __func__, i);
-                continue;
-            }
+            embd_pos = i;
+            GGML_ASSERT(embd != NULL && "failed to get token embeddings");
+        } else {
+            // try to get sequence embeddings - supported only when pooling_type is not NONE
+            embd = llama_get_embeddings_seq(ctx, batch.tokens[i].seq_id);
+            embd_pos = batch.tokens[i].seq_id;
+            GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
        }

-        float * out = output + batch.seq_id[i][0] * n_embd;
-        common_embd_normalize(embd, out, n_embd, 2);
+        float * out = output + embd_pos * n_embd;
+        common_embd_normalize(embd, out, n_embd, embd_norm);
    }
 }

@@ -214,7 +230,7 @@ int main(int argc, char ** argv) {

    // initialize batch
    const int n_chunks = chunks.size();
-    struct llama_batch batch = llama_batch_init(n_batch, 0, 1);
+    struct common_batch batch = common_batch(n_batch, 1);

    // allocate output
    const int n_embd = llama_model_n_embd(model);
@@ -231,10 +247,10 @@ int main(int argc, char ** argv) {
        const uint64_t n_toks = inp.size();

        // encode if at capacity
-        if (batch.n_tokens + n_toks > n_batch) {
+        if (llama_batch_ext_get_n_tokens(batch.get()) + n_toks > n_batch) {
            float * out = emb + p * n_embd;
            batch_decode(ctx, batch, out, s, n_embd);
-            common_batch_clear(batch);
+            batch.clear();
            p += s;
            s = 0;
        }
@@ -255,7 +271,7 @@ int main(int argc, char ** argv) {
        chunks[i].tokens.clear();
    }

-    struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1);
+    struct common_batch query_batch = common_batch(n_batch, 1);

    // start loop, receive query and return top k similar chunks based on cosine similarity
    std::string query;
@@ -269,7 +285,7 @@ int main(int argc, char ** argv) {
        std::vector<float> query_emb(n_embd, 0);
        batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);

-        common_batch_clear(query_batch);
+        query_batch.clear();

        // compute cosine similarities
        {
@@ -299,6 +315,5 @@ int main(int argc, char ** argv) {
    llama_perf_context_print(ctx);

    // clean up
-    llama_batch_free(query_batch);
    llama_backend_free();
 }
--- a/examples/rpc/CMakeLists.txt
+++ b/examples/rpc/CMakeLists.txt
@@ -1,4 +1,2 @@
-set(TARGET rpc-server)
-add_executable(${TARGET} rpc-server.cpp)
-target_link_libraries(${TARGET} PRIVATE ggml)
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
+add_executable(rpc-server rpc-server.cpp)
+target_link_libraries(rpc-server PRIVATE ggml llama)
--- a/examples/rpc/README.md
+++ b/examples/rpc/README.md
@@ -72,14 +72,3 @@ $ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name

 This way you can offload model layers to both local and remote devices.

-### Local cache
-
-The RPC server can use a local cache to store large tensors and avoid transferring them over the network.
-This can speed up model loading significantly, especially when using large models.
-To enable the cache, use the `-c` option:
-
-```bash
-$ bin/rpc-server -c
-```
-
-By default, the cache is stored in the `$HOME/.cache/llama.cpp/rpc` directory and can be controlled via the `LLAMA_CACHE` environment variable.
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -1,7 +1,3 @@
-#if defined(_MSC_VER)
-#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
-#endif
-
 #include "ggml-cpu.h"

 #ifdef GGML_USE_CUDA
@@ -22,142 +18,26 @@

 #include "ggml-rpc.h"
 #ifdef _WIN32
-#  define DIRECTORY_SEPARATOR '\\'
-#  include <locale>
 #  include <windows.h>
-#  include <fcntl.h>
-#  include <io.h>
 #else
-#  define DIRECTORY_SEPARATOR '/'
 #  include <unistd.h>
-#  include <sys/stat.h>
 #endif
-#include <codecvt>
 #include <string>
 #include <stdio.h>
-#include <vector>
-#include <filesystem>
-
-namespace fs = std::filesystem;
-
-// NOTE: this is copied from common.cpp to avoid linking with libcommon
-// returns true if successful, false otherwise
-static bool fs_create_directory_with_parents(const std::string & path) {
-#ifdef _WIN32
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    std::wstring wpath = converter.from_bytes(path);
-
-    // if the path already exists, check whether it's a directory
-    const DWORD attributes = GetFileAttributesW(wpath.c_str());
-    if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
-        return true;
-    }
-
-    size_t pos_slash = 0;
-
-    // process path from front to back, procedurally creating directories
-    while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
-        const std::wstring subpath = wpath.substr(0, pos_slash);
-        const wchar_t * test = subpath.c_str();
-
-        const bool success = CreateDirectoryW(test, NULL);
-        if (!success) {
-            const DWORD error = GetLastError();
-
-            // if the path already exists, ensure that it's a directory
-            if (error == ERROR_ALREADY_EXISTS) {
-                const DWORD attributes = GetFileAttributesW(subpath.c_str());
-                if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
-                    return false;
-                }
-            } else {
-                return false;
-            }
-        }
-
-        pos_slash += 1;
-    }
-
-    return true;
-#else
-    // if the path already exists, check whether it's a directory
-    struct stat info;
-    if (stat(path.c_str(), &info) == 0) {
-        return S_ISDIR(info.st_mode);
-    }
-
-    size_t pos_slash = 1; // skip leading slashes for directory creation
-
-    // process path from front to back, procedurally creating directories
-    while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
-        const std::string subpath = path.substr(0, pos_slash);
-        struct stat info;
-
-        // if the path already exists, ensure that it's a directory
-        if (stat(subpath.c_str(), &info) == 0) {
-            if (!S_ISDIR(info.st_mode)) {
-                return false;
-            }
-        } else {
-            // create parent directories
-            const int ret = mkdir(subpath.c_str(), 0755);
-            if (ret != 0) {
-                return false;
-            }
-        }
-
-        pos_slash += 1;
-    }
-
-    return true;
-#endif // _WIN32
-}
-
-// NOTE: this is copied from common.cpp to avoid linking with libcommon
-static std::string fs_get_cache_directory() {
-    std::string cache_directory = "";
-    auto ensure_trailing_slash = [](std::string p) {
-        // Make sure to add trailing slash
-        if (p.back() != DIRECTORY_SEPARATOR) {
-            p += DIRECTORY_SEPARATOR;
-        }
-        return p;
-    };
-    if (getenv("LLAMA_CACHE")) {
-        cache_directory = std::getenv("LLAMA_CACHE");
-    } else {
-#ifdef __linux__
-        if (std::getenv("XDG_CACHE_HOME")) {
-            cache_directory = std::getenv("XDG_CACHE_HOME");
-        } else {
-            cache_directory = std::getenv("HOME") + std::string("/.cache/");
-        }
-#elif defined(__APPLE__)
-        cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
-#elif defined(_WIN32)
-        cache_directory = std::getenv("LOCALAPPDATA");
-#endif // __linux__
-        cache_directory = ensure_trailing_slash(cache_directory);
-        cache_directory += "llama.cpp";
-    }
-    return ensure_trailing_slash(cache_directory);
-}

 struct rpc_server_params {
    std::string host        = "127.0.0.1";
    int         port        = 50052;
    size_t      backend_mem = 0;
-    bool        use_cache   = false;
 };

 static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
    fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help                show this help message and exit\n");
-    fprintf(stderr, "  -H HOST, --host HOST      host to bind to (default: %s)\n", params.host.c_str());
-    fprintf(stderr, "  -p PORT, --port PORT      port to bind to (default: %d)\n", params.port);
-    fprintf(stderr, "  -m MEM,  --mem MEM        backend memory size (in MB)\n");
-    fprintf(stderr, "  -c,      --cache          enable local file cache\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -H HOST, --host HOST  host to bind to (default: %s)\n", params.host.c_str());
+    fprintf(stderr, "  -p PORT, --port PORT  port to bind to (default: %d)\n", params.port);
+    fprintf(stderr, "  -m MEM, --mem MEM     backend memory size (in MB)\n");
    fprintf(stderr, "\n");
 }

@@ -178,8 +58,6 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
            if (params.port <= 0 || params.port > 65535) {
                return false;
            }
-        } else if (arg == "-c" || arg == "--cache") {
-            params.use_cache = true;
        } else if (arg == "-m" || arg == "--mem") {
            if (++i >= argc) {
                return false;
@@ -286,20 +164,8 @@ int main(int argc, char * argv[]) {
    } else {
        get_backend_memory(&free_mem, &total_mem);
    }
-    const char * cache_dir = nullptr;
-    std::string cache_dir_str = fs_get_cache_directory() + "rpc/";
-    if (params.use_cache) {
-        if (!fs_create_directory_with_parents(cache_dir_str)) {
-            fprintf(stderr, "Failed to create cache directory: %s\n", cache_dir_str.c_str());
-            return 1;
-        }
-        cache_dir = cache_dir_str.c_str();
-    }
-    printf("Starting RPC server\n");
-    printf("  endpoint       : %s\n", endpoint.c_str());
-    printf("  local cache    : %s\n", cache_dir ? cache_dir : "n/a");
-    printf("  backend memory : %zu MB\n", free_mem / (1024 * 1024));
-    ggml_backend_rpc_start_server(backend, endpoint.c_str(), cache_dir, free_mem, total_mem);
+    printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
+    ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
    ggml_backend_free(backend);
    return 0;
 }
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -38,6 +38,24 @@
 }
 #endif

+GGML_ATTRIBUTE_FORMAT(1, 2)
+static std::string fmt(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    const int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::string buf;
+    buf.resize(size);
+    const int size2 = vsnprintf(const_cast<char *>(buf.data()), buf.size() + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+
+    return buf;
+}
+
 GGML_ATTRIBUTE_FORMAT(1, 2)
 static int printe(const char * fmt, ...) {
    va_list args;
@@ -507,11 +525,11 @@ class HttpClient {
        int secs = static_cast<int>(seconds) % 60;

        if (hrs > 0) {
-            return string_format("%dh %02dm %02ds", hrs, mins, secs);
+            return fmt("%dh %02dm %02ds", hrs, mins, secs);
        } else if (mins > 0) {
-            return string_format("%dm %02ds", mins, secs);
+            return fmt("%dm %02ds", mins, secs);
        } else {
-            return string_format("%ds", secs);
+            return fmt("%ds", secs);
        }
    }

@@ -526,7 +544,7 @@ class HttpClient {
            }
        }

-        return string_format("%.2f %s", dbl_size, suffix[i]);
+        return fmt("%.2f %s", dbl_size, suffix[i]);
    }

    static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
@@ -560,9 +578,7 @@ class HttpClient {
        return (now_downloaded_plus_file_size * 100) / total_to_download;
    }

-    static std::string generate_progress_prefix(curl_off_t percentage) {
-        return string_format("%3ld%% |", static_cast<long int>(percentage));
-    }
+    static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", static_cast<long int>(percentage)); }

    static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
        const auto                          now             = std::chrono::steady_clock::now();
@@ -573,9 +589,9 @@ class HttpClient {
    static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download,
                                                double speed, double estimated_time) {
        const int width = 10;
-        return string_format("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(),
-                             width, human_readable_size(total_to_download).c_str(), width,
-                             human_readable_size(speed).c_str(), width, human_readable_time(estimated_time).c_str());
+        return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width,
+                   human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width,
+                   human_readable_time(estimated_time).c_str());
    }

    static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {
@@ -624,6 +640,7 @@ class LlamaData {
    std::vector<llama_chat_message> messages; // TODO: switch to common_chat_msg
    std::list<std::string>          msg_strs;
    std::vector<char>               fmtted;
+    llama_pos                       n_past = 0;

    int init(Opt & opt) {
        model = initialize_model(opt);
@@ -934,10 +951,10 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
 }

 // Check if we have enough space in the context to evaluate this batch
-static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
+static int check_context_size(const llama_context_ptr & ctx, const llama_batch_ext_ptr & batch) {
    const int n_ctx      = llama_n_ctx(ctx.get());
    const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
-    if (n_ctx_used + batch.n_tokens > n_ctx) {
+    if (n_ctx_used + llama_batch_ext_get_n_tokens(batch.get()) > n_ctx) {
        printf(LOG_COL_DEFAULT "\n");
        printe("context size exceeded\n");
        return 1;
@@ -975,15 +992,17 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
    }

    // prepare a batch for the prompt
-    llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
+    auto batch = llama_batch_ext_ptr::init_from_text(tokens.data(), tokens.size(), llama_data.n_past, 0, true);
    llama_token new_token_id;
    while (true) {
        check_context_size(llama_data.context, batch);
-        if (llama_decode(llama_data.context.get(), batch)) {
+        if (llama_decode_ext(llama_data.context.get(), batch.get())) {
            printe("failed to decode\n");
            return 1;
        }

+        llama_data.n_past += llama_batch_ext_get_n_tokens(batch.get());
+
        // sample the next token, check is it an end of generation?
        new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
        if (llama_vocab_is_eog(vocab, new_token_id)) {
@@ -998,7 +1017,7 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
        print_word_and_concatenate_to_response(piece, response);

        // prepare the next batch with the sampled token
-        batch = llama_batch_get_one(&new_token_id, 1);
+        batch.reset(llama_batch_ext_init_from_text(&new_token_id, 1, llama_data.n_past, 0, true));
    }

    printf(LOG_COL_DEFAULT);
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -48,15 +48,11 @@ int main(int argc, char ** argv) {
    auto tokens = common_tokenize(ctx, params.prompt, true);

    // prepare the batch
-    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
-    for (size_t i = 0; i < tokens.size(); i++) {
-        common_batch_add(batch, tokens[i], i, {0}, false);
-    }
-    batch.logits[batch.n_tokens - 1] = true; // generate next token
+    llama_batch_ext * batch = llama_batch_ext_init_from_text(tokens.data(), tokens.size(), 0, 0, true);

    // evaluate prompt
-    llama_decode(ctx, batch);
-    n_past += batch.n_tokens;
+    llama_decode_ext(ctx, batch);
+    n_past += llama_batch_ext_get_n_tokens(batch);

    // save state (rng, logits, embedding and kv_cache) to file
    {
@@ -83,12 +79,13 @@ int main(int argc, char ** argv) {
        printf("%s", next_token_str.c_str());
        result0 += next_token_str;

-        common_batch_clear(batch);
-        common_batch_add(batch, next_token, n_past, {0}, true);
+        llama_batch_ext_clear(batch);
+        llama_seq_id seq_id = 0;
+        llama_batch_ext_add_text(batch, next_token, 0, &seq_id, 1, true);

-        if (llama_decode(ctx, batch)) {
+        if (llama_decode_ext(ctx, batch)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_free(batch);
+            llama_batch_ext_free(batch);
            return 1;
        }
        n_past += 1;
@@ -135,12 +132,13 @@ int main(int argc, char ** argv) {
        printf("%s", next_token_str.c_str());
        result1 += next_token_str;

-        common_batch_clear(batch);
-        common_batch_add(batch, next_token, n_past, {0}, true);
+        llama_batch_ext_clear(batch);
+        llama_seq_id seq_id = 0;
+        llama_batch_ext_add_text(batch, next_token, 0, &seq_id, 1, true);

-        if (llama_decode(ctx2, batch)) {
+        if (llama_decode_ext(ctx2, batch)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_free(batch);
+            llama_batch_ext_free(batch);
            return 1;
        }
        n_past += 1;
@@ -216,12 +214,13 @@ int main(int argc, char ** argv) {
        printf("%s", next_token_str.c_str());
        result2 += next_token_str;

-        common_batch_clear(batch);
-        common_batch_add(batch, next_token, n_past, {1}, true);
+        llama_batch_ext_clear(batch);
+        llama_seq_id seq_id = 1; // seq 1 instead of 0
+        llama_batch_ext_add_text(batch, next_token, 0, &seq_id, 1, true);

-        if (llama_decode(ctx3, batch)) {
+        if (llama_decode_ext(ctx3, batch)) {
            fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
-            llama_batch_free(batch);
+            llama_batch_ext_free(batch);
            return 1;
        }
        n_past += 1;
@@ -233,7 +232,7 @@ int main(int argc, char ** argv) {
    llama_sampler_free(smpl2);
    llama_sampler_free(smpl3);

-    llama_batch_free(batch);
+    llama_batch_ext_free(batch);

    if (result0 != result2) {
        fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__);
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
--- a/examples/server/public/index.html.gz
+++ b/examples/server/public/index.html.gz
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -133,8 +133,7 @@ struct slot_params {

        auto grammar_triggers = json::array();
        for (const auto & trigger : sampling.grammar_triggers) {
-            server_grammar_trigger ct(std::move(trigger));
-            grammar_triggers.push_back(ct.to_json());
+            grammar_triggers.push_back(trigger.to_json<json>());
        }

        return json {
@@ -373,9 +372,9 @@ struct server_task {
            const auto grammar_triggers = data.find("grammar_triggers");
            if (grammar_triggers != data.end()) {
                for (const auto & t : *grammar_triggers) {
-                    server_grammar_trigger ct(t);
-                    if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
-                        const auto & word = ct.value.value;
+                    auto ct = common_grammar_trigger::from_json(t);
+                    if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) {
+                        const auto & word = ct.value;
                        auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true);
                        if (ids.size() == 1) {
                            auto token = ids[0];
@@ -393,7 +392,7 @@ struct server_task {
                            params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word});
                        }
                    } else {
-                        params.sampling.grammar_triggers.push_back(std::move(ct.value));
+                        params.sampling.grammar_triggers.push_back(ct);
                    }
                }
            }
@@ -490,12 +489,8 @@ struct result_timings {
    double predicted_per_token_ms;
    double predicted_per_second;

-    // Optional speculative metrics - only included when > 0
-    int32_t draft_n = 0;
-    int32_t draft_n_accepted = 0;
-
    json to_json() const {
-        json base = {
+        return {
            {"prompt_n",               prompt_n},
            {"prompt_ms",              prompt_ms},
            {"prompt_per_token_ms",    prompt_per_token_ms},
@@ -506,13 +501,6 @@ struct result_timings {
            {"predicted_per_token_ms", predicted_per_token_ms},
            {"predicted_per_second",   predicted_per_second},
        };
-
-        if (draft_n > 0) {
-            base["draft_n"] = draft_n;
-            base["draft_n_accepted"] = draft_n_accepted;
-        }
-
-        return base;
    }
 };

@@ -842,11 +830,6 @@ struct server_task_result_cmpl_final : server_task_result {
            ret.push_back({"timings", timings.to_json()});
        }

-        // extra fields for debugging purposes
-        if (verbose) {
-            ret["__verbose"] = to_json_non_oaicompat();
-        }
-
        return ret;
    }
 };
@@ -1241,7 +1224,7 @@ struct server_slot {
    // only used for completion/embedding/infill/rerank
    server_task_type task_type = SERVER_TASK_TYPE_COMPLETION;

-    llama_batch batch_spec = {};
+    common_batch batch_spec;

    llama_context * ctx = nullptr;
    llama_context * ctx_dft = nullptr;
@@ -1311,10 +1294,6 @@ struct server_slot {

    std::function<void(int)> callback_on_release;

-    // Speculative decoding stats
-    int32_t n_draft_total = 0;      // Total draft tokens generated
-    int32_t n_draft_accepted = 0;   // Draft tokens actually accepted
-
    void reset() {
        SLT_DBG(*this, "%s", "\n");

@@ -1331,10 +1310,6 @@ struct server_slot {

        generated_tokens.clear();
        generated_token_probs.clear();
-
-        // clear speculative decoding stats
-        n_draft_total = 0;
-        n_draft_accepted = 0;
    }

    bool is_non_causal() const {
@@ -1401,12 +1376,6 @@ struct server_slot {
        timings.predicted_per_token_ms = t_token_generation / n_decoded;
        timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;

-        // Add speculative metrics
-        if (n_draft_total > 0) {
-            timings.draft_n = n_draft_total;
-            timings.draft_n_accepted = n_draft_accepted;
-        }
-
        return timings;
    }

@@ -1454,15 +1423,6 @@ struct server_slot {
                t_prompt_processing, n_prompt_tokens_processed, t_prompt, n_prompt_second,
                t_token_generation, n_decoded, t_gen, n_gen_second,
                t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
-
-        if (n_draft_total > 0) {
-            const float draft_ratio = (float) n_draft_accepted / n_draft_total;
-            SLT_INF(*this,
-                    "\n"
-                    "draft acceptance rate = %0.5f (%5d accepted / %5d generated)\n",
-                    draft_ratio, n_draft_accepted, n_draft_total
-            );
-        }
    }

    json to_json() const {
@@ -1836,7 +1796,7 @@ struct server_context {

    llama_context_params cparams_dft;

-    llama_batch batch = {};
+    common_batch batch;

    bool clean_kv_cache = true;
    bool add_bos_token  = true;
@@ -1869,15 +1829,11 @@ struct server_context {

            common_speculative_free(slot.spec);
            slot.spec = nullptr;
-
-            llama_batch_free(slot.batch_spec);
        }
-
-        llama_batch_free(batch);
    }

    bool load_model(const common_params & params) {
-        SRV_INF("loading model '%s'\n", params.model.path.c_str());
+        SRV_INF("loading model '%s'\n", params.model.c_str());

        params_base = params;

@@ -1887,7 +1843,7 @@ struct server_context {
        ctx   = llama_init.context.get();

        if (model == nullptr) {
-            SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str());
+            SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
            return false;
        }

@@ -1898,13 +1854,16 @@ struct server_context {
        add_bos_token = llama_vocab_get_add_bos(vocab);
        has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;

-        if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) {
-            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str());
+        if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) {
+            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());

            auto params_dft = params_base;

            params_dft.devices      = params_base.speculative.devices;
+            params_dft.hf_file      = params_base.speculative.hf_file;
+            params_dft.hf_repo      = params_base.speculative.hf_repo;
            params_dft.model        = params_base.speculative.model;
+            params_dft.model_url    = params_base.speculative.model_url;
            params_dft.n_ctx        = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
            params_dft.n_parallel   = 1;
@@ -1918,12 +1877,12 @@ struct server_context {
            model_dft = llama_init_dft.model.get();

            if (model_dft == nullptr) {
-                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str());
+                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
                return false;
            }

            if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
-                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
+                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());

                return false;
            }
@@ -1963,7 +1922,7 @@ struct server_context {
            slot.n_predict = params_base.n_predict;

            if (model_dft) {
-                slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
+                slot.batch_spec = common_batch(params_base.speculative.n_max + 1, 1);

                slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft);
                if (slot.ctx_dft == nullptr) {
@@ -1988,7 +1947,7 @@ struct server_context {

            slot.reset();

-            slots.push_back(slot);
+            slots.push_back(std::move(slot));
        }

        default_generation_settings_for_props = slots[0].to_json();
@@ -1999,7 +1958,7 @@ struct server_context {
            const int32_t n_batch = llama_n_batch(ctx);

            // only a single seq_id per token is needed
-            batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
+            batch = common_batch(std::max(n_batch, params_base.n_parallel), 1);
        }

        metrics.init();
@@ -2134,9 +2093,7 @@ struct server_context {
        }

        if (slot.ctx_dft) {
-            llama_batch_free(slot.batch_spec);
-
-            slot.batch_spec = llama_batch_init(slot.params.speculative.n_max + 1, 0, 1);
+            slot.batch_spec = common_batch(slot.params.speculative.n_max + 1, 1);
        }

        slot.state = SLOT_STATE_STARTED;
@@ -2444,7 +2401,7 @@ struct server_context {
        queue_results.send(std::move(res));
    }

-    void send_embedding(const server_slot & slot, const llama_batch & batch) {
+    void send_embedding(const server_slot & slot, common_batch & batch) {
        auto res = std::make_unique<server_task_result_embd>();
        res->id        = slot.id_task;
        res->index     = slot.index;
@@ -2455,18 +2412,19 @@ struct server_context {

        std::vector<float> embd_res(n_embd, 0.0f);

-        for (int i = 0; i < batch.n_tokens; ++i) {
-            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+        for (int i = 0; i < batch.get_n_tokens(); ++i) {
+            auto tok = batch.tokens[i];
+            if (!tok.logits || tok.seq_id != slot.id) {
                continue;
            }

-            const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            const float * embd = llama_get_embeddings_seq(ctx, tok.seq_id);
            if (embd == NULL) {
                embd = llama_get_embeddings_ith(ctx, i);
            }

            if (embd == NULL) {
-                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
+                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", tok.token, tok.seq_id);

                res->embedding.push_back(std::vector<float>(n_embd, 0.0f));
                continue;
@@ -2487,24 +2445,25 @@ struct server_context {
        queue_results.send(std::move(res));
    }

-    void send_rerank(const server_slot & slot, const llama_batch & batch) {
+    void send_rerank(const server_slot & slot, common_batch & batch) {
        auto res = std::make_unique<server_task_result_rerank>();
        res->id    = slot.id_task;
        res->index = slot.index;
        res->n_tokens = slot.n_prompt_tokens;

-        for (int i = 0; i < batch.n_tokens; ++i) {
-            if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+        for (int i = 0; i < batch.get_n_tokens(); ++i) {
+            auto tok = batch.tokens[i];
+            if (!tok.logits || tok.seq_id != slot.id) {
                continue;
            }

-            const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+            const float * embd = llama_get_embeddings_seq(ctx, tok.seq_id);
            if (embd == NULL) {
                embd = llama_get_embeddings_ith(ctx, i);
            }

            if (embd == NULL) {
-                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
+                SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", tok.token, tok.seq_id);

                res->score = -1e6;
                continue;
@@ -2895,7 +2854,7 @@ struct server_context {
        }

        // start populating the batch for this iteration
-        common_batch_clear(batch);
+        batch.clear();

        // track if given slot can be batched with slots already in the batch
        server_slot * slot_batched = nullptr;
@@ -2917,9 +2876,9 @@ struct server_context {
                continue;
            }

-            slot.i_batch = batch.n_tokens;
+            slot.i_batch = batch.get_n_tokens();

-            common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true);
+            batch.add_text(slot.sampled, slot.n_past, slot.id, true);

            slot.n_past += 1;

@@ -2936,7 +2895,7 @@ struct server_context {
        int32_t n_ubatch = llama_n_ubatch(ctx);

        // next, batch any pending prompts without exceeding n_batch
-        if (params_base.cont_batching || batch.n_tokens == 0) {
+        if (params_base.cont_batching || batch.get_n_tokens() == 0) {
            for (auto & slot : slots) {
                // check if we can batch this slot with the previous one
                if (slot.is_processing()) {
@@ -3102,7 +3061,7 @@ struct server_context {
                    // non-causal tasks require to fit the entire prompt in the physical batch
                    if (slot.is_non_causal()) {
                        // cannot fit the prompt in the current batch - will try next iter
-                        if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
+                        if (batch.get_n_tokens() + slot.n_prompt_tokens > n_batch) {
                            continue;
                        }
                    }
@@ -3122,11 +3081,11 @@ struct server_context {
                    slot.cache_tokens.resize(slot.n_past);

                    // add prompt tokens for processing in the current batch
-                    while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
+                    while (slot.n_past < slot.n_prompt_tokens && batch.get_n_tokens() < n_batch) {
                        // without pooling, we want to output the embeddings for all the tokens in the batch
                        const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;

-                        common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd);
+                        batch.add_text(prompt_tokens[slot.n_past], slot.n_past, slot.id, need_embd);

                        if (slot.params.cache_prompt) {
                            slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
@@ -3136,13 +3095,13 @@ struct server_context {
                        slot.n_past++;
                    }

-                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
+                    SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.get_n_tokens(), (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);

                    // entire prompt has been processed
                    if (slot.n_past == slot.n_prompt_tokens) {
                        slot.state = SLOT_STATE_DONE_PROMPT;

-                        GGML_ASSERT(batch.n_tokens > 0);
+                        GGML_ASSERT(batch.get_n_tokens() > 0);

                        common_sampler_reset(slot.smpl);

@@ -3152,27 +3111,27 @@ struct server_context {
                        }

                        // extract the logits only for the last token
-                        batch.logits[batch.n_tokens - 1] = true;
+                        batch.set_logits_last();

                        slot.n_decoded = 0;
-                        slot.i_batch   = batch.n_tokens - 1;
+                        slot.i_batch   = batch.get_n_tokens() - 1;

-                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.n_tokens);
+                        SLT_INF(slot, "prompt done, n_past = %d, n_tokens = %d\n", slot.n_past, batch.get_n_tokens());
                    }
                }

-                if (batch.n_tokens >= n_batch) {
+                if (batch.get_n_tokens() >= n_batch) {
                    break;
                }
            }
        }

-        if (batch.n_tokens == 0) {
+        if (batch.get_n_tokens() == 0) {
            SRV_WRN("%s", "no tokens to decode\n");
            return;
        }

-        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
+        SRV_DBG("decoding batch, n_tokens = %d\n", batch.get_n_tokens());

        if (slot_batched) {
            // make sure we're in the right embedding mode
@@ -3182,20 +3141,12 @@ struct server_context {
        }

        // process the created batch of tokens
-        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
-            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
+        for (int32_t i = 0; i < batch.get_n_tokens(); i += n_batch) {
+            const int32_t n_tokens = std::min(n_batch, batch.get_n_tokens() - i);

-            llama_batch batch_view = {
-                n_tokens,
-                batch.token    + i,
-                nullptr,
-                batch.pos      + i,
-                batch.n_seq_id + i,
-                batch.seq_id   + i,
-                batch.logits   + i,
-            };
+            common_batch batch_view = batch.get_view(i, n_tokens);

-            const int ret = llama_decode(ctx, batch_view);
+            const int ret = llama_decode_ext(ctx, batch_view.get());
            metrics.on_decoded(slots);

            if (ret != 0) {
@@ -3322,9 +3273,6 @@ struct server_context {

                llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);

-                // keep track of total number of tokens generated in the draft
-                slot.n_draft_total += draft.size();
-
                // ignore small drafts
                if (slot.params.speculative.n_min > (int) draft.size()) {
                    SLT_DBG(slot, "ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
@@ -3333,16 +3281,16 @@ struct server_context {
                }

                // construct the speculation batch
-                common_batch_clear(slot.batch_spec);
-                common_batch_add  (slot.batch_spec, id, slot.n_past, { slot.id }, true);
+                slot.batch_spec.clear();
+                slot.batch_spec.add_text(id, slot.n_past, slot.id, true);

                for (size_t i = 0; i < draft.size(); ++i) {
-                    common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
+                    slot.batch_spec.add_text(draft[i], slot.n_past + 1 + i, slot.id, true);
                }

-                SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens);
+                SLT_DBG(slot, "decoding speculative batch, size = %d\n", slot.batch_spec.get_n_tokens());

-                llama_decode(ctx, slot.batch_spec);
+                llama_decode_ext(ctx, slot.batch_spec.get());

                // the accepted tokens from the speculation
                const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
@@ -3350,9 +3298,6 @@ struct server_context {
                slot.n_past    += ids.size();
                slot.n_decoded += ids.size();

-                // update how many tokens out of draft was accepted
-                slot.n_draft_accepted += ids.size() - 1;
-
                slot.cache_tokens.push_back(id);
                slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);

@@ -3863,7 +3808,7 @@ int main(int argc, char ** argv) {
        json data = {
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
            { "total_slots",                 ctx_server.params_base.n_parallel },
-            { "model_path",                  ctx_server.params_base.model.path },
+            { "model_path",                  ctx_server.params_base.model },
            { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
            { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
            { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
@@ -4129,7 +4074,7 @@ int main(int argc, char ** argv) {
            {"object", "list"},
            {"data", {
                {
-                    {"id",       params.model_alias.empty() ? params.model.path : params.model_alias},
+                    {"id",       params.model_alias.empty() ? params.model : params.model_alias},
                    {"object",   "model"},
                    {"created",  std::time(0)},
                    {"owned_by", "llamacpp"},
@@ -4497,24 +4442,15 @@ int main(int argc, char ** argv) {
        llama_backend_free();
    };

+    // bind HTTP listen port
    bool was_bound = false;
-    if (string_ends_with(std::string(params.hostname), ".sock")) {
-        LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
-        svr->set_address_family(AF_UNIX);
-        // bind_to_port requires a second arg, any value other than 0 should
-        // simply get ignored
-        was_bound = svr->bind_to_port(params.hostname, 8080);
-    } else {
-        LOG_INF("%s: binding port with default address family\n", __func__);
-        // bind HTTP listen port
-        if (params.port == 0) {
-            int bound_port = svr->bind_to_any_port(params.hostname);
-            if ((was_bound = (bound_port >= 0))) {
-                params.port = bound_port;
-            }
-        } else {
-            was_bound = svr->bind_to_port(params.hostname, params.port);
+    if (params.port == 0) {
+        int bound_port = svr->bind_to_any_port(params.hostname);
+        if ((was_bound = (bound_port >= 0))) {
+            params.port = bound_port;
        }
+    } else {
+        was_bound = svr->bind_to_port(params.hostname, params.port);
    }

    if (!was_bound) {
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -58,32 +58,6 @@ static T json_value(const json & body, const std::string & key, const T & defaul

 const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);

-// thin wrapper around common_grammar_trigger with (de)serialization functions
-struct server_grammar_trigger {
-    common_grammar_trigger value;
-
-    server_grammar_trigger() = default;
-    server_grammar_trigger(const common_grammar_trigger & value) : value(value) {}
-    server_grammar_trigger(const json & in) {
-        value.type = (common_grammar_trigger_type) in.at("type").get<int>();
-        value.value = in.at("value").get<std::string>();
-        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
-            value.token = (llama_token) in.at("token").get<int>();
-        }
-    }
-
-    json to_json() const {
-        json out {
-            {"type", (int) value.type},
-            {"value", value.value},
-        };
-        if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) {
-            out["token"] = (int) value.token;
-        }
-        return out;
-    }
-};
-
 //
 // tokenizer and input processing utils
 //
@@ -653,8 +627,7 @@ static json oaicompat_completion_params_parse(
    llama_params["grammar_lazy"]     = chat_params.grammar_lazy;
    auto grammar_triggers = json::array();
    for (const auto & trigger : chat_params.grammar_triggers) {
-        server_grammar_trigger ct(trigger);
-        grammar_triggers.push_back(ct.to_json());
+        grammar_triggers.push_back(trigger.to_json<json>());
    }
    llama_params["grammar_triggers"] = grammar_triggers;
    llama_params["preserved_tokens"] = chat_params.preserved_tokens;
--- a/examples/server/webui/package-lock.json
+++ b/examples/server/webui/package-lock.json
--- a/examples/server/webui/package.json
+++ b/examples/server/webui/package.json
@@ -13,11 +13,9 @@
  "dependencies": {
    "@heroicons/react": "^2.2.0",
    "@sec-ant/readable-stream": "^0.6.0",
-    "@tailwindcss/postcss": "^4.1.1",
-    "@tailwindcss/vite": "^4.1.1",
    "@vscode/markdown-it-katex": "^1.1.1",
    "autoprefixer": "^10.4.20",
-    "daisyui": "^5.0.12",
+    "daisyui": "^4.12.14",
    "dexie": "^4.0.11",
    "highlight.js": "^11.10.0",
    "katex": "^0.16.15",
@@ -31,7 +29,7 @@
    "remark-breaks": "^4.0.0",
    "remark-gfm": "^4.0.0",
    "remark-math": "^6.0.0",
-    "tailwindcss": "^4.1.1",
+    "tailwindcss": "^3.4.15",
    "textlinestream": "^1.1.1",
    "vite-plugin-singlefile": "^2.0.3"
  },
--- a/examples/server/webui/postcss.config.js
+++ b/examples/server/webui/postcss.config.js
@@ -1,5 +1,6 @@
 export default {
  plugins: {
-    "@tailwindcss/postcss": {},
+    tailwindcss: {},
+    autoprefixer: {},
  },
 }
--- a/examples/server/webui/src/App.tsx
+++ b/examples/server/webui/src/App.tsx
@@ -28,7 +28,7 @@ function AppLayout() {
    <>
      <Sidebar />
      <div
-        className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto bg-base-100"
+        className="drawer-content grow flex flex-col h-screen w-screen mx-auto px-4 overflow-auto"
        id="main-scroll"
      >
        <Header />
--- a/examples/server/webui/src/Config.ts
+++ b/examples/server/webui/src/Config.ts
@@ -1,4 +1,4 @@
-import daisyuiThemes from 'daisyui/theme/object';
+import daisyuiThemes from 'daisyui/src/theming/themes';
 import { isNumeric } from './utils/misc';

 export const isDev = import.meta.env.MODE === 'development';
--- a/examples/server/webui/src/components/ChatScreen.tsx
+++ b/examples/server/webui/src/components/ChatScreen.tsx
@@ -99,9 +99,13 @@ export default function ChatScreen() {
    canvasData,
    replaceMessageAndGenerate,
  } = useAppContext();
-  const textarea = useOptimizedTextarea(prefilledMsg.content());
+  const [inputMsg, setInputMsg] = useState(prefilledMsg.content());
+  const inputRef = useRef<HTMLTextAreaElement>(null);

-  const { extraContext, clearExtraContext } = useVSCodeContext(textarea);
+  const { extraContext, clearExtraContext } = useVSCodeContext(
+    inputRef,
+    setInputMsg
+  );
  // TODO: improve this when we have "upload file" feature
  const currExtra: Message['extra'] = extraContext ? [extraContext] : undefined;

@@ -131,10 +135,9 @@ export default function ChatScreen() {
  };

  const sendNewMessage = async () => {
-    const lastInpMsg = textarea.value();
-    if (lastInpMsg.trim().length === 0 || isGenerating(currConvId ?? ''))
-      return;
-    textarea.setValue('');
+    if (inputMsg.trim().length === 0 || isGenerating(currConvId ?? '')) return;
+    const lastInpMsg = inputMsg;
+    setInputMsg('');
    scrollToBottom(false);
    setCurrNodeId(-1);
    // get the last message node
@@ -143,13 +146,13 @@ export default function ChatScreen() {
      !(await sendMessage(
        currConvId,
        lastMsgNodeId,
-        lastInpMsg,
+        inputMsg,
        currExtra,
        onChunk
      ))
    ) {
      // restore the input message if failed
-      textarea.setValue(lastInpMsg);
+      setInputMsg(lastInpMsg);
    }
    // OK
    clearExtraContext();
@@ -192,13 +195,16 @@ export default function ChatScreen() {
      // send the prefilled message if needed
      sendNewMessage();
    } else {
-      // otherwise, focus on the input
-      textarea.focus();
+      // otherwise, focus on the input and move the cursor to the end
+      if (inputRef.current) {
+        inputRef.current.focus();
+        inputRef.current.selectionStart = inputRef.current.value.length;
+      }
    }
    prefilledMsg.clear();
    // no need to keep track of sendNewMessage
    // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [textarea.ref]);
+  }, [inputRef]);

  // due to some timing issues of StorageUtils.appendMsg(), we need to make sure the pendingMsg is not duplicated upon rendering (i.e. appears once in the saved conversation and once in the pendingMsg)
  const pendingMsgDisplay: MessageDisplay[] =
@@ -252,7 +258,9 @@ export default function ChatScreen() {
          <textarea
            className="textarea textarea-bordered w-full"
            placeholder="Type a message (Shift+Enter to add a new line)"
-            ref={textarea.ref}
+            ref={inputRef}
+            value={inputMsg}
+            onChange={(e) => setInputMsg(e.target.value)}
            onKeyDown={(e) => {
              if (e.nativeEvent.isComposing || e.keyCode === 229) return;
              if (e.key === 'Enter' && e.shiftKey) return;
@@ -272,7 +280,11 @@ export default function ChatScreen() {
              Stop
            </button>
          ) : (
-            <button className="btn btn-primary ml-2" onClick={sendNewMessage}>
+            <button
+              className="btn btn-primary ml-2"
+              onClick={sendNewMessage}
+              disabled={inputMsg.trim().length === 0}
+            >
              Send
            </button>
          )}
@@ -286,43 +298,3 @@ export default function ChatScreen() {
    </div>
  );
 }
-
-export interface OptimizedTextareaValue {
-  value: () => string;
-  setValue: (value: string) => void;
-  focus: () => void;
-  ref: React.RefObject<HTMLTextAreaElement>;
-}
-
-// This is a workaround to prevent the textarea from re-rendering when the inner content changes
-// See https://github.com/ggml-org/llama.cpp/pull/12299
-function useOptimizedTextarea(initValue: string): OptimizedTextareaValue {
-  const [savedInitValue, setSavedInitValue] = useState<string>(initValue);
-  const textareaRef = useRef<HTMLTextAreaElement>(null);
-
-  useEffect(() => {
-    if (textareaRef.current && savedInitValue) {
-      textareaRef.current.value = savedInitValue;
-      setSavedInitValue('');
-    }
-  }, [textareaRef, savedInitValue, setSavedInitValue]);
-
-  return {
-    value: () => {
-      return textareaRef.current?.value ?? savedInitValue;
-    },
-    setValue: (value: string) => {
-      if (textareaRef.current) {
-        textareaRef.current.value = value;
-      }
-    },
-    focus: () => {
-      if (textareaRef.current) {
-        // focus and move the cursor to the end
-        textareaRef.current.focus();
-        textareaRef.current.selectionStart = textareaRef.current.value.length;
-      }
-    },
-    ref: textareaRef,
-  };
-}
--- a/examples/server/webui/src/components/Header.tsx
+++ b/examples/server/webui/src/components/Header.tsx
@@ -2,7 +2,7 @@ import { useEffect, useState } from 'react';
 import StorageUtils from '../utils/storage';
 import { useAppContext } from '../utils/app.context';
 import { classNames } from '../utils/misc';
-import daisyuiThemes from 'daisyui/theme/object';
+import daisyuiThemes from 'daisyui/src/theming/themes';
 import { THEMES } from '../Config';
 import { useNavigate } from 'react-router';

@@ -20,6 +20,7 @@ export default function Header() {
    document.body.setAttribute('data-theme', selectedTheme);
    document.body.setAttribute(
      'data-color-scheme',
+      // @ts-expect-error daisyuiThemes complains about index type, but it should work
      daisyuiThemes[selectedTheme]?.['color-scheme'] ?? 'auto'
    );
  }, [selectedTheme]);
--- a/examples/server/webui/src/index.scss
+++ b/examples/server/webui/src/index.scss
@@ -1,13 +1,8 @@
@use 'sass:meta';
-@use 'tailwindcss';

-@plugin 'daisyui' {
-  themes: all;
-}
-
-html {
-  scrollbar-gutter: auto;
-}
+@tailwind base;
+@tailwind components;
+@tailwind utilities;

 .markdown {
  h1,
--- a/examples/server/webui/src/utils/llama-vscode.ts
+++ b/examples/server/webui/src/utils/llama-vscode.ts
@@ -1,6 +1,5 @@
 import { useEffect, useState } from 'react';
 import { MessageExtraContext } from './types';
-import { OptimizedTextareaValue } from '../components/ChatScreen';

 // Extra context when using llama.cpp WebUI from llama-vscode, inside an iframe
 // Ref: https://github.com/ggml-org/llama.cpp/pull/11940
@@ -15,7 +14,10 @@ interface SetTextEvData {
 * window.postMessage({ command: 'setText', text: 'Spot the syntax error', context: 'def test()\n  return 123' }, '*');
 */

-export const useVSCodeContext = (textarea: OptimizedTextareaValue) => {
+export const useVSCodeContext = (
+  inputRef: React.RefObject<HTMLTextAreaElement>,
+  setInputMsg: (text: string) => void
+) => {
  const [extraContext, setExtraContext] = useState<MessageExtraContext | null>(
    null
  );
@@ -25,20 +27,20 @@ export const useVSCodeContext = (textarea: OptimizedTextareaValue) => {
    const handleMessage = (event: MessageEvent) => {
      if (event.data?.command === 'setText') {
        const data: SetTextEvData = event.data;
-        textarea.setValue(data?.text);
+        setInputMsg(data?.text);
        if (data?.context && data.context.length > 0) {
          setExtraContext({
            type: 'context',
            content: data.context,
          });
        }
-        textarea.focus();
+        inputRef.current?.focus();
      }
    };

    window.addEventListener('message', handleMessage);
    return () => window.removeEventListener('message', handleMessage);
-  }, [textarea]);
+  }, [inputRef, setInputMsg]);

  // Add a keydown listener that sends the "escapePressed" message to the parent window
  useEffect(() => {
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@@ -108,19 +108,22 @@ int main(int argc, char ** argv) {
        }

        // prepare a batch for the prompt
-        llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+        llama_pos n_past = 0;
+        llama_batch_ext * batch = llama_batch_ext_init_from_text(prompt_tokens.data(), prompt_tokens.size(), n_past, 0, true);
+        n_past += llama_batch_ext_get_n_tokens(batch);
+
        llama_token new_token_id;
        while (true) {
            // check if we have enough space in the context to evaluate this batch
            int n_ctx = llama_n_ctx(ctx);
            int n_ctx_used = llama_kv_self_used_cells(ctx);
-            if (n_ctx_used + batch.n_tokens > n_ctx) {
+            if (n_ctx_used + llama_batch_ext_get_n_tokens(batch) > n_ctx) {
                printf("\033[0m\n");
                fprintf(stderr, "context size exceeded\n");
                exit(0);
            }

-            if (llama_decode(ctx, batch)) {
+            if (llama_decode_ext(ctx, batch)) {
                GGML_ABORT("failed to decode\n");
            }

@@ -144,9 +147,14 @@ int main(int argc, char ** argv) {
            response += piece;

            // prepare the next batch with the sampled token
-            batch = llama_batch_get_one(&new_token_id, 1);
+            llama_batch_ext_clear(batch);
+            llama_seq_id seq_id = 0;
+            llama_batch_ext_add_text(batch, new_token_id, n_past, &seq_id, 1, true);
+            n_past++;
        }

+        llama_batch_ext_free(batch);
+
        return response;
    };

--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {

    // prepare a batch for the prompt

-    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
+    llama_batch_ext * batch = llama_batch_ext_init_from_text(prompt_tokens.data(), prompt_tokens.size(), 0, 0, true);

    // main loop

@@ -151,14 +151,14 @@ int main(int argc, char ** argv) {
    int n_decode = 0;
    llama_token new_token_id;

-    for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
+    for (int n_pos = 0; n_pos + llama_batch_ext_get_n_tokens(batch) < n_prompt + n_predict; ) {
        // evaluate the current batch with the transformer model
-        if (llama_decode(ctx, batch)) {
+        if (llama_decode_ext(ctx, batch)) {
            fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
            return 1;
        }

-        n_pos += batch.n_tokens;
+        n_pos += llama_batch_ext_get_n_tokens(batch);

        // sample the next token
        {
@@ -180,7 +180,9 @@ int main(int argc, char ** argv) {
            fflush(stdout);

            // prepare the next batch with the sampled token
-            batch = llama_batch_get_one(&new_token_id, 1);
+            llama_batch_ext_clear(batch);
+            llama_seq_id seq_id = 0;
+            llama_batch_ext_add_text(batch, new_token_id, n_pos, &seq_id, 1, true);

            n_decode += 1;
        }
@@ -198,6 +200,7 @@ int main(int argc, char ** argv) {
    llama_perf_context_print(ctx);
    fprintf(stderr, "\n");

+    llama_batch_ext_free(batch);
    llama_sampler_free(smpl);
    llama_free(ctx);
    llama_model_free(model);
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@@ -24,7 +24,7 @@ int main(int argc, char ** argv) {

    common_init();

-    if (params.speculative.model.path.empty()) {
+    if (params.speculative.model.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
@@ -113,7 +113,8 @@ int main(int argc, char ** argv) {
    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);

    // eval the prompt
-    llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
+    auto batch = llama_batch_ext_ptr::init_from_text(inp.data(), inp.size() - 1, 0, 0, true);
+    llama_decode_ext(ctx_tgt, batch.get());

    // note: keep the last token separate!
    llama_token id_last = inp.back();
@@ -132,7 +133,7 @@ int main(int argc, char ** argv) {

    struct common_speculative * spec = common_speculative_init(ctx_dft);

-    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
+    llama_batch_ext * batch_tgt = llama_batch_ext_init(llama_n_batch(ctx_tgt), 1);

    const auto t_enc_end = ggml_time_us();

@@ -151,8 +152,9 @@ int main(int argc, char ** argv) {
        //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());

        // always have a token to evaluate from before - id_last
-        common_batch_clear(batch_tgt);
-        common_batch_add  (batch_tgt, id_last, n_past++, { 0 }, true);
+        llama_batch_ext_clear(batch_tgt);
+        llama_seq_id seq_id = 0;
+        llama_batch_ext_add_text(batch_tgt, id_last, n_past++, &seq_id, 1, true);

        // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
        {
@@ -162,12 +164,12 @@ int main(int argc, char ** argv) {
            }

            for (size_t i = 0; i < draft.size(); ++i) {
-                common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
+                llama_batch_ext_add_text(batch_tgt, draft[i], n_past + i, &seq_id, 1, true);
            }

            //LOG_DBG("target batch: %s\n", string_from(ctx_tgt, batch_tgt).c_str());

-            llama_decode(ctx_tgt, batch_tgt);
+            llama_decode_ext(ctx_tgt, batch_tgt);
        }

        // sample from the full target batch and return the accepted tokens based on the target sampler
@@ -253,6 +255,7 @@ int main(int argc, char ** argv) {
    common_sampler_free(smpl);
    common_speculative_free(spec);

+    llama_batch_ext_free(batch_tgt);
    llama_backend_free();

    LOG("\n\n");
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -45,8 +45,7 @@ int main(int argc, char ** argv) {
    }

    common_init();
-
-    if (params.speculative.model.path.empty()) {
+    if (params.speculative.model.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
@@ -166,9 +165,12 @@ int main(int argc, char ** argv) {
    const auto t_enc_start = ggml_time_us();

    // eval the prompt with both models
-    llama_decode(ctx_tgt, llama_batch_get_one( inp.data(), n_input - 1));
-    llama_decode(ctx_tgt, llama_batch_get_one(&inp.back(),           1));
-    llama_decode(ctx_dft, llama_batch_get_one( inp.data(), n_input));
+    llama_batch_ext_ptr batch0(llama_batch_ext_init_from_text( inp.data(), n_input - 1, 0,           0, true));
+    llama_batch_ext_ptr batch1(llama_batch_ext_init_from_text(&inp.back(),           1, n_input - 1, 0, true));
+    llama_batch_ext_ptr batch2(llama_batch_ext_init_from_text( inp.data(), n_input    , 0,           0, true));
+    llama_decode_ext(ctx_tgt, batch0.get());
+    llama_decode_ext(ctx_tgt, batch1.get());
+    llama_decode_ext(ctx_dft, batch2.get());

    const auto t_enc_end = ggml_time_us();

@@ -199,8 +201,8 @@ int main(int argc, char ** argv) {
        drafts[s].smpl = common_sampler_init(model_dft, params.sampling);
    }

-    llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
-    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, n_seq_dft);
+    llama_batch_ext * batch_dft = llama_batch_ext_init(llama_n_batch(ctx_dft), 1);
+    llama_batch_ext * batch_tgt = llama_batch_ext_init(llama_n_batch(ctx_tgt), n_seq_dft);

    const auto t_dec_start = ggml_time_us();

@@ -331,7 +333,7 @@ int main(int argc, char ** argv) {
                        }

                        active_seqs.erase(s);
-                        for (int i = 0; i < n_seq_dft; i++) {
+                        for(int i = 0; i < n_seq_dft; i++) {
                            if (i == s) {
                                continue;
                            }
@@ -441,12 +443,13 @@ int main(int argc, char ** argv) {
            drafts[0].dists.push_back(std::vector<llama_token_data>());
            drafts[0].i_batch_tgt.push_back(0);

-            common_batch_clear(batch_dft);
-            common_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
+            llama_batch_ext_clear(batch_dft);
+            llama_seq_id seq_id = 0;
+            llama_batch_ext_add_text(batch_dft, token_id, n_past_dft, &seq_id, 1, true);

            llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
-            llama_decode(ctx_dft, batch_dft);
+            llama_decode_ext(ctx_dft, batch_dft);

            ++n_past_dft;
        }
@@ -471,12 +474,19 @@ int main(int argc, char ** argv) {
        drafts[0].drafting    = true;
        drafts[0].i_batch_dft = 0;

-        common_batch_clear(batch_tgt);
-        common_batch_add  (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
+        struct batch_info {
+            llama_token id;
+            llama_pos pos;
+            std::vector<llama_seq_id> seq_id;
+        };
+
+        std::vector<batch_info> batch_tgt_data;
+
+        batch_tgt_data.push_back({ drafts[0].tokens[0], n_past_tgt, {0} });

        // sample n_draft tokens from the draft model using tree-based sampling
        for (int i = 0; i < n_draft; ++i) {
-            batch_dft.n_tokens = 0;
+            llama_batch_ext_clear(batch_dft);

            for (int s = 0; s < n_seq_dft; ++s) {
                drafts[s].skip = false;
@@ -507,11 +517,10 @@ int main(int argc, char ** argv) {
                        llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);

                        // all previous tokens from this branch are now also part of the new branch
-                        for (int t = 0; t < batch_tgt.n_tokens; ++t) {
-                            for (int p = 0; p < batch_tgt.n_seq_id[t]; ++p) {
-                                if (batch_tgt.seq_id[t][p] == s) {
-                                    batch_tgt.seq_id[t][batch_tgt.n_seq_id[t]] = n_seq_cur;
-                                    batch_tgt.n_seq_id[t]++;
+                        for (int t = 0; t < (int) batch_tgt_data.size(); ++t) {
+                            for (int p = 0; p < (int) batch_tgt_data[t].seq_id.size(); ++p) {
+                                if (batch_tgt_data[t].seq_id[p] == s) {
+                                    batch_tgt_data[t].seq_id.push_back(n_seq_cur);
                                    break;
                                }
                            }
@@ -553,32 +562,30 @@ int main(int argc, char ** argv) {
                    drafts[s].dists.push_back({cur_p->data, cur_p->data + cur_p->size});

                    // add unique drafted tokens to the target batch
-                    drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
+                    drafts[s].i_batch_tgt.push_back(batch_tgt_data.size());

-                    common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
+                    batch_tgt_data.push_back({ id, n_past_tgt + i + 1, { s }});

                    // add the token to the batch for batched decoding with the draft model
-                    drafts[s].i_batch_dft = batch_dft.n_tokens;
+                    drafts[s].i_batch_dft = llama_batch_ext_add_text(batch_dft, id, n_past_cur, &s, 1, true);

-                    common_batch_add(batch_dft, id, n_past_cur, { s }, true);
-
-                    if (batch_tgt.n_tokens > n_draft) {
+                    if (batch_tgt_data.size() > (size_t) n_draft) {
                        drafts[s].drafting = false;
                    }
                }
            }

            // no sequence is drafting anymore
-            if (batch_dft.n_tokens == 0) {
+            if (llama_batch_ext_get_n_tokens(batch_dft) == 0) {
                break;
            }

            // evaluate the drafted tokens on the draft model
-            llama_decode(ctx_dft, batch_dft);
+            llama_decode_ext(ctx_dft, batch_dft);
            ++n_past_cur;
            ++n_drafted;

-            if (batch_tgt.n_tokens > n_draft) {
+            if (batch_tgt_data.size() > (size_t) n_draft) {
                break;
            }
        }
@@ -590,8 +597,15 @@ int main(int argc, char ** argv) {
                llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
            }

+            llama_batch_ext_clear(batch_tgt);
+            for (int i = 0; i < (int) batch_tgt_data.size(); ++i) {
+                const auto & data = batch_tgt_data[i];
+
+                llama_batch_ext_add_text(batch_tgt, data.id, data.pos, data.seq_id.data(), data.seq_id.size(), true);
+            }
+
            // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
-            llama_decode(ctx_tgt, batch_tgt);
+            llama_decode_ext(ctx_tgt, batch_tgt);
            ++n_past_tgt;
        }

@@ -634,7 +648,8 @@ int main(int argc, char ** argv) {
        common_sampler_free(drafts[s].smpl);
    }

-    llama_batch_free(batch_dft);
+    llama_batch_ext_free(batch_dft);
+    llama_batch_ext_free(batch_tgt);

    llama_backend_free();

--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@@ -571,13 +571,14 @@ int main(int argc, char ** argv) {
    model_ttc = llama_init_ttc.model.get();
    ctx_ttc   = llama_init_ttc.context.get();

-    if (model_ttc == nullptr || ctx_ttc == nullptr) {
-        return ENOENT;
-    }
-
    const llama_vocab * vocab = llama_model_get_vocab(model_ttc);

-    params.model = params.vocoder.model;
+    // TODO: refactor in a common struct
+    params.model     = params.vocoder.model;
+    params.model_url = params.vocoder.model_url;
+    params.hf_repo   = params.vocoder.hf_repo;
+    params.hf_file   = params.vocoder.hf_file;
+
    params.embedding = true;

    common_init_result llama_init_cts = common_init_from_params(params);
@@ -585,10 +586,6 @@ int main(int argc, char ** argv) {
    model_cts = llama_init_cts.model.get();
    ctx_cts   = llama_init_cts.context.get();

-    if (model_cts == nullptr || ctx_cts == nullptr) {
-        return ENOENT;
-    }
-
    std::vector<common_sampler *> smpl(n_parallel);
    for (int i = 0; i < n_parallel; ++i) {
        params.sampling.no_perf = (i != 0);
@@ -694,13 +691,11 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
            const std::string voice_data = audio_data;

            auto tmp = common_tokenize(vocab, voice_data, false, true);
-
-            std::ostringstream tokens_oss;
+            printf("\n\n");
            for (size_t i = 0; i < tmp.size(); ++i) {
-                tokens_oss << tmp[i] << ", ";
+                printf("%d, ", tmp[i]);
            }
-            LOG_INF("\n\n%s: llama tokens: %s\n\n", __func__, tokens_oss.str().c_str());
-
+            printf("\n\n");
            prompt_add(prompt_inp, tmp);
 #else
            prompt_add(prompt_inp, llama_tokens {
@@ -823,7 +818,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14

        // create a llama_batch
        // we use this object to submit token data for decoding
-        llama_batch batch = llama_batch_init(std::max(prompt_inp.size(), (size_t) n_parallel), 0, n_parallel);
+        llama_batch_ext * batch = llama_batch_ext_init(std::max(prompt_inp.size(), (size_t) n_parallel), n_parallel);

        std::vector<llama_seq_id> seq_ids(n_parallel, 0);
        for (int32_t i = 0; i < n_parallel; ++i) {
@@ -832,14 +827,14 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14

        // evaluate the initial prompt
        for (size_t i = 0; i < prompt_inp.size(); ++i) {
-            common_batch_add(batch, prompt_inp[i], i, seq_ids, false);
+            llama_batch_ext_add_text(batch, prompt_inp[i], i, seq_ids.data(), seq_ids.size(), false);
        }
-        GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size());
+        GGML_ASSERT(llama_batch_ext_get_n_tokens(batch) == (int) prompt_inp.size());

        // llama_decode will output logits only for the last token of the prompt
-        batch.logits[batch.n_tokens - 1] = true;
+        llama_batch_ext_set_output_last(batch);

-        if (llama_decode(ctx_ttc, batch) != 0) {
+        if (llama_decode_ext(ctx_ttc, batch) != 0) {
            LOG_ERR("%s: llama_decode() failed\n", __func__);
            return 1;
        }
@@ -858,16 +853,16 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14

        // remember the batch index of the last token for each parallel sequence
        // we need this to determine which logits to sample from
-        std::vector<int32_t> i_batch(n_parallel, batch.n_tokens - 1);
+        std::vector<int32_t> i_batch(n_parallel, llama_batch_ext_get_n_tokens(batch) - 1);

-        int n_past   = batch.n_tokens;
+        int n_past   = llama_batch_ext_get_n_tokens(batch);
        int n_decode = 0;

        bool next_token_uses_guide_token = true;

        while (n_decode <= n_predict) {
            // prepare the next batch
-            common_batch_clear(batch);
+            llama_batch_ext_clear(batch);

            // sample the next token for each parallel sequence / stream
            for (int32_t i = 0; i < n_parallel; ++i) {
@@ -923,14 +918,14 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
                    //LOG_CNT("%d", i);
                }

-                i_batch[i] = batch.n_tokens;
+                i_batch[i] = llama_batch_ext_get_n_tokens(batch);

                // push this new token for next evaluation
-                common_batch_add(batch, new_token_id, n_past, { i }, true);
+                llama_batch_ext_add_text(batch, new_token_id, n_past, &i, 1, true);
            }

            // all streams are finished
-            if (batch.n_tokens == 0) {
+            if (llama_batch_ext_get_n_tokens(batch) == 0) {
                break;
            }

@@ -938,13 +933,13 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
            n_past += 1;

            // evaluate the current batch with the transformer model
-            if (llama_decode(ctx_ttc, batch)) {
+            if (llama_decode_ext(ctx_ttc, batch)) {
                LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
                return 1;
            }
        }

-        llama_batch_free(batch);
+        llama_batch_ext_free(batch);

        LOG("\n");
        LOG_INF("%s: time for decoder:       %.3f ms\n", __func__, (ggml_time_us() - t_dec_start) / 1000.0f);
@@ -1013,14 +1008,15 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14

    const int n_codes = codes.size();

-    llama_batch batch = llama_batch_init(n_codes, 0, 1);
+    llama_batch_ext * batch = llama_batch_ext_init(n_codes, 1);

    for (size_t i = 0; i < codes.size(); ++i) {
-        common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits?
+        llama_seq_id seq_id = 0;
+        llama_batch_ext_add_text(batch, codes[i], i, &seq_id, 1, true); // TODO: all logits?
    }
-    GGML_ASSERT(batch.n_tokens == n_codes);
+    GGML_ASSERT(llama_batch_ext_get_n_tokens(batch) == n_codes);

-    if (llama_decode(ctx_cts, batch) != 0) {
+    if (llama_decode_ext(ctx_cts, batch) != 0) {
        LOG_ERR("%s: llama_decode() failed\n", __func__);
        return 1;
    }
@@ -1084,6 +1080,7 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
        retval = ENOENT;
    }

+    llama_batch_ext_free(batch);
    llama_backend_free();

    return retval;
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -100,10 +100,6 @@ else()
    set(INS_ENB ON)
 endif()

-message(DEBUG "GGML_NATIVE         : ${GGML_NATIVE}")
-message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
-message(DEBUG "INS_ENB             : ${INS_ENB}")
-
 option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
 option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 option(GGML_CPU_KLEIDIAI     "ggml: use KleidiAI optimized kernels if applicable" OFF)
@@ -127,12 +123,10 @@ endif()
 option(GGML_LASX             "ggml: enable lasx"             ON)
 option(GGML_LSX              "ggml: enable lsx"              ON)
 option(GGML_RVV              "ggml: enable rvv"              ON)
-option(GGML_RV_ZFH           "ggml: enable riscv zfh"        OFF)
 option(GGML_VXE              "ggml: enable vxe"              ON)

 option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
-set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
-set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
+set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")


 if (WIN32)
--- a/ggml/cmake/GitVars.cmake
+++ b/ggml/cmake/GitVars.cmake
@@ -1,22 +0,0 @@
-find_package(Git)
-
-# the commit's SHA1
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_SHA1
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-# the date of the commit
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_DATE
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-# the subject of the commit
-execute_process(COMMAND
-    "${GIT_EXECUTABLE}" log -1 --format=%s
-    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
-    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
--- a/ggml/cmake/ggml-config.cmake.in
+++ b/ggml/cmake/ggml-config.cmake.in
@@ -5,7 +5,7 @@

 set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
 set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
-#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
+set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")

 find_package(Threads REQUIRED)

--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -17,9 +17,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c

 GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);

-GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
-                                                    const char * cache_dir,
-                                                    size_t free_mem, size_t total_mem);
+GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);

 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);

--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1791,11 +1791,11 @@ extern "C" {

 #define GGML_KQ_MASK_PAD 64

-    // q:    [n_embd_k, n_batch,     n_head,    1]
-    // k:    [n_embd_k, n_kv,        n_head_kv, 1]
-    // v:    [n_embd_v, n_kv,        n_head_kv, 1] !! not transposed !!
-    // mask: [n_kv,     n_batch_pad, 1,         1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
-    // res:  [n_embd_v, n_head,      n_batch,   1] !! permuted !!
+    // q:    [n_embd, n_batch,     n_head,    1]
+    // k:    [n_embd, n_kv,        n_head_kv, 1]
+    // v:    [n_embd, n_kv,        n_head_kv, 1] !! not transposed !!
+    // mask: [n_kv,   n_batch_pad, 1,         1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
+    // res:  [n_embd, n_head,      n_batch,   1] !! permuted !!
    GGML_API struct ggml_tensor * ggml_flash_attn_ext(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -65,7 +65,7 @@ if (GGML_LTO)
    endif()
 endif()

-if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
+if (GGML_CCACHE)
    find_program(GGML_CCACHE_FOUND ccache)
    find_program(GGML_SCCACHE_FOUND sccache)

@@ -76,11 +76,7 @@ if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAU
            set(GGML_CCACHE_VARIANT sccache)
        endif()
        # TODO: should not be set globally
-        if (GGML_SYCL AND GGML_CCACHE_FOUND AND WIN32)
-            set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "ccache compiler_type=icl")
-        else ()
-            set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
-        endif ()
+        set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
        set(ENV{CCACHE_SLOPPINESS} time_macros)
        message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
    else()
@@ -329,10 +325,6 @@ if (CMAKE_SYSTEM_NAME MATCHES "Android")
    target_link_libraries(ggml-base PRIVATE dl)
 endif()

-if(CMAKE_SYSTEM_NAME MATCHES "visionOS")
-    target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE)
-endif()
-
 if (BUILD_SHARED_LIBS)
    foreach (target ggml-base ggml)
        set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
--- a/ggml/src/ggml-cann/.clang-format
+++ b/ggml/src/ggml-cann/.clang-format
@@ -0,0 +1,168 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveMacros: false
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: WithoutElse
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: true
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<ext/.*\.h>'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+    SortPriority:    0
+  - Regex:           '^<.*'
+    Priority:        2
+    SortPriority:    0
+  - Regex:           '.*'
+    Priority:        3
+    SortPriority:    0
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IncludeIsMainSourceRegex: ''
+IndentCaseLabels: true
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Never
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+RawStringFormats:
+  - Language:        Cpp
+    Delimiters:
+      - cc
+      - CC
+      - cpp
+      - Cpp
+      - CPP
+      - 'c++'
+      - 'C++'
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+  - Language:        TextProto
+    Delimiters:
+      - pb
+      - PB
+      - proto
+      - PROTO
+    EnclosingFunctions:
+      - EqualsProto
+      - EquivToProto
+      - PARSE_PARTIAL_TEXT_PROTO
+      - PARSE_TEST_PROTO
+      - PARSE_TEXT_PROTO
+      - ParseTextOrDie
+      - ParseTextProtoOrDie
+    CanonicalDelimiter: ''
+    BasedOnStyle:    google
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+Standard:        Auto
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseCRLF:         false
+UseTab:          Never
+...
+
--- a/ggml/src/ggml-cann/CMakeLists.txt
+++ b/ggml/src/ggml-cann/CMakeLists.txt
@@ -51,11 +51,13 @@ if (CANN_INSTALL_DIR)
        ${CANN_INSTALL_DIR}/acllib/include
    )

+    add_subdirectory(kernels)
    list(APPEND CANN_LIBRARIES
        ascendcl
        nnopbase
        opapi
        acl_op_compiler
+        ascendc_kernels
    )

    file(GLOB GGML_SOURCES_CANN "*.cpp")
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@@ -54,7 +54,9 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
    // added.
    int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];

+    int64_t acl_storage_len = 0;
    if (ne == nullptr) {
+        acl_storage_len = ggml_nbytes(tensor);
        for (int i = 0; i < GGML_MAX_DIMS; i++) {
            acl_ne[i] = tensor->ne[i];
            // The step size of acl is in elements.
@@ -63,18 +65,14 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
    } else {
        // With bcast
        for (int i = 0; i < dims; i++) {
+            acl_storage_len += (ne[i] - 1) * nb[i];
            acl_ne[i] = ne[i];
            acl_stride[i] = nb[i] / ggml_element_size(tensor);
        }
    }

-    int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
-    int64_t acl_storage_len = 1;
-    for (int i = 0; i < final_dims; i++) {
-        acl_storage_len += (acl_ne[i] - 1) * acl_stride[i];
-    }
-
    // Reverse ne and stride.
+    int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
    std::reverse(acl_ne, acl_ne + final_dims);
    std::reverse(acl_stride, acl_stride + final_dims);

--- a/ggml/src/ggml-cann/acl_tensor.h
+++ b/ggml/src/ggml-cann/acl_tensor.h
@@ -101,14 +101,14 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
        tmp_stride[i] = nb[i] / type_size;
    }

-    int64_t acl_storage_len = 1;
-    for (int i = 0; i < dims; i++) {
-        acl_storage_len += (tmp_ne[i] - 1) * tmp_stride[i];
-    }
-
    std::reverse(tmp_ne, tmp_ne + dims);
    std::reverse(tmp_stride, tmp_stride + dims);

+    int64_t acl_storage_len = 0;
+    for (int i = 0; i < dims; i++) {
+        acl_storage_len += (ne[i] - 1) * nb[i];
+    }
+
    aclTensor* acl_tensor =
        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
                        format, &acl_storage_len, 1, data_ptr);
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -30,7 +30,6 @@
 #include <aclnnop/aclnn_copy.h>
 #include <aclnnop/aclnn_cos.h>
 #include <aclnnop/aclnn_div.h>
-#include <aclnnop/aclnn_embedding.h>
 #include <aclnnop/aclnn_exp.h>
 #include <aclnnop/aclnn_fill_scalar.h>
 #include <aclnnop/aclnn_group_norm.h>
@@ -51,7 +50,6 @@
 #include <aclnnop/aclnn_triu.h>
 #include <aclnnop/aclnn_upsample_nearest_2d.h>
 #include <aclnnop/aclnn_weight_quant_batch_matmul_v2.h>
-#include <aclnnop/aclnn_argmax.h>
 #include <float.h>

 #include <cmath>
@@ -60,6 +58,7 @@
 #include <vector>

 #include "ggml-impl.h"
+#include "kernels/ascendc_kernels.h"

 #define GGML_COMMON_DECL_C

@@ -100,35 +99,6 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
    ACL_CHECK(aclDestroyIntArray(repeats));
 }

-/**
- * @brief Casts the elements of a tensor to a specified data type using the CANN backend.
- *
- * @details This function performs a type conversion on the elements of the input tensor `acl_src`
- *          and stores the results in the destination tensor `acl_dst`. The conversion type is
- *          determined based on the `dst` tensor's data type.
- *
- * @param ctx The context for the CANN backend operations.
- * @param acl_src The source tensor whose elements will be cast.
- * @param acl_dst The destination tensor that will store the casted elements.
- * @param dst The ggml tensor specifying the target data type.
- */
-static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
-                    aclTensor* acl_dst, ggml_tensor* dst) {
-    uint64_t workspaceSize = 0;
-    aclOpExecutor* executor;
-    void* workspaceAddr = nullptr;
-    ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src,
-                                        ggml_cann_type_mapping(dst->type),
-                                        acl_dst, &workspaceSize, &executor));
-
-    if (workspaceSize > 0) {
-        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
-        workspaceAddr = workspace_allocator.get();
-    }
-
-    ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
-}
-
 void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
    GGML_ASSERT(ggml_can_repeat(src, dst));
@@ -359,6 +329,8 @@ void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) {

 void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);

    float min;
    float max;
@@ -917,76 +889,173 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
 }

 void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src = dst->src[0];

-    aclTensor* acl_src = ggml_cann_create_tensor(src0);
+    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-    if (ggml_are_same_shape(src0, dst)) {
-        if (dst->type == src0->type) {
-            cann_copy(ctx, acl_src, acl_dst);
-        } else {
-            aclnn_cast(ctx, acl_src, acl_dst, dst);
-        }
-    } else {
-        if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
-            if (dst->type == src0->type) {
-                size_t cpy_size = ggml_nbytes(dst);
-                ACL_CHECK(aclrtMemcpyAsync(
-                    dst->data, cpy_size, src0->data, cpy_size,
-                    ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
-                return;
-            } else {
-                ggml_cann_pool_alloc src_buffer_allocator(
-                    ctx.pool(),
-                    ggml_nelements(dst) * ggml_type_size(dst->type));
-                void* src_trans_buffer = src_buffer_allocator.get();
-                size_t src_trans_nb[GGML_MAX_DIMS];
-                src_trans_nb[0] = ggml_type_size(dst->type);
-                for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                    src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
-                }
-                aclTensor* src_trans_tensor = ggml_cann_create_tensor(
-                    src_trans_buffer, ggml_cann_type_mapping(dst->type),
-                    ggml_type_size(dst->type), src0->ne, src_trans_nb,
-                    GGML_MAX_DIMS);

-                aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
-                size_t cpy_size = ggml_nbytes(dst);
-                ACL_CHECK(aclrtMemcpyAsync(
-                    dst->data, cpy_size, src_trans_buffer, cpy_size,
-                    ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
-                ACL_CHECK(aclDestroyTensor(src_trans_tensor));
-                return;
-            }
-        } else if (ggml_is_contiguous(dst)) {
-            ggml_cann_pool_alloc src_buffer_allocator(
-                ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
-            void* src_trans_buffer = src_buffer_allocator.get();
-            size_t src_trans_nb[GGML_MAX_DIMS];
-            src_trans_nb[0] = ggml_type_size(dst->type);
-            for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
-            }
-            aclTensor* src_trans_tensor = ggml_cann_create_tensor(
-                src_trans_buffer, ggml_cann_type_mapping(dst->type),
-                ggml_type_size(dst->type), src0->ne, src_trans_nb,
-                GGML_MAX_DIMS);
+    ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
+    ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
+    src->extra = src_extra_allocator.get();
+    dst->extra = dst_extra_allocator.get();
+    ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));
+    ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));

-            aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
-
-            size_t cpy_size = ggml_nbytes(dst);
-            ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src_trans_buffer,
-                                       cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
-                                       ctx.stream()));
-            ACL_CHECK(aclDestroyTensor(src_trans_tensor));
-            return;
-        } else {
-            GGML_ABORT("Unsupport dst is not tontiguous.");
-        }
+    if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
+        ggml_are_same_shape(src, dst)) {
+        cann_copy(ctx, acl_src, acl_dst);
+        ACL_CHECK(aclDestroyTensor(acl_src));
+        ACL_CHECK(aclDestroyTensor(acl_dst));
+        return;
    }
+    // TODO: simplify
+    if (src->type == GGML_TYPE_F16) {
+        if (dst->type == GGML_TYPE_Q8_0) {
+            aclrtlaunch_ascendc_quantize_f16_q8_0(
+                24, ctx.stream(), src->data, dst->data,
+                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
+                ((ggml_tensor*)dst->extra)->ne);
+            return;
+        }
+        if (dst->type == GGML_TYPE_Q4_0) {
+            aclrtlaunch_ascendc_quantize_f16_to_q4_0(
+                24, ctx.stream(), src->data, dst->data,
+                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
+                ((ggml_tensor*)dst->extra)->ne);
+            return;
+        }
+        if (dst->type == GGML_TYPE_F16) {
+            if (ggml_are_same_shape(src, dst)) {
+                cann_copy(ctx, acl_src, acl_dst);
+                ACL_CHECK(aclDestroyTensor(acl_src));
+                ACL_CHECK(aclDestroyTensor(acl_dst));
+                return;
+            }
+            if (ggml_is_contiguous(dst)) {
+                const size_t src_type_size = ggml_type_size(src->type);
+                if (src->nb[0] == src_type_size) {
+                    // src0 is contigous on first dimension, copy by rows
+                    int64_t rows_num = ggml_nrows(src);

-    ACL_CHECK(aclDestroyTensor(acl_src));
-    ACL_CHECK(aclDestroyTensor(acl_dst));
+                    aclrtlaunch_ascendc_dup_by_rows_fp16(
+                        rows_num, ctx.stream(), src->data, dst->data,
+                        ((ggml_tensor*)src->extra)->ne,
+                        ((ggml_tensor*)src->extra)->nb,
+                        ((ggml_tensor*)dst->extra)->ne,
+                        ((ggml_tensor*)dst->extra)->nb);
+                    return;
+                }
+                GGML_ABORT("fatal error");
+            }
+            GGML_ABORT("fatal error");
+        }
+        if (dst->type == GGML_TYPE_F32) {
+            if (ggml_are_same_shape(src, dst)) {
+                cann_copy(ctx, acl_src, acl_dst);
+                ACL_CHECK(aclDestroyTensor(acl_src));
+                ACL_CHECK(aclDestroyTensor(acl_dst));
+                return;
+            }
+            if (ggml_is_contiguous(dst)) {
+                const size_t src_type_size = ggml_type_size(src->type);
+                if (src->nb[0] == src_type_size) {
+                    // src0 is contigous on first dimension, copy by rows
+                    int64_t rows_num = ggml_nrows(src);
+                    aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
+                        rows_num, ctx.stream(), src->data, dst->data,
+                        ((ggml_tensor*)src->extra)->ne,
+                        ((ggml_tensor*)src->extra)->nb,
+                        ((ggml_tensor*)dst->extra)->ne,
+                        ((ggml_tensor*)dst->extra)->nb);
+                    return;
+                }
+                GGML_ABORT("fatal error");
+            }
+            GGML_ABORT("fatal error");
+        }
+        // TODO
+        GGML_ABORT("fatal error");
+    } else if (src->type == GGML_TYPE_F32) {
+        // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
+        //          && nb0 == type_size)
+        if (dst->type == GGML_TYPE_Q8_0) {
+            aclrtlaunch_ascendc_quantize_f32_q8_0(
+                24, ctx.stream(), src->data, dst->data,
+                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
+                ((ggml_tensor*)dst->extra)->ne);
+            return;
+        }
+        if (dst->type == GGML_TYPE_Q4_0) {
+            aclrtlaunch_ascendc_quantize_f32_to_q4_0(
+                24, ctx.stream(), src->data, dst->data,
+                ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
+                ((ggml_tensor*)dst->extra)->ne);
+            return;
+        }
+        if (dst->type == GGML_TYPE_F32) {
+            if (ggml_are_same_shape(src, dst)) {
+                cann_copy(ctx, acl_src, acl_dst);
+                ACL_CHECK(aclDestroyTensor(acl_src));
+                ACL_CHECK(aclDestroyTensor(acl_dst));
+                return;
+            }
+            if (ggml_is_contiguous(dst)) {
+                const size_t src_type_size = ggml_type_size(src->type);
+                if (src->nb[0] == src_type_size) {
+                    // src0 is contigous on first dimension, copy by rows
+                    int64_t rows_num = ggml_nrows(src);
+                    aclrtlaunch_ascendc_dup_by_rows_fp32(
+                        rows_num, ctx.stream(), src->data, dst->data,
+                        ((ggml_tensor*)src->extra)->ne,
+                        ((ggml_tensor*)src->extra)->nb,
+                        ((ggml_tensor*)dst->extra)->ne,
+                        ((ggml_tensor*)dst->extra)->nb);
+                    return;
+                }
+                GGML_ABORT("fatal error");
+            } else {
+                // TODO: dst not contiguous
+                GGML_ABORT("fatal error");
+            }
+        }
+        if (dst->type == GGML_TYPE_F16) {
+            if (ggml_are_same_shape(src, dst)) {
+                cann_copy(ctx, acl_src, acl_dst);
+                ACL_CHECK(aclDestroyTensor(acl_src));
+                ACL_CHECK(aclDestroyTensor(acl_dst));
+                return;
+            }
+            if (ggml_is_contiguous(dst)) {
+                const size_t src_type_size = ggml_type_size(src->type);
+                if (src->nb[0] == src_type_size) {
+                    // src0 is contigous on first dimension, copy by rows
+                    int64_t rows_num = ggml_nrows(src);
+                    aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
+                        rows_num, ctx.stream(), src->data, dst->data,
+                        ((ggml_tensor*)src->extra)->ne,
+                        ((ggml_tensor*)src->extra)->nb,
+                        ((ggml_tensor*)dst->extra)->ne,
+                        ((ggml_tensor*)dst->extra)->nb);
+                    return;
+                }
+                GGML_ABORT("fatal error");
+            }
+        }
+        // TODO
+        GGML_ABORT("fatal error");
+    } else {
+        if (ggml_are_same_shape(src, dst)) {
+            cann_copy(ctx, acl_src, acl_dst);
+            ACL_CHECK(aclDestroyTensor(acl_src));
+            ACL_CHECK(aclDestroyTensor(acl_dst));
+            return;
+        }
+        GGML_ABORT("fatal error");
+    }
 }

 #ifdef __cplusplus
@@ -1089,6 +1158,8 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    float eps;
    memcpy(&eps, dst->op_params, sizeof(float));

+    GGML_ASSERT(eps > 0.0f);
+
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
@@ -2307,168 +2378,85 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
 }

-/**
- * @brief Performs embedding operation on a 4D tensor using the CANN backend.
- *
- * This function extracts slices from the source tensor (`src_buffer`),
- * index tensor (`index`), and destination tensor (`dst`), and performs an
- * embedding operation on them. The embedding operation is applied by iterating
- * over the last two dimensions of the source tensor, creating the necessary
- * tensors for the source, index, and output, and executing the embedding operation.
- *
- * @param ctx The context for CANN backend operations.
- * @param src_buffer The source buffer holding the data for the source tensor.
- * @param src_ne The dimensions of the source tensor.
- * @param src_nb The strides (byte offsets) of the source tensor.
- * @param index The index tensor used in the embedding operation.
- * @param dst The destination tensor where the result will be stored.
- */
-static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
-                            int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
-                            ggml_tensor* dst) {
-    for (int64_t i = 0; i < src_ne[3]; i++) {
-        for (int64_t j = 0; j < src_ne[2]; j++) {
-            // src
-            int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
-            size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
-            aclTensor* acl_src_tensor = ggml_cann_create_tensor(
-                (char*)src_buffer + i * src_nb[3] + j * src_nb[2],
-                ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
-                acl_src_ne, acl_src_nb, 2);
-
-            // index
-            int64_t acl_index_ne[1] = {index->ne[0]};
-            size_t acl_index_nb[1] = {index->nb[0]};
-            aclTensor* acl_index = ggml_cann_create_tensor(
-                (char*)index->data + i * index->nb[2] + j * index->nb[1],
-                ggml_cann_type_mapping(index->type), ggml_element_size(index),
-                acl_index_ne, acl_index_nb, 1);
-
-            // out
-            int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
-            size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
-            aclTensor* acl_out = ggml_cann_create_tensor(
-                (char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
-                ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
-                acl_out_ne, acl_out_nb, 2);
-
-            uint64_t workspaceSize = 0;
-            aclOpExecutor* executor;
-            void* workspaceAddr = nullptr;
-
-            ACL_CHECK(aclnnEmbeddingGetWorkspaceSize(
-                acl_src_tensor, acl_index, acl_out, &workspaceSize, &executor));
-
-            if (workspaceSize > 0) {
-                ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
-                                                         workspaceSize);
-                workspaceAddr = workspace_allocator.get();
-            }
-
-            ACL_CHECK(aclnnEmbedding(workspaceAddr, workspaceSize, executor,
-                                     ctx.stream()));
-
-            ACL_CHECK(aclDestroyTensor(acl_src_tensor));
-            ACL_CHECK(aclDestroyTensor(acl_index));
-            ACL_CHECK(aclDestroyTensor(acl_out));
-        }
-    }
-}
-
 void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
-    ggml_tensor* src0 = dst->src[0];  // src
-    ggml_tensor* src1 = dst->src[1];  // index
+    ggml_tensor* src0 = dst->src[0];
+    ggml_tensor* src1 = dst->src[1];
+
+    ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
+    ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
+    ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
+    src0->extra = src0_extra_allocator.get();
+    src1->extra = src1_extra_allocator.get();
+    dst->extra = dst_extra_allocator.get();
+    ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));
+    ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));
+    ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
+                               sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
+                               ctx.stream()));

    switch (src0->type) {
        case GGML_TYPE_F32: {
-            aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
-                                   dst);
+#ifdef ASCEND_310P
+            // Special operation for get_row_f32 kernel of 310P: clear the
+            // content of dest data buffer when row is not aligned to 32 bytes
+            if ((src0->ne[0] % 8) != 0) {
+                size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
+                                 src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
+                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
+            }
+#endif
+            aclrtlaunch_ascendc_get_row_f32(
+                24, ctx.stream(), src0->data, src1->data, dst->data,
+                ((ggml_tensor*)src0->extra)->ne,
+                ((ggml_tensor*)src0->extra)->nb,
+                ((ggml_tensor*)src1->extra)->ne,
+                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
+                ((ggml_tensor*)dst->extra)->nb);
            break;
        }
        case GGML_TYPE_F16: {
-            aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
-            ggml_cann_pool_alloc src_buffer_allocator(
-                ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
-            void* src_trans_buffer = src_buffer_allocator.get();
-            size_t src_trans_nb[GGML_MAX_DIMS];
-            src_trans_nb[0] = sizeof(float_t);
-            for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
+#ifdef ASCEND_310P
+            // Special operation for get_row_f16 kernel of 310P: clear the
+            // content of dest data buffer when row is not aligned to 32 bytes
+            if ((src0->ne[0] % 16) != 0) {
+                size_t dst_len =
+                    src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
+                    ggml_type_size(
+                        GGML_TYPE_F32);  // out is also f32, even input is f16
+                ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
            }
-            aclTensor* src_trans_tensor = ggml_cann_create_tensor(
-                src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
-                src0->ne, src_trans_nb, GGML_MAX_DIMS);
-            aclnn_cast(ctx, acl_src0, src_trans_tensor, dst);
-            aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
-                                   src_trans_nb, src1, dst);
-            ACL_CHECK(aclDestroyTensor(acl_src0));
-            ACL_CHECK(aclDestroyTensor(src_trans_tensor));
+#endif
+            aclrtlaunch_ascendc_get_row_f16(
+                24, ctx.stream(), src0->data, src1->data, dst->data,
+                ((ggml_tensor*)src0->extra)->ne,
+                ((ggml_tensor*)src0->extra)->nb,
+                ((ggml_tensor*)src1->extra)->ne,
+                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
+                ((ggml_tensor*)dst->extra)->nb);
            break;
        }
-        case GGML_TYPE_Q8_0: {
-            // add 1 dim for bcast mul.
-            size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
-                dequant_nb[GGML_MAX_DIMS + 1];
-            int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
-                *dequant_ne;
-            int64_t scale_offset = 0;
-
-            // [3,4,5,64] -> [3,4,5,2,32]
-            weight_ne[0] = QK8_0;
-            weight_ne[1] = src0->ne[0] / QK8_0;
-            weight_nb[0] = sizeof(int8_t);
-            weight_nb[1] = weight_nb[0] * weight_ne[0];
-            for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
-                weight_ne[i] = src0->ne[i - 1];
-                weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
-            }
-
-            // [3,4,5,64] -> [3,4,5,2,1]
-            scale_ne[0] = 1;
-            scale_ne[1] = src0->ne[0] / QK8_0;
-            scale_nb[0] = sizeof(uint16_t);
-            scale_nb[1] = scale_nb[0] * scale_ne[0];
-            for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
-                scale_ne[i] = src0->ne[i - 1];
-                scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
-            }
-
-            // [3,4,5,64] -> [3,4,5,2,32]
-            dequant_ne = weight_ne;
-            dequant_nb[0] = sizeof(float_t);
-            for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
-                dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
-            }
-
-            scale_offset = ggml_nelements(src0) * sizeof(int8_t);
-            ggml_cann_pool_alloc dequant_buffer_allocator(
-                ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
-
-            aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
-                src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
-                GGML_MAX_DIMS + 1);
-            aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
-                src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb,
-                GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
-            aclTensor* dequant_tensor = ggml_cann_create_tensor(
-                dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
-                dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
-
-            aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
-            dequant_nb[0] = sizeof(float_t);
-            dequant_ne = src0->ne;
-            for (int i = 1; i < GGML_MAX_DIMS; i++) {
-                dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
-            }
-
-            aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
-                                   dequant_ne, dequant_nb, src1, dst);
-
-            ACL_CHECK(aclDestroyTensor(dequant_tensor));
+        case GGML_TYPE_Q4_0:
+            aclrtlaunch_ascendc_get_row_q4_0(
+                24, ctx.stream(), src0->data, src1->data, dst->data,
+                ((ggml_tensor*)src0->extra)->ne,
+                ((ggml_tensor*)src1->extra)->ne,
+                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
+                ((ggml_tensor*)dst->extra)->nb);
+            break;
+        case GGML_TYPE_Q8_0:
+            aclrtlaunch_ascendc_get_row_q8_0(
+                24, ctx.stream(), src0->data, src1->data, dst->data,
+                ((ggml_tensor*)src0->extra)->ne,
+                ((ggml_tensor*)src1->extra)->ne,
+                ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
+                ((ggml_tensor*)dst->extra)->nb);
            break;
-        }
        default:
-            GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
+            GGML_ABORT("fatal error");
            break;
    }
 }
@@ -2809,8 +2797,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,

            ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
                acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
-                nullptr, nullptr, nullptr, antiquantGroupSize,
-                acl_output_tensor, &workspaceSize, &executor));
+                nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor,
+                &workspaceSize, &executor));
            if (workspaceAddr == nullptr) {
                workspaceAddr = workspace_allocator.alloc(workspaceSize);
            }
@@ -3149,7 +3137,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    // TODO: use ascendc
    // Only test with LLAMA model.
    ggml_tensor* src0 = dst->src[0];  // input
-    // ggml_tensor* src2 = dst->src[2];  // freq_factors, not used now.
+    ggml_tensor* src2 = dst->src[2];  // freq_factors

    // param
    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
@@ -3441,46 +3429,3 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
-
-
- void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst){
-    ggml_tensor * src0 = dst->src[0];
-
-    aclTensor* acl_src = ggml_cann_create_tensor(src0);
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst, dst->ne, dst->nb, 3);
-
-    uint64_t workspaceSize = 0;
-    aclOpExecutor* executor;
-    void* workspaceAddr = nullptr;
-
-    ACL_CHECK(aclnnArgMaxGetWorkspaceSize(acl_src, 3, false, acl_dst,
-                     &workspaceSize, &executor));
-    if (workspaceSize > 0) {
-        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
-        workspaceAddr = workspace_allocator.get();
-    }
-    ACL_CHECK(aclnnArgMax(workspaceAddr, workspaceSize, executor, ctx.stream()));
-
-    ACL_CHECK(aclDestroyTensor(acl_src));
-    ACL_CHECK(aclDestroyTensor(acl_dst));
-}
-
-void ggml_cann_cos(ggml_backend_cann_context& ctx, ggml_tensor* dst){
-    ggml_tensor * src0 = dst->src[0];
-
-    aclTensor* acl_src = ggml_cann_create_tensor(src0);
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-    aclnn_cos(ctx, acl_src, acl_dst);
-    ACL_CHECK(aclDestroyTensor(acl_src));
-    ACL_CHECK(aclDestroyTensor(acl_dst));
-}
-
-void ggml_cann_sin(ggml_backend_cann_context& ctx, ggml_tensor* dst){
-    ggml_tensor * src0 = dst->src[0];
-
-    aclTensor* acl_src = ggml_cann_create_tensor(src0);
-    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
-    aclnn_sin(ctx, acl_src, acl_dst);
-    ACL_CHECK(aclDestroyTensor(acl_src));
-    ACL_CHECK(aclDestroyTensor(acl_dst));
-}
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@@ -484,47 +484,6 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 */
 void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);

-/**
- * @brief   Computes the index of the maximum value along the specified dimension
- *          of a ggml tensor using the CANN backend.
- *
- * @details This function performs an argmax operation on the input tensor.
- *          It finds the index of the maximum value along the specified axis
- *          and stores these indices in the destination tensor `dst`. The
- *          operation is executed using the CANN backend for optimized performance.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the indices of the maximum values will be stored.
- *            dst->op is `GGML_OP_ARGMAX`.
- */
-void ggml_cann_argmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Computes the cosine of each element in a ggml tensor using the CANN backend.
- *
- * @details This function applies the cosine function element-wise to the input tensor.
- *          The computed cosine values are stored in the destination tensor `dst`.
- *          The operation is optimized using the CANN backend for improved performance.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the cosine values will be stored.
- *            dst->op is `GGML_OP_COS`.
- */
-void ggml_cann_cos(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
-/**
- * @brief   Computes the sine of each element in a ggml tensor using the CANN backend.
- *
- * @details This function applies the sine function element-wise to the input tensor.
- *          The computed sine values are stored in the destination tensor `dst`.
- *          The operation is optimized using the CANN backend for improved performance.
- *
- * @param ctx The CANN context used for operations.
- * @param dst The destination tensor where the sine values will be stored.
- *            dst->op is `GGML_OP_SIN`.
- */
-void ggml_cann_sin(ggml_backend_cann_context& ctx, ggml_tensor* dst);
-
 template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
                                       aclTensor*, uint64_t*, aclOpExecutor**),
          aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
@@ -576,6 +535,9 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
 void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];

+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);

@@ -604,6 +566,9 @@ template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
 void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];

+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);

--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1420,15 +1420,6 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
        case GGML_OP_ARGSORT:
            ggml_cann_argsort(ctx, dst);
            break;
-        case GGML_OP_ARGMAX:
-            ggml_cann_argmax(ctx, dst);
-            break;
-        case GGML_OP_COS:
-            ggml_cann_cos(ctx, dst);
-            break;
-        case GGML_OP_SIN:
-            ggml_cann_sin(ctx, dst);
-            break;
        default:
            return false;
    }
@@ -1467,6 +1458,11 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
    ACL_CHECK(aclrtSynchronizeDevice());
    ACL_CHECK(aclrtResetDevice(cann_ctx->device));

+    // finalize when last backend freed.
+    if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) {
+        ACL_CHECK(aclFinalize());
+    }
+
    delete cann_ctx;
    delete backend;
 }
@@ -1692,14 +1688,11 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
        case GGML_OP_MUL_MAT: {
            switch (op->src[0]->type) {
+                case GGML_TYPE_Q8_0:
                case GGML_TYPE_F16:
                case GGML_TYPE_F32:
-                    return true;
-                case GGML_TYPE_Q8_0:
                case GGML_TYPE_Q4_0:
-                    // only support contiguous for quantized types.
-                    return ggml_is_contiguous(op->src[0]) &&
-                            ggml_is_contiguous(op->src[1]);
+                    return true;
                default:
                    return false;
            }
@@ -1711,6 +1704,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            switch (op->src[0]->type) {
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
+                case GGML_TYPE_Q4_0:
                case GGML_TYPE_Q8_0:
                    return true;
                default:
@@ -1718,21 +1712,16 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
        } break;
        case GGML_OP_CPY: {
-            ggml_tensor *src = op->src[0];
-            if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
-                  (src->type != GGML_TYPE_F32 &&
-                    src->type != GGML_TYPE_F16)) {
-                // only support F32 and F16.
-                return false;
+            switch (op->type) {
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_Q8_0:
+                case GGML_TYPE_Q4_0:
+                    return true;
+                default:
+                    return false;
            }
-
-            if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
-                // unsupport dst is not contiguous.
-                return false;
-            }
-
-            return true;
-        } break;
+        }
        case GGML_OP_CONT: {
            // TODO: support GGML_TYPE_BF16
            switch (op->src[0]->type) {
@@ -1745,14 +1734,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        }
        case GGML_OP_ROPE: {
            // TODO: with ops-test v == 1
-            float ext_factor = 0.0f;
-            memcpy(&ext_factor, (const float *) op->op_params + 7, sizeof(float));
+            float * ext_factor = (float*)((int32_t*)op->op_params + 7);
            // TODO: n_dims <= ne0
            if (op->src[0]->ne[0] != op->op_params[1]) {
                return false;
            }
            // TODO: ext_factor != 0
-            if (ext_factor != 0) {
+            if (*ext_factor != 0) {
                return false;
            }

@@ -1774,19 +1762,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            }
            return true;
        }
-        case GGML_OP_POOL_2D: {
-            const int32_t * opts = (const int32_t *) op->op_params;
-            const int       k0   = opts[1];
-            const int       k1   = opts[2];
-            const int       p0   = opts[5];
-            const int       p1   = opts[6];
-            // value of paddingH should be at most half of kernelH
-            // value of paddingW should be at most half of kernelW
-            return (p0 <= (k0 / 2)) && (p1 <= (k1 / 2));
-        }
-        case GGML_OP_DUP:
        case GGML_OP_IM2COL:
        case GGML_OP_CONCAT:
+        case GGML_OP_DUP:
        case GGML_OP_REPEAT:
        case GGML_OP_NONE:
        case GGML_OP_RESHAPE:
@@ -1803,6 +1781,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_CLAMP:
        case GGML_OP_DIAG_MASK_INF:
        case GGML_OP_SOFT_MAX:
+        case GGML_OP_POOL_2D:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
@@ -1811,9 +1790,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
        case GGML_OP_LEAKY_RELU:
-        case GGML_OP_ARGMAX:
-        case GGML_OP_COS:
-        case GGML_OP_SIN:
            return true;
        default:
            return false;
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@@ -0,0 +1,30 @@
+file(GLOB SRC_FILES
+    get_row_f32.cpp
+    get_row_f16.cpp
+    get_row_q4_0.cpp
+    get_row_q8_0.cpp
+    quantize_f32_q8_0.cpp
+    quantize_f16_q8_0.cpp
+    quantize_float_to_q4_0.cpp
+    dup.cpp
+)
+
+set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
+set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels STATIC
+    ${SRC_FILES}
+)
+
+message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
+ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
+# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
--- a/ggml/src/ggml-cann/kernels/ascendc_kernels.h
+++ b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
@@ -0,0 +1,19 @@
+#ifndef ASCENDC_KERNELS_H
+#define ASCENDC_KERNELS_H
+
+#include "aclrtlaunch_ascendc_get_row_f32.h"
+#include "aclrtlaunch_ascendc_get_row_f16.h"
+#include "aclrtlaunch_ascendc_get_row_q8_0.h"
+#include "aclrtlaunch_ascendc_get_row_q4_0.h"
+
+#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
+#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
+#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
+#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
+
+#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
+#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
+#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
+#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
+
+#endif  // ASCENDC_KERNELS_H
--- a/ggml/src/ggml-cann/kernels/dup.cpp
+++ b/ggml/src/ggml-cann/kernels/dup.cpp
@@ -0,0 +1,234 @@
+#include "kernel_operator.h"
+
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+const int64_t SUPPORTED_MAX_DIM = 65535;  // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
+
+template <typename SRC_T, typename DST_T>
+class DupByRows {
+   public:
+    __aicore__ inline DupByRows() {}
+    __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
+                                size_t *input_nb_ub) {
+        /* Dup by rows when src is contigous on first dimension and dst is
+        contiguous, each kernel process one row.
+        */
+
+        // Input has four dims.
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        // param
+        num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
+        num_elem = input_ne_ub[0];
+
+        // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
+        idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
+        idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
+                  / (input_ne_ub[1]);
+        idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
+                - idx_ne2 * input_ne_ub[1];
+
+        // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
+        src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
+                     + input_nb_ub[1] * idx_ne1;
+
+        // dst is contiguous
+        dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
+
+        src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
+                                                                src_stride));
+        dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
+                                                                dst_stride));
+
+        pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
+                                                32 - 1) / 32 * 32);
+        pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
+                                                32 - 1) / 32 * 32);
+    }
+
+    __aicore__ inline void copy_in() {
+        LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
+        const size_t elem_per_block = 32 / sizeof(SRC_T);
+        size_t tail = num_elem % elem_per_block;
+        size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
+        DataCopy(src_local, src_gm, cpy_elements_len);
+        src_queue.EnQue(src_local);
+    }
+
+    __aicore__ inline void copy_out() {
+        LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
+#ifdef ASCEND_310P
+        const size_t elem_per_block = 32 / sizeof(DST_T);
+        size_t tail = num_elem % elem_per_block;
+        size_t len = num_elem & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(dst_gm, dst_local, len);
+        }
+        if(tail != 0) {
+            for (size_t i = tail; i < elem_per_block; i++) {
+                dst_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(dst_gm[len], dst_local[len], elem_per_block);
+            SetAtomicNone();
+        }
+#else
+        DataCopyExtParams dataCopyParams;
+        dataCopyParams.blockCount = 1;
+        dataCopyParams.blockLen = num_elem * sizeof(DST_T);
+        DataCopyPad(dst_gm, dst_local, dataCopyParams);
+#endif
+        dst_queue.FreeTensor(dst_local);
+    }
+
+    __aicore__ inline void dup() {
+        // main process, copy one row data from src to dst.
+        copy_in();
+
+        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
+        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
+
+        int32_t BLOCK_NUM = 32 / sizeof(DST_T);
+        DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
+                                        / BLOCK_NUM * BLOCK_NUM);
+        dst_queue.EnQue<DST_T>(dst_local);
+
+        src_queue.FreeTensor(src_local);
+        copy_out();
+    }
+
+    __aicore__ inline void dup_with_cast() {
+        // main process, copy one row data from src to dst.
+        // cast dtype from src to dst.
+        copy_in();
+
+        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
+        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
+
+        Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
+        dst_queue.EnQue<DST_T>(dst_local);
+
+        src_queue.FreeTensor(src_local);
+        copy_out();
+    }
+
+   private:
+
+    TPipe pipe;
+    GlobalTensor<SRC_T> src_gm;
+    GlobalTensor<DST_T> dst_gm;
+
+    int64_t num_rows;
+    int64_t num_elem;
+    int64_t idx_ne3;
+    int64_t idx_ne2;
+    int64_t idx_ne1;
+    int64_t src_stride;
+    int64_t dst_stride;
+
+    TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
+                                                        GM_ADDR src_gm,
+                                                        GM_ADDR dst_gm,
+                                                        GM_ADDR input_ne_gm,
+                                                        GM_ADDR input_nb_gm,
+                                                        GM_ADDR output_ne_gm,
+                                                        GM_ADDR output_nb_gm) {
+
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    DupByRows<half, half> op;
+    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
+    op.dup();
+}
+
+extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
+                                                        GM_ADDR src_gm,
+                                                        GM_ADDR dst_gm,
+                                                        GM_ADDR input_ne_gm,
+                                                        GM_ADDR input_nb_gm,
+                                                        GM_ADDR output_ne_gm,
+                                                        GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    DupByRows<float, float> op;
+    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
+    op.dup();
+}
+
+extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
+                                                        GM_ADDR src_gm,
+                                                        GM_ADDR dst_gm,
+                                                        GM_ADDR input_ne_gm,
+                                                        GM_ADDR input_nb_gm,
+                                                        GM_ADDR output_ne_gm,
+                                                        GM_ADDR output_nb_gm) {
+
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    DupByRows<float, half> op;
+    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
+    op.dup_with_cast();
+}
+
+extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
+                                                        GM_ADDR src_gm,
+                                                        GM_ADDR dst_gm,
+                                                        GM_ADDR input_ne_gm,
+                                                        GM_ADDR input_nb_gm,
+                                                        GM_ADDR output_ne_gm,
+                                                        GM_ADDR output_nb_gm) {
+
+    // copy params from gm to ub.
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    DupByRows<half, float> op;
+    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
+    op.dup_with_cast();
+}
--- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
@@ -0,0 +1,197 @@
+#include "kernel_operator.h"
+
+// optimize me. Use template to avoid copy code.
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+
+class GET_ROW_F16 {
+   public:
+    __aicore__ inline GET_ROW_F16() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
+                                int64_t *output_ne_ub, size_t *output_nb_ub) {
+        // TODO, use template for F16/f32
+        int64_t op_block_num = GetBlockNum();
+        op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+
+            indices_ne[i] = indices_ne_ub[i];
+            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
+
+            output_ne[i] = output_ne_ub[i];
+            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
+        }
+
+        // Indices has two dims. n_elements = all rows should get.
+        // dr, all rows should this thread get.
+        uint64_t n_elements =
+            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
+        dr = n_elements / op_block_num;
+
+        uint64_t tails = n_elements % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        input_gm.SetGlobalBuffer((__gm__ half *)input);
+        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
+        output_gm.SetGlobalBuffer((__gm__ float *)output);
+
+        uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
+                                             & ~31);
+        uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
+                                              & ~31);
+
+        local_buffer_elems = input_local_buffer_size / sizeof(half);
+
+        // TODO, consider long row that can't put in UB.
+        // All data should asign to 32. It's ok because all data is align to 32.
+        pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
+        pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
+        size_t origin_len = len;
+        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
+        const size_t elem_per_block = 32 / sizeof(half);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if(tail != 0) {
+            len += elem_per_block;
+        }
+        DataCopy(input_local, input_gm[offset], len);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
+        LocalTensor<float> output_local = output_queue.DeQue<float>();
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(output_gm[offset], output_local, len);
+        }
+
+        if(tail != 0) {
+#ifdef ASCEND_310P
+            for (size_t i = tail; i < elem_per_block; i++) {
+                output_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
+            SetAtomicNone();
+#else
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = tail * sizeof(float);
+            DataCopyPad(output_gm[offset + len], output_local[len],
+                        dataCopyParams);
+#endif
+        }
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline void calculate_row(int64_t idx) {
+        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
+        const int64_t indices_ne1_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
+            indices_ne[0];
+        const int64_t indices_ne0_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
+             indices_ne1_idx * indices_ne[0]);
+
+        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
+                                       indices_ne1_idx * indices_stride[1] +
+                                       indices_ne2_idx * indices_stride[2];
+        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
+
+        const int64_t input_offset = selected_row_idx * input_stride[1] +
+                                     indices_ne1_idx * input_stride[2] +
+                                     indices_ne2_idx * input_stride[3];
+
+        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
+                                      indices_ne1_idx * output_stride[2] +
+                                      indices_ne2_idx * output_stride[3];
+
+        copy_in(input_offset, input_ne[0]);
+        LocalTensor<half> input_local = input_queue.DeQue<half>();
+        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
+
+        Cast(output_local, input_local, RoundMode::CAST_NONE,
+             local_buffer_elems);
+        output_queue.EnQue(output_local);
+        copy_out(output_offset, input_ne[0]);
+
+        input_queue.FreeTensor(input_local);
+    }
+
+    __aicore__ inline void calculate() {
+        for (int64_t i = ir; i < ir + dr; i++) {
+            calculate_row(i);
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t indices_ne[4];
+    size_t indices_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    size_t local_buffer_elems;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<half> input_gm;
+    GlobalTensor<int32_t> indices_gm;
+    GlobalTensor<float> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    int64_t op_block_idx;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_get_row_f16(
+    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
+    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t indices_ne_ub[4];
+    size_t indices_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
+    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    GET_ROW_F16 op;
+    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
+            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
+    op.calculate();
+}
--- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
@@ -0,0 +1,190 @@
+#include "kernel_operator.h"
+
+// optimize me. Use template to avoid copy code.
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+
+class GET_ROW_F32 {
+   public:
+    __aicore__ inline GET_ROW_F32() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
+                                int64_t *output_ne_ub, size_t *output_nb_ub) {
+        int64_t op_block_num = GetBlockNum();
+        op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+
+            indices_ne[i] = indices_ne_ub[i];
+            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
+
+            output_ne[i] = output_ne_ub[i];
+            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
+        }
+
+        // Indices has two dims. n_elements = all rows should get.
+        // dr, all rows should this thread get.
+        uint64_t n_elements =
+            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
+        dr = n_elements / op_block_num;
+
+        uint64_t tails = n_elements % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        input_gm.SetGlobalBuffer((__gm__ float *)input);
+        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
+        output_gm.SetGlobalBuffer((__gm__ float *)output);
+
+        uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
+        local_buffer_elems = local_buffer_size / sizeof(float);
+
+        // TODO, consider long row that can't put in UB.
+        // All data should asign to 32. It's ok because all data is align to 32.
+        pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
+        pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
+        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if(tail != 0) {
+            len += elem_per_block;
+        }
+        DataCopy(input_local, input_gm[offset], len);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
+        LocalTensor<float> output_local = output_queue.DeQue<float>();
+        const size_t elem_per_block = 32 / sizeof(float);
+        size_t tail = len % elem_per_block;
+        len = len & ~(elem_per_block - 1);
+        if (len > 0) {
+            DataCopy(output_gm[offset], output_local, len);
+        }
+
+        if(tail != 0) {
+#ifdef ASCEND_310P
+            for (size_t i = tail; i < elem_per_block; i++) {
+                output_local[len + i].SetValue(0, 0);
+            }
+            SetAtomicAdd<float>();
+            DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
+            SetAtomicNone();
+#else
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = tail * sizeof(float);
+            DataCopyPad(output_gm[offset + len], output_local[len],
+                        dataCopyParams);
+#endif
+        }
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline void calculate_row(int64_t idx) {
+        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
+        const int64_t indices_ne1_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
+            indices_ne[0];
+        const int64_t indices_ne0_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
+             indices_ne1_idx * indices_ne[0]);
+
+        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
+                                       indices_ne1_idx * indices_stride[1] +
+                                       indices_ne2_idx * indices_stride[2];
+        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
+
+        const int64_t input_offset = selected_row_idx * input_stride[1] +
+                                     indices_ne1_idx * input_stride[2] +
+                                     indices_ne2_idx * input_stride[3];
+
+        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
+                                      indices_ne1_idx * output_stride[2] +
+                                      indices_ne2_idx * output_stride[3];
+
+        copy_in(input_offset, input_ne[0]);
+        LocalTensor<float> input_local = input_queue.DeQue<float>();
+        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
+
+        DataCopy(output_local, input_local, local_buffer_elems);
+        output_queue.EnQue(output_local);
+        copy_out(output_offset, input_ne[0]);
+
+        input_queue.FreeTensor(input_local);
+    }
+
+    __aicore__ inline void calculate() {
+        for (int64_t i = ir; i < ir + dr; i++) {
+            calculate_row(i);
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t indices_ne[4];
+    size_t indices_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    size_t local_buffer_elems;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<float> input_gm;
+    GlobalTensor<int32_t> indices_gm;
+    GlobalTensor<float> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    int64_t op_block_idx;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_get_row_f32(
+    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
+    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t indices_ne_ub[4];
+    size_t indices_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
+    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    GET_ROW_F32 op;
+    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
+            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
+    op.calculate();
+}
--- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@@ -0,0 +1,204 @@
+#include "kernel_operator.h"
+
+// optimize me. Use template to avoid copy code.
+using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support 4bit get row
+    extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
+        GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+        GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
+        GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support 4bit get row.\n");
+    }
+#else
+
+#define BUFFER_NUM 2
+
+#define QK4_0 32
+
+class GET_ROW_Q4_0 {
+   public:
+    __aicore__ inline GET_ROW_Q4_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
+                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
+                                size_t *indices_nb_ub, int64_t *output_ne_ub,
+                                size_t *output_nb_ub) {
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            indices_ne[i] = indices_ne_ub[i];
+            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
+            scale_ne[i] = input_ne_ub[i];
+            output_ne[i] = output_ne_ub[i];
+            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
+        }
+
+        // one scale for a group.
+        scale_ne[0] /= QK4_0;
+
+        input_stride[0] = 1;
+        scale_stride[0] = 1;
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        group_size_in_row = input_ne[0] / QK4_0;
+        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
+                               input_ne[3] / 2;
+
+        // Indices has two dims. n_elements = all rows should get.
+        // dr, all rows should this thread get.
+        uint64_t n_elements =
+            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
+        dr = n_elements / op_block_num;
+
+        uint64_t tails = n_elements % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
+        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
+        output_gm.SetGlobalBuffer((__gm__ float *)output);
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
+        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
+        pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
+        // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
+        DataCopy(input_local, input_gm[offset], QK4_0);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        LocalTensor<float> output_local = output_queue.DeQue<float>();
+        DataCopy(output_gm[offset], output_local, QK4_0);
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
+        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
+        const int64_t indices_ne1_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
+            indices_ne[0];
+        const int64_t indices_ne0_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
+             indices_ne1_idx * indices_ne[0]);
+
+        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
+                                       indices_ne1_idx * indices_stride[1] +
+                                       indices_ne2_idx * indices_stride[2];
+        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
+
+        const int64_t input_offset = selected_row_idx * input_stride[1] +
+                                     indices_ne1_idx * input_stride[2] +
+                                     indices_ne2_idx * input_stride[3] +
+                                     group * QK4_0;
+        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
+                                     indices_ne1_idx * scale_stride[2] +
+                                     indices_ne2_idx * scale_stride[3] + group;
+        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
+                                      indices_ne1_idx * output_stride[2] +
+                                      indices_ne2_idx * output_stride[3] +
+                                      group * QK4_0;
+
+        copy_in(input_offset);
+        LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
+        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
+        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
+
+        // TODO: cast more data to speed up.
+        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
+        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
+
+        // Only mul need compile by group.
+        half scale = scale_gm.GetValue(scale_offset);
+
+        Muls(output_local, output_local, (float)scale, QK4_0);
+
+        input_queue.FreeTensor(input_local);
+        cast_queue.FreeTensor(cast_local);
+        output_queue.EnQue(output_local);
+
+        copy_out(output_offset);
+    }
+
+    __aicore__ inline void calculate() {
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                calculate_group(i, j);
+            }
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t scale_ne[4];
+    size_t scale_stride[4];
+
+    int64_t indices_ne[4];
+    size_t indices_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t ir;
+    int64_t dr;
+
+    int64_t group_size_in_row;
+
+    TPipe pipe;
+    GlobalTensor<int4b_t> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int32_t> indices_gm;
+    GlobalTensor<float> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
+    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
+    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    int64_t indices_ne_ub[4];
+    size_t indices_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
+    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    GET_ROW_Q4_0 op;
+    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
+            indices_nb_ub, output_ne_ub, output_nb_ub);
+    op.calculate();
+}
+
+#endif // #ifdef ASCEND_310P
--- a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
@@ -0,0 +1,191 @@
+#include "kernel_operator.h"
+
+// optimize me. Use template to avoid copy code.
+using namespace AscendC;
+
+#define BUFFER_NUM 2
+
+#define QK8_0 32
+
+class GET_ROW_Q8_0 {
+   public:
+    __aicore__ inline GET_ROW_Q8_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
+                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
+                                size_t *indices_nb_ub, int64_t *output_ne_ub,
+                                size_t *output_nb_ub) {
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            indices_ne[i] = indices_ne_ub[i];
+            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
+            scale_ne[i] = input_ne_ub[i];
+            output_ne[i] = output_ne_ub[i];
+            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
+        }
+
+        // one scale for a group.
+        scale_ne[0] /= QK8_0;
+
+        input_stride[0] = 1;
+        scale_stride[0] = 1;
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        group_size_in_row = input_ne[0] / QK8_0;
+        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
+                               input_ne[3] * sizeof(int8_t);
+
+        // Indices has two dims. n_elements = all rows should get.
+        // dr, all rows should this thread get.
+        uint64_t n_elements =
+            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
+        dr = n_elements / op_block_num;
+
+        uint64_t tails = n_elements % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
+        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
+        output_gm.SetGlobalBuffer((__gm__ float *)output);
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
+        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
+        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
+        DataCopy(input_local, input_gm[offset], QK8_0);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        LocalTensor<float> output_local = output_queue.DeQue<float>();
+        DataCopy(output_gm[offset], output_local, QK8_0);
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
+        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
+        const int64_t indices_ne1_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
+            indices_ne[0];
+        const int64_t indices_ne0_idx =
+            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
+             indices_ne1_idx * indices_ne[0]);
+
+        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
+                                       indices_ne1_idx * indices_stride[1] +
+                                       indices_ne2_idx * indices_stride[2];
+        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
+
+        const int64_t input_offset = selected_row_idx * input_stride[1] +
+                                     indices_ne1_idx * input_stride[2] +
+                                     indices_ne2_idx * input_stride[3] +
+                                     group * QK8_0;
+        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
+                                     indices_ne1_idx * scale_stride[2] +
+                                     indices_ne2_idx * scale_stride[3] + group;
+        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
+                                      indices_ne1_idx * output_stride[2] +
+                                      indices_ne2_idx * output_stride[3] +
+                                      group * QK8_0;
+
+        copy_in(input_offset);
+        LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
+        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
+        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
+
+        // TODO: cast more data to speed up.
+        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
+        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
+
+        // Only mul need compile by group.
+        half scale = scale_gm.GetValue(scale_offset);
+        Muls(output_local, output_local, (float)scale, QK8_0);
+
+        input_queue.FreeTensor(input_local);
+        cast_queue.FreeTensor(cast_local);
+        output_queue.EnQue(output_local);
+
+        copy_out(output_offset);
+    }
+
+    __aicore__ inline void calculate() {
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                calculate_group(i, j);
+            }
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t scale_ne[4];
+    size_t scale_stride[4];
+
+    int64_t indices_ne[4];
+    size_t indices_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t ir;
+    int64_t dr;
+
+    int64_t group_size_in_row;
+
+    TPipe pipe;
+    GlobalTensor<int8_t> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int32_t> indices_gm;
+    GlobalTensor<float> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
+    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
+    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
+    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
+    int64_t input_ne_ub[4];
+    int64_t indices_ne_ub[4];
+    size_t indices_nb_ub[4];
+    int64_t output_ne_ub[4];
+    size_t output_nb_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
+    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+    copy_to_ub(output_nb_gm, output_nb_ub, 32);
+
+    GET_ROW_Q8_0 op;
+    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
+            indices_nb_ub, output_ne_ub, output_nb_ub);
+    op.calculate();
+}
--- a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
@@ -0,0 +1,218 @@
+#include "kernel_operator.h"
+
+using namespace AscendC;
+#ifdef ASCEND_310P
+    extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f16->8bit quantization.\n");
+    }
+#else
+
+#define BUFFER_NUM 2
+#define QK8_0 32
+
+class QUANTIZE_F16_Q8_0 {
+   public:
+    __aicore__ inline QUANTIZE_F16_Q8_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *output_ne_ub) {
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+
+            output_ne[i] = output_ne_ub[i];
+        }
+
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
+        }
+
+        scale_ne = input_ne;
+        scale_stride[0] = 1;
+        scale_stride[1] = input_ne[0] / QK8_0;
+        for (int i = 2; i < 4; i++) {
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        // split input tensor by rows.
+        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
+        dr = nr / op_block_num;
+
+        uint64_t tails = nr % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        group_size_in_row = scale_stride[1];
+        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
+                              output_ne[3] * sizeof(uint8_t);
+
+        input_gm.SetGlobalBuffer((__gm__ half *)input);
+        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
+                                                 group_size_in_row *
+                                                 sizeof(half)));
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
+        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
+        pipe.InitBuffer(work_queue, 1, 32);
+        pipe.InitBuffer(max_queue, 1, 32);
+        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
+        pipe.InitBuffer(scale_queue, 1, 32);
+        pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
+        DataCopy(input_local, input_gm[offset], QK8_0);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
+        DataCopy(output_gm[offset], output_local, QK8_0);
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
+        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
+        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
+        const int64_t i1 =
+            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
+
+        const int64_t input_offset = i1 * input_stride[1] +
+                                     i2 * input_stride[2] +
+                                     i3 * input_stride[3] + QK8_0 * group;
+
+        const int64_t output_offset = i1 * output_stride[1] +
+                                      i2 * output_stride[2] +
+                                      i3 * output_stride[3] + QK8_0 * group;
+
+        copy_in(input_offset);
+        LocalTensor<half> input_local = input_queue.DeQue<half>();
+        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
+        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
+        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
+        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
+        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
+
+        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
+        Abs(abs_local, cast_local, QK8_0);
+        ReduceMax(max_local, abs_local, work_local, QK8_0);
+
+        pipe_barrier(PIPE_ALL);
+        float d = max_local.GetValue(0);
+        d = d / ((1 << 7) - 1);
+        if (d != 0) {
+            Muls(cast_local, cast_local, 1.0f / d, QK8_0);
+        }
+
+        Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
+        Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
+        Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
+        output_queue.EnQue(output_local);
+        copy_out(output_offset);
+
+        input_queue.FreeTensor(input_local);
+        work_queue.FreeTensor(work_local);
+        abs_queue.FreeTensor(abs_local);
+        max_queue.FreeTensor(max_local);
+        cast_queue.FreeTensor(cast_local);
+        return (half)d;
+    }
+
+    __aicore__ inline void calculate() {
+        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
+        uint32_t scale_local_offset = 0;
+        uint32_t scale_global_offset = 0;
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                half scale = calculate_group(i, j);
+                scale_local.SetValue(scale_local_offset++, scale);
+                if (scale_local_offset == 16) {
+                    scale_local_offset = 0;
+                    // TODO: OPTIMIZE ME
+                    pipe_barrier(PIPE_ALL);
+                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
+                    pipe_barrier(PIPE_ALL);
+                    scale_global_offset += 16;
+                }
+            }
+        }
+
+        if (scale_local_offset != 0) {
+            pipe_barrier(PIPE_ALL);
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
+            DataCopyPad(scale_gm[scale_global_offset], scale_local,
+                        dataCopyParams);
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t *scale_ne;
+    size_t scale_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t group_size_in_row;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<half> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int8_t> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, 1> work_queue;
+    TQue<QuePosition::VECOUT, 1> max_queue;
+    TQue<QuePosition::VECIN, 1> abs_queue;
+    TQue<QuePosition::VECOUT, 1> scale_queue;
+    TQue<QuePosition::VECOUT, 1> cast_queue;
+
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
+    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+    QUANTIZE_F16_Q8_0 op;
+    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+    op.calculate();
+}
+
+#endif // #ifdef ASCEND_310P
--- a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
@@ -0,0 +1,216 @@
+#include "kernel_operator.h"
+
+using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support f32->8bit quantization
+    extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f32->8bit quantization.\n");
+    }
+#else
+
+#define BUFFER_NUM 2
+#define QK8_0 32
+
+class QUANTIZE_F32_Q8_0 {
+   public:
+    __aicore__ inline QUANTIZE_F32_Q8_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *output_ne_ub) {
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+
+            output_ne[i] = output_ne_ub[i];
+        }
+
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
+        }
+
+        scale_ne = input_ne;
+        scale_stride[0] = 1;
+        scale_stride[1] = input_ne[0] / QK8_0;
+        for (int i = 2; i < 4; i++) {
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        // split input tensor by rows.
+        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
+        dr = nr / op_block_num;
+
+        uint64_t tails = nr % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        group_size_in_row = scale_stride[1];
+        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
+                              output_ne[3] * sizeof(uint8_t);
+
+        input_gm.SetGlobalBuffer((__gm__ float *)input);
+        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
+                                                 ir * group_size_in_row *
+                                                 sizeof(half)));
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
+        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
+        pipe.InitBuffer(work_queue, 1, 32);
+        pipe.InitBuffer(max_queue, 1, 32);
+        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
+        pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
+        pipe.InitBuffer(scale_queue, 1, 32);
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
+        DataCopy(input_local, input_gm[offset], QK8_0);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
+        DataCopy(output_gm[offset], output_local, QK8_0);
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
+        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
+        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
+        const int64_t i1 =
+            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
+
+        const int64_t input_offset = i1 * input_stride[1] +
+                                     i2 * input_stride[2] +
+                                     i3 * input_stride[3] + QK8_0 * group;
+
+        const int64_t output_offset = i1 * output_stride[1] +
+                                      i2 * output_stride[2] +
+                                      i3 * output_stride[3] + QK8_0 * group;
+
+        copy_in(input_offset);
+        LocalTensor<float> input_local = input_queue.DeQue<float>();
+        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
+        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
+        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
+        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
+        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
+
+        Abs(abs_local, input_local, QK8_0);
+        ReduceMax(max_local, abs_local, work_local, QK8_0);
+        pipe_barrier(PIPE_ALL);
+        float d = max_local.GetValue(0);
+        d = d / ((1 << 7) - 1);
+        if (d != 0) {
+            Muls(input_local, input_local, 1.0f / d, QK8_0);
+        }
+
+        Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
+        Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
+        Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
+        output_queue.EnQue(output_local);
+        copy_out(output_offset);
+
+        input_queue.FreeTensor(input_local);
+        work_queue.FreeTensor(work_local);
+        abs_queue.FreeTensor(abs_local);
+        max_queue.FreeTensor(max_local);
+        cast_queue.FreeTensor(cast_local);
+
+        return (half)d;
+    }
+
+    __aicore__ inline void calculate() {
+        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
+        uint32_t scale_local_offset = 0;
+        uint32_t scale_global_offset = 0;
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                half scale = calculate_group(i, j);
+                scale_local.SetValue(scale_local_offset++, scale);
+                if (scale_local_offset == 16) {
+                    scale_local_offset = 0;
+                    // TODO: OPTIMIZE ME
+                    pipe_barrier(PIPE_ALL);
+                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
+                    pipe_barrier(PIPE_ALL);
+                    scale_global_offset += 16;
+                }
+            }
+        }
+
+        if (scale_local_offset != 0) {
+            pipe_barrier(PIPE_ALL);
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
+            DataCopyPad(scale_gm[scale_global_offset], scale_local,
+                        dataCopyParams);
+            pipe_barrier(PIPE_ALL);
+        }
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t *scale_ne;
+    size_t scale_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t group_size_in_row;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<float> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int8_t> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, 1> work_queue;
+    TQue<QuePosition::VECOUT, 1> max_queue;
+    TQue<QuePosition::VECIN, 1> abs_queue;
+    TQue<QuePosition::VECIN, 1> cast_queue;
+    TQue<QuePosition::VECOUT, 1> scale_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
+    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+    QUANTIZE_F32_Q8_0 op;
+    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+    op.calculate();
+}
+
+#endif // #ifdef ASCEND_310P
--- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
@@ -0,0 +1,295 @@
+#include "kernel_operator.h"
+
+using namespace AscendC;
+#ifdef ASCEND_310P // 310P not support float->4bit quantization
+    extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f32->4bit quantization.\n");
+    }
+
+    extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
+        GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+        GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+        // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
+        printf("Ascend310P not support f16->4bit quantization.\n");
+    }
+#else
+
+#define BUFFER_NUM 2
+#define Group_Size 32
+
+template <typename SRC_T>
+class QUANTIZE_FLOAT_TO_Q4_0 {
+   public:
+    __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
+    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
+                                int64_t *input_ne_ub, size_t *input_nb_ub,
+                                int64_t *output_ne_ub) {
+        // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
+        //                         permute=[0,0,0,0]):
+        // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
+        int64_t op_block_num = GetBlockNum();
+        int64_t op_block_idx = GetBlockIdx();
+
+        // input stride of data elements
+        for (int i = 0; i < 4; i++) {
+            input_ne[i] = input_ne_ub[i];
+            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
+            output_ne[i] = output_ne_ub[i];
+        }
+
+        // output stride of data elements
+        output_stride[0] = 1;
+        for (int i = 1; i < 4; i++) {
+            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
+        }
+
+        // scale saved one by one after data:. [group1_scale, group2_scale, ...]
+        scale_ne = input_ne;
+        scale_stride[0] = 1;
+        scale_stride[1] = input_ne[0] / Group_Size;
+        for (int i = 2; i < 4; i++) {
+            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
+        }
+
+        // split input tensor by rows.
+        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
+        dr = nr / op_block_num;
+
+        uint64_t tails = nr % op_block_num;
+        if (op_block_idx < tails) {
+            dr += 1;
+            ir = dr * op_block_idx;
+        } else {
+            ir = dr * op_block_idx + tails;
+        }
+
+        group_size_in_row = scale_stride[1];
+        int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
+                              output_ne[3] * sizeof(uint8_t) / 2;
+
+        input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
+        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
+        scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
+                                                 group_size_in_row *
+                                                 sizeof(half)));
+
+        pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
+        pipe.InitBuffer(output_queue, BUFFER_NUM,
+                            Group_Size * sizeof(int8_t) / 2);
+        pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
+        pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
+        pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
+        pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
+    }
+
+    __aicore__ inline void copy_in(uint32_t offset) {
+        LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
+        DataCopy(input_local, input_gm[offset], Group_Size);
+        input_queue.EnQue(input_local);
+    }
+
+    __aicore__ inline void copy_out(uint32_t offset) {
+        // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
+        // and using DataCopyPad to avoid 32 bits align.
+        LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
+        LocalTensor<int8_t> output_int8_local =
+                                    output_local.ReinterpretCast<int8_t>();
+
+        DataCopyExtParams dataCopyParams;
+        dataCopyParams.blockCount = 1;
+        dataCopyParams.blockLen = Group_Size / 2  * sizeof(int8_t);
+        DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
+
+        output_queue.FreeTensor(output_local);
+    }
+
+    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
+                                         LocalTensor<float> input_local) {
+        DataCopy(cast_local, input_local, Group_Size);
+    }
+
+    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
+                                         LocalTensor<half> input_local) {
+        Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
+    }
+
+    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
+        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
+        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
+        const int64_t i1 =
+            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
+
+        const int64_t input_offset = i1 * input_stride[1] +
+                                     i2 * input_stride[2] +
+                                     i3 * input_stride[3] + Group_Size * group;
+
+        // output_offset is stride for output_gm which datatype is int8_t and
+        // divided by 2 is needed for int4b_t.
+        const int64_t output_offset = (i1 * output_stride[1] +
+                                       i2 * output_stride[2] +
+                                       i3 * output_stride[3] +
+                                       Group_Size * group) / 2;
+        copy_in(input_offset);
+
+        LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
+        LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
+        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
+        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
+        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
+        LocalTensor<float> min_local = min_queue.AllocTensor<float>();
+        LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
+        LocalTensor<half> half_local = half_queue.AllocTensor<half>();
+
+        input_to_cast(cast_local, input_local);
+
+        ReduceMax(max_local, cast_local, work_local, Group_Size);
+        ReduceMin(min_local, cast_local, work_local, Group_Size);
+        const float max_value = max_local.GetValue(0);
+        const float min_value = min_local.GetValue(0);
+        float d = max_value;
+        if (min_value < 0 && (-1 * min_value) > max_value) {
+            d = min_value;
+        }
+
+        d = d / (-8);
+        if (d != 0) {
+            Muls(cast_local, cast_local, 1.0f / d, Group_Size);
+        }
+
+        // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
+        float scalar = 8.5f;
+        Adds(cast_local, cast_local, scalar, Group_Size);
+        Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
+        scalar = 15.0f;
+        Mins(cast_local, cast_local, scalar, Group_Size);
+        scalar = -8.0f;
+        Adds(cast_local, cast_local, scalar, Group_Size);
+
+        // float->half->int4b
+        Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
+        Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
+
+        output_queue.EnQue(output_local);
+        copy_out(output_offset);
+
+        input_queue.FreeTensor(input_local);
+        work_queue.FreeTensor(work_local);
+        max_queue.FreeTensor(max_local);
+        min_queue.FreeTensor(min_local);
+        int8_queue.FreeTensor(int8_local);
+        half_queue.FreeTensor(half_local);
+        cast_queue.FreeTensor(cast_local);
+        return (half)d;
+    }
+
+    __aicore__ inline void calculate() {
+        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
+        uint32_t scale_local_offset = 0;
+        uint32_t scale_global_offset = 0;
+        for (int64_t i = ir; i < ir + dr; i++) {
+            for (int64_t j = 0; j < group_size_in_row; j++) {
+                half scale = calculate_group(i, j);
+                scale_local.SetValue(scale_local_offset++, scale);
+                // Copy Group_Size/2 length data each time.
+                if (scale_local_offset == Group_Size / 2) {
+                    scale_local_offset = 0;
+                    // TODO: OPTIMIZE ME
+                    pipe_barrier(PIPE_ALL);
+                    DataCopy(scale_gm[scale_global_offset], scale_local,
+                                      Group_Size / 2);
+                    pipe_barrier(PIPE_ALL);
+                    scale_global_offset += Group_Size / 2;
+                }
+            }
+        }
+
+        if (scale_local_offset != 0) {
+            pipe_barrier(PIPE_ALL);
+            DataCopyExtParams dataCopyParams;
+            dataCopyParams.blockCount = 1;
+            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
+            DataCopyPad(scale_gm[scale_global_offset], scale_local,
+                        dataCopyParams);
+            pipe_barrier(PIPE_ALL);
+        }
+        scale_queue.FreeTensor(scale_local);
+    }
+
+   private:
+    int64_t input_ne[4];
+    size_t input_stride[4];
+
+    int64_t *scale_ne;
+    size_t scale_stride[4];
+
+    int64_t output_ne[4];
+    size_t output_stride[4];
+
+    int64_t group_size_in_row;
+
+    int64_t ir;
+    int64_t dr;
+
+    TPipe pipe;
+    GlobalTensor<SRC_T> input_gm;
+    GlobalTensor<half> scale_gm;
+    GlobalTensor<int8_t> output_gm;
+    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
+    TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
+    TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
+};
+
+template <typename T>
+__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
+    auto gm_ptr = (__gm__ uint8_t *)gm;
+    auto ub_ptr = (uint8_t *)(ub);
+    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
+        *ub_ptr = *gm_ptr;
+    }
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
+    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+    QUANTIZE_FLOAT_TO_Q4_0<half> op;
+    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+    op.calculate();
+}
+
+extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
+    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
+    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
+    int64_t input_ne_ub[4];
+    size_t input_nb_ub[4];
+    int64_t output_ne_ub[4];
+
+    copy_to_ub(input_ne_gm, input_ne_ub, 32);
+    copy_to_ub(input_nb_gm, input_nb_ub, 32);
+    copy_to_ub(output_ne_gm, output_ne_ub, 32);
+
+    QUANTIZE_FLOAT_TO_Q4_0<float> op;
+    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
+    op.calculate();
+}
+
+#endif // #ifdef ASCEND_310P
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -158,12 +158,6 @@ typedef sycl::half2 ggml_half2;

 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP

-#ifdef _MSC_VER
-#define GGML_EXTENSION
-#else // _MSC_VER
-#define GGML_EXTENSION __extension__
-#endif // _MSC_VER
-
 #define QK4_0 32
 typedef struct {
    ggml_half d;           // delta
@@ -173,7 +167,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 b

 #define QK4_1 32
 typedef struct {
-    GGML_EXTENSION union {
+    union {
        struct {
            ggml_half d; // delta
            ggml_half m; // min
@@ -194,7 +188,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0

 #define QK5_1 32
 typedef struct {
-    GGML_EXTENSION union {
+    union {
        struct {
            ggml_half d; // delta
            ggml_half m; // min
@@ -215,7 +209,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block

 #define QK8_1 32
 typedef struct {
-    GGML_EXTENSION union {
+    union {
        struct {
            ggml_half d; // delta
            ggml_half s; // d * sum(qs[i])
@@ -256,7 +250,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0
 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
-    GGML_EXTENSION union {
+    union {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
@@ -283,7 +277,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12
 // weight is represented as x = a * q + b
 // Effectively 4.5 bits per weight
 typedef struct {
-    GGML_EXTENSION union {
+    union {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
@@ -300,7 +294,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2,
 // weight is represented as x = a * q + b
 // Effectively 5.5 bits per weight
 typedef struct {
-    GGML_EXTENSION union {
+    union {
        struct {
            ggml_half d;    // super-block scale for quantized scales
            ggml_half dmin; // super-block scale for quantized mins
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -23,11 +23,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        ggml-cpu/amx/mmq.cpp
        ggml-cpu/amx/mmq.h
        ggml-cpu/ggml-cpu-impl.h
-        ggml-cpu/common.h
-        ggml-cpu/binary-ops.h
-        ggml-cpu/binary-ops.cpp
-        ggml-cpu/unary-ops.h
-        ggml-cpu/unary-ops.cpp
        )

    target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
@@ -294,29 +289,23 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
        endif()
    elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
        message(STATUS "PowerPC detected")
-        if (GGML_NATIVE)
-            if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
-                file(READ "/proc/cpuinfo" POWER10_M)
-            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
-                execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
-            endif()
+        if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
+            file(READ "/proc/cpuinfo" POWER10_M)
+        elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
+            execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
+        endif()

-            string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
-            string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
+        string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
+        string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")

-            if (EXTRACTED_NUMBER GREATER_EQUAL 10)
-                list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
-            elseif (EXTRACTED_NUMBER EQUAL 9)
-                list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
-            elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
-                list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
-            else()
-                list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
-            endif()
+        if (EXTRACTED_NUMBER GREATER_EQUAL 10)
+            list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
+        elseif (EXTRACTED_NUMBER EQUAL 9)
+            list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
+        elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
+            list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
        else()
-            if (GGML_CPU_POWERPC_CPUTYPE)
-                list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
-            endif()
+            list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
        endif()
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
        message(STATUS "loongarch64 detected")
@@ -331,11 +320,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
        message(STATUS "RISC-V detected")
        if (GGML_RVV)
-            if (GGML_RV_ZFH)
-                list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
-            else()
-                list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
-            endif()
+            list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
        endif()
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
        message(STATUS "s390x detected")
@@ -374,9 +359,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

        # Fetch KleidiAI sources:
        include(FetchContent)
-        set(KLEIDIAI_COMMIT_TAG "v1.5.0")
+        set(KLEIDIAI_COMMIT_TAG "v1.3.0")
        set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
-        set(KLEIDIAI_ARCHIVE_MD5  "ea22e1aefb800e9bc8c74d91633cc58e")
+        set(KLEIDIAI_ARCHIVE_MD5  "060bd2dc64642b091f461cc8dd7426d9")

        if (POLICY CMP0135)
            cmake_policy(SET CMP0135 NEW)
--- a/ggml/src/ggml-cpu/binary-ops.cpp
+++ b/ggml/src/ggml-cpu/binary-ops.cpp
@@ -1,158 +0,0 @@
-#include "binary-ops.h"
-
-#if defined(GGML_USE_ACCELERATE)
-#include <Accelerate/Accelerate.h>
-
-using vDSP_fn_t = void (*)(const float *, vDSP_Stride, const float *, vDSP_Stride, float *, vDSP_Stride, vDSP_Length);
-#endif
-
-static inline float op_add(float a, float b) {
-    return a + b;
-}
-
-static inline float op_sub(float a, float b) {
-    return a - b;
-}
-
-static inline float op_mul(float a, float b) {
-    return a * b;
-}
-
-static inline float op_div(float a, float b) {
-    return a / b;
-}
-
-template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
-static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) {
-    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
-    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
-    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
-
-    for (int i = 0; i < n; i++) {
-        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i])));
-    }
-}
-
-template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
-static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) {
-    constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
-    constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
-    constexpr auto f32_to_dst  = type_conversion_table<dst_t >::from_f32;
-
-    for (int i = 0; i < n; i++) {
-        int i10 = i % ne10;
-        const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10);
-        z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr)));
-    }
-}
-
-template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
-static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(dst_t));
-    GGML_ASSERT(nb00 == sizeof(src0_t));
-
-    const auto [ir0, ir1] = get_thread_range(params, src0);
-    const bool is_src1_contiguous = (nb10 == sizeof(src1_t));
-
-    if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous
-        GGML_ASSERT(ggml_are_same_shape(src0, src1));
-    }
-
-#ifdef GGML_USE_ACCELERATE
-    vDSP_fn_t vDSP_op = nullptr;
-    // TODO - avoid the f32-only check using type 'trait' lookup tables and row-based src-to-float conversion functions
-    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        if (op == op_add) {
-            vDSP_op = vDSP_vadd;
-        } else if (op == op_sub) {
-            vDSP_op = vDSP_vsub;
-        } else if (op == op_mul) {
-            vDSP_op = vDSP_vmul;
-        } else if (op == op_div) {
-            vDSP_op = vDSP_vdiv;
-        }
-    }
-#endif
-
-    for (int64_t ir = ir0; ir < ir1; ++ir) {
-        const int64_t i03 = ir/(ne02*ne01);
-        const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-        const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-        const int64_t i13 = i03 % ne13;
-        const int64_t i12 = i02 % ne12;
-        const int64_t i11 = i01 % ne11;
-
-        dst_t        * dst_ptr  = (dst_t  *)       ((char *)       dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-        const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-        const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-        if (is_src1_contiguous) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t nr0 = ne00 / ne10;
-
-            for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                if constexpr (std::is_same_v<src0_t, float> && std::is_same_v<src1_t, float> && std::is_same_v<dst_t, float>) {
-                    if (vDSP_op != nullptr) {
-                        vDSP_op(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
-                        continue;
-                    }
-                }
-#endif
-                vec_binary_op_contiguous<op>(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-            }
-        } else {
-            vec_binary_op_non_contiguous<op>(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr);
-        }
-    }
-}
-
-// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
-template <float (*op)(float, float)>
-static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
-    const ggml_tensor * src0 = dst->src[0];
-    const ggml_tensor * src1 = dst->src[1];
-
-    /*  */ if (src0->type == GGML_TYPE_F32  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) { // all f32
-        apply_binary_op<op, float, float, float>(params, dst);
-    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F16  && dst->type == GGML_TYPE_F16) { // all f16
-        apply_binary_op<op, ggml_fp16_t, ggml_fp16_t, ggml_fp16_t>(params, dst);
-    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
-        apply_binary_op<op, ggml_bf16_t, ggml_bf16_t, ggml_bf16_t>(params, dst);
-    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_BF16) {
-        apply_binary_op<op, ggml_bf16_t, float, ggml_bf16_t>(params, dst);
-    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
-        apply_binary_op<op, ggml_bf16_t, float, float>(params, dst);
-    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F16) {
-        apply_binary_op<op, ggml_fp16_t, float, ggml_fp16_t>(params, dst);
-    } else if (src0->type == GGML_TYPE_F16  && src1->type == GGML_TYPE_F32  && dst->type == GGML_TYPE_F32) {
-        apply_binary_op<op, ggml_fp16_t, float, float>(params, dst);
-    } else {
-        GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
-            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
-    }
-}
-
-void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) {
-    binary_op<op_add>(params, dst);
-}
-
-void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) {
-    binary_op<op_sub>(params, dst);
-}
-
-void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) {
-    binary_op<op_mul>(params, dst);
-}
-
-void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) {
-    binary_op<op_div>(params, dst);
-}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	7a3c178d78	speculative : adapt to new llama API ggml-ci	2025-03-18 22:05:44 +02:00
Xuan Son Nguyen	dc4bb64290	Merge branch 'master' into xsn/private_batch_api	2025-03-18 15:45:22 +01:00
Xuan-Son Nguyen	eab5606d7b	Apply suggestions from code review	2025-03-17 12:17:14 +01:00
Xuan-Son Nguyen	de788e071b	Update examples/tts/tts.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2025-03-17 12:05:23 +01:00
Xuan Son Nguyen	624a683c6f	fix compile	2025-03-14 22:30:29 +01:00
Xuan Son Nguyen	116b9a1662	rename to init_from_text	2025-03-14 22:17:07 +01:00
Xuan Son Nguyen	eaffba0f2e	llama_batch_ext_ptr::from_text/embd	2025-03-14 17:12:03 +01:00
Xuan Son Nguyen	8e7714fa77	fix compile	2025-03-14 11:28:15 +01:00
Xuan Son Nguyen	a363251fac	qwen2vl: use llama_batch_ext_set_pos	2025-03-14 11:25:36 +01:00
Xuan Son Nguyen	ba79369615	fix llama_batch_ext_init_from_embd	2025-03-14 11:17:22 +01:00
Xuan Son Nguyen	07d84fa3c2	fix missing n_past in various places this is actually a revert of `cda0e4b648`	2025-03-14 10:47:08 +01:00
Xuan Son Nguyen	32940369d3	fix gemma3-cli	2025-03-14 10:33:28 +01:00
Xuan Son Nguyen	5e6a6d4e1c	fix llama-run n_past	2025-03-14 10:32:43 +01:00
Xuan Son Nguyen	bfdddbc150	bring back mistakenly deleted llama_batch_init/free	2025-03-14 00:22:28 +01:00
Xuan Son Nguyen	54566ad95d	correct comment	2025-03-14 00:21:06 +01:00
Xuan Son Nguyen	04f8641815	rm redundant llama_batch_ext_set_output_last	2025-03-13 23:14:16 +01:00
Xuan Son Nguyen	c3dd79007b	fix llama_batch_ext_init_from_text	2025-03-13 23:09:27 +01:00
Xuan Son Nguyen	65f0184517	compile ok	2025-03-13 22:56:35 +01:00
Xuan Son Nguyen	9fb2d81eab	fix common_batch missing seq_id	2025-03-13 22:38:04 +01:00
Xuan Son Nguyen	47086fa82d	apply to the rest	2025-03-13 22:36:27 +01:00
Xuan Son Nguyen	4aabf4e8f4	return output ID from llama_batch_ext_add/set	2025-03-13 17:47:07 +01:00
Xuan Son Nguyen	86973cb14a	fix merge errors	2025-03-13 17:32:36 +01:00
Xuan Son Nguyen	17f954c8e2	Merge branch 'master' into xsn/private_batch_api	2025-03-13 15:55:18 +01:00
Xuan Son Nguyen	46596caf6d	apply various in places	2025-03-01 20:42:18 +01:00
Xuan Son Nguyen	1d6ba97789	remove token_info API	2025-03-01 16:21:16 +01:00
Xuan Son Nguyen	1170135dfb	llama_batch_ext_add_text	2025-03-01 14:00:14 +01:00
Xuan Son Nguyen	40989f4116	correct llama_decode_ext	2025-03-01 14:00:05 +01:00
Xuan Son Nguyen	9e75c49d35	Merge branch 'master' into xsn/private_batch_api	2025-03-01 12:13:03 +01:00
Xuan Son Nguyen	f0ffd81130	adapt common	2025-03-01 12:12:52 +01:00
Xuan Son Nguyen	a1b1dea33b	Merge branch 'master' into xsn/private_batch_api	2025-02-24 17:01:30 +01:00
Xuan Son Nguyen	4bf7ca3943	llama_decode_ext	2025-02-24 17:01:20 +01:00
Xuan Son Nguyen	aed4a8e980	fix server	2025-02-16 11:36:50 +01:00
Xuan Son Nguyen	85ef80cbe9	server : use llama_batch_ext	2025-02-16 00:06:48 +01:00
Xuan Son Nguyen	17d3658b5f	move to llama_batch_ext	2025-02-16 00:02:53 +01:00
Xuan Son Nguyen	f2e59a8eb9	rework, targeting llama-server	2025-02-14 18:16:49 +01:00
Xuan Son Nguyen	4ed4fe75ed	first proposal for private llama_batch	2025-02-14 00:48:12 +01:00