test

ggml-ci
2026-04-23 16:37:33 +03:00 · 2025-04-10 12:35:16 +03:00
31 changed files with 1226 additions and 2261 deletions
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -36,13 +36,13 @@ jobs:
      matrix:
        config:
          # Multi-stage build
-          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: true }
-          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
-          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, free_disk_space: false }
+          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: true}
+          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
+          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, free_disk_space: true }
+          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
    steps:
      - name: Check out the repo
        uses: actions/checkout@v4
--- a/build-xcframework.sh
+++ b/build-xcframework.sh
@@ -41,11 +41,6 @@ COMMON_CMAKE_ARGS=(
    -DGGML_OPENMP=${GGML_OPENMP}
 )

-XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
-MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
-MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
-echo "Detected Xcode version: $XCODE_VERSION"
-
 check_required_tool() {
    local tool=$1
    local install_message=$2
@@ -330,28 +325,21 @@ combine_static_libraries() {

    # Platform-specific post-processing for device builds
    if [[ "$is_simulator" == "false" ]]; then
-        if command -v xcrun vtool &>/dev/null; then
+        if command -v vtool &>/dev/null; then
            case "$platform" in
                "ios")
                    echo "Marking binary as a framework binary for iOS..."
-                    xcrun vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
+                    vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
                    ;;
                "visionos")
                    echo "Marking binary as a framework binary for visionOS..."
-                    if [[ "$MAJOR_VERSION" -gt 16 ]] || [[ "$MAJOR_VERSION" -eq 16 && "$MINOR_VERSION" -gt 2 ]]; then
-                        echo "Xcode version greater than 16.2, using visionOS."
-                        VISION_OS_BUILD_VERSION="visionos"
-                    else
-                        echo "Xcode version less than or equal to 16.2, using xros."
-                        VISION_OS_BUILD_VERSION="xros"
-                    fi
-                    xcrun vtool -set-build-version ${VISION_OS_BUILD_VERSION} ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
+                    vtool -set-build-version xros ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
                    ;;
                "tvos")
                    echo "Marking binary as a framework binary for tvOS..."
-                    xcrun vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
+                    vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
                        -output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
                    ;;
            esac
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -65,7 +65,6 @@ class Model:
    model_name: str | None
    metadata_override: Path | None
    dir_model_card: Path
-    remote_hf_model_id: str | None

    # subclasses should define this!
    model_arch: gguf.MODEL_ARCH
@@ -74,7 +73,7 @@ class Model:
                 use_temp_file: bool = False, eager: bool = False,
                 metadata_override: Path | None = None, model_name: str | None = None,
                 split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
-                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None):
+                 small_first_shard: bool = False, hparams: dict[str, Any] | None = None):
        if type(self) is Model:
            raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")

@@ -84,24 +83,11 @@ class Model:
        self.is_big_endian = is_big_endian
        self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
        self.use_temp_file = use_temp_file
-        self.lazy = not eager or (remote_hf_model_id is not None)
-        self.remote_hf_model_id = remote_hf_model_id
-        if remote_hf_model_id is not None:
-            self.is_safetensors = True
-
-            def get_remote_tensors() -> Iterator[tuple[str, Tensor]]:
-                logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}")
-                remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id)
-                self.tensor_names = set(name for name in remote_tensors.keys())
-                for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id).items():
-                    yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor))
-
-            self.get_tensors = get_remote_tensors
-        else:
-            self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
-            self.is_safetensors = len(self.part_names) > 0
-            if not self.is_safetensors:
-                self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
+        self.lazy = not eager
+        self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
+        self.is_safetensors = len(self.part_names) > 0
+        if not self.is_safetensors:
+            self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
        self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
        self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
        self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@@ -407,10 +393,6 @@ class Model:

        self.metadata = gguf.Metadata.load(self.metadata_override, self.dir_model_card, self.model_name, total_params)

-        # If we are using HF model id, set the metadata name to the model id
-        if self.remote_hf_model_id:
-            self.metadata.name = self.remote_hf_model_id
-
        # Fallback to model directory name if metadata name is still missing
        if self.metadata.name is None:
            self.metadata.name = self.dir_model.name
@@ -1750,7 +1732,7 @@ class LlamaModel(Model):

                low_freq_wavelen = old_context_len / low_freq_factor
                high_freq_wavelen = old_context_len / high_freq_factor
-                # assert low_freq_wavelen != high_freq_wavelen # Errors for Llama4
+                assert low_freq_wavelen != high_freq_wavelen

                rope_factors = []
                for freq in freqs:
@@ -1806,6 +1788,10 @@ class Llama4Model(LlamaModel):
        self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"])

    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
+        name = name.replace("language_model.", "")
+        name = name.replace("feed_forward.", "mlp.") # a bit hacky for now
+        name = name.replace(".router.weight", ".gate.weight") # a bit hacky for now
+
        # split the gate_up into gate and up
        if "gate_up_proj" in name:
            name_up = name.replace("gate_up_proj", "up_proj.weight")
@@ -5417,14 +5403,6 @@ class LazyTorchTensor(gguf.LazyBase):
        lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
        return cast(torch.Tensor, lazy)

-    @classmethod
-    def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor):
-        dtype = cls._dtype_str_map[remote_tensor.dtype]
-        shape = remote_tensor.shape
-        meta = cls.meta_with_dtype_and_shape(dtype, shape)
-        lazy = cls(meta=meta, args=(remote_tensor,), func=lambda r: torch.frombuffer(r.data(), dtype=dtype).reshape(shape))
-        return cast(torch.Tensor, lazy)
-
    @classmethod
    def __torch_function__(cls, func, types, args=(), kwargs=None):
        del types  # unused
@@ -5502,10 +5480,6 @@ def parse_args() -> argparse.Namespace:
        "--print-supported-models", action="store_true",
        help="Print the supported models"
    )
-    parser.add_argument(
-        "--remote", action="store_true",
-        help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.",
-    )

    args = parser.parse_args()
    if not args.print_supported_models and args.model is None:
@@ -5546,14 +5520,6 @@ def main() -> None:

    dir_model = args.model

-    if args.remote:
-        from huggingface_hub import snapshot_download
-        local_dir = snapshot_download(
-            repo_id=str(dir_model),
-            allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"])
-        dir_model = Path(local_dir)
-        logger.info(f"Downloaded config and tokenizer to {local_dir}")
-
    if not dir_model.is_dir():
        logger.error(f'Error: {args.model} is not a directory')
        sys.exit(1)
@@ -5575,9 +5541,6 @@ def main() -> None:

    if args.outfile is not None:
        fname_out = args.outfile
-    elif args.remote:
-        # if remote, use the model ID as the output file name
-        fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf")
    else:
        fname_out = dir_model

@@ -5601,8 +5564,7 @@ def main() -> None:
                                     metadata_override=args.metadata, model_name=args.model_name,
                                     split_max_tensors=args.split_max_tensors,
                                     split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
-                                     small_first_shard=args.no_tensor_first_split,
-                                     remote_hf_model_id=str(args.model) if args.remote else None)
+                                     small_first_shard=args.no_tensor_first_split)

        if args.vocab_only:
            logger.info("Exporting model vocab...")
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -1,5 +1,3 @@
-# llava (legacy)
-
 add_library(llava OBJECT
            llava.cpp
            llava.h
@@ -24,41 +22,12 @@ if (BUILD_SHARED_LIBS)
    install(TARGETS llava_shared LIBRARY)
 endif()

-# mtmd
-
-add_library(mtmd OBJECT
-            mtmd.cpp
-            mtmd.h
-            clip.cpp
-            clip.h
-            clip-impl.h
-            )
-
-target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
-
-target_include_directories(mtmd PUBLIC .)
-target_include_directories(mtmd PRIVATE ../..)
-target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
-
-target_compile_features(mtmd PRIVATE cxx_std_17)
-
-add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
-if (BUILD_SHARED_LIBS)
-    set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
-    add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
-    target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
-    install(TARGETS mtmd_shared LIBRARY)
-endif()
-
 if (NOT MSVC)
    target_compile_options(llava PRIVATE -Wno-cast-qual) # stb_image.h
-    target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
 endif()

 if(TARGET BUILD_INFO)
    add_dependencies(llava BUILD_INFO)
-    add_dependencies(mtmd BUILD_INFO)
 endif()

 set(TARGET llama-llava-cli)
@@ -86,7 +55,7 @@ set(TARGET llama-gemma3-cli)
 add_executable(${TARGET} gemma3-cli.cpp)
 set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)

 set(TARGET llama-llava-clip-quantize-cli)
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -1,8 +1,5 @@
 #include "ggml.h"
 #include "gguf.h"
-#include "clip.h"
-
-#include "clip.h"

 #include <climits>
 #include <cstdarg>
@@ -10,7 +7,6 @@
 #include <map>
 #include <sstream>
 #include <vector>
-#include <memory>

 // Internal header for clip.cpp

@@ -124,23 +120,6 @@ static projector_type clip_projector_type_from_string(const std::string & str) {
    return PROJECTOR_TYPE_UNKNOWN;
 }

-// RGB uint8 image
-struct clip_image_u8 {
-    int nx;
-    int ny;
-
-    std::vector<uint8_t> buf;
-};
-
-// RGB float32 image (NHWC)
-// Memory layout: RGBRGBRGB...
-struct clip_image_f32 {
-    int nx;
-    int ny;
-
-    std::vector<float> buf;
-};
-
 //
 // logging
 //
@@ -199,36 +178,6 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, ..
 #define LOG_DBG(...) LOG_TMPL(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LOG_CNT(...) LOG_TMPL(GGML_LOG_LEVEL_CONT,  __VA_ARGS__)

-//
-// cpp wrappers
-//
-
-// wrapper for clip_image_size
-struct clip_image_size_deleter {
-    void operator()(clip_image_size * val) { clip_image_size_free(val); }
-};
-typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
-
-// wrapper for clip_image_u8
-struct clip_image_u8_deleter {
-    void operator()(clip_image_u8 * val) { clip_image_u8_free(val); }
-};
-typedef std::unique_ptr<clip_image_u8, clip_image_u8_deleter> clip_image_u8_ptr;
-
-// wrapper for clip_image_f32
-struct clip_image_f32_deleter {
-    void operator()(clip_image_f32 * val) { clip_image_f32_free(val); }
-};
-typedef std::unique_ptr<clip_image_f32, clip_image_f32_deleter> clip_image_f32_ptr;
-
-struct clip_image_u8_batch {
-    std::vector<clip_image_u8_ptr> entries;
-};
-
-struct clip_image_f32_batch {
-    std::vector<clip_image_f32_ptr> entries;
-};
-
 //
 // common utils
 //
@@ -265,20 +214,6 @@ static void string_replace_all(std::string & s, const std::string & search, cons
    s = std::move(builder);
 }

-// split string by a `std::string delim` instead of `char delim`
-static std::vector<std::string> string_split_str(std::string s, const std::string & delimiter) {
-    std::vector<std::string> tokens;
-    size_t pos = 0;
-    std::string token;
-    while ((pos = s.find(delimiter)) != std::string::npos) {
-        token = s.substr(0, pos);
-        tokens.push_back(token);
-        s.erase(0, pos + delimiter.length());
-    }
-    tokens.push_back(s);
-    return tokens;
-}
-
 //
 // gguf utils
 //
@@ -336,9 +271,3 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
            return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
    }
 }
-
-//
-// API used internally with mtmd
-//
-
-projector_type clip_get_projector_type(const struct clip_ctx * ctx);
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -32,6 +32,23 @@ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callbac

 //#define CLIP_DEBUG_FUNCTIONS

+// RGB uint8 image
+struct clip_image_u8 {
+    int nx;
+    int ny;
+
+    std::vector<uint8_t> buf;
+};
+
+// RGB float32 image (NHWC)
+// Memory layout: RGBRGBRGB...
+struct clip_image_f32 {
+    int nx;
+    int ny;
+
+    std::vector<float> buf;
+};
+
 #ifdef CLIP_DEBUG_FUNCTIONS
 static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
    std::ofstream file(filename, std::ios::binary);
@@ -315,47 +332,58 @@ struct clip_ctx {
    bool use_gelu = false;
    bool use_silu = false;

-    gguf_context_ptr ctx_gguf;
-    ggml_context_ptr ctx_data;
+    struct gguf_context * ctx_gguf = nullptr;
+    struct ggml_context * ctx_data = nullptr;

    std::vector<uint8_t> buf_compute_meta;

    std::vector<ggml_backend_t> backend_ptrs;
    std::vector<ggml_backend_buffer_type_t> backend_buft;

-    ggml_backend_ptr backend;
-    ggml_backend_ptr backend_cpu;
-    ggml_backend_buffer_ptr buf;
+    ggml_backend_t backend     = nullptr;
+    ggml_backend_t backend_cpu = nullptr;
+    ggml_backend_buffer_t buf  = nullptr;

    ggml_backend_sched_ptr sched;

-    clip_image_size load_image_size;
+    struct clip_image_size * load_image_size = nullptr;

    clip_ctx(clip_context_params & ctx_params) {
-        backend_cpu = ggml_backend_ptr(ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr));
-        backend     = ggml_backend_ptr(ctx_params.use_gpu
+        backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+        backend     = ctx_params.use_gpu
                        ? ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_GPU, nullptr)
-                        : nullptr);
+                        : nullptr;

        if (backend) {
-            LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend.get()));
-            backend_ptrs.push_back(backend.get());
-            backend_buft.push_back(ggml_backend_get_default_buffer_type(backend.get()));
+            LOG_INF("%s: CLIP using %s backend\n", __func__, ggml_backend_name(backend));
+            backend_ptrs.push_back(backend);
+            backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
        } else {
-            backend = std::move(backend_cpu);
+            backend = backend_cpu;
            LOG_INF("%s: CLIP using CPU backend\n", __func__);
        }

-        backend_ptrs.push_back(backend_cpu.get());
-        backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu.get()));
+        backend_ptrs.push_back(backend_cpu);
+        backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));

        sched.reset(
            ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false)
        );
    }
+
+    ~clip_ctx() {
+        ggml_free(ctx_data);
+        gguf_free(ctx_gguf);
+        ggml_backend_buffer_free(buf);
+        ggml_backend_free(backend);
+        if (backend_cpu != backend) {
+            ggml_backend_free(backend_cpu);
+        }
+        clip_image_size_free(load_image_size);
+    }
 };

-static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
+static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
    const auto & model = ctx->vision_model;
    const auto & hparams = model.hparams;

@@ -371,7 +399,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
    const int n_layer              = hparams.n_layer;
    const float eps                = hparams.eps;

-    GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1
+    GGML_ASSERT(imgs->size == 1); // batch_size == 1

    struct ggml_init_params params = {
        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
@@ -379,9 +407,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
        /*.no_alloc   =*/ true,
    };

-    ggml_context_ptr ctx0_ptr(ggml_init(params));
-    auto ctx0 = ctx0_ptr.get();
-
+    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph * gf = ggml_new_graph(ctx0);

    // input raw
@@ -503,10 +529,12 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
    // build the graph
    ggml_build_forward_expand(gf, embeddings);

+    ggml_free(ctx0);
+
    return gf;
 }

-static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
+static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
    if (!ctx->has_vision_encoder) {
        LOG_ERR("This gguf file seems to have no vision encoder\n");
        return nullptr;
@@ -519,20 +547,23 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
    int image_size_width  = image_size;
    int image_size_height = image_size;
    if (ctx->has_minicpmv_projector) {
-        LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height);
-        image_size_width  = load_image_size.width;
-        image_size_height = load_image_size.height;
+        if (load_image_size == nullptr) {
+            load_image_size = clip_image_size_init();
+        }
+        LOG_DBG("%s: %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        image_size_width  = load_image_size->width;
+        image_size_height = load_image_size->height;
        if (is_inf) {
-            image_size_width  = imgs.entries[0]->nx;
-            image_size_height = imgs.entries[0]->ny;
+            image_size_width  = imgs->data->nx;
+            image_size_height = imgs->data->ny;
        }
    }
    else if (ctx->has_qwen2vl_merger) {
        // use the image's native resolution when image is avaible
        if (is_inf) {
        // if (imgs->data->nx && imgs->data->ny) {
-            image_size_width  = imgs.entries[0]->nx;
-            image_size_height = imgs.entries[0]->ny;
+            image_size_width  = imgs->data->nx;
+            image_size_height = imgs->data->ny;
        }
    }
    const int patch_size           = hparams.patch_size;
@@ -547,7 +578,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
    const float eps                = hparams.eps;
    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

-    const int batch_size = imgs.entries.size();
+    const int batch_size = imgs->size;

    if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) {
        GGML_ASSERT(batch_size == 1);
@@ -559,9 +590,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
        /*.no_alloc   =*/ true,
    };

-    ggml_context_ptr ctx0_ptr(ggml_init(params));
-    auto ctx0 = ctx0_ptr.get();
-
+    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph * gf = ggml_new_graph(ctx0);

    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
@@ -1049,7 +1078,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
            }
        } else {
-            GGML_ABORT("fatal error");
+            GGML_ABORT("fatel error");
        }
    }
    else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
@@ -1069,10 +1098,12 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
    // build the graph
    ggml_build_forward_expand(gf, embeddings);

+    ggml_free(ctx0);
+
    return gf;
 }

-static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
+static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs, struct clip_image_size * load_image_size, bool is_inf = false) {
    if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
        return clip_image_build_graph_siglip(ctx, imgs);
    } else {
@@ -1243,7 +1274,7 @@ struct clip_model_loader {
            /*.mem_buffer =*/ NULL,
            /*.no_alloc =*/ true,
        };
-        ctx_clip.ctx_data.reset(ggml_init(params));
+        ctx_clip.ctx_data = ggml_init(params);
        if (!ctx_clip.ctx_data) {
            throw std::runtime_error(string_format("%s: failed to init ggml context\n", __func__));
        }
@@ -1257,7 +1288,7 @@ struct clip_model_loader {
            if (cur) {
                tensors_to_load.push_back(cur);
                // add tensors to context
-                struct ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur);
+                struct ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data, cur);
                ggml_set_name(data_tensor, cur->name);
                cur = data_tensor;
            }
@@ -1428,11 +1459,11 @@ struct clip_model_loader {
            }

            // alloc memory and offload data
-            ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend.get());
-            ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft));
-            ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+            ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend);
+            ctx_clip.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data, buft);
+            ggml_backend_buffer_set_usage(ctx_clip.buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
            for (auto & t : tensors_to_load) {
-                struct ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name);
+                struct ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data, t->name);
                const size_t offset = tensor_offset[t->name];
                fin.seekg(offset, std::ios::beg);
                if (!fin) {
@@ -1457,20 +1488,10 @@ struct clip_model_loader {

    void alloc_compute_meta() {
        ctx_clip.buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
-
-        // create a fake batch
        clip_image_f32_batch batch;
-        clip_image_f32_ptr img(clip_image_f32_init());
-        clip_image_size image_size;
-        image_size.width  = clip_get_image_size(&ctx_clip);
-        image_size.height = clip_get_image_size(&ctx_clip);
-        int n_patches = clip_get_image_size(&ctx_clip) / image_size.width;
-        img->nx = n_patches;
-        img->ny = n_patches;
-        img->buf.resize(n_patches * image_size.width * image_size.height * 3);
-        batch.entries.push_back(std::move(img));
-
-        ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false);
+        batch.size = 1;
+        batch.data = nullptr;
+        ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, &batch, nullptr, false);
        ggml_backend_sched_reserve(ctx_clip.sched.get(), gf);
        for (size_t i = 0; i < ctx_clip.backend_ptrs.size(); ++i) {
            ggml_backend_t backend = ctx_clip.backend_ptrs[i];
@@ -1571,11 +1592,11 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
 }

 void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
-    ctx_clip->load_image_size = *load_image_size; // copy
+    ctx_clip->load_image_size = load_image_size;
 }

 struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
-    return &ctx_clip->load_image_size;
+    return ctx_clip->load_image_size;
 }

 struct clip_image_size * clip_image_size_init() {
@@ -1593,53 +1614,25 @@ struct clip_image_f32 * clip_image_f32_init() {
    return new clip_image_f32();
 }

-struct clip_image_f32_batch * clip_image_f32_batch_init() {
-    return new clip_image_f32_batch();
-}
-
-unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
-    if (nx) *nx = img->nx;
-    if (ny) *ny = img->ny;
-    return img->buf.data();
-}
-
 void clip_image_size_free(struct clip_image_size * load_image_size) {
    if (load_image_size == nullptr) {
        return;
    }
    delete load_image_size;
 }
-void clip_image_u8_free(struct clip_image_u8  * img) { if (img) delete img; }
-void clip_image_f32_free(struct clip_image_f32 * img) { if (img) delete img; }
-void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) delete batch; }
-void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { if (batch) delete batch; }
-
-size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) {
-    return batch->entries.size();
-}
-
-size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) {
-    if (idx < 0 || idx >= (int)batch->entries.size()) {
-        LOG_ERR("%s: invalid index %d\n", __func__, idx);
-        return 0;
+void clip_image_u8_free(struct clip_image_u8  * img) { delete img; }
+void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
+void clip_image_u8_batch_free(struct clip_image_u8_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
    }
-    return batch->entries[idx]->nx;
 }
-
-size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) {
-    if (idx < 0 || idx >= (int)batch->entries.size()) {
-        LOG_ERR("%s: invalid index %d\n", __func__, idx);
-        return 0;
+void clip_image_f32_batch_free(struct clip_image_f32_batch  * batch) {
+    if (batch->size > 0) {
+        delete[] batch->data;
+        batch->size = 0;
    }
-    return batch->entries[idx]->ny;
-}
-
-clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) {
-    if (idx < 0 || idx >= (int)batch->entries.size()) {
-        LOG_ERR("%s: invalid index %d\n", __func__, idx);
-        return nullptr;
-    }
-    return batch->entries[idx].get();
 }

 void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
@@ -1713,15 +1706,14 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
 }

 // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
-static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
-    dst.nx = src.nx;
-    dst.ny = src.ny;
-    dst.buf.resize(src.buf.size());
+static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* dst, const float mean[3], const float std[3]) {
+    dst->nx = src->nx;
+    dst->ny = src->ny;
+    dst->buf.resize(src->buf.size());

-    // TODO @ngxson : seems like this could be done more efficiently on cgraph
-    for (size_t i = 0; i < src.buf.size(); ++i) {
+    for (size_t i = 0; i < src->buf.size(); ++i) {
        int c = i % 3; // rgb
-        dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
+        dst->buf[i] = (static_cast<float>(src->buf[i]) / 255.0f - mean[c]) / std[c];
    }
 }

@@ -1729,7 +1721,7 @@ inline int clip(int x, int lower, int upper) {
    return std::max(lower, std::min(x, upper));
 }

-static bool bicubic_resize(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
+static bool bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_width, int target_height) {
    const int nx = img.nx;
    const int ny = img.ny;

@@ -1867,13 +1859,13 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
    return best_fit;
 }

-static std::vector<clip_image_u8_ptr> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
-    std::vector<clip_image_u8_ptr> patches;
+static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & image, int patch_size) {
+    std::vector<clip_image_u8*> patches;
    int width = image.nx;
    int height = image.ny;
    for (int i = 0; i < height; i += patch_size) {
        for (int j = 0; j < width; j += patch_size) {
-            clip_image_u8_ptr patch(clip_image_u8_init());
+            clip_image_u8 *patch = clip_image_u8_init();
            patch->nx = std::min(patch_size, width - j);
            patch->ny = std::min(patch_size, height - i);
            patch->buf.resize(3 * patch->nx * patch->ny);
@@ -1884,7 +1876,7 @@ static std::vector<clip_image_u8_ptr> divide_to_patches_u8(const clip_image_u8 &
                    }
                }
            }
-            patches.push_back(std::move(patch));
+            patches.push_back(patch);
        }
    }
    return patches;
@@ -1965,7 +1957,7 @@ static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int mul
 //    -> https://arxiv.org/pdf/2403.11703
 //    -> https://github.com/thunlp/LLaVA-UHD
 //    -> https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
-static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
+static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_image_u8 * img, const int max_slice_nums=9, const int scale_resolution=448, const int patch_size=14) {
    const std::pair<int, int> original_size={img->nx,img->ny};
    const int original_width = img->nx;
    const int original_height = img->ny;
@@ -1973,30 +1965,30 @@ static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_im
    const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
    const int multiple = fmin(ceil(ratio), max_slice_nums);

-    std::vector<std::vector<clip_image_u8_ptr>> images;
+    std::vector<std::vector<clip_image_u8 *>> images;
    LOG_DBG("%s: multiple %d\n", __func__, multiple);
-    images.push_back(std::vector<clip_image_u8_ptr>());
+    images.push_back(std::vector<clip_image_u8 *>());

    if (multiple <= 1) {
        auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size, true);
-        clip_image_u8_ptr source_image(clip_image_u8_init());
+        clip_image_u8 * source_image = clip_image_u8_init();
        bicubic_resize(*img, *source_image, best_size.first, best_size.second);
        // source_image = image.resize(best_size, Image.Resampling.BICUBIC)
-        images.back().push_back(std::move(source_image));
+        images[images.size()-1].push_back(source_image);
    }
    else if (multiple > 1) {
        auto best_size = uhd_find_best_resize(original_size, scale_resolution, patch_size);
-        clip_image_u8_ptr source_image(clip_image_u8_init());
+        clip_image_u8 * source_image = clip_image_u8_init();
        bicubic_resize(*img, *source_image, best_size.first, best_size.second);
        // source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
        LOG_DBG("%s: image_size: %d %d; source_image size: %d %d\n", __func__, img->nx, img->ny, best_size.first, best_size.second);
-        images.back().push_back(std::move(source_image));
+        images[images.size()-1].push_back(source_image);

        std::pair<int, int> best_grid = uhd_best_grid(max_slice_nums, multiple, log_ratio);
        LOG_DBG("%s: image_size: %d %d; best_grid: %d %d\n", __func__, img->nx, img->ny, best_grid.first, best_grid.second);

        auto refine_size = uhd_get_refine_size(original_size, best_grid, scale_resolution, patch_size, true);
-        clip_image_u8_ptr refine_image(clip_image_u8_init());
+        clip_image_u8 * refine_image = clip_image_u8_init();
        bicubic_resize(*img, *refine_image, refine_size.first, refine_size.second);

        LOG_DBG("%s: refine_image_size: %d %d; refine_size: %d %d\n", __func__, refine_image->nx, refine_image->ny, refine_size.first, refine_size.second);
@@ -2007,9 +1999,9 @@ static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_im
        int grid_x = int(width / best_grid.first);
        int grid_y = int(height / best_grid.second);
        for (int patches_i = 0, ic = 0; patches_i < height && ic < best_grid.second; patches_i += grid_y, ic += 1){
-            images.push_back(std::vector<clip_image_u8_ptr>());
+            images.push_back(std::vector<clip_image_u8 *>());
            for(int patches_j = 0, jc = 0; patches_j < width && jc < best_grid.first; patches_j += grid_x, jc += 1){
-                clip_image_u8_ptr patch(clip_image_u8_init());
+                clip_image_u8 * patch = clip_image_u8_init();
                patch->nx = grid_x;
                patch->ny = grid_y;
                patch->buf.resize(3 * patch->nx * patch->ny);
@@ -2022,9 +2014,10 @@ static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_im
                        patch->buf[j+2] = refine_image->buf[i+2];
                    }
                }
-                images.back().push_back(std::move(patch));
+                images[images.size()-1].push_back(patch);
            }
        }
+        clip_image_u8_free(refine_image);
    }
    return images;
 }
@@ -2032,8 +2025,8 @@ static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_im
 int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
    const int max_slice_nums=9;
    const int scale_resolution=448;
-    const int original_width = ctx_clip->load_image_size.width;
-    const int original_height = ctx_clip->load_image_size.height;
+    const int original_width = ctx_clip->load_image_size->width;
+    const int original_height = ctx_clip->load_image_size->height;
    const float log_ratio = log(1.0*original_width/original_height);
    const float ratio = 1.0 * original_width * original_height/ (scale_resolution * scale_resolution);
    const int multiple = fmin(ceil(ratio), max_slice_nums);
@@ -2043,44 +2036,64 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {

 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
+bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {

-    if (clip_is_minicpmv(ctx)) {
+    if(clip_is_minicpmv(ctx)){
        int max_slice_nums = 9;
-        std::vector<std::vector<clip_image_u8_ptr>> imgs = uhd_slice_image(img, max_slice_nums);
+        std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
+        res_imgs->size = 0;
+        for (size_t i = 0; i < imgs.size(); ++i){
+            res_imgs->size += imgs[i].size();
+        }
+        res_imgs->data = new clip_image_f32[res_imgs->size];
+        int idx = 0;
        for (size_t i = 0; i < imgs.size(); ++i) {
            for (size_t j = 0; j < imgs[i].size(); ++j) {
                LOG_DBG("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
-                clip_image_f32_ptr res(clip_image_f32_init());
-                normalize_image_u8_to_f32(*imgs[i][j], *res, ctx->image_mean, ctx->image_std);
-                res_imgs->entries.push_back(std::move(res));
+                clip_image_f32 * res = clip_image_f32_init();
+                normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
+                res_imgs->data[idx++] = *res;
+                clip_image_f32_free(res);
+            }
+        }
+        for (size_t i = 0; i < imgs.size(); ++i) {
+            for (size_t j = 0; j < imgs[i].size(); ++j) {
+                if (imgs[i][j] != nullptr) {
+                    clip_image_u8_free(imgs[i][j]);
+                }
            }
        }
        return true;
    }
    else if (ctx->has_qwen2vl_merger) {
-        clip_image_u8 resized;
-        auto patch_size = clip_get_patch_size(ctx) * 2;
+        clip_image_u8 * resized = clip_image_u8_init();
+        auto patch_size = clip_patch_size(ctx) * 2;
        int nx = ceil((float)img->nx / patch_size) * patch_size;
        int ny = ceil((float)img->ny / patch_size) * patch_size;
-        bicubic_resize(*img, resized, nx, ny);
+        bicubic_resize(*img, *resized, nx, ny);

-        clip_image_f32_ptr img_f32(clip_image_f32_init());
-        // clip_image_f32_ptr res(clip_image_f32_init());
-        normalize_image_u8_to_f32(resized, *img_f32, ctx->image_mean, ctx->image_std);
+        res_imgs->data = new clip_image_f32[1];
+        // clip_image_f32 * res = clip_image_f32_init();
+        normalize_image_u8_to_f32(resized, res_imgs->data, ctx->image_mean, ctx->image_std);
        // res_imgs->data[0] = *res;
-        res_imgs->entries.push_back(std::move(img_f32));
+        res_imgs->size = 1;
+
+        // clip_image_f32_free(res);
+        clip_image_u8_free(resized);
        return true;
    }

    if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
+        res_imgs->size = 1;
+        res_imgs->data = new clip_image_f32[res_imgs->size];
        clip_image_u8 resized_image;
        int32_t sz=ctx->vision_model.hparams.image_size;
        bicubic_resize(*img, resized_image,sz,sz);
-        clip_image_f32_ptr img_f32(clip_image_f32_init());
+        clip_image_f32 * res = clip_image_f32_init();
        //clip_image_save_to_bmp(resized_image, "resized.bmp");
-        normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
-        res_imgs->entries.push_back(std::move(img_f32));
+        normalize_image_u8_to_f32(&resized_image, res, ctx->image_mean, ctx->image_std);
+        res_imgs->data[0] = *res;
+        clip_image_f32_free(res);
        return true;
    }

@@ -2095,12 +2108,16 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
        pad_to_square = false;
    }
    // free the previous res_imgs if any set
-    res_imgs->entries.clear();
+    if (res_imgs->size > 0) {
+        clip_image_f32_batch_free(res_imgs);
+    }
+    res_imgs->data = nullptr;
+    res_imgs->size = 0;

    // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156

-    clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
+    clip_image_u8 * temp = clip_image_u8_init(); // we will keep the input image data here temporarily
    if (pad_to_square && img->nx != img->ny) {
        int longer_side = std::max(img->nx, img->ny);
        temp->nx = longer_side;
@@ -2143,18 +2160,28 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
            //     clip_image_u8_free(temp2);
            // }

-            std::vector<clip_image_u8_ptr> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
+            std::vector<clip_image_u8 *> patches = divide_to_patches_u8(*temp, params.image_size); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)

-            clip_image_u8_ptr image_original_resize(clip_image_u8_init());
+            clip_image_u8 *image_original_resize = clip_image_u8_init();
            // bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
            bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
-            patches.insert(patches.begin(), std::move(image_original_resize));
-            for (auto & patch : patches) {
-                clip_image_f32_ptr res(clip_image_f32_init());
-                normalize_image_u8_to_f32(*patch, *res, ctx->image_mean, ctx->image_std);
-                res_imgs->entries.push_back(std::move(res));
+            patches.insert(patches.begin(), image_original_resize);
+            // clip_image_f32_batch_init(patches.size());
+            res_imgs->size = patches.size();
+            res_imgs->data = new clip_image_f32[res_imgs->size];
+            int num=0;
+            for (auto& patch : patches) {
+                normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
+                num++;
            }

+            for (size_t i = 0; i < patches.size(); i++) {
+                // LOG_DBG("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
+                clip_image_u8_free(patches[i]);
+            }
+
+            clip_image_u8_free(temp);
+
            return true;
        } else {
            temp->nx = img->nx;
@@ -2170,7 +2197,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str

    const int nx2 = ctx->vision_model.hparams.image_size;
    const int ny2 = ctx->vision_model.hparams.image_size;
-    clip_image_f32_ptr res(clip_image_f32_init());
+    clip_image_f32 * res = clip_image_f32_init();
    res->nx = nx2;
    res->ny = ny2;
    res->buf.resize(3 * nx2 * ny2);
@@ -2222,6 +2249,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
            }
        }
    }
+    clip_image_u8_free(temp);

    // {
    //     clip_image_u8 * temp2 = clip_image_u8_init();
@@ -2231,7 +2259,10 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
    // }
    // res_imgs.push_back(res);

-    res_imgs->entries.push_back(std::move(res));
+    res_imgs->size = 1;
+    res_imgs->data = new clip_image_f32[res_imgs->size];
+    res_imgs->data[0] = *res;
+    clip_image_f32_free(res);

    return true;
 }
@@ -2259,15 +2290,15 @@ size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w
    return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }

-int32_t clip_get_image_size(const struct clip_ctx * ctx) {
+int32_t clip_image_size(const struct clip_ctx * ctx) {
    return ctx->vision_model.hparams.image_size;
 }

-int32_t clip_get_patch_size(const struct clip_ctx * ctx) {
+int32_t clip_patch_size(const struct clip_ctx * ctx) {
    return ctx->vision_model.hparams.patch_size;
 }

-int32_t clip_get_hidden_size(const struct clip_ctx * ctx) {
+int32_t clip_hidden_size(const struct clip_ctx * ctx) {
    return ctx->vision_model.hparams.hidden_size;
 }

@@ -2315,8 +2346,6 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
        int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
        int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
        n_patches = x_patch * y_patch;
-    } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
-        n_patches = 256;
    }

    return n_patches;
@@ -2414,23 +2443,19 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
        return false;
    }

-    clip_image_f32_batch imgs;
-    clip_image_f32_ptr img_copy(clip_image_f32_init());
-    *img_copy = *img;
-    imgs.entries.push_back(std::move(img_copy));
-
+    clip_image_f32_batch imgs{};
+    imgs.size = 1;
+    imgs.data = img;
    return clip_image_batch_encode(ctx, n_threads, &imgs, vec);
 }

-bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
-    const clip_image_f32_batch & imgs = *imgs_c_ptr;
-
+bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
    if (!ctx->has_vision_encoder) {
        LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
        return false;
    }

-    int batch_size = imgs.entries.size();
+    int batch_size = imgs->size;
    if (ctx->has_llava_projector) {
        GGML_ASSERT(batch_size == 1); // TODO: support multiple images
    }
@@ -2457,22 +2482,25 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
    int image_size_width  = image_size;
    int image_size_height = image_size;
    if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
-        image_size_width  = imgs.entries[0]->nx;
-        image_size_height = imgs.entries[0]->ny;
+        image_size_width  = imgs->data[0].nx;
+        image_size_height = imgs->data[0].ny;
    }
    const int patch_size    = hparams.patch_size;
    const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
    const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
-    const int pos_w = ctx->load_image_size.width / patch_size;
-    const int pos_h = ctx->load_image_size.height / patch_size;
+    if(ctx->load_image_size==nullptr){
+        ctx->load_image_size= clip_image_size_init();
+    }
+    const int pos_w = ctx->load_image_size->width/patch_size;
+    const int pos_h = ctx->load_image_size->height/patch_size;

    {
        struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
        float * data = (float *)malloc(ggml_nbytes(inp_raw));

-        for (size_t i = 0; i < imgs.entries.size(); i++) {
-            const int nx = imgs.entries[i]->nx;
-            const int ny = imgs.entries[i]->ny;
+        for (size_t i = 0; i < imgs->size; i++) {
+            const int nx = imgs->data[i].nx;
+            const int ny = imgs->data[i].ny;
            if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
                GGML_ASSERT(nx == image_size && ny == image_size);
            }
@@ -2483,7 +2511,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                for (int k = 0; k < 3; k++) {
                    for (int y = 0; y < ny; y++) {
                        for (int x = 0; x < nx; x++) {
-                            data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries[b]->buf[3 * (y * nx + x) + k];
+                            data[(b * 3 * n) + k * n + y * nx + x] = imgs->data[b].buf[3 * (y * nx + x) + k];
                        }
                    }
                }
@@ -2610,7 +2638,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        }
    }

-    ggml_backend_cpu_set_n_threads(ctx->backend_cpu.get(), n_threads);
+    ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);

    auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
    if (status != GGML_STATUS_SUCCESS) {
@@ -2643,8 +2671,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
        /* verbosity */ GGML_LOG_LEVEL_ERROR,
    });

-    const auto & ctx_src = ctx_clip->ctx_gguf.get();
-    const auto & ctx_data = ctx_clip->ctx_data.get();
+    const auto & ctx_src = ctx_clip->ctx_gguf;
+    const auto & ctx_data = ctx_clip->ctx_data;

    auto * ctx_out = gguf_init_empty();
    gguf_set_kv(ctx_out, ctx_src);
@@ -2865,11 +2893,3 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
    clip_image_encode(ctx, n_threads, &clip_img, vec);
    return true;
 }
-
-//
-// API used internally with mtmd
-//
-
-projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
-    return ctx->proj_type;
-}
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -30,8 +30,15 @@ struct clip_image_size {
    int height;
 };

-struct clip_image_u8_batch;
-struct clip_image_f32_batch;
+struct clip_image_u8_batch {
+    struct clip_image_u8 * data;
+    size_t size;
+};
+
+struct clip_image_f32_batch {
+    struct clip_image_f32 * data;
+    size_t size;
+};

 struct clip_context_params {
    bool use_gpu;
@@ -48,9 +55,9 @@ CLIP_API void clip_free(struct clip_ctx * ctx);
 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
 CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);

-CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
-CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
-CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
+CLIP_API int32_t clip_image_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_patch_size (const struct clip_ctx * ctx);
+CLIP_API int32_t clip_hidden_size(const struct clip_ctx * ctx);

 // TODO: should be enum, not string
 CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
@@ -66,13 +73,9 @@ CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
 CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);

-CLIP_API struct clip_image_size      * clip_image_size_init();
-CLIP_API struct clip_image_u8        * clip_image_u8_init ();
-CLIP_API struct clip_image_f32       * clip_image_f32_init();
-CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(); // only used by libllava
-
-// nx, ny are the output image dimensions
-CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
+CLIP_API struct clip_image_size * clip_image_size_init();
+CLIP_API struct clip_image_u8  * clip_image_u8_init ();
+CLIP_API struct clip_image_f32 * clip_image_f32_init();

 CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
 CLIP_API void clip_image_u8_free (struct clip_image_u8  * img);
@@ -80,12 +83,6 @@ CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
 CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch  * batch);
 CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);

-// use for accessing underlay data of clip_image_f32_batch
-CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
-CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
-CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
-CLIP_API clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
-
 /**
 * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
 * The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@@ -2,11 +2,11 @@
 #include "log.h"
 #include "common.h"
 #include "sampling.h"
+#include "clip.h"
+#include "stb_image.h"
 #include "llama.h"
 #include "ggml.h"
 #include "console.h"
-#include "chat.h"
-#include "mtmd.h"

 #include <vector>
 #include <limits.h>
@@ -57,18 +57,13 @@ static void sigint_handler(int signo) {
 #endif

 struct gemma3_context {
-    mtmd_context_ptr ctx_vision;
-    common_init_result llama_init;
+    struct clip_ctx    * ctx_clip = NULL;
+    common_init_result   llama_init;

    llama_model       * model;
    llama_context     * lctx;
    const llama_vocab * vocab;
    llama_batch         batch;
-    int                 n_batch;
-
-    // note: we know that gemma3 template is "linear", meaning each turn is completely separated to another
-    // so here we don't need to keep track of chat history
-    common_chat_templates_ptr tmpls;

    int n_threads    = 1;
    llama_pos n_past = 0;
@@ -79,24 +74,21 @@ struct gemma3_context {
        vocab = llama_model_get_vocab(model);
        n_threads = params.cpuparams.n_threads;
        batch = llama_batch_init(params.n_batch, 0, 1);
-        n_batch = params.n_batch;
-        tmpls = common_chat_templates_init(model, params.chat_template);
-        init_vision_context(params);
+        init_clip_model(params);
    }

-    void init_vision_context(common_params & params) {
+    void init_clip_model(common_params & params) {
        const char * clip_path = params.mmproj.path.c_str();
-        ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
-            /* use_gpu */   true,
-            /* timings */   true,
-            /* n_threads */ params.cpuparams.n_threads,
-            /* verbosity */ GGML_LOG_LEVEL_INFO,
-        }));
-        if (!ctx_vision.get()) {
-            LOG_ERR("Failed to load vision model from %s\n", clip_path);
+        ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
+        if (!ctx_clip) {
+            LOG_ERR("Failed to load CLIP model from %s\n", clip_path);
            exit(1);
        }
    }
+
+    ~gemma3_context() {
+        clip_free(ctx_clip);
+    }
 };

 struct decode_embd_batch {
@@ -132,6 +124,77 @@ struct decode_embd_batch {
    }
 };

+static int eval_text(gemma3_context & ctx, std::string input, bool logits_last = false) {
+    llama_tokens tokens = common_tokenize(ctx.lctx, input, false, true);
+    common_batch_clear(ctx.batch);
+    for (llama_token & t : tokens) {
+        common_batch_add(ctx.batch, t, ctx.n_past++, {0}, false);
+    }
+    if (logits_last) {
+        ctx.batch.logits[ctx.batch.n_tokens - 1] = true;
+    }
+    // LOG("eval_text (n_tokens = %d): %s\n", (int)tokens.size(), input.c_str());
+    if (llama_decode(ctx.lctx, ctx.batch)) {
+        LOG_ERR("Failed to decode text\n");
+        return 1;
+    }
+    return 0;
+}
+
+static int eval_image(gemma3_context & ctx, std::string & fname) {
+    std::vector<float> image_embd_v;
+    int n_embd = llama_model_n_embd(ctx.model);
+    int n_tokens = 256;
+    image_embd_v.resize(n_tokens * n_embd);
+
+    bool ok;
+    struct clip_image_u8 * img_u8 = clip_image_u8_init();
+    ok = clip_image_load_from_file(fname.c_str(), img_u8);
+    if (!ok) {
+        LOG_ERR("Unable to load image %s\n", fname.c_str());
+        clip_image_u8_free(img_u8);
+        return 2; // non-fatal error
+    }
+
+    clip_image_f32_batch batch_f32;
+    ok = clip_image_preprocess(ctx.ctx_clip, img_u8, &batch_f32);
+    if (!ok) {
+        LOG_ERR("Unable to preprocess image\n");
+        clip_image_f32_batch_free(&batch_f32);
+        clip_image_u8_free(img_u8);
+        return 1;
+    }
+
+    int64_t t0 = ggml_time_ms();
+    LOG("Encoding image %s\n", fname.c_str());
+    ok = clip_image_batch_encode(ctx.ctx_clip, ctx.n_threads, &batch_f32, image_embd_v.data());
+    if (!ok) {
+        LOG_ERR("Unable to encode image\n");
+        clip_image_f32_batch_free(&batch_f32);
+        clip_image_u8_free(img_u8);
+        return 1;
+    }
+    LOG("Image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
+
+    clip_image_f32_batch_free(&batch_f32);
+    clip_image_u8_free(img_u8);
+
+    // decode image embeddings
+    int64_t t1 = ggml_time_ms();
+    eval_text(ctx, "<start_of_image>");
+    llama_set_causal_attn(ctx.lctx, false);
+    decode_embd_batch batch_img(image_embd_v.data(), n_tokens, ctx.n_past, 0);
+    if (llama_decode(ctx.lctx, batch_img.batch)) {
+        LOG_ERR("failed to decode image\n");
+        return 1;
+    }
+    ctx.n_past += n_tokens;
+    llama_set_causal_attn(ctx.lctx, true);
+    eval_text(ctx, "<end_of_image>");
+    LOG("Image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
+    return 0;
+}
+
 static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
    for (int i = 0; i < n_predict; i++) {
        if (i > n_predict || !g_is_generating) {
@@ -161,45 +224,6 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
    return 0;
 }

-static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
-    std::vector<mtmd_bitmap> bitmaps;
-
-    common_chat_templates_inputs tmpl_inputs;
-    tmpl_inputs.messages = {msg};
-    tmpl_inputs.add_generation_prompt = true;
-    tmpl_inputs.use_jinja = false; // jinja is buggy here
-    auto formatted_chat = common_chat_templates_apply(ctx.tmpls.get(), tmpl_inputs);
-    LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.prompt.c_str());
-
-    for (auto & fname : images_fname) {
-        mtmd_bitmap bitmap;
-        if (mtmd_helper_bitmap_init_from_file(fname.c_str(), bitmap)) {
-            LOG_ERR("Unable to load image %s\n", fname.c_str());
-            return 2; // image not found
-        }
-        bitmaps.push_back(std::move(bitmap));
-    }
-
-    mtmd_input_text text;
-    text.text          = formatted_chat.prompt;
-    text.add_special   = add_bos;
-    text.parse_special = true;
-    mtmd_input_chunks_ptr chunks(mtmd_tokenize(ctx.ctx_vision.get(), text, bitmaps));
-    if (chunks == nullptr) {
-        LOG_ERR("Unable to tokenize prompt\n");
-        return 1;
-    }
-
-    if (mtmd_helper_eval(ctx.ctx_vision.get(), ctx.lctx, chunks.get(), ctx.n_past, 0, ctx.n_batch)) {
-        LOG_ERR("Unable to eval prompt\n");
-        return 1;
-    }
-
-    ctx.n_past += mtmd_helper_get_n_tokens(chunks.get());
-
-    return 0;
-}
-
 int main(int argc, char ** argv) {
    ggml_time_init();

@@ -241,15 +265,21 @@ int main(int argc, char ** argv) {
 #endif
    }

+    if (eval_text(ctx, "<bos>")) {
+        return 1;
+    }
+
    if (is_single_turn) {
        g_is_generating = true;
-        if (params.prompt.find("<__image__>") == std::string::npos) {
-            params.prompt += " <__image__>";
+        if (eval_text(ctx, "<start_of_turn>user\n")) {
+            return 1;
        }
-        common_chat_msg msg;
-        msg.role = "user";
-        msg.content = params.prompt;
-        if (eval_message(ctx, msg, params.image, true)) {
+        for (auto & fname : params.image) {
+            if (eval_image(ctx, fname)) {
+                return 1;
+            }
+        }
+        if (eval_text(ctx, params.prompt + "<end_of_turn><start_of_turn>model\n", true)) {
            return 1;
        }
        if (generate_response(ctx, smpl, n_predict)) {
@@ -263,9 +293,9 @@ int main(int argc, char ** argv) {
        LOG("\n   /quit or /exit   exit the program");
        LOG("\n");

-        bool is_first_msg = true;
-        std::vector<std::string> images_fname;
-        std::string content;
+        if (eval_text(ctx, "<start_of_turn>user\n")) {
+            return 1;
+        }

        while (true) {
            g_is_generating = false;
@@ -290,31 +320,24 @@ int main(int argc, char ** argv) {
            g_is_generating = true;
            if (line.find("/image") == 0) {
                std::string image = line.substr(7);
-                images_fname.push_back(string_strip(image));
-                content += "<__image__>";
-                continue;
-            } else {
-                content += line;
-            }
-            common_chat_msg msg;
-            msg.role = "user";
-            msg.content = content;
-            int ret = eval_message(ctx, msg, images_fname, is_first_msg);
-            if (ret == 2) {
-                // non-fatal error
-                images_fname.clear();
-                content.clear();
+                int res = eval_image(ctx, image);
+                if (res == 2) {
+                    continue; // image not found
+                }
+                if (res) {
+                    return 1;
+                }
                continue;
            }
-            if (ret) {
+            if (eval_text(ctx, line + "<end_of_turn><start_of_turn>model\n", true)) {
                return 1;
            }
            if (generate_response(ctx, smpl, n_predict)) {
                return 1;
            }
-            images_fname.clear();
-            content.clear();
-            is_first_msg = false;
+            if (eval_text(ctx, "<end_of_turn><start_of_turn>user\n")) {
+                return 1;
+            }
        }
    }

--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -10,7 +10,6 @@
 #include <cstring>
 #include <limits>
 #include <vector>
-#include <memory>

 #if defined(LLAVA_LOG_OFF)
 #   define LOG_INF(...)
@@ -46,17 +45,6 @@ struct clip_image_grid_shape {
    int second;
 };

-// convenience cpp wrapper
-struct clip_image_f32_batch_deleter {
-    void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
-};
-typedef std::unique_ptr<clip_image_f32_batch, clip_image_f32_batch_deleter> clip_image_f32_batch_ptr;
-
-struct clip_image_size_deleter {
-    void operator()(clip_image_f32_batch * val) { clip_image_f32_batch_free(val); }
-};
-typedef std::unique_ptr<clip_image_size, clip_image_size_deleter> clip_image_size_ptr;
-
 /**
 * Selects the best resolution from a list of possible resolutions based on the original size.
 *
@@ -117,8 +105,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
        struct ggml_context * ctx;
    } model;

-    const int32_t image_size = clip_get_image_size(ctx_clip);
-    const int32_t patch_size = clip_get_patch_size(ctx_clip);
+    const int32_t image_size = clip_image_size(ctx_clip);
+    const int32_t patch_size = clip_patch_size(ctx_clip);

    int32_t num_patches_per_side = image_size / patch_size; // 336 / 14 = 24 - used for embedding-patching boxes (24*24 = 576 patches)

@@ -258,9 +246,12 @@ static clip_image_f32 * reshape_by_patch(clip_image_f32 * image, int patch_size)

 static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float * image_embd, int * n_img_pos) {
    // std::vector<clip_image_f32*> img_res_v; // format VectN x H x W x RGB (N x 336 x 336 x 3), so interleaved RGB - different to the python implementation which is N x 3 x 336 x 336
-    clip_image_f32_batch_ptr img_res_v(clip_image_f32_batch_init());
-    if (!clip_image_preprocess(ctx_clip, img, img_res_v.get())) {
+    clip_image_f32_batch img_res_v;
+    img_res_v.size = 0;
+    img_res_v.data = nullptr;
+    if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
        LOG_ERR("%s: unable to preprocess image\n", __func__);
+        delete[] img_res_v.data;
        return false;
    }

@@ -268,72 +259,66 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli

    const char * mm_patch_merge_type = clip_patch_merge_type(ctx_clip);

-    const size_t n_imgs = clip_image_f32_batch_n_images(img_res_v.get());
-
    if (clip_is_minicpmv(ctx_clip) || clip_is_qwen2vl(ctx_clip)) {
        std::vector<float *> image_embd_v;
-        image_embd_v.resize(n_imgs);
-        clip_image_size load_image_size;
+        image_embd_v.resize(img_res_v.size);
+        struct clip_image_size * load_image_size = clip_image_size_init();

-        for (size_t i = 0; i < n_imgs; i++) {
+        for (size_t i = 0; i < img_res_v.size; i++) {
            const int64_t t_img_enc_step_start_us = ggml_time_us();
-            int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
-            int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
-            image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, nx, ny));
-            int patch_size = 14;
-            load_image_size.width = nx;
-            load_image_size.height = ny;
-            clip_add_load_image_size(ctx_clip, &load_image_size);
+            image_embd_v[i] = (float *)malloc(clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
+            int patch_size=14;
+            load_image_size->width = img_res_v.data[i].nx;
+            load_image_size->height = img_res_v.data[i].ny;
+            clip_add_load_image_size(ctx_clip, load_image_size);

            bool encoded = false;
-            clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
            if (clip_is_qwen2vl(ctx_clip)) {
-                encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]);
+                encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
            }
            else {
-                encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(img_res, patch_size), image_embd_v[i]);
+                encoded = clip_image_encode(ctx_clip, n_threads, reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
            }

            if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
            }
            const int64_t t_img_enc_steop_batch_us = ggml_time_us();
-            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)n_imgs, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
+            LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
        }
        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

        int n_img_pos_out = 0;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
-            int nx = clip_image_f32_batch_nx(img_res_v.get(), i);
-            int ny = clip_image_f32_batch_ny(img_res_v.get(), i);
-            clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
            std::memcpy(
                image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
                image_embd_v[i],
-                clip_embd_nbytes_by_img(ctx_clip, nx, ny));
-            n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
+                clip_embd_nbytes_by_img(ctx_clip, img_res_v.data[i].nx, img_res_v.data[i].ny));
+            n_img_pos_out += clip_n_patches_by_img(ctx_clip, &img_res_v.data[i]);
        }
        *n_img_pos = n_img_pos_out;
        for (size_t i = 0; i < image_embd_v.size(); i++) {
            free(image_embd_v[i]);
        }
        image_embd_v.clear();
-        load_image_size.width = img->nx;
-        load_image_size.height = img->ny;
-        clip_add_load_image_size(ctx_clip, &load_image_size);
-        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size.width, load_image_size.height);
+        load_image_size->width = img->nx;
+        load_image_size->height = img->ny;
+        clip_add_load_image_size(ctx_clip, load_image_size);
+        LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
+        delete[] img_res_v.data;
+        img_res_v.size = 0;
+        img_res_v.data = nullptr;
    }
    else if (clip_is_glm(ctx_clip)){
        struct clip_image_size * load_image_size = clip_image_size_init();
-        load_image_size->width  = clip_image_f32_batch_nx(img_res_v.get(), 0);
-        load_image_size->height = clip_image_f32_batch_ny(img_res_v.get(), 0);
+        load_image_size->width = img_res_v.data[0].nx;
+        load_image_size->height = img_res_v.data[0].ny;
        clip_add_load_image_size(ctx_clip, load_image_size);

-        clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
-        bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd);
-        int pos = int(load_image_size->width/clip_get_patch_size(ctx_clip)/2);
+        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
+        int pos = int(load_image_size->width/clip_patch_size(ctx_clip)/2);
        *n_img_pos = (pos * pos + 2);
        if (!encoded){
            LOG_ERR("Unable to encode image \n");
@@ -343,8 +328,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
    else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
        // flat / default llava-1.5 type embedding
        *n_img_pos = clip_n_patches(ctx_clip);
-        clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
-        bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
+        bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
+        delete[] img_res_v.data;
        if (!encoded) {
            LOG_ERR("Unable to encode image\n");

@@ -355,18 +340,17 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
        // spatial_unpad llava-1.6 type embedding
        // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
        std::vector<float *> image_embd_v;
-        image_embd_v.resize(n_imgs);
-        for (size_t i = 0; i < n_imgs; i++) {
-            clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), i);
+        image_embd_v.resize(img_res_v.size);
+        for (size_t i = 0; i < img_res_v.size; i++) {
            image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
-            const bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
+            const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
            if (!encoded) {
-                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) n_imgs);
+                LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
                return false;
            }
        }
        const int64_t t_img_enc_batch_us = ggml_time_us();
-        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)n_imgs, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
+        LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

        const int32_t * image_grid = clip_image_grid(ctx_clip);
        const size_t num_gridpoints = get_clip_image_grid_size(ctx_clip);
@@ -376,7 +360,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
            grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
        }

-        const int32_t image_size = clip_get_image_size(ctx_clip);
+        // free all img_res_v - not needed anymore
+        delete[] img_res_v.data;
+        img_res_v.size = 0;
+        img_res_v.data = nullptr;
+
+        const int32_t image_size = clip_image_size(ctx_clip);

        struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);

--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -1,341 +0,0 @@
-#include "clip.h"
-#include "clip-impl.h"
-#include "mtmd.h"
-
-#include "llama.h"
-
-#include <algorithm>
-#include <cerrno>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <limits>
-#include <vector>
-
-struct mtmd_context {
-    struct clip_ctx * ctx_clip;
-    const struct llama_model * text_model;
-    std::vector<float> image_embd_v; // image embedding vector
-    bool print_timings;
-    int n_threads;
-    std::string image_marker;
-
-    // TODO @ngxson : add timings
-
-    mtmd_context(const char * mmproj_fname,
-                   const llama_model * text_model,
-                   const mtmd_context_params & ctx_params) : print_timings(ctx_params.print_timings), n_threads(ctx_params.n_threads), image_marker(ctx_params.image_marker) {
-        clip_context_params ctx_clip_params;
-        ctx_clip_params.use_gpu   = ctx_params.use_gpu;
-        ctx_clip_params.verbosity = ctx_params.verbosity;
-        ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
-        if (!ctx_clip) {
-            throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
-        }
-        this->text_model = text_model;
-    }
-
-    ~mtmd_context() {
-        clip_free(ctx_clip);
-    }
-};
-
-struct mtmd_image_tokens_data {
-    clip_image_f32_batch batch_f32; // preprocessed image patches
-};
-
-struct mtmd_image_tokens {
-    uint32_t nx; // number of tokens in x direction
-    uint32_t ny; // number of tokens in y direction
-    uint32_t n_tokens() const { return nx * ny; }
-    clip_image_f32_batch batch_f32; // preprocessed image patches
-};
-
-mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
-        const struct llama_model * text_model,
-        const struct mtmd_context_params ctx_params) {
-    try {
-        return new mtmd_context(mmproj_fname, text_model, ctx_params);
-    } catch (const std::exception & e) {
-        LOG_ERR("%s: error: %s\n", __func__, e.what());
-        return nullptr;
-    }
-}
-
-void mtmd_free(mtmd_context * ctx) {
-    if (ctx) {
-        delete ctx;
-    }
-}
-
-// copied from common_tokenize
-static std::vector<llama_token> mtmd_tokenize_text_internal(
-    const struct llama_vocab * vocab,
-           const std::string & text,
-                        bool   add_special,
-                        bool   parse_special) {
-    // upper limit for the number of tokens
-    int n_tokens = text.length() + 2 * add_special;
-    std::vector<llama_token> result(n_tokens);
-    n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
-    if (n_tokens < 0) {
-        result.resize(-n_tokens);
-        int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
-        GGML_ASSERT(check == -n_tokens);
-    } else {
-        result.resize(n_tokens);
-    }
-    return result;
-}
-
-mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
-                                const mtmd_input_text & text,
-                                const std::vector<mtmd_bitmap> & bitmaps) {
-    mtmd_input_chunks * output = new mtmd_input_chunks;
-    auto vocab = llama_model_get_vocab(ctx->text_model);
-
-    std::string prompt_modified(text.text);
-    std::string marker_modified(ctx->image_marker);
-    projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
-    // a bit hacky here, but works for now
-    // for some models, we need to add prefix and suffix to the image embeddings
-    if (proj_type == PROJECTOR_TYPE_GEMMA3) {
-        // <start_of_image> ... (image embeddings) ... <end_of_image>
-        marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
-        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
-    }
-
-    std::vector<std::string> parts = string_split_str(text.text, ctx->image_marker);
-    output->clear();
-    output->reserve(parts.size());
-
-    size_t i_img = 0;
-
-    for (const auto & part : parts) {
-        //printf("tokenizing part: %s\n", part.c_str());
-        bool add_bos = &parts.front() == &part;
-        auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
-        if (tokens.empty()) {
-            continue;
-        }
-        mtmd_input_chunk chunk{
-            MTMD_INPUT_CHUNK_TYPE_TEXT,
-            std::move(tokens),
-            {},
-        };
-        output->emplace_back(std::move(chunk));
-
-        if (&parts.back() != &part) {
-            // add image token to middle of 2 parts
-
-            if (i_img >= bitmaps.size()) {
-                LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
-                return nullptr;
-            }
-
-            // shim layer
-            clip_image_u8_ptr img_u8(clip_image_u8_init());
-            img_u8->nx = bitmaps[i_img].nx;
-            img_u8->ny = bitmaps[i_img].ny;
-            img_u8->buf.resize(bitmaps[i_img].data.size());
-            std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
-
-            // preprocess image
-            clip_image_f32_batch batch_f32;
-            bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
-            if (!ok) {
-                LOG_ERR("Unable to preprocess image\n");
-                return nullptr;
-            }
-
-            mtmd_image_tokens * image_tokens = new mtmd_image_tokens;
-            image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
-            image_tokens->ny = 1; // TODO
-            image_tokens->batch_f32 = std::move(batch_f32);
-
-            mtmd_input_chunk chunk{
-                MTMD_INPUT_CHUNK_TYPE_IMAGE,
-                {},
-                image_tokens,
-            };
-            output->emplace_back(std::move(chunk));
-            i_img++;
-        }
-    }
-
-    return output;
-}
-
-void mtmd_input_chunks_free(mtmd_input_chunks * chunks) {
-    for (auto & chunk : *chunks) {
-        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE && chunk.tokens_image) {
-            delete chunk.tokens_image;
-        }
-    }
-    delete chunks;
-}
-
-int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
-    int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
-    ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
-    bool ok = clip_image_batch_encode(
-        ctx->ctx_clip,
-        ctx->n_threads,
-        &image_tokens->batch_f32,
-        ctx->image_embd_v.data());
-    return ok ? 0 : 1;
-}
-
-float * mtmd_get_output_embd(mtmd_context * ctx) {
-    return ctx->image_embd_v.data();
-}
-
-size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks) {
-    size_t n_tokens = 0;
-    for (auto & chunk : *chunks) {
-        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            n_tokens += chunk.tokens_text.size();
-        } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            n_tokens += chunk.tokens_image->n_tokens();
-        } else {
-            GGML_ASSERT(false && "chunk type not supported");
-        }
-    }
-    return n_tokens;
-}
-
-// helper struct to make working with embd batch easier
-// note: this will be removed after llama_batch_ext refactoring
-struct decode_embd_batch {
-    std::vector<llama_pos>      pos;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id>   seq_id_0;
-    std::vector<llama_seq_id *> seq_ids;
-    std::vector<int8_t>         logits;
-    llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-        pos     .resize(n_tokens);
-        n_seq_id.resize(n_tokens);
-        seq_ids .resize(n_tokens + 1);
-        logits  .resize(n_tokens);
-        seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
-        seq_ids [n_tokens] = nullptr;
-        batch = {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ nullptr,
-            /*embd           =*/ embd,
-            /*pos            =*/ pos.data(),
-            /*n_seq_id       =*/ n_seq_id.data(),
-            /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
-        };
-        for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i] = pos_0 + i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
-        }
-    }
-};
-
-int32_t mtmd_helper_eval(mtmd_context * ctx,
-        llama_context * lctx,
-        mtmd_input_chunks * chunks,
-        llama_pos pos0,
-        llama_seq_id seq_id,
-        int32_t n_batch) {
-    int32_t ret;
-    llama_pos n_past = pos0;
-    llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
-
-    for (auto & chunk : *chunks) {
-        bool is_last = &chunk == &chunks->back();
-        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            // TODO @ngxson : may need to split into smaller batches
-            text_batch.n_tokens = chunk.tokens_text.size();
-            for (size_t i = 0; i < chunk.tokens_text.size(); i++) {
-                text_batch.token   [i]    = chunk.tokens_text[i];
-                text_batch.pos     [i]    = n_past++;
-                text_batch.n_seq_id[i]    = 1;
-                text_batch.seq_id  [i][0] = seq_id;
-                text_batch.logits  [i]    = false;
-            }
-            if (is_last) {
-                // always get logits for last input chunk
-                text_batch.logits[text_batch.n_tokens - 1] = true;
-            }
-            ret = llama_decode(lctx, text_batch);
-            if (ret != 0) {
-                LOG_ERR("failed to decode text\n");
-                llama_batch_free(text_batch);
-                return ret;
-            }
-
-        } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            GGML_ASSERT(!is_last && "logits for last image chunk is not yet support");
-            GGML_ASSERT(chunk.tokens_image != nullptr);
-            int64_t t0 = ggml_time_ms();
-            if (ctx->print_timings) {
-                LOG_INF("encoding image...\n");
-            }
-            ret = mtmd_encode(ctx, chunk.tokens_image);
-            if (ret != 0) {
-                LOG_ERR("failed to encode image\n");
-                llama_batch_free(text_batch);
-                return ret;
-            }
-            if (ctx->print_timings) {
-                LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
-            }
-
-            int32_t n_tokens = chunk.tokens_image->n_tokens();
-            float * embd = mtmd_get_output_embd(ctx);
-            decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
-            int64_t t1 = ggml_time_ms();
-            ret = llama_decode(lctx, batch_img.batch);
-            if (ret != 0) {
-                LOG_ERR("failed to decode image\n");
-                llama_batch_free(text_batch);
-                return ret;
-            }
-            if (ctx->print_timings) {
-                LOG_INF("image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
-            }
-
-            n_past += n_tokens;
-
-        } else {
-            GGML_ASSERT(false && "chunk type not supported");
-        }
-    }
-
-    llama_batch_free(text_batch);
-    return 0;
-}
-
-int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output) {
-    clip_image_u8_ptr img_u8(clip_image_u8_init());
-    bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
-    if (!ok) {
-        LOG_ERR("Unable to load image from buffer\n");
-        return 1;
-    }
-    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
-    output.data.resize(output.nx * output.ny * 3);
-    std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
-    return 0;
-}
-
-int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output) {
-    clip_image_u8_ptr img_u8(clip_image_u8_init());
-    bool ok = clip_image_load_from_file(fname, img_u8.get());
-    if (!ok) {
-        LOG_ERR("Unable to load image %s\n", fname);
-        return 1;
-    }
-    unsigned char * data = clip_image_u8_get_data(img_u8.get(), &output.nx, &output.ny);
-    output.data.resize(output.nx * output.ny * 3);
-    std::memcpy(output.data.data(), data, output.nx * output.ny * 3);
-    return 0;
-}
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -1,146 +0,0 @@
-#ifndef MTMD_H
-#define MTMD_H
-
-#include "ggml.h"
-#include "llama.h"
-#include "clip.h"
-
-#include <vector>
-#include <cinttypes>
-#include <memory>
-
-#ifdef LLAMA_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef LLAMA_BUILD
-#            define MTMD_API __declspec(dllexport)
-#        else
-#            define MTMD_API __declspec(dllimport)
-#        endif
-#    else
-#        define MTMD_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define MTMD_API
-#endif
-
-#ifdef __cplusplus
-
-enum mtmd_input_chunk_type {
-    MTMD_INPUT_CHUNK_TYPE_TEXT,
-    MTMD_INPUT_CHUNK_TYPE_IMAGE,
-};
-
-struct mtmd_context;
-struct mtmd_image_tokens;
-
-// represents raw image data, layout is RGBRGBRGB...
-// length of data must be nx * ny * 3
-struct mtmd_bitmap {
-    uint32_t nx;
-    uint32_t ny;
-    std::vector<unsigned char> data;
-};
-
-struct mtmd_input_chunk {
-    mtmd_input_chunk_type type;
-    std::vector<llama_token> tokens_text;
-    mtmd_image_tokens * tokens_image = nullptr;
-};
-
-using mtmd_input_chunks = std::vector<mtmd_input_chunk>;
-
-struct mtmd_context_params {
-    bool use_gpu = true;
-    bool print_timings = true;
-    int n_threads = 4;
-    enum ggml_log_level verbosity = GGML_LOG_LEVEL_INFO;
-    const char * image_marker = "<__image__>";
-};
-
-struct mtmd_input_text {
-    std::string text;
-    bool add_special;
-    bool parse_special;
-};
-
-// initialize the mtmd context
-// return nullptr on failure
-MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
-                                                const llama_model * text_model,
-                                                const mtmd_context_params ctx_params);
-
-MTMD_API void mtmd_free(mtmd_context * ctx);
-
-// tokenize an input text prompt and an image
-// the prompt must have the input image marker (default: "<__image__>") in it
-// the marker will be replaced with the image tokens
-// for example:
-//   "here is an image: <__image__>\ndescribe it in detail."
-//   this will gives 3 chunks:
-//   1. "here is an image: <start_of_image>"
-//   2. (image tokens)
-//   3. "<end_of_image>\ndescribe it in detail."
-// number of bitmaps must be equal to the number of image markers in the prompt
-// this function is thread-safe (shared ctx)
-MTMD_API mtmd_input_chunks * mtmd_tokenize(mtmd_context * ctx,
-                                const mtmd_input_text & text,
-                                const std::vector<mtmd_bitmap> & bitmaps);
-
-// free image chunk data
-MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks);
-
-// returns 0 on success
-MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
-                            const mtmd_image_tokens * image_tokens);
-
-// get output embeddings from the last encode pass
-MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
-
-//
-// helper functions (can be implemented based on other functions)
-//
-
-// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
-MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks * chunks);
-
-// helper function that automatically:
-// 1. run llama_decode() on text chunks
-// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
-// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
-// otherwise, returns 0 on success
-MTMD_API int32_t mtmd_helper_eval(mtmd_context * ctx,
-                                llama_context * lctx,
-                                mtmd_input_chunks * chunks,
-                                llama_pos pos0,
-                                llama_seq_id seq_id,
-                                int32_t n_batch);
-
-// helper function to construct a mtmd_bitmap from a file
-// returns 0 on success
-// this function is thread-safe
-MTMD_API int32_t mtmd_helper_bitmap_init_from_file(const char * fname, mtmd_bitmap & output);
-
-// helper function to construct a mtmd_bitmap from a buffer
-// the buffer must be an image in format supported by stb_image (jpg, png, bmp, gif, etc.)
-// returns 0 on success
-// this function is thread-safe
-MTMD_API int32_t mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len, mtmd_bitmap & output);
-
-// convenient unique_ptr wrappers
-struct mtmd_context_deleter {
-    void operator()(mtmd_context * val) { mtmd_free(val); }
-};
-using mtmd_context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
-
-struct mtmd_input_chunks_deleter {
-    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
-};
-using mtmd_input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
-
-#else
-
-static_assert(false && "C header is not yet supported by this library");
-
-#endif
-
-#endif
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -507,12 +507,17 @@ extern "C" {

        GGML_OP_UNARY,

+        GGML_OP_MAP_UNARY,
+        GGML_OP_MAP_BINARY,
+
+        GGML_OP_MAP_CUSTOM1_F32,
+        GGML_OP_MAP_CUSTOM2_F32,
+        GGML_OP_MAP_CUSTOM3_F32,
+
        GGML_OP_MAP_CUSTOM1,
        GGML_OP_MAP_CUSTOM2,
        GGML_OP_MAP_CUSTOM3,

-        GGML_OP_CUSTOM,
-
        GGML_OP_CROSS_ENTROPY_LOSS,
        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
        GGML_OP_OPT_STEP_ADAMW,
@@ -1717,29 +1722,24 @@ extern "C" {
            float                 p0,
            float                 p1);

-    enum ggml_scale_mode {
-        GGML_SCALE_MODE_NEAREST  = 0,
-        GGML_SCALE_MODE_BILINEAR = 1,
-    };
-
-    // interpolate
+    // nearest interpolate
    // multiplies ne0 and ne1 by scale factor
+    // used in stable-diffusion
    GGML_API struct ggml_tensor * ggml_upscale(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   scale_factor,
-            enum ggml_scale_mode  mode);
+            int                   scale_factor);

-    // interpolate
-    // interpolate scale to specified dimensions
+    // nearest interpolate
+    // nearest interpolate to specified dimensions
+    // used in tortoise.cpp
    GGML_API struct ggml_tensor * ggml_upscale_ext(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   ne0,
            int                   ne1,
            int                   ne2,
-            int                   ne3,
-            enum ggml_scale_mode  mode);
+            int                   ne3);

    // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
    GGML_API struct ggml_tensor * ggml_pad(
@@ -1916,6 +1916,83 @@ extern "C" {

    // custom operators

+    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
+    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
+
+    typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
+    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
+    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
+            struct ggml_context        * ctx,
+            struct ggml_tensor         * a,
+                   ggml_unary_op_f32_t   fun),
+        "use ggml_map_custom1 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
+            struct ggml_context        * ctx,
+            struct ggml_tensor         * a,
+                   ggml_unary_op_f32_t   fun),
+        "use ggml_map_custom1_inplace instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b,
+                   ggml_binary_op_f32_t   fun),
+        "use ggml_map_custom2 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b,
+                   ggml_binary_op_f32_t   fun),
+        "use ggml_map_custom2_inplace instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+                   ggml_custom1_op_f32_t   fun),
+        "use ggml_map_custom1 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+                   ggml_custom1_op_f32_t   fun),
+        "use ggml_map_custom1_inplace instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+            struct ggml_tensor           * b,
+                   ggml_custom2_op_f32_t   fun),
+        "use ggml_map_custom2 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+            struct ggml_tensor           * b,
+                   ggml_custom2_op_f32_t   fun),
+        "use ggml_map_custom2_inplace instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+            struct ggml_tensor           * b,
+            struct ggml_tensor           * c,
+                   ggml_custom3_op_f32_t   fun),
+        "use ggml_map_custom3 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+            struct ggml_tensor           * b,
+            struct ggml_tensor           * c,
+                   ggml_custom3_op_f32_t   fun),
+        "use ggml_map_custom3_inplace instead");
+
+    // custom operators v2
+
    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
@@ -1971,30 +2048,6 @@ extern "C" {
            int                     n_tasks,
            void                  * userdata);

-    typedef void (*ggml_custom_op_t)(struct ggml_tensor * dst , int ith, int nth, void * userdata);
-
-    GGML_API struct ggml_tensor * ggml_custom_4d(
-            struct ggml_context * ctx,
-            enum ggml_type        type,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2,
-            int64_t               ne3,
-            struct ggml_tensor ** args,
-            int                   n_args,
-            ggml_custom_op_t      fun,
-            int                   n_tasks,
-            void                * userdata);
-
-    GGML_API struct ggml_tensor * ggml_custom_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor ** args,
-            int                   n_args,
-            ggml_custom_op_t      fun,
-            int                   n_tasks,
-            void                * userdata);
-
    // loss function

    GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -1824,9 +1824,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
            if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
                return false;
            }
-            if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
-                return false;
-            }
            return true;
        }
        case GGML_OP_POOL_2D: {
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2027,6 +2027,41 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_rwkv_wkv7(params, tensor);
            } break;
+        case GGML_OP_MAP_UNARY:
+            {
+                ggml_unary_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_unary(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_BINARY:
+            {
+                ggml_binary_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_binary(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM1_F32:
+            {
+                ggml_custom1_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_custom1_f32(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM2_F32:
+            {
+                ggml_custom2_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_custom2_f32(params, tensor, fun);
+            }
+            break;
+        case GGML_OP_MAP_CUSTOM3_F32:
+            {
+                ggml_custom3_op_f32_t fun;
+                memcpy(&fun, tensor->op_params, sizeof(fun));
+                ggml_compute_forward_map_custom3_f32(params, tensor, fun);
+            }
+            break;
        case GGML_OP_MAP_CUSTOM1:
            {
                ggml_compute_forward_map_custom1(params, tensor);
@@ -2042,11 +2077,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                ggml_compute_forward_map_custom3(params, tensor);
            }
            break;
-        case GGML_OP_CUSTOM:
-            {
-                ggml_compute_forward_custom(params, tensor);
-            }
-            break;
        case GGML_OP_CROSS_ENTROPY_LOSS:
            {
                ggml_compute_forward_cross_entropy_loss(params, tensor);
@@ -2298,6 +2328,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_WIN_PART:
        case GGML_OP_WIN_UNPART:
        case GGML_OP_GET_REL_POS:
+        case GGML_OP_MAP_UNARY:
+        case GGML_OP_MAP_BINARY:
+        case GGML_OP_MAP_CUSTOM1_F32:
+        case GGML_OP_MAP_CUSTOM2_F32:
+        case GGML_OP_MAP_CUSTOM3_F32:
            {
                n_tasks = 1;
            } break;
@@ -2331,16 +2366,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                    n_tasks = MIN(p.n_tasks, n_threads);
                }
            } break;
-        case GGML_OP_CUSTOM:
-            {
-                struct ggml_custom_op_params p;
-                memcpy(&p, node->op_params, sizeof(p));
-                if (p.n_tasks == GGML_N_TASKS_MAX) {
-                    n_tasks = n_threads;
-                } else {
-                    n_tasks = MIN(p.n_tasks, n_threads);
-                }
-            } break;
        case GGML_OP_CROSS_ENTROPY_LOSS:
        case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
        case GGML_OP_OPT_STEP_ADAMW:
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6351,72 +6351,24 @@ static void ggml_compute_forward_upscale_f32(
    const float sf2 = (float)ne2/src0->ne[2];
    const float sf3 = (float)ne3/src0->ne[3];

-    const ggml_scale_mode mode = (ggml_scale_mode) ggml_get_op_params_i32(dst, 0);
+    // TODO: optimize

-    if (mode == GGML_SCALE_MODE_NEAREST) {
-        for (int64_t i3 = 0; i3 < ne3; i3++) {
-            const int64_t i03 = i3 / sf3;
-            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-                const int64_t i02 = i2 / sf2;
-                for (int64_t i1 = 0; i1 < ne1; i1++) {
-                    const int64_t i01 = i1 / sf1;
-                    for (int64_t i0 = 0; i0 < ne0; i0++) {
-                        const int64_t i00 = i0 / sf0;
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        const int64_t i03 = i3 / sf3;
+        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+            const int64_t i02 = i2 / sf2;
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                const int64_t i01 = i1 / sf1;
+                for (int64_t i0 = 0; i0 < ne0; i0++) {
+                    const int64_t i00 = i0 / sf0;

-                        const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
+                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);

-                        *y = *x;
-                    }
+                    *y = *x;
                }
            }
        }
-    } else if (mode == GGML_SCALE_MODE_BILINEAR) {
-        // setting a pixel offset of 0 would replicate the behavior of pytorch interpolate with align_corners=True
-        const float pixel_offset = 0.5f;
-
-        for (int64_t i3 = 0; i3 < ne3; i3++) {
-            const int64_t i03 = i3 / sf3;
-            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-                const int64_t i02 = i2 / sf2;
-                for (int64_t i1 = 0; i1 < ne1; i1++) {
-                    const float y = ((float)i1 + pixel_offset) / sf1 - pixel_offset;
-                    int64_t y0 = (int64_t)floorf(y);
-                    int64_t y1 = y0 + 1;
-
-                    y0 = std::max(int64_t(0), std::min(y0, ne01 - 1));
-                    y1 = std::max(int64_t(0), std::min(y1, ne01 - 1));
-
-                    float dy = y - (float)y0;
-                    dy = std::max(0.0f, std::min(dy, 1.0f));
-
-                    for (int64_t i0 = 0; i0 < ne0; i0++) {
-                        const float x = ((float)i0 + pixel_offset) / sf0 - pixel_offset;
-                        int64_t x0 = (int64_t)floorf(x);
-                        int64_t x1 = x0 + 1;
-
-                        x0 = std::max(int64_t(0), std::min(x0, ne00 - 1));
-                        x1 = std::max(int64_t(0), std::min(x1, ne00 - 1));
-
-                        float dx = x - (float)x0;
-                        dx = std::max(0.0f, std::min(dx, 1.0f));
-
-                        // fetch the four surrounding pixel values and interpolate
-                        const float a = *(const float *)((const char *)src0->data + x0*nb00 + y0*nb01 + i02*nb02 + i03*nb03);
-                        const float b = *(const float *)((const char *)src0->data + x1*nb00 + y0*nb01 + i02*nb02 + i03*nb03);
-                        const float c = *(const float *)((const char *)src0->data + x0*nb00 + y1*nb01 + i02*nb02 + i03*nb03);
-                        const float d = *(const float *)((const char *)src0->data + x1*nb00 + y1*nb01 + i02*nb02 + i03*nb03);
-
-                        const float val = a*(1 - dx)*(1 - dy) + b*dx*(1 - dy) + c*(1 - dx)*dy + d*dx*dy;
-
-                        float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
-                        *y_dst = val;
-                    }
-                }
-            }
-        }
-    } else {
-        GGML_ABORT("unsupported upscale mode");
    }
 }

@@ -8316,6 +8268,152 @@ void ggml_compute_forward_rwkv_wkv7(
    }
 }

+// ggml_compute_forward_map_unary
+
+static void ggml_compute_forward_map_unary_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst,
+        const ggml_unary_op_f32_t fun) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        fun(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+void ggml_compute_forward_map_unary(
+        const ggml_compute_params * params,
+        ggml_tensor * dst,
+        const ggml_unary_op_f32_t fun) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_map_unary_f32(params, dst, fun);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_map_binary
+
+static void ggml_compute_forward_map_binary_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst,
+        const ggml_binary_op_f32_t fun) {
+
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    assert(ggml_is_contiguous_1(src0));
+    assert(ggml_is_contiguous_1(src1));
+    assert(ggml_is_contiguous_1(dst));
+    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    for (int i = 0; i < n; i++) {
+        fun(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])),
+                (float *) ((char *) src1->data + i*(src1->nb[1])));
+    }
+}
+
+void ggml_compute_forward_map_binary(
+        const ggml_compute_params * params,
+        ggml_tensor * dst,
+        const ggml_binary_op_f32_t fun) {
+
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_map_binary_f32(params, dst, fun);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+// ggml_compute_forward_map_custom1
+
+void ggml_compute_forward_map_custom1_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst,
+        const ggml_custom1_op_f32_t fun) {
+
+    const ggml_tensor * a = dst->src[0];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    fun(dst, a);
+}
+
+// ggml_compute_forward_map_custom2
+
+void ggml_compute_forward_map_custom2_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst,
+        const ggml_custom2_op_f32_t fun) {
+
+    const ggml_tensor * a = dst->src[0];
+    const ggml_tensor * b = dst->src[1];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    fun(dst, a, b);
+}
+
+// ggml_compute_forward_map_custom3
+
+void ggml_compute_forward_map_custom3_f32(
+        const ggml_compute_params * params,
+        ggml_tensor * dst,
+        const ggml_custom3_op_f32_t fun) {
+
+    const ggml_tensor * a = dst->src[0];
+    const ggml_tensor * b = dst->src[1];
+    const ggml_tensor * c = dst->src[1];
+
+    if (params->ith != 0) {
+        return;
+    }
+
+    fun(dst, a, b, c);
+}
+
 // ggml_compute_forward_map_custom1

 void ggml_compute_forward_map_custom1(
@@ -8361,18 +8459,6 @@ void ggml_compute_forward_map_custom3(
    p.fun(dst, a, b, c, params->ith, params->nth, p.userdata);
 }

-// ggml_compute_forward_custom
-
-void ggml_compute_forward_custom(
-    const struct ggml_compute_params * params,
-          struct ggml_tensor * dst) {
-
-    struct ggml_custom_op_params p;
-    memcpy(&p, dst->op_params, sizeof(p));
-
-    p.fun(dst, params->ith, params->nth, p.userdata);
-}
-
 // ggml_compute_forward_cross_entropy_loss

 static void ggml_compute_forward_cross_entropy_loss_f32(
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -96,10 +96,29 @@ void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params,
 void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_map_unary(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
+    const ggml_unary_op_f32_t fun);
+void ggml_compute_forward_map_binary(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
+    const ggml_binary_op_f32_t fun);
+void ggml_compute_forward_map_custom1_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
+    const ggml_custom1_op_f32_t fun);
+void ggml_compute_forward_map_custom2_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
+    const ggml_custom2_op_f32_t fun);
+void ggml_compute_forward_map_custom3_f32(
+    const struct ggml_compute_params * params,
+    struct ggml_tensor * dst,
+    const ggml_custom3_op_f32_t fun);
 void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
-void ggml_compute_forward_custom(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@@ -855,17 +855,13 @@ static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
        tmp[i] = GGML_FP16_TO_FP32(x[i]);
    }

-    // note: keep type-cast here to prevent compiler bugs
-    // see: https://github.com/ggml-org/llama.cpp/issues/12846
-    return vec_xl(0, (const float *)(tmp));
+    return vec_xl(0, tmp);
 }

 static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
    float arr[4];

-    // note: keep type-cast here to prevent compiler bugs
-    // see: https://github.com/ggml-org/llama.cpp/issues/12846
-    vec_xst(y, 0, (float *)(arr));
+    vec_xst(y, 0, arr);

    for (int i = 0; i < 4; i++) {
        x[i] = GGML_FP32_TO_FP16(arr[i]);
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3216,7 +3216,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_GROUP_NORM:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_UPSCALE:
-            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
        case GGML_OP_PAD:
        case GGML_OP_ARANGE:
        case GGML_OP_TIMESTEP_EMBEDDING:
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -148,14 +148,8 @@ struct ggml_map_custom2_op_params {

 struct ggml_map_custom3_op_params {
    ggml_custom3_op_t fun;
-    int               n_tasks;
-    void            * userdata;
-};
-
-struct ggml_custom_op_params {
-    ggml_custom_op_t fun;
-    int              n_tasks;
-    void           * userdata;
+    int n_tasks;
+    void * userdata;
 };

 // bitset
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -1334,9 +1334,8 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
            return op->src[0]->type == GGML_TYPE_F16;
        case GGML_OP_POOL_1D:
            return false;
-        case GGML_OP_UPSCALE:
-            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
        case GGML_OP_POOL_2D:
+        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_PAD_REFLECT_1D:
        case GGML_OP_TIMESTEP_EMBEDDING:
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
--- a/ggml/src/ggml-sycl/element_wise.hpp
+++ b/ggml/src/ggml-sycl/element_wise.hpp
@@ -2,13 +2,6 @@
 #define GGML_SYCL_ELEMENTWISE_HPP

 #include "common.hpp"
-#include "ggml.h"
-#include <limits.h>
-
-template <typename T>
-T neg_infinity() {
-    return -std::numeric_limits<T>::infinity();
-}

 static __dpct_inline__ float op_repeat(const float a, const float b) {
    return b;
@@ -31,19 +24,6 @@ static __dpct_inline__ float op_div(const float a, const float b) {
    return a / b;
 }

-template<typename T>
-struct typed_data {
-    const T * src;
-    T * dst;
-};
-
-template<typename T>
-typed_data<T> cast_data(ggml_tensor * dst) {
-    return {
-        /* .src = */ static_cast<const T *>(dst->src[0]->data),
-        /* .dst = */ static_cast<T *>(dst->data)
-    };
-}

 void ggml_sycl_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

@@ -85,10 +65,6 @@ void ggml_sycl_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

 void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

-void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
-
-// ---------
-
 void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst);

 void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -1617,6 +1617,17 @@ static void scale_f32(const float * x, float * dst, const float scale, const int
    dst[i] = scale * x[i];
 }

+static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k,
+                      const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
+}

 template <typename Ti, typename To>
 static  void pool2d_nchw_kernel(
@@ -1757,6 +1768,18 @@ static void scale_f32_sycl(const float *x, float *dst, const float scale,
        });
 }

+static void clamp_f32_sycl(const float *x, float *dst, const float min,
+                           const float max, const int k,
+                           queue_ptr stream) {
+    const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            clamp_f32(x, dst, min, max, k, item_ct1);
+        });
+}

 static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
                              const int nrows, queue_ptr stream) {
@@ -2235,6 +2258,26 @@ inline void ggml_sycl_op_scale(ggml_backend_sycl_context & ctx, ggml_tensor *dst
    SYCL_CHECK(0);
 }

+inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
+    float *       dst_dd  = static_cast<float *>(dst->data);
+
+    float min;
+    float max;
+    memcpy(&min, dst->op_params, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    clamp_f32_sycl(src0_dd, dst_dd, min, max, ggml_nelements(dst->src[0]), ctx.stream());
+    /*
+    DPCT1010:88: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    SYCL_CHECK(0);
+}
+
 static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
    static bool peer_access_enabled = false;

@@ -3175,6 +3218,10 @@ static void ggml_sycl_scale(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
    ggml_sycl_op_scale(ctx, dst);
 }

+static void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    ggml_sycl_op_clamp(ctx, dst);
+}
+
 static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_op_diag_mask_inf(ctx, dst);
 }
@@ -3853,11 +3900,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                case GGML_UNARY_OP_GELU_QUICK:
                case GGML_UNARY_OP_TANH:
                case GGML_UNARY_OP_EXP:
-#if defined (GGML_SYCL_F16)
-                    return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type);
-#else
-                    return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
-#endif
+                    return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32);
                default:
                    return false;
            }
@@ -3979,18 +4022,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
-            return (op->src[0]->type == GGML_TYPE_F32);
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
        case GGML_OP_SIN:
        case GGML_OP_COS:
        case GGML_OP_CLAMP:
        case GGML_OP_LOG:
-#if defined (GGML_SYCL_F16)
-            return ((op->type == GGML_TYPE_F32 || op->type == GGML_SYCL_F16) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_SYCL_F16) && (op->type == op->src[0]->type));
-#else
-            return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type);
-#endif
+            return (op->src[0]->type == GGML_TYPE_F32);
        case GGML_OP_NORM:
        case GGML_OP_RMS_NORM:
        case GGML_OP_L2_NORM:
@@ -4017,13 +4055,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
        case GGML_OP_IM2COL:
            // TODO: add support for the new F32 operations
            return op->src[0]->type == GGML_TYPE_F16;
-        case GGML_OP_UPSCALE:
-            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
        case GGML_OP_POOL_2D:
        case GGML_OP_SUM:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_ARGSORT:
        case GGML_OP_ACC:
+        case GGML_OP_UPSCALE:
        case GGML_OP_PAD:
        case GGML_OP_LEAKY_RELU:
        case GGML_OP_TIMESTEP_EMBEDDING:
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -5749,7 +5749,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
        }
        return nullptr;
    case GGML_OP_UPSCALE:
-        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && dst->op_params[0] == GGML_SCALE_MODE_NEAREST) {
+        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_upscale_f32;
        }
        return nullptr;
@@ -9404,10 +9404,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
        case GGML_OP_COS:
        case GGML_OP_CLAMP:
            return op->src[0]->type == GGML_TYPE_F32;
-        case GGML_OP_UPSCALE:
-            return op->op_params[0] == GGML_SCALE_MODE_NEAREST;
        case GGML_OP_ACC:
        case GGML_OP_CONCAT:
+        case GGML_OP_UPSCALE:
        case GGML_OP_SCALE:
        case GGML_OP_PAD:
        case GGML_OP_DIAG_MASK_INF:
@@ -9775,7 +9774,7 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
    } else if (tensor->op == GGML_OP_CONCAT) {
        tensor_clone = ggml_concat(ggml_ctx, src_clone[0], src_clone[1], *(int *)tensor->op_params);
    } else if (tensor->op == GGML_OP_UPSCALE) {
-        tensor_clone = ggml_upscale_ext(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], tensor->op_params[0], tensor->op_params[1], (ggml_scale_mode) tensor->op_params[0]);
+        tensor_clone = ggml_upscale_ext(ggml_ctx, src_clone[0], tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
    } else if (tensor->op == GGML_OP_SCALE) {
        const float * params = (const float *)tensor->op_params;
        tensor_clone = ggml_scale(ggml_ctx, src_clone[0], params[0]);
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -982,18 +982,23 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {

    "UNARY",

+    "MAP_UNARY",
+    "MAP_BINARY",
+
+    "MAP_CUSTOM1_F32",
+    "MAP_CUSTOM2_F32",
+    "MAP_CUSTOM3_F32",
+
    "MAP_CUSTOM1",
    "MAP_CUSTOM2",
    "MAP_CUSTOM3",

-    "CUSTOM",
-
    "CROSS_ENTROPY_LOSS",
    "CROSS_ENTROPY_LOSS_BACK",
    "OPT_STEP_ADAMW",
 };

-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");

 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@@ -1076,18 +1081,23 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {

    "unary(x)",

-    "map_custom(x)",
-    "map_custom(x,y)",
-    "map_custom(x,y,z)",
+    "f(x)",
+    "f(x,y)",
+
+    "custom_f32(x)",
+    "custom_f32(x,y)",
+    "custom_f32(x,y,z)",

    "custom(x)",
+    "custom(x,y)",
+    "custom(x,y,z)",

    "cross_entropy_loss(x,y)",
    "cross_entropy_loss_back(x,y)",
    "adamw(x)",
 };

-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 85, "GGML_OP_COUNT != 85");

 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");

@@ -4174,8 +4184,7 @@ static struct ggml_tensor * ggml_upscale_impl(
        int                   ne0,
        int                   ne1,
        int                   ne2,
-        int                   ne3,
-        enum ggml_scale_mode  mode) {
+        int                   ne3) {
    GGML_ASSERT(a->ne[0] <= ne0);
    GGML_ASSERT(a->ne[1] <= ne1);
    GGML_ASSERT(a->ne[2] <= ne2);
@@ -4183,8 +4192,6 @@ static struct ggml_tensor * ggml_upscale_impl(

    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);

-    ggml_set_op_params_i32(result, 0, mode);
-
    result->op     = GGML_OP_UPSCALE;
    result->src[0] = a;

@@ -4194,9 +4201,8 @@ static struct ggml_tensor * ggml_upscale_impl(
 struct ggml_tensor * ggml_upscale(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   scale_factor,
-        enum ggml_scale_mode  mode) {
-    return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3], mode);
+        int                   scale_factor) {
+    return ggml_upscale_impl(ctx, a, a->ne[0] * scale_factor, a->ne[1] * scale_factor, a->ne[2], a->ne[3]);
 }

 struct ggml_tensor * ggml_upscale_ext(
@@ -4205,9 +4211,8 @@ struct ggml_tensor * ggml_upscale_ext(
        int                   ne0,
        int                   ne1,
        int                   ne2,
-        int                   ne3,
-        enum ggml_scale_mode  mode) {
-    return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3, mode);
+        int                   ne3) {
+    return ggml_upscale_impl(ctx, a, ne0, ne1, ne2, ne3);
 }

 // ggml_pad
@@ -4837,6 +4842,179 @@ struct ggml_tensor * ggml_unary_inplace(
    return ggml_unary_impl(ctx, a, op, true);
 }

+// ggml_map_unary
+
+static struct ggml_tensor * ggml_map_unary_impl_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t   fun,
+        bool                         inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op     = GGML_OP_MAP_UNARY;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_unary_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t   fun) {
+    return ggml_map_unary_impl_f32(ctx, a, fun, false);
+}
+
+struct ggml_tensor * ggml_map_unary_inplace_f32(
+        struct ggml_context        * ctx,
+        struct ggml_tensor         * a,
+        const  ggml_unary_op_f32_t   fun) {
+    return ggml_map_unary_impl_f32(ctx, a, fun, true);
+}
+
+// ggml_map_binary
+
+static struct ggml_tensor * ggml_map_binary_impl_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t   fun,
+        bool                          inplace) {
+    GGML_ASSERT(ggml_are_same_shape(a, b));
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op     = GGML_OP_MAP_BINARY;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_binary_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t   fun) {
+    return ggml_map_binary_impl_f32(ctx, a, b, fun, false);
+}
+
+struct ggml_tensor * ggml_map_binary_inplace_f32(
+        struct ggml_context         * ctx,
+        struct ggml_tensor          * a,
+        struct ggml_tensor          * b,
+        const  ggml_binary_op_f32_t   fun) {
+    return ggml_map_binary_impl_f32(ctx, a, b, fun, true);
+}
+
+// ggml_map_custom1_f32
+
+static struct ggml_tensor * ggml_map_custom1_impl_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_f32_t   fun,
+        bool                           inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op     = GGML_OP_MAP_CUSTOM1_F32;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom1_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_f32_t   fun) {
+    return ggml_map_custom1_impl_f32(ctx, a, fun, false);
+}
+
+struct ggml_tensor * ggml_map_custom1_inplace_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        const  ggml_custom1_op_f32_t   fun) {
+    return ggml_map_custom1_impl_f32(ctx, a, fun, true);
+}
+
+// ggml_map_custom2_f32
+
+static struct ggml_tensor * ggml_map_custom2_impl_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_f32_t   fun,
+        bool                           inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op     = GGML_OP_MAP_CUSTOM2_F32;
+    result->src[0] = a;
+    result->src[1] = b;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom2_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_f32_t   fun) {
+    return ggml_map_custom2_impl_f32(ctx, a, b, fun, false);
+}
+
+struct ggml_tensor * ggml_map_custom2_inplace_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        const  ggml_custom2_op_f32_t   fun) {
+    return ggml_map_custom2_impl_f32(ctx, a, b, fun, true);
+}
+
+// ggml_map_custom3_f32
+
+static struct ggml_tensor * ggml_map_custom3_impl_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_f32_t   fun,
+        bool                           inplace) {
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params(result, (const void *) &fun, sizeof(fun));
+
+    result->op     = GGML_OP_MAP_CUSTOM3_F32;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_map_custom3_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_f32_t   fun) {
+    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, false);
+}
+
+struct ggml_tensor * ggml_map_custom3_inplace_f32(
+        struct ggml_context          * ctx,
+        struct ggml_tensor           * a,
+        struct ggml_tensor           * b,
+        struct ggml_tensor           * c,
+        const  ggml_custom3_op_f32_t   fun) {
+    return ggml_map_custom3_impl_f32(ctx, a, b, c, fun, true);
+}
+
 // ggml_map_custom1

 static struct ggml_tensor * ggml_map_custom1_impl(
@@ -4855,7 +5033,7 @@ static struct ggml_tensor * ggml_map_custom1_impl(
        /*.n_tasks  =*/ n_tasks,
        /*.userdata =*/ userdata
    };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, (const void *) &params, sizeof(params));

    result->op     = GGML_OP_MAP_CUSTOM1;
    result->src[0] = a;
@@ -4900,7 +5078,7 @@ static struct ggml_tensor * ggml_map_custom2_impl(
        /*.n_tasks  =*/ n_tasks,
        /*.userdata =*/ userdata
    };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, (const void *) &params, sizeof(params));

    result->op     = GGML_OP_MAP_CUSTOM2;
    result->src[0] = a;
@@ -4949,7 +5127,7 @@ static struct ggml_tensor * ggml_map_custom3_impl(
        /*.n_tasks  =*/ n_tasks,
        /*.userdata =*/ userdata
    };
-    ggml_set_op_params(result, &params, sizeof(params));
+    ggml_set_op_params(result, (const void *) &params, sizeof(params));

    result->op     = GGML_OP_MAP_CUSTOM3;
    result->src[0] = a;
@@ -4981,66 +5159,6 @@ struct ggml_tensor * ggml_map_custom3_inplace(
    return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
 }

-struct ggml_tensor * ggml_custom_4d(
-        struct ggml_context * ctx,
-        enum ggml_type        type,
-        int64_t               ne0,
-        int64_t               ne1,
-        int64_t               ne2,
-        int64_t               ne3,
-        struct ggml_tensor ** args,
-        int                   n_args,
-        ggml_custom_op_t      fun,
-        int                   n_tasks,
-        void                * userdata) {
-
-    GGML_ASSERT(n_args < GGML_MAX_SRC);
-
-    struct ggml_tensor * result = ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
-
-    struct ggml_custom_op_params params = {
-        /*.fun      =*/ fun,
-        /*.n_tasks  =*/ n_tasks,
-        /*.userdata =*/ userdata
-    };
-    ggml_set_op_params(result, &params, sizeof(params));
-
-    result->op = GGML_OP_CUSTOM;
-    for (int i = 0; i < n_args; i++) {
-        result->src[i] = args[i];
-    }
-
-    return result;
-}
-
-struct ggml_tensor * ggml_custom_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor ** args,
-        int                   n_args,
-        ggml_custom_op_t      fun,
-        int                   n_tasks,
-        void                * userdata) {
-
-    GGML_ASSERT(n_args < GGML_MAX_SRC - 1);
-
-    struct ggml_tensor * result = ggml_view_tensor(ctx, a);
-
-    struct ggml_custom_op_params params = {
-        /*.fun      =*/ fun,
-        /*.n_tasks  =*/ n_tasks,
-        /*.userdata =*/ userdata
-    };
-    ggml_set_op_params(result, &params, sizeof(params));
-
-    result->op = GGML_OP_CUSTOM;
-    result->src[0] = a;
-    for (int i = 0; i < n_args; i++) {
-        result->src[i + 1] = args[i];
-    }
-
-    return result;
-}
 // ggml_cross_entropy_loss

 struct ggml_tensor * ggml_cross_entropy_loss(
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -30,7 +30,6 @@ class TensorNameMap:
            "rwkv.embeddings",                           # rwkv6
            "model.embeddings",                          # rwkv7
            "model.word_embeddings",                     # bailingmoe
-            "language_model.model.embed_tokens",         # llama4
        ),

        # Token type embeddings
@@ -68,7 +67,6 @@ class TensorNameMap:
            "output_layer",              # chatglm
            "head",                      # rwkv
            "head.out",                  # wavtokenizer
-            "language_model.lm_head",    # llama4
        ),

        # Output norm
@@ -91,7 +89,6 @@ class TensorNameMap:
            "rwkv.ln_out",                             # rwkv6
            "model.ln_out",                            # rwkv7
            "backbone.final_layer_norm",               # wavtokenizer
-            "language_model.model.norm",               # llama4
        ),

        # Rope frequencies
@@ -133,7 +130,6 @@ class TensorNameMap:
            "transformer.layers.{bid}.attn_norm",                   # openelm
            "rwkv.blocks.{bid}.ln1",                                # rwkv6
            "model.layers.{bid}.ln1",                               # rwkv7
-            "language_model.model.layers.{bid}.input_layernorm",    # llama4
        ),

        # Attention norm 2
@@ -173,7 +169,6 @@ class TensorNameMap:
            "model.layers.{bid}.attention.wq",                           # internlm2
            "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok
            "transformer.h.{bid}.attn.attention.q_proj",                 # exaone
-            "language_model.model.layers.{bid}.self_attn.q_proj",        # llama4
        ),

        # Attention key
@@ -188,7 +183,6 @@ class TensorNameMap:
            "model.layers.{bid}.attention.wk",                         # internlm2
            "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok
            "transformer.h.{bid}.attn.attention.k_proj",               # exaone
-            "language_model.model.layers.{bid}.self_attn.k_proj",      # llama4
        ),

        # Attention value
@@ -202,7 +196,6 @@ class TensorNameMap:
            "model.layers.{bid}.attention.wv",                           # internlm2
            "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok
            "transformer.h.{bid}.attn.attention.v_proj",                 # exaone
-            "language_model.model.layers.{bid}.self_attn.v_proj",        # llama4
        ),

        # Attention output
@@ -229,7 +222,6 @@ class TensorNameMap:
            "encoder.layers.{bid}.self_attention.dense",                    # chatglm
            "transformer.layers.{bid}.attn.out_proj",                       # openelm
            "transformer.h.{bid}.attn.attention.out_proj",                  # exaone
-            "language_model.model.layers.{bid}.self_attn.o_proj",           # llama4
        ),

        # Attention output norm
@@ -267,7 +259,6 @@ class TensorNameMap:
            "transformer.decoder_layer.{bid}.rms_norm_2",                    # Grok
            "encoder.layers.{bid}.post_attention_layernorm",                 # chatglm
            "transformer.layers.{bid}.ffn_norm",                             # openelm
-            "language_model.model.layers.{bid}.post_attention_layernorm",    # llama4
        ),

        # Post feed-forward norm
@@ -287,7 +278,6 @@ class TensorNameMap:
            "transformer.decoder_layer.{bid}.router",           # Grok
            "transformer.blocks.{bid}.ffn.router.layer",        # dbrx
            "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
-            "language_model.model.layers.{bid}.feed_forward.router", # llama4
        ),

        MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@@ -325,7 +315,6 @@ class TensorNameMap:
            "model.layers.{bid}.residual_mlp.w3",                     # arctic
            "encoder.layers.{bid}.mlp.dense_h_to_4h",                 # chatglm
            "transformer.h.{bid}.mlp.c_fc_1",                         # exaone
-            "language_model.model.layers.{bid}.feed_forward.up_proj", # llama4
        ),

        MODEL_TENSOR.FFN_UP_EXP: (
@@ -334,13 +323,11 @@ class TensorNameMap:
            "transformer.blocks.{bid}.ffn.experts.mlp.v1",    # dbrx
            "model.layers.{bid}.mlp.experts.up_proj",         # qwen2moe olmoe (merged)
            "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
-            "language_model.model.layers.{bid}.feed_forward.experts.up_proj", # llama4
        ),

        MODEL_TENSOR.FFN_UP_SHEXP: (
            "model.layers.{bid}.mlp.shared_expert.up_proj",  # qwen2moe
            "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2
-            "language_model.model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4
        ),

        # AWQ-activation gate
@@ -361,7 +348,6 @@ class TensorNameMap:
            "transformer.h.{bid}.mlp.linear_1",           # refact
            "model.layers.{bid}.residual_mlp.w1",         # arctic
            "transformer.h.{bid}.mlp.c_fc_0",             # exaone
-            "language_model.model.layers.{bid}.feed_forward.gate_proj", # llama4
        ),

        MODEL_TENSOR.FFN_GATE_EXP: (
@@ -370,13 +356,11 @@ class TensorNameMap:
            "transformer.blocks.{bid}.ffn.experts.mlp.w1",    # dbrx
            "model.layers.{bid}.mlp.experts.gate_proj",       # qwen2moe olmoe (merged)
            "model.layers.{bid}.block_sparse_moe.experts.w1", # phimoe (merged)
-            "language_model.model.layers.{bid}.feed_forward.experts.gate_proj", # llama4
        ),

        MODEL_TENSOR.FFN_GATE_SHEXP: (
            "model.layers.{bid}.mlp.shared_expert.gate_proj",  # qwen2moe
            "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2
-            "language_model.model.layers.{bid}.feed_forward.shared_expert.gate_proj", # llama4
        ),

        # Feed-forward down
@@ -405,7 +389,6 @@ class TensorNameMap:
            "encoder.layer.{bid}.mlp.down_layer",                     # jina-bert-v2
            "encoder.layers.{bid}.mlp.dense_4h_to_h",                 # chatglm
            "model.layers.h.{bid}.mlp.c_proj",                        # exaone
-            "language_model.model.layers.{bid}.feed_forward.down_proj", # llama4
        ),

        MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -415,13 +398,11 @@ class TensorNameMap:
            "model.layers.{bid}.mlp.experts.down_proj",          # qwen2moe olmoe (merged)
            "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
            "model.layers.{bid}.block_sparse_moe.experts.w2",    # phimoe (merged)
-            "language_model.model.layers.{bid}.feed_forward.experts.down_proj", # llama4
        ),

        MODEL_TENSOR.FFN_DOWN_SHEXP: (
            "model.layers.{bid}.mlp.shared_expert.down_proj",  # qwen2moe
            "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2
-            "language_model.model.layers.{bid}.feed_forward.shared_expert.down_proj", # llama4
        ),

        MODEL_TENSOR.ATTN_Q_NORM: (
--- a/gguf-py/gguf/utility.py
+++ b/gguf-py/gguf/utility.py
@@ -1,11 +1,7 @@
 from __future__ import annotations

-from dataclasses import dataclass
 from typing import Literal

-import os
-import json
-

 def fill_templated_filename(filename: str, output_type: str | None) -> str:
    # Given a file name fill in any type templates e.g. 'some-model-name.{ftype}.gguf'
@@ -71,194 +67,3 @@ def naming_convention(model_name: str | None, base_name: str | None, finetune_st
    kind = f"-{model_type.strip().replace(' ', '-')}" if model_type is not None else ""

    return f"{name}{parameters}{finetune}{version}{encoding}{kind}"
-
-
-@dataclass
-class RemoteTensor:
-    dtype: str
-    shape: tuple[int, ...]
-    offset_start: int
-    size: int
-    url: str
-
-    def data(self) -> bytearray:
-        # TODO: handle request errors (maybe with limited retries?)
-        # NOTE: using a bytearray, otherwise PyTorch complains the buffer is not writeable
-        data = bytearray(SafetensorRemote.get_data_by_range(url=self.url, start=self.offset_start, size=self.size))
-        return data
-
-
-class SafetensorRemote:
-    """
-    Uility class to handle remote safetensor files.
-    This class is designed to work with Hugging Face model repositories.
-
-    Example (one model has single safetensor file, the other has multiple):
-        for model_id in ["ngxson/TEST-Tiny-Llama4", "Qwen/Qwen2.5-7B-Instruct"]:
-            tensors = SafetensorRemote.get_list_tensors_hf_model(model_id)
-            print(tensors)
-
-    Example reading tensor data:
-        tensors = SafetensorRemote.get_list_tensors_hf_model(model_id)
-        for name, meta in tensors.items():
-            dtype, shape, offset_start, size, remote_safetensor_url = meta
-            # read the tensor data
-            data = SafetensorRemote.get_data_by_range(remote_safetensor_url, offset_start, size)
-            print(data)
-    """
-
-    BASE_DOMAIN = "https://huggingface.co"
-    ALIGNMENT = 8 # bytes
-
-    @classmethod
-    def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]:
-        """
-        Get list of tensors from a Hugging Face model repository.
-
-        Returns a dictionary of tensor names and their metadata.
-        Each tensor is represented as a tuple of (dtype, shape, offset_start, size, remote_safetensor_url)
-        """
-        # case 1: model has only one single model.safetensor file
-        is_single_file = cls.check_file_exist(f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors")
-        if is_single_file:
-            url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors"
-            return cls.get_list_tensors(url)
-
-        # case 2: model has multiple files
-        index_url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors.index.json"
-        is_multiple_files = cls.check_file_exist(index_url)
-        if is_multiple_files:
-            # read the index file
-            index_data = cls.get_data_by_range(index_url, 0)
-            index_str = index_data.decode('utf-8')
-            index_json = json.loads(index_str)
-            assert index_json.get("weight_map") is not None, "weight_map not found in index file"
-            weight_map = index_json["weight_map"]
-            # get the list of files
-            all_files = list(set(weight_map.values()))
-            all_files.sort() # make sure we load shard files in order
-            # get the list of tensors
-            tensors: dict[str, RemoteTensor] = {}
-            for file in all_files:
-                url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/{file}"
-                for key, val in cls.get_list_tensors(url).items():
-                    tensors[key] = val
-            return tensors
-
-        raise ValueError(f"Model {model_id} does not have any safetensor files")
-
-    @classmethod
-    def get_list_tensors(cls, url: str) -> dict[str, RemoteTensor]:
-        """
-        Get list of tensors from a remote safetensor file.
-
-        Returns a dictionary of tensor names and their metadata.
-        Each tensor is represented as a tuple of (dtype, shape, offset_start, size)
-        """
-        metadata, data_start_offset = cls.get_metadata(url)
-        res: dict[str, RemoteTensor] = {}
-
-        for name, meta in metadata.items():
-            if name == "__metadata__":
-                continue
-            if not isinstance(meta, dict):
-                raise ValueError(f"Invalid metadata for tensor '{name}': {meta}")
-            try:
-                dtype = meta["dtype"]
-                shape = meta["shape"]
-                offset_start_relative, offset_end_relative = meta["data_offsets"]
-                size = offset_end_relative - offset_start_relative
-                offset_start = data_start_offset + offset_start_relative
-                res[name] = RemoteTensor(dtype=dtype, shape=tuple(shape), offset_start=offset_start, size=size, url=url)
-            except KeyError as e:
-                raise ValueError(f"Missing key in metadata for tensor '{name}': {e}, meta = {meta}")
-
-        return res
-
-    @classmethod
-    def get_metadata(cls, url: str) -> tuple[dict, int]:
-        """
-        Get JSON metadata from a remote safetensor file.
-
-        Returns tuple of (metadata, data_start_offset)
-        """
-        # Request first 5MB of the file (hopefully enough for metadata)
-        read_size = 5 * 1024 * 1024
-        raw_data = cls.get_data_by_range(url, 0, read_size)
-
-        # Parse header
-        # First 8 bytes contain the metadata length as u64 little-endian
-        if len(raw_data) < 8:
-            raise ValueError("Not enough data to read metadata size")
-        metadata_length = int.from_bytes(raw_data[:8], byteorder='little')
-
-        # Calculate the data start offset
-        data_start_offset = 8 + metadata_length
-        alignment = SafetensorRemote.ALIGNMENT
-        if data_start_offset % alignment != 0:
-            data_start_offset += alignment - (data_start_offset % alignment)
-
-        # Check if we have enough data to read the metadata
-        if len(raw_data) < 8 + metadata_length:
-            raise ValueError(f"Could not read complete metadata. Need {8 + metadata_length} bytes, got {len(raw_data)}")
-
-        # Extract metadata bytes and parse as JSON
-        metadata_bytes = raw_data[8:8 + metadata_length]
-        metadata_str = metadata_bytes.decode('utf-8')
-        try:
-            metadata = json.loads(metadata_str)
-            return metadata, data_start_offset
-        except json.JSONDecodeError as e:
-            raise ValueError(f"Failed to parse safetensor metadata as JSON: {e}")
-
-    @classmethod
-    def get_data_by_range(cls, url: str, start: int, size: int = -1) -> bytes:
-        """
-        Get raw byte data from a remote file by range.
-        If size is not specified, it will read the entire file.
-        """
-        import requests
-        from urllib.parse import urlparse
-
-        parsed_url = urlparse(url)
-        if not parsed_url.scheme or not parsed_url.netloc:
-            raise ValueError(f"Invalid URL: {url}")
-
-        headers = cls._get_request_headers()
-        if size > -1:
-            headers["Range"] = f"bytes={start}-{start + size}"
-        response = requests.get(url, allow_redirects=True, headers=headers)
-        response.raise_for_status()
-
-        # Get raw byte data
-        return response.content[:size]
-
-    @classmethod
-    def check_file_exist(cls, url: str) -> bool:
-        """
-        Check if a file exists at the given URL.
-        Returns True if the file exists, False otherwise.
-        """
-        import requests
-        from urllib.parse import urlparse
-
-        parsed_url = urlparse(url)
-        if not parsed_url.scheme or not parsed_url.netloc:
-            raise ValueError(f"Invalid URL: {url}")
-
-        try:
-            headers = cls._get_request_headers()
-            headers["Range"] = "bytes=0-0"
-            response = requests.head(url, allow_redirects=True, headers=headers)
-            # Success (2xx) or redirect (3xx)
-            return 200 <= response.status_code < 400
-        except requests.RequestException:
-            return False
-
-    @classmethod
-    def _get_request_headers(cls) -> dict[str, str]:
-        """Prepare common headers for requests."""
-        headers = {"User-Agent": "convert_hf_to_gguf"}
-        if os.environ.get("HF_TOKEN"):
-            headers["Authorization"] = f"Bearer {os.environ['HF_TOKEN']}"
-        return headers
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -158,13 +158,13 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
    # scripts/gen-authors.sh -> scripts/gen-authors.sh

    cat ggml-src.patch | sed -E \
-        -e 's/([[:space:]]| [ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \
-        -e 's/([[:space:]]| [ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
-        -e 's/([[:space:]]| [ab]\/)cmake\/BuildTypes.cmake/\1ggml\/cmake\/BuildTypes.cmake/g' \
-        -e 's/([[:space:]]| [ab]\/)cmake\/GitVars.cmake/\1ggml\/cmake\/GitVars.cmake/g' \
-        -e 's/([[:space:]]| [ab]\/)cmake\/common.cmake/\1ggml\/cmake\/common.cmake/g' \
-        -e 's/([[:space:]]| [ab]\/)cmake\/ggml-config.cmake.in/\1ggml\/cmake\/ggml-config.cmake.in/g' \
-        -e 's/([[:space:]]| [ab]\/)src\/ggml-cpu\/cmake\/FindSIMD.cmake/\1ggml\/src\/ggml-cpu\/cmake\/FindSIMD.cmake/g' \
+        -e 's/(^[[:space:]]| [ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \
+        -e 's/(^[[:space:]]| [ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
+        -e 's/(^[[:space:]]| [ab]\/)cmake\/BuildTypes.cmake/\1ggml\/cmake\/BuildTypes.cmake/g' \
+        -e 's/(^[[:space:]]| [ab]\/)cmake\/GitVars.cmake/\1ggml\/cmake\/GitVars.cmake/g' \
+        -e 's/(^[[:space:]]| [ab]\/)cmake\/common.cmake/\1ggml\/cmake\/common.cmake/g' \
+        -e 's/(^[[:space:]]| [ab]\/)cmake\/ggml-config.cmake.in/\1ggml\/cmake\/ggml-config.cmake.in/g' \
+        -e 's/(^[[:space:]]| [ab]\/)src\/ggml-cpu\/cmake\/FindSIMD.cmake/\1ggml\/src\/ggml-cpu\/cmake\/FindSIMD.cmake/g' \
        -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\2.c/g' \
        -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\2.cpp/g' \
        -e 's/([[:space:]]| [ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\2.h/g' \
@@ -180,11 +180,11 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
        -e 's/([[:space:]]| [ab]\/)src\/ggml-rpc\//\1ggml\/src\/ggml-rpc\//g' \
        -e 's/([[:space:]]| [ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
        -e 's/([[:space:]]| [ab]\/)src\/ggml-vulkan\//\1ggml\/src\/ggml-vulkan\//g' \
-        -e 's/([[:space:]]| [ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\2.h/g' \
-        -e 's/([[:space:]]| [ab]\/)include\/gguf(.*)\.h/\1ggml\/include\/gguf\2.h/g' \
-        -e 's/([[:space:]]| [ab]\/)tests\/(.*)\.cpp/\1tests\/\2.cpp/g' \
-        -e 's/([[:space:]]| [ab]\/)LICENSE/\1LICENSE/g' \
-        -e 's/([[:space:]]| [ab]\/)scripts\/gen-authors\.sh/\1scripts\/gen-authors.sh/g' \
+        -e 's/^([[:space:]]| [ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\2.h/g' \
+        -e 's/^([[:space:]]| [ab]\/)include\/gguf(.*)\.h/\1ggml\/include\/gguf\2.h/g' \
+        -e 's/^([[:space:]]| [ab]\/)tests\/(.*)\.cpp/\1tests\/\2.cpp/g' \
+        -e 's/^([[:space:]]| [ab]\/)LICENSE/\1LICENSE/g' \
+        -e 's/^([[:space:]]| [ab]\/)scripts\/gen-authors\.sh/\1scripts\/gen-authors.sh/g' \
        > ggml-src.patch.tmp
    mv ggml-src.patch.tmp ggml-src.patch

--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-2abf606f098844faebee578996cae9c6d63a40e2
+70e85f61f1fdcd1064a1e032ff564d5b5e67560c
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -4440,8 +4440,8 @@ struct llm_build_llama : public llm_graph_context {

                if (arch == LLM_ARCH_LLAMA4 && use_rope && hparams.use_kq_norm) {
                    // Llama4TextL2Norm
-                    Qcur = ggml_rms_norm(ctx0, Qcur, hparams.f_norm_rms_eps);
-                    Kcur = ggml_rms_norm(ctx0, Kcur, hparams.f_norm_rms_eps);
+                    Qcur = ggml_rms_norm(ctx0, Qcur, 1e-6);
+                    Kcur = ggml_rms_norm(ctx0, Kcur, 1e-6);
                    cb(Qcur, "Qcur_normed", il);
                    cb(Kcur, "Kcur_normed", il);
                }
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -271,14 +271,6 @@ static std::string var_to_str(ggml_op_pool pool) {
    }
 }

-static std::string var_to_str(ggml_scale_mode mode) {
-    switch (mode) {
-        case GGML_SCALE_MODE_NEAREST:  return "nearest";
-        case GGML_SCALE_MODE_BILINEAR: return "bilinear";
-        default:                      return std::to_string(mode);
-    }
-}
-
 #define VAR_TO_STR(x) (#x "=" + var_to_str(x))

 #define VARS_TO_STR1(a) VAR_TO_STR(a)
@@ -2956,16 +2948,15 @@ struct test_upscale : public test_case {
    const std::array<int64_t, 4> ne;
    const int32_t scale_factor;
    const bool transpose;
-    const ggml_scale_mode mode;

    std::string vars() override {
-        return VARS_TO_STR5(type, ne, scale_factor, mode, transpose);
+        return VARS_TO_STR4(type, ne, scale_factor, transpose);
    }

    test_upscale(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne = {512, 512, 3, 1},
-            int32_t scale_factor = 2, ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST, bool transpose = false)
-        : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose), mode(mode) {}
+            int32_t scale_factor = 2, bool transpose = false)
+        : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
@@ -2976,7 +2967,7 @@ struct test_upscale : public test_case {
            ggml_set_name(a, "a_transposed");
        }

-        ggml_tensor * out = ggml_upscale(ctx, a, scale_factor, mode);
+        ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
        ggml_set_name(out, "out");

        return out;
@@ -2988,23 +2979,21 @@ struct test_upscale_ext : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
    const std::array<int64_t, 4> ne_tgt;
-    const ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST;

    std::string vars() override {
-        return VARS_TO_STR4(type, ne, ne_tgt, mode);
+        return VARS_TO_STR3(type, ne, ne_tgt);
    }

    test_upscale_ext(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne     = {2, 5,  7, 11},
-            std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13},
-            ggml_scale_mode mode = GGML_SCALE_MODE_NEAREST)
-        : type(type), ne(ne), ne_tgt(ne_tgt), mode(mode) {}
+            std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
+        : type(type), ne(ne), ne_tgt(ne_tgt) {}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
        ggml_set_name(a, "a");

-        ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3], mode);
+        ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
        ggml_set_name(out, "out");

        return out;
@@ -4410,15 +4399,12 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
    }

-    for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR}) {
-        test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode));
-        test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode, true));
-        test_cases.emplace_back(new test_upscale_ext(GGML_TYPE_F32, {2, 5,  7, 11}, {5, 7, 11, 13}, mode));
-    }
-
    test_cases.emplace_back(new test_sum());
    test_cases.emplace_back(new test_sum_rows());
    test_cases.emplace_back(new test_mean());
+    test_cases.emplace_back(new test_upscale());
+    test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
+    test_cases.emplace_back(new test_upscale_ext());
    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1}));
    test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1}));
    test_cases.emplace_back(new test_acc());