various clean up

Merge branch 'master' into video-support
feat: add video support for Qwen3.5
2026-04-16 16:27:32 +03:00 · 2026-04-13 17:39:14 +02:00 · 2026-04-13 15:40:05 +02:00 · 2026-03-06 21:37:07 +09:00
9 changed files with 191 additions and 26 deletions
--- a/tools/mtmd/clip-graph.h
+++ b/tools/mtmd/clip-graph.h
@@ -32,6 +32,9 @@ struct clip_graph {
    float kq_scale; // TODO: maybe move this to hparams
    const clip_flash_attn_type flash_attn_type;

+    // TODO [QWEN_VIDEO]: improve this in the future
+    int nt = 1; // number of temporal dim, to be used by Qwen-VL models
+
    ggml_context_ptr ctx0_ptr;
    ggml_context * ctx0;
    ggml_cgraph * gf;
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -448,6 +448,7 @@ struct clip_image_u8_batch {
 struct clip_image_f32_batch {
    std::vector<clip_image_f32_ptr> entries;
    bool is_audio = false;
+    bool is_seq = true;

    // for llava-uhd style models, we need to know the grid size
    // note: entries.size() == grid_x * grid_y + 1 (one overview image)
@@ -458,6 +459,7 @@ struct clip_image_f32_batch {
        clip_image_f32_batch new_batch{
            /* entries  */ {},
            /* is_audio */ is_audio,
+            /* is_seq   */ is_seq,
            /* grid_x   */ grid_x,
            /* grid_y   */ grid_y,
        };
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -515,7 +515,7 @@ ggml_tensor * clip_graph::build_inp() {
 }

 ggml_tensor * clip_graph::build_inp_raw(int channels) {
-    ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
+    ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels, nt);
    ggml_set_name(inp_raw, "inp_raw");
    ggml_set_input(inp_raw);
    return inp_raw;
@@ -951,6 +951,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
            GGML_ABORT("missing cgraph builder");
    }

+    // TODO [QWEN_VIDEO]: improve this in the future
+    builder->nt = imgs.entries.size();
+
    return builder->build();
 }

@@ -3042,10 +3045,11 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
    const clip_image_f32_batch & imgs = *imgs_c_ptr;
    int batch_size = imgs.entries.size();
+    bool support_seq = clip_model_supports_seq_input(ctx);

    // TODO @ngxson : implement batch size > 1 as a loop
    //                we don't need true batching support because the cgraph will gonna be big anyway
-    if (batch_size != 1) {
+    if (batch_size != 1 && !support_seq) {
        return false; // only support batch size of 1
    }

@@ -3117,6 +3121,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        // └─────┘ │
        //   ──────┘ x B

+        // IMPORTANT: [QWEN_VIDEO] the batch dim is currently used for temporal dim in Qwen-VL models
+
        for (size_t i = 0; i < imgs.entries.size(); i++) {
            const int nx = imgs.entries[i]->nx;
            const int ny = imgs.entries[i]->ny;
@@ -3747,6 +3753,17 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
    }
 }

+bool clip_model_supports_seq_input(const struct clip_ctx * ctx) {
+    switch (ctx->proj_type()) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+            return true;
+        default:
+            return false;
+    }
+}
+
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
    clip_image_f32 clip_img;
    clip_img.buf.resize(h * w * 3);
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -116,3 +116,6 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
 bool clip_has_vision_encoder(const struct clip_ctx * ctx);
 bool clip_has_audio_encoder(const struct clip_ctx * ctx);
 bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
+
+// true if model graph support image->nt (temporal dimension) as input
+bool clip_model_supports_seq_input(const struct clip_ctx * ctx);
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -26,10 +26,11 @@ struct clip_graph_pixtral : clip_graph {
 struct clip_graph_qwen2vl : clip_graph {
    clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
    ggml_cgraph * build() override;
+    ggml_tensor * build_inp_with_temporal_merge();
 };

-struct clip_graph_qwen3vl : clip_graph {
-    clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+struct clip_graph_qwen3vl : clip_graph_qwen2vl {
+    clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_qwen2vl(ctx, img) {}
    ggml_cgraph * build() override;
 };

--- a/tools/mtmd/models/qwen2vl.cpp
+++ b/tools/mtmd/models/qwen2vl.cpp
@@ -1,5 +1,31 @@
 #include "models.h"

+ggml_tensor * clip_graph_qwen2vl::build_inp_with_temporal_merge() {
+    ggml_tensor * inp_raw = build_inp_raw();
+
+    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
+    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+
+    const size_t nb1 = ggml_row_size(inp_raw->type, img.nx);
+    const size_t nb2 = nb1 * img.ny;
+
+    if (nt == 1) {
+        // still image input
+        return ggml_add(ctx0,
+            ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1),
+            ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1));
+    } else if (nt == 2) {
+        // 2 frames input (video input)
+        ggml_tensor * inp_0 = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, 0);
+        ggml_tensor * inp_1 = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, nb2 * 3);
+        return ggml_add(ctx0,
+            ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_0, patch_size, patch_size, 0, 0, 1, 1),
+            ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_1, patch_size, patch_size, 0, 0, 1, 1));
+    } else {
+        GGML_ASSERT(false && "nt > 2 is not supported");
+    }
+}
+
 ggml_cgraph * clip_graph_qwen2vl::build() {
    GGML_ASSERT(model.patch_bias == nullptr);
    GGML_ASSERT(model.class_embedding == nullptr);
@@ -16,17 +42,10 @@ ggml_cgraph * clip_graph_qwen2vl::build() {

    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

-    ggml_tensor * inp_raw = build_inp_raw();
-    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-
-    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
+    ggml_tensor * inp = build_inp_with_temporal_merge();

    // second conv dimension
    {
-        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-        inp = ggml_add(ctx0, inp, inp_1);
-
        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
        inp = ggml_cont_4d(
            ctx0, inp,
--- a/tools/mtmd/models/qwen3vl.cpp
+++ b/tools/mtmd/models/qwen3vl.cpp
@@ -13,17 +13,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() {

    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

-    ggml_tensor * inp_raw = build_inp_raw();
-    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+    ggml_tensor * inp = build_inp_with_temporal_merge();

-    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
-    GGML_ASSERT(img.ny % (patch_size * 2) == 0);
-
-    // second conv dimension
+    // spatial merge
    {
-        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-        inp = ggml_add(ctx0, inp, inp_1);
-
        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
        inp = ggml_cont_4d(
            ctx0, inp,
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -25,9 +25,11 @@

 // represents raw image data, layout is RGBRGBRGB...
 // length of data must be nx * ny * 3
+// for sequence of images (i.e. video): data is nt sequential RGB frames, each nx * ny * 3 bytes
 struct mtmd_bitmap {
    uint32_t nx;
    uint32_t ny;
+    uint32_t nt = 1; // 1 for single images, >= 2 (even) for sequence
    std::vector<unsigned char> data;
    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
    bool is_audio = false; // true if the bitmap is audio
@@ -37,8 +39,8 @@ struct mtmd_image_tokens {
    uint32_t nx; // number of tokens in x direction
    uint32_t ny; // number of tokens in y direction
    bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
-    uint32_t n_tokens() const { return nx * ny; }
    clip_image_f32_batch batch_f32; // preprocessed image patches
+    uint32_t n_tokens() const { return nx * ny; } // TODO [QWEN_VIDEO]: we don't count nt here to be compatible with Qwen-VL, but other models in the future might have different logic
    std::string id; // optional user-defined ID, useful for KV cache tracking

    mtmd_image_tokens clone() {
@@ -875,6 +877,73 @@ struct mtmd_tokenizer {
        return 0;
    }

+    int32_t add_seq_image(const mtmd_bitmap * bitmap) {
+        GGML_ASSERT(ctx->ctx_v);
+        GGML_ASSERT(bitmap->nt > 1);
+        // TODO [QWEN_VIDEO]: we only support even frames (Qwen-VL style) for now
+        GGML_ASSERT(bitmap->nt % 2 == 0);
+        bool support_seq = clip_model_supports_seq_input(ctx->ctx_v);
+        if (!support_seq) {
+            LOG_ERR("%s: error: model does not support sequential image input (usually requires Qwen-VL style models)\n", __func__);
+            return 2;
+        }
+
+        const uint32_t n_frames = bitmap->nt;
+        const size_t   frame_bytes = (size_t)bitmap->nx * bitmap->ny * 3;
+
+        // preprocess each frame individually
+        clip_image_f32_batch all_frames;
+        all_frames.is_seq = true;
+        all_frames.grid_x = 0; // currently, we don't support tiling for video input
+        all_frames.grid_y = 0; // currently, we don't support tiling for video input
+
+        for (uint32_t f = 0; f < n_frames; f++) {
+            clip_image_u8_ptr img_u8(clip_image_u8_init());
+            img_u8->nx = bitmap->nx;
+            img_u8->ny = bitmap->ny;
+            img_u8->buf.resize(frame_bytes);
+            std::memcpy(img_u8->buf.data(), bitmap->data.data() + f * frame_bytes, frame_bytes);
+
+            clip_image_f32_batch frame_batch;
+            bool ok = ctx->image_preproc->preprocess(*img_u8, frame_batch);
+            if (!ok) {
+                LOG_ERR("Unable to preprocess image\n");
+                return 2;
+            }
+            GGML_ASSERT(frame_batch.entries.size() == 1);
+            all_frames.entries.push_back(std::move(frame_batch.entries[0]));
+        }
+
+        mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+        if (mtmd_decode_use_mrope(ctx)) {
+            // for Qwen2VL, we need this information for M-RoPE decoding positions
+            image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, all_frames.entries[0].get());
+            image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, all_frames.entries[0].get());
+            image_tokens->use_mrope_pos = true;
+        } else {
+            GGML_ASSERT(false && "not supported");
+        }
+        image_tokens->batch_f32 = std::move(all_frames);
+        image_tokens->id = bitmap->id; // optional
+
+        LOG_DBG("seq_image: nt=%u, nx=%u, ny=%u, n_tokens=%u\n",
+                bitmap->nt, image_tokens->nx, image_tokens->ny, image_tokens->n_tokens());
+
+        mtmd_input_chunk chunk{
+            MTMD_INPUT_CHUNK_TYPE_IMAGE,
+            {}, // text tokens
+            std::move(image_tokens),
+            nullptr, // audio tokens
+        };
+        cur.entries.emplace_back(std::move(chunk));
+
+        if (!ctx->img_end.empty()) {
+            add_text(ctx->img_end, true);
+        }
+
+        return 0;
+    }
+
    std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
        std::vector<mtmd_input_chunk> chunks;

@@ -993,6 +1062,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
        || clip_is_glm(ctx_clip)
        || proj_type == PROJECTOR_TYPE_INTERNVL) {
        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
+        // video: each entry is one frame pair, encoded with per-frame attention
        const auto & entries = image_tokens->batch_f32.entries;
        for (size_t i = 0; i < entries.size(); i++) {
            int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
@@ -1075,17 +1145,54 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
    mtmd_bitmap * bitmap = new mtmd_bitmap;
    bitmap->nx = nx;
    bitmap->ny = ny;
+    bitmap->nt = 1;
    size_t data_size = (size_t)nx * ny * 3;
    bitmap->data.resize(data_size);
    std::memcpy(bitmap->data.data(), data, data_size);
    return bitmap;
 }

+mtmd_bitmap * mtmd_bitmap_init_from_seq(uint32_t nx,
+                                        uint32_t ny,
+                                        uint32_t nt,
+                                        const unsigned char * data) {
+    if (nt == 0) {
+        LOG_ERR("%s: error: nt must be greater than 0 for sequence input\n", __func__);
+        return nullptr;
+    }
+    if (nt == 1) {
+        // if nt == 1, it's not really a sequence, we can treat it as a single image
+        return mtmd_bitmap_init(nx, ny, data);
+    }
+    // TODO [QWEN_VIDEO]: we only support Qwen-VL style for now, which requires even number of frames
+    // therefore, we duplicate the last frame if nt is odd, to avoid issues in video preprocessing
+    bool is_odd = (nt % 2 == 1);
+    if (is_odd) {
+        nt += 1;
+    }
+    size_t frame_size = (size_t)nx * ny * 3;
+    mtmd_bitmap * bitmap = new mtmd_bitmap;
+    bitmap->nx = nx;
+    bitmap->ny = ny;
+    bitmap->nt = nt;
+    size_t data_size = frame_size * nt;
+    bitmap->data.resize(data_size);
+    std::memcpy(bitmap->data.data(), data, data_size);
+    if (is_odd) {
+        // duplicate the last frame
+        std::memcpy(bitmap->data.data() + (nt - 1) * frame_size,
+                    data + (nt - 2) * frame_size,
+                    frame_size);
+    }
+    return bitmap;
+}
+
 mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
                                          const float * data) {
    mtmd_bitmap * bitmap = new mtmd_bitmap;
    bitmap->nx = n_samples;
    bitmap->ny = 1;
+    bitmap->nt = 1;
    bitmap->is_audio = true;
    size_t data_size = n_samples * sizeof(float);
    bitmap->data.resize(data_size);
@@ -1101,6 +1208,10 @@ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
    return bitmap->ny;
 }

+uint32_t mtmd_bitmap_get_nt(const mtmd_bitmap * bitmap) {
+    return bitmap->nt;
+}
+
 const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
    return bitmap->data.data();
 }
@@ -1113,6 +1224,10 @@ bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
    return bitmap->is_audio;
 }

+bool mtmd_bitmap_is_seq(const mtmd_bitmap * bitmap) {
+    return bitmap->nt >= 2;
+}
+
 const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
    return bitmap->id.c_str();
 }
@@ -1255,8 +1370,8 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {

 llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
    if (image_tokens->use_mrope_pos) {
-        // for M-RoPE, temporal dimension = max(t,h,w)
-        // t is omitted as we don't support video input
+        // for M-RoPE, n_pos = max(t, h, w)
+        // t is omitted as we don't support batching
        return std::max(image_tokens->nx, image_tokens->ny);
    }
    return image_tokens->n_tokens();
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -135,16 +135,23 @@ MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);
 // if bitmap is image:
 //     length of data must be nx * ny * 3
 //     the data is in RGBRGBRGB... format
+// if bitmap is sequence of images (i.e. video):
+//     nt is the number of frames
+//     length of data must be nx * ny * 3 * nt
+//     frames are sequential RGB, each nx * ny * 3 bytes
 // if bitmap is audio:
 //     length of data must be n_samples * sizeof(float)
 //     the data is in float format (PCM F32)
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init           (uint32_t nx, uint32_t ny, const unsigned char * data);
+MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_seq  (uint32_t nx, uint32_t ny, uint32_t nt, const unsigned char * data);
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples,         const float         * data);
 MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap);
 MTMD_API uint32_t              mtmd_bitmap_get_ny     (const mtmd_bitmap * bitmap);
+MTMD_API uint32_t              mtmd_bitmap_get_nt     (const mtmd_bitmap * bitmap);
 MTMD_API const unsigned char * mtmd_bitmap_get_data   (const mtmd_bitmap * bitmap);
 MTMD_API size_t                mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
 MTMD_API bool                  mtmd_bitmap_is_audio   (const mtmd_bitmap * bitmap);
+MTMD_API bool                  mtmd_bitmap_is_seq     (const mtmd_bitmap * bitmap);
 MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap);
 // bitmap ID is optional, but useful for KV cache tracking
 // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
@@ -277,9 +284,14 @@ struct bitmap {
    bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
        ptr.reset(mtmd_bitmap_init(nx, ny, data));
    }
+    bitmap(uint32_t nx, uint32_t ny, uint32_t nt, const unsigned char * data) {
+        ptr.reset(mtmd_bitmap_init_from_seq(nx, ny, nt, data));
+    }
    ~bitmap() = default;
-    uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
-    uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
+    uint32_t nx()     const { return mtmd_bitmap_get_nx(ptr.get()); }
+    uint32_t ny()     const { return mtmd_bitmap_get_ny(ptr.get()); }
+    uint32_t nt()     const { return mtmd_bitmap_get_nt(ptr.get()); }
+    bool     is_seq() const { return mtmd_bitmap_is_seq(ptr.get()); }
    const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
    size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
    std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
Author	SHA1	Message	Date
Xuan Son Nguyen	c5b682b25c	various clean up	2026-04-13 17:39:14 +02:00
Xuan Son Nguyen	f558360b32	Merge branch 'master' into video-support	2026-04-13 15:40:05 +02:00
andrewmd5	573f2cf58e	feat: add video support for Qwen3.5	2026-03-06 21:37:07 +09:00