Compare commits

...

3 Commits

Author SHA1 Message Date
Xuan Son Nguyen
c5b682b25c various clean up 2026-04-13 17:39:14 +02:00
Xuan Son Nguyen
f558360b32 Merge branch 'master' into video-support 2026-04-13 15:40:05 +02:00
andrewmd5
573f2cf58e feat: add video support for Qwen3.5 2026-03-06 21:37:07 +09:00
9 changed files with 191 additions and 26 deletions

View File

@@ -32,6 +32,9 @@ struct clip_graph {
float kq_scale; // TODO: maybe move this to hparams
const clip_flash_attn_type flash_attn_type;
// TODO [QWEN_VIDEO]: improve this in the future
int nt = 1; // number of temporal dim, to be used by Qwen-VL models
ggml_context_ptr ctx0_ptr;
ggml_context * ctx0;
ggml_cgraph * gf;

View File

@@ -448,6 +448,7 @@ struct clip_image_u8_batch {
struct clip_image_f32_batch {
std::vector<clip_image_f32_ptr> entries;
bool is_audio = false;
bool is_seq = true;
// for llava-uhd style models, we need to know the grid size
// note: entries.size() == grid_x * grid_y + 1 (one overview image)
@@ -458,6 +459,7 @@ struct clip_image_f32_batch {
clip_image_f32_batch new_batch{
/* entries */ {},
/* is_audio */ is_audio,
/* is_seq */ is_seq,
/* grid_x */ grid_x,
/* grid_y */ grid_y,
};

View File

@@ -515,7 +515,7 @@ ggml_tensor * clip_graph::build_inp() {
}
ggml_tensor * clip_graph::build_inp_raw(int channels) {
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels, nt);
ggml_set_name(inp_raw, "inp_raw");
ggml_set_input(inp_raw);
return inp_raw;
@@ -951,6 +951,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
GGML_ABORT("missing cgraph builder");
}
// TODO [QWEN_VIDEO]: improve this in the future
builder->nt = imgs.entries.size();
return builder->build();
}
@@ -3042,10 +3045,11 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
const clip_image_f32_batch & imgs = *imgs_c_ptr;
int batch_size = imgs.entries.size();
bool support_seq = clip_model_supports_seq_input(ctx);
// TODO @ngxson : implement batch size > 1 as a loop
// we don't need true batching support because the cgraph will gonna be big anyway
if (batch_size != 1) {
if (batch_size != 1 && !support_seq) {
return false; // only support batch size of 1
}
@@ -3117,6 +3121,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// └─────┘ │
// ──────┘ x B
// IMPORTANT: [QWEN_VIDEO] the batch dim is currently used for temporal dim in Qwen-VL models
for (size_t i = 0; i < imgs.entries.size(); i++) {
const int nx = imgs.entries[i]->nx;
const int ny = imgs.entries[i]->ny;
@@ -3747,6 +3753,17 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
}
}
bool clip_model_supports_seq_input(const struct clip_ctx * ctx) {
switch (ctx->proj_type()) {
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
return true;
default:
return false;
}
}
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
clip_image_f32 clip_img;
clip_img.buf.resize(h * w * 3);

View File

@@ -116,3 +116,6 @@ void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel
bool clip_has_vision_encoder(const struct clip_ctx * ctx);
bool clip_has_audio_encoder(const struct clip_ctx * ctx);
bool clip_has_whisper_encoder(const struct clip_ctx * ctx);
// true if model graph support image->nt (temporal dimension) as input
bool clip_model_supports_seq_input(const struct clip_ctx * ctx);

View File

@@ -26,10 +26,11 @@ struct clip_graph_pixtral : clip_graph {
struct clip_graph_qwen2vl : clip_graph {
clip_graph_qwen2vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
ggml_tensor * build_inp_with_temporal_merge();
};
struct clip_graph_qwen3vl : clip_graph {
clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
struct clip_graph_qwen3vl : clip_graph_qwen2vl {
clip_graph_qwen3vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph_qwen2vl(ctx, img) {}
ggml_cgraph * build() override;
};

View File

@@ -1,5 +1,31 @@
#include "models.h"
ggml_tensor * clip_graph_qwen2vl::build_inp_with_temporal_merge() {
ggml_tensor * inp_raw = build_inp_raw();
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
const size_t nb1 = ggml_row_size(inp_raw->type, img.nx);
const size_t nb2 = nb1 * img.ny;
if (nt == 1) {
// still image input
return ggml_add(ctx0,
ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1),
ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1));
} else if (nt == 2) {
// 2 frames input (video input)
ggml_tensor * inp_0 = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, 0);
ggml_tensor * inp_1 = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, nb2 * 3);
return ggml_add(ctx0,
ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_0, patch_size, patch_size, 0, 0, 1, 1),
ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_1, patch_size, patch_size, 0, 0, 1, 1));
} else {
GGML_ASSERT(false && "nt > 2 is not supported");
}
}
ggml_cgraph * clip_graph_qwen2vl::build() {
GGML_ASSERT(model.patch_bias == nullptr);
GGML_ASSERT(model.class_embedding == nullptr);
@@ -16,17 +42,10 @@ ggml_cgraph * clip_graph_qwen2vl::build() {
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
ggml_tensor * inp = build_inp_with_temporal_merge();
// second conv dimension
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,

View File

@@ -13,17 +13,10 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
ggml_tensor * inp = build_inp_with_temporal_merge();
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
// second conv dimension
// spatial merge
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(
ctx0, inp,

View File

@@ -25,9 +25,11 @@
// represents raw image data, layout is RGBRGBRGB...
// length of data must be nx * ny * 3
// for sequence of images (i.e. video): data is nt sequential RGB frames, each nx * ny * 3 bytes
struct mtmd_bitmap {
uint32_t nx;
uint32_t ny;
uint32_t nt = 1; // 1 for single images, >= 2 (even) for sequence
std::vector<unsigned char> data;
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
bool is_audio = false; // true if the bitmap is audio
@@ -37,8 +39,8 @@ struct mtmd_image_tokens {
uint32_t nx; // number of tokens in x direction
uint32_t ny; // number of tokens in y direction
bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
uint32_t n_tokens() const { return nx * ny; }
clip_image_f32_batch batch_f32; // preprocessed image patches
uint32_t n_tokens() const { return nx * ny; } // TODO [QWEN_VIDEO]: we don't count nt here to be compatible with Qwen-VL, but other models in the future might have different logic
std::string id; // optional user-defined ID, useful for KV cache tracking
mtmd_image_tokens clone() {
@@ -875,6 +877,73 @@ struct mtmd_tokenizer {
return 0;
}
int32_t add_seq_image(const mtmd_bitmap * bitmap) {
GGML_ASSERT(ctx->ctx_v);
GGML_ASSERT(bitmap->nt > 1);
// TODO [QWEN_VIDEO]: we only support even frames (Qwen-VL style) for now
GGML_ASSERT(bitmap->nt % 2 == 0);
bool support_seq = clip_model_supports_seq_input(ctx->ctx_v);
if (!support_seq) {
LOG_ERR("%s: error: model does not support sequential image input (usually requires Qwen-VL style models)\n", __func__);
return 2;
}
const uint32_t n_frames = bitmap->nt;
const size_t frame_bytes = (size_t)bitmap->nx * bitmap->ny * 3;
// preprocess each frame individually
clip_image_f32_batch all_frames;
all_frames.is_seq = true;
all_frames.grid_x = 0; // currently, we don't support tiling for video input
all_frames.grid_y = 0; // currently, we don't support tiling for video input
for (uint32_t f = 0; f < n_frames; f++) {
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->nx = bitmap->nx;
img_u8->ny = bitmap->ny;
img_u8->buf.resize(frame_bytes);
std::memcpy(img_u8->buf.data(), bitmap->data.data() + f * frame_bytes, frame_bytes);
clip_image_f32_batch frame_batch;
bool ok = ctx->image_preproc->preprocess(*img_u8, frame_batch);
if (!ok) {
LOG_ERR("Unable to preprocess image\n");
return 2;
}
GGML_ASSERT(frame_batch.entries.size() == 1);
all_frames.entries.push_back(std::move(frame_batch.entries[0]));
}
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
if (mtmd_decode_use_mrope(ctx)) {
// for Qwen2VL, we need this information for M-RoPE decoding positions
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, all_frames.entries[0].get());
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, all_frames.entries[0].get());
image_tokens->use_mrope_pos = true;
} else {
GGML_ASSERT(false && "not supported");
}
image_tokens->batch_f32 = std::move(all_frames);
image_tokens->id = bitmap->id; // optional
LOG_DBG("seq_image: nt=%u, nx=%u, ny=%u, n_tokens=%u\n",
bitmap->nt, image_tokens->nx, image_tokens->ny, image_tokens->n_tokens());
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_IMAGE,
{}, // text tokens
std::move(image_tokens),
nullptr, // audio tokens
};
cur.entries.emplace_back(std::move(chunk));
if (!ctx->img_end.empty()) {
add_text(ctx->img_end, true);
}
return 0;
}
std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
std::vector<mtmd_input_chunk> chunks;
@@ -993,6 +1062,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
|| clip_is_glm(ctx_clip)
|| proj_type == PROJECTOR_TYPE_INTERNVL) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
// video: each entry is one frame pair, encoded with per-frame attention
const auto & entries = image_tokens->batch_f32.entries;
for (size_t i = 0; i < entries.size(); i++) {
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
@@ -1075,17 +1145,54 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
mtmd_bitmap * bitmap = new mtmd_bitmap;
bitmap->nx = nx;
bitmap->ny = ny;
bitmap->nt = 1;
size_t data_size = (size_t)nx * ny * 3;
bitmap->data.resize(data_size);
std::memcpy(bitmap->data.data(), data, data_size);
return bitmap;
}
mtmd_bitmap * mtmd_bitmap_init_from_seq(uint32_t nx,
uint32_t ny,
uint32_t nt,
const unsigned char * data) {
if (nt == 0) {
LOG_ERR("%s: error: nt must be greater than 0 for sequence input\n", __func__);
return nullptr;
}
if (nt == 1) {
// if nt == 1, it's not really a sequence, we can treat it as a single image
return mtmd_bitmap_init(nx, ny, data);
}
// TODO [QWEN_VIDEO]: we only support Qwen-VL style for now, which requires even number of frames
// therefore, we duplicate the last frame if nt is odd, to avoid issues in video preprocessing
bool is_odd = (nt % 2 == 1);
if (is_odd) {
nt += 1;
}
size_t frame_size = (size_t)nx * ny * 3;
mtmd_bitmap * bitmap = new mtmd_bitmap;
bitmap->nx = nx;
bitmap->ny = ny;
bitmap->nt = nt;
size_t data_size = frame_size * nt;
bitmap->data.resize(data_size);
std::memcpy(bitmap->data.data(), data, data_size);
if (is_odd) {
// duplicate the last frame
std::memcpy(bitmap->data.data() + (nt - 1) * frame_size,
data + (nt - 2) * frame_size,
frame_size);
}
return bitmap;
}
mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
const float * data) {
mtmd_bitmap * bitmap = new mtmd_bitmap;
bitmap->nx = n_samples;
bitmap->ny = 1;
bitmap->nt = 1;
bitmap->is_audio = true;
size_t data_size = n_samples * sizeof(float);
bitmap->data.resize(data_size);
@@ -1101,6 +1208,10 @@ uint32_t mtmd_bitmap_get_ny(const mtmd_bitmap * bitmap) {
return bitmap->ny;
}
uint32_t mtmd_bitmap_get_nt(const mtmd_bitmap * bitmap) {
return bitmap->nt;
}
const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) {
return bitmap->data.data();
}
@@ -1113,6 +1224,10 @@ bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
return bitmap->is_audio;
}
bool mtmd_bitmap_is_seq(const mtmd_bitmap * bitmap) {
return bitmap->nt >= 2;
}
const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
return bitmap->id.c_str();
}
@@ -1255,8 +1370,8 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
if (image_tokens->use_mrope_pos) {
// for M-RoPE, temporal dimension = max(t,h,w)
// t is omitted as we don't support video input
// for M-RoPE, n_pos = max(t, h, w)
// t is omitted as we don't support batching
return std::max(image_tokens->nx, image_tokens->ny);
}
return image_tokens->n_tokens();

View File

@@ -135,16 +135,23 @@ MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);
// if bitmap is image:
// length of data must be nx * ny * 3
// the data is in RGBRGBRGB... format
// if bitmap is sequence of images (i.e. video):
// nt is the number of frames
// length of data must be nx * ny * 3 * nt
// frames are sequential RGB, each nx * ny * 3 bytes
// if bitmap is audio:
// length of data must be n_samples * sizeof(float)
// the data is in float format (PCM F32)
MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_seq (uint32_t nx, uint32_t ny, uint32_t nt, const unsigned char * data);
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
MTMD_API uint32_t mtmd_bitmap_get_nt (const mtmd_bitmap * bitmap);
MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
MTMD_API bool mtmd_bitmap_is_seq (const mtmd_bitmap * bitmap);
MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
// bitmap ID is optional, but useful for KV cache tracking
// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
@@ -277,9 +284,14 @@ struct bitmap {
bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
ptr.reset(mtmd_bitmap_init(nx, ny, data));
}
bitmap(uint32_t nx, uint32_t ny, uint32_t nt, const unsigned char * data) {
ptr.reset(mtmd_bitmap_init_from_seq(nx, ny, nt, data));
}
~bitmap() = default;
uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
uint32_t nt() const { return mtmd_bitmap_get_nt(ptr.get()); }
bool is_seq() const { return mtmd_bitmap_is_seq(ptr.get()); }
const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }