diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 3d6cf6fd84..ceb4326cbe 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3615,9 +3615,17 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // set input pixel values if (!imgs.is_audio) { + // detect number of channels from the buffer size + const int nx = imgs.entries[0]->nx; + const int ny = imgs.entries[0]->ny; + const int n = nx * ny; + const size_t buf_size = imgs.entries[0]->buf.size(); + const int n_channels = (int)(buf_size / n); + GGML_ASSERT(n_channels == 3 || n_channels == 6); + size_t nelem = 0; for (const auto & img : imgs.entries) { - nelem += img->nx * img->ny * 3; + nelem += img->nx * img->ny * n_channels; } std::vector inp_raw(nelem); @@ -3631,21 +3639,21 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // │ H │ channel = B // └─────┘ │ // ──────┘ x B + // + // for 6-channel video input, same layout but with 6 planar channels - for (size_t i = 0; i < imgs.entries.size(); i++) { - const int nx = imgs.entries[i]->nx; - const int ny = imgs.entries[i]->ny; - const int n = nx * ny; + for (int b = 0; b < batch_size; b++) { + const int cur_nx = imgs.entries[b]->nx; + const int cur_ny = imgs.entries[b]->ny; + const int cur_n = cur_nx * cur_ny; - for (int b = 0; b < batch_size; b++) { - float * batch_entry = inp_raw.data() + b * (3*n); - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - size_t base_src = 3*(y * nx + x); // idx of the first channel - size_t base_dst = y * nx + x; // idx of the first channel - batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ]; - batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1]; - batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2]; + float * batch_entry = inp_raw.data() + b * (n_channels * cur_n); + for (int y = 0; y < cur_ny; y++) { + for (int x = 0; x < cur_nx; x++) { + size_t base_src = n_channels * (y * cur_nx + x); + size_t base_dst = y * cur_nx + x; + for (int c = 0; c < n_channels; c++) { + batch_entry[c * cur_n + base_dst] = imgs.entries[b]->buf[base_src + c]; } } } diff --git a/tools/mtmd/models/qwen3vl.cpp b/tools/mtmd/models/qwen3vl.cpp index 5ecb10fe43..0669e5af14 100644 --- a/tools/mtmd/models/qwen3vl.cpp +++ b/tools/mtmd/models/qwen3vl.cpp @@ -13,16 +13,34 @@ ggml_cgraph * clip_graph_qwen3vl::build() { int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; - ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + // detect video: 6-channel input means interleaved frame pairs (even_rgb + odd_rgb) + // for images (3ch), both Conv2Ds receive the same input (original behavior) + // for video (6ch), Conv2D_0 gets even frames (ch 0-2), Conv2D_1 gets odd frames (ch 3-5) + const bool is_video = (img.buf.size() == (size_t)img.nx * img.ny * 6); + const int n_channels = is_video ? 6 : 3; + + ggml_tensor * inp_raw = build_inp_raw(n_channels); + + ggml_tensor * inp; + if (is_video) { + const size_t nb1 = ggml_row_size(inp_raw->type, img.nx); + const size_t nb2 = nb1 * img.ny; + ggml_tensor * inp_even = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, 0); + ggml_tensor * inp_odd = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, nb2 * 3); + inp = ggml_add(ctx0, + ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_even, patch_size, patch_size, 0, 0, 1, 1), + ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_odd, patch_size, patch_size, 0, 0, 1, 1)); + } else { + inp = ggml_add(ctx0, + ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1), + ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1)); + } GGML_ASSERT(img.nx % (patch_size * 2) == 0); GGML_ASSERT(img.ny % (patch_size * 2) == 0); - // second conv dimension + // spatial merge { - auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_add(ctx0, inp, inp_1); inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] inp = ggml_cont_4d( diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 5bcb7ec1bc..f2e50d005c 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -174,6 +174,28 @@ struct decode_embd_batch { } } + // M-RoPE for video: 3D positions [temporal, height, width] + void set_position_mrope_3d(llama_pos pos_0, int nx, int ny, int nt, llama_seq_id seq_id) { + GGML_ASSERT(n_pos_per_embd == 4); + seq_id_0[0] = seq_id; + for (int t = 0; t < nt; t++) { + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + int i = t * ny * nx + y * nx + x; + pos[i ] = pos_0 + t; + pos[i + batch.n_tokens ] = pos_0 + y; + pos[i + batch.n_tokens * 2] = pos_0 + x; + pos[i + batch.n_tokens * 3] = 0; + } + } + } + for (int i = 0; i < batch.n_tokens; i++) { + batch.n_seq_id[i] = 1; + batch.seq_id [i] = seq_id_0.data(); + batch.logits [i] = false; + } + } + // M-RoPE for audio void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) { GGML_ASSERT(n_pos_per_embd == 4); @@ -260,7 +282,12 @@ int32_t mtmd_helper_decode_image_chunk( } const int nx = mtmd_image_tokens_get_nx(image_tokens); const int ny = mtmd_image_tokens_get_ny(image_tokens); - batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id); + const int nt = mtmd_image_tokens_get_nt(image_tokens); + if (nt > 1) { + batch_embd.set_position_mrope_3d(n_past, nx, ny, nt, seq_id); + } else { + batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id); + } } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { batch_embd.set_position_mrope_1d(n_past, seq_id); } else { diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index f66c07345e..fc76d5d175 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -24,9 +24,11 @@ // represents raw image data, layout is RGBRGBRGB... // length of data must be nx * ny * 3 +// for video: data is n_frames sequential RGB frames, each nx * ny * 3 bytes struct mtmd_bitmap { uint32_t nx; uint32_t ny; + uint32_t n_frames = 0; // 0 for single images, >= 2 (even) for video std::vector data; std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking bool is_audio = false; // true if the bitmap is audio @@ -35,8 +37,9 @@ struct mtmd_bitmap { struct mtmd_image_tokens { uint32_t nx; // number of tokens in x direction uint32_t ny; // number of tokens in y direction + uint32_t nt = 1; // number of temporal positions (1 for images, > 1 for video) bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position) - uint32_t n_tokens() const { return nx * ny; } + uint32_t n_tokens() const { return nt * nx * ny; } clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking @@ -44,6 +47,7 @@ struct mtmd_image_tokens { return mtmd_image_tokens{ nx, ny, + nt, use_mrope_pos, batch_f32.clone(), id @@ -553,6 +557,10 @@ struct mtmd_tokenizer { } int32_t add_media(const mtmd_bitmap * bitmap) { + if (bitmap->n_frames >= 2) { + return add_video(bitmap); + } + if (!bitmap->is_audio) { // handle image @@ -743,6 +751,102 @@ struct mtmd_tokenizer { return 0; } + // preprocess video frames and create an image chunk with temporal dimension + // frames are paired (even+odd), each pair becomes one 6-channel image + // each pair is encoded independently through the ViT (per-frame attention) + int32_t add_video(const mtmd_bitmap * bitmap) { + if (!ctx->ctx_v) { + LOG_ERR("%s: error: model does not support vision input\n", __func__); + return 2; + } + + const uint32_t n_frames = bitmap->n_frames; + const uint32_t n_pairs = n_frames / 2; + const size_t frame_bytes = (size_t)bitmap->nx * bitmap->ny * 3; + + if (!ctx->img_beg.empty()) { + add_text(ctx->img_beg, true); + } + + // preprocess each frame individually + clip_image_f32_batch all_frames; + for (uint32_t f = 0; f < n_frames; f++) { + clip_image_u8_ptr img_u8(clip_image_u8_init()); + img_u8->nx = bitmap->nx; + img_u8->ny = bitmap->ny; + img_u8->buf.resize(frame_bytes); + std::memcpy(img_u8->buf.data(), bitmap->data.data() + f * frame_bytes, frame_bytes); + + clip_image_f32_batch frame_batch; + bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &frame_batch); + if (!ok) { + LOG_ERR("Unable to preprocess video frame %u\n", f); + return 2; + } + GGML_ASSERT(frame_batch.entries.size() == 1); + all_frames.entries.push_back(std::move(frame_batch.entries[0])); + } + + const int frame_nx = all_frames.entries[0]->nx; + const int frame_ny = all_frames.entries[0]->ny; + const int n_pixels = frame_nx * frame_ny; + + // interleave frame pairs into 6-channel images (even_rgb + odd_rgb) + // each pair is a separate batch entry, encoded independently + clip_image_f32_batch pair_batch; + for (uint32_t p = 0; p < n_pairs; p++) { + const auto & even = all_frames.entries[p * 2]; + const auto & odd = all_frames.entries[p * 2 + 1]; + GGML_ASSERT(even->nx == frame_nx && even->ny == frame_ny); + GGML_ASSERT(odd->nx == frame_nx && odd->ny == frame_ny); + + clip_image_f32_ptr pair(clip_image_f32_init()); + pair->nx = frame_nx; + pair->ny = frame_ny; + pair->buf.resize((size_t)n_pixels * 6); + + for (int i = 0; i < n_pixels; i++) { + const int dst = i * 6; + const int src = i * 3; + pair->buf[dst + 0] = even->buf[src + 0]; + pair->buf[dst + 1] = even->buf[src + 1]; + pair->buf[dst + 2] = even->buf[src + 2]; + pair->buf[dst + 3] = odd->buf[src + 0]; + pair->buf[dst + 4] = odd->buf[src + 1]; + pair->buf[dst + 5] = odd->buf[src + 2]; + } + pair_batch.entries.push_back(std::move(pair)); + } + + const uint32_t tokens_x = clip_n_output_tokens_x(ctx->ctx_v, pair_batch.entries[0].get()); + const uint32_t tokens_y = clip_n_output_tokens_y(ctx->ctx_v, pair_batch.entries[0].get()); + + mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); + image_tokens->nx = tokens_x; + image_tokens->ny = tokens_y; + image_tokens->nt = n_pairs; + image_tokens->use_mrope_pos = true; + image_tokens->batch_f32 = std::move(pair_batch); + image_tokens->id = bitmap->id; + + LOG_DBG("video: nt=%u, nx=%u, ny=%u, n_tokens=%u\n", + image_tokens->nt, image_tokens->nx, image_tokens->ny, image_tokens->n_tokens()); + + mtmd_input_chunk chunk{ + MTMD_INPUT_CHUNK_TYPE_IMAGE, + {}, // text tokens + std::move(image_tokens), + nullptr, // audio tokens + }; + cur.entries.emplace_back(std::move(chunk)); + + if (!ctx->img_end.empty()) { + add_text(ctx->img_end, true); + } + + return 0; + } + std::vector split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) { std::vector chunks; @@ -855,10 +959,13 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); bool ok = false; - if (clip_is_llava(ctx_clip) + if (image_tokens->nt > 1 + || clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) { - // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() + // encode each batch entry independently + // video: each entry is one frame pair, encoded with per-frame attention + // llava/minicpmv/glm: does not support batched encoding const auto & entries = image_tokens->batch_f32.entries; for (size_t i = 0; i < entries.size(); i++) { int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); @@ -938,6 +1045,21 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx, return bitmap; } +mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx, + uint32_t ny, + uint32_t n_frames, + const unsigned char * data) { + GGML_ASSERT(n_frames >= 2 && n_frames % 2 == 0); + mtmd_bitmap * bitmap = new mtmd_bitmap; + bitmap->nx = nx; + bitmap->ny = ny; + bitmap->n_frames = n_frames; + size_t data_size = (size_t)nx * ny * 3 * n_frames; + bitmap->data.resize(data_size); + std::memcpy(bitmap->data.data(), data, data_size); + return bitmap; +} + mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data) { mtmd_bitmap * bitmap = new mtmd_bitmap; @@ -970,6 +1092,14 @@ bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) { return bitmap->is_audio; } +bool mtmd_bitmap_is_video(const mtmd_bitmap * bitmap) { + return bitmap->n_frames >= 2; +} + +uint32_t mtmd_bitmap_get_n_frames(const mtmd_bitmap * bitmap) { + return bitmap->n_frames; +} + const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) { return bitmap->id.c_str(); } @@ -1106,15 +1236,18 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) { return image_tokens->ny; } +size_t mtmd_image_tokens_get_nt(const mtmd_image_tokens * image_tokens) { + return image_tokens->nt; +} + const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) { return image_tokens->id.c_str(); } llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) { if (image_tokens->use_mrope_pos) { - // for M-RoPE, temporal dimension = max(t,h,w) - // t is omitted as we don't support video input - return std::max(image_tokens->nx, image_tokens->ny); + // for M-RoPE, n_pos = max(t, h, w) + return (llama_pos)std::max({image_tokens->nt, image_tokens->nx, image_tokens->ny}); } return image_tokens->n_tokens(); } diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index ebb4a18fb3..a8592e7d20 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -134,17 +134,24 @@ MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx); // if bitmap is image: // length of data must be nx * ny * 3 // the data is in RGBRGBRGB... format +// if bitmap is video: +// length of data must be nx * ny * 3 * n_frames +// n_frames must be >= 2 and even +// frames are sequential RGB, each nx * ny * 3 bytes // if bitmap is audio: // length of data must be n_samples * sizeof(float) // the data is in float format (PCM F32) MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data); +MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t n_frames, const unsigned char * data); MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data); -MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap); -MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap); -MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap); -MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap); -MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap); -MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap); +MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap); +MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap); +MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap); +MTMD_API size_t mtmd_bitmap_get_n_bytes (const mtmd_bitmap * bitmap); +MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap); +MTMD_API bool mtmd_bitmap_is_video (const mtmd_bitmap * bitmap); +MTMD_API uint32_t mtmd_bitmap_get_n_frames(const mtmd_bitmap * bitmap); +MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap); // bitmap ID is optional, but useful for KV cache tracking // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data() MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap); @@ -187,6 +194,7 @@ MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk); MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens); MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_nt (const mtmd_image_tokens * image_tokens); MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate // number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise) MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate @@ -276,9 +284,14 @@ struct bitmap { bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) { ptr.reset(mtmd_bitmap_init(nx, ny, data)); } + bitmap(uint32_t nx, uint32_t ny, uint32_t n_frames, const unsigned char * data) { + ptr.reset(mtmd_bitmap_init_from_video(nx, ny, n_frames, data)); + } ~bitmap() = default; - uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); } - uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); } + uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); } + uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); } + uint32_t n_frames() const { return mtmd_bitmap_get_n_frames(ptr.get()); } + bool is_video() const { return mtmd_bitmap_is_video(ptr.get()); } const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); } size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); } std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }