Merge 573f2cf58e into b91d7dfe5b

2026-03-16 02:08:16 +08:00 · 2026-03-16 02:08:16 +08:00 · bb941c5269
parent b91d7dfe5b 573f2cf58e
commit bb941c5269
5 changed files with 233 additions and 34 deletions
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -3615,9 +3615,17 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima

    // set input pixel values
    if (!imgs.is_audio) {
+        // detect number of channels from the buffer size
+        const int nx = imgs.entries[0]->nx;
+        const int ny = imgs.entries[0]->ny;
+        const int n  = nx * ny;
+        const size_t buf_size = imgs.entries[0]->buf.size();
+        const int n_channels = (int)(buf_size / n);
+        GGML_ASSERT(n_channels == 3 || n_channels == 6);
+
        size_t nelem = 0;
        for (const auto & img : imgs.entries) {
-            nelem += img->nx * img->ny * 3;
+            nelem += img->nx * img->ny * n_channels;
        }
        std::vector<float> inp_raw(nelem);

@ -3631,21 +3639,21 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
        // │     H │  channel = B
        // └─────┘ │
        //   ──────┘ x B
+        //
+        // for 6-channel video input, same layout but with 6 planar channels

-        for (size_t i = 0; i < imgs.entries.size(); i++) {
-            const int nx = imgs.entries[i]->nx;
-            const int ny = imgs.entries[i]->ny;
-            const int n = nx * ny;
+        for (int b = 0; b < batch_size; b++) {
+            const int cur_nx = imgs.entries[b]->nx;
+            const int cur_ny = imgs.entries[b]->ny;
+            const int cur_n  = cur_nx * cur_ny;

-            for (int b = 0; b < batch_size; b++) {
-                float * batch_entry = inp_raw.data() + b * (3*n);
-                for (int y = 0; y < ny; y++) {
-                    for (int x = 0; x < nx; x++) {
-                        size_t base_src = 3*(y * nx + x); // idx of the first channel
-                        size_t base_dst =    y * nx + x;  // idx of the first channel
-                        batch_entry[      base_dst] = imgs.entries[b]->buf[base_src    ];
-                        batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
-                        batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
+            float * batch_entry = inp_raw.data() + b * (n_channels * cur_n);
+            for (int y = 0; y < cur_ny; y++) {
+                for (int x = 0; x < cur_nx; x++) {
+                    size_t base_src = n_channels * (y * cur_nx + x);
+                    size_t base_dst =              y * cur_nx + x;
+                    for (int c = 0; c < n_channels; c++) {
+                        batch_entry[c * cur_n + base_dst] = imgs.entries[b]->buf[base_src + c];
                    }
                }
            }
--- a/tools/mtmd/models/qwen3vl.cpp
+++ b/tools/mtmd/models/qwen3vl.cpp
@ -13,16 +13,34 @@ ggml_cgraph * clip_graph_qwen3vl::build() {

    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};

-    ggml_tensor * inp_raw = build_inp_raw();
-    ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+    // detect video: 6-channel input means interleaved frame pairs (even_rgb + odd_rgb)
+    // for images (3ch), both Conv2Ds receive the same input (original behavior)
+    // for video (6ch), Conv2D_0 gets even frames (ch 0-2), Conv2D_1 gets odd frames (ch 3-5)
+    const bool is_video = (img.buf.size() == (size_t)img.nx * img.ny * 6);
+    const int  n_channels = is_video ? 6 : 3;
+
+    ggml_tensor * inp_raw = build_inp_raw(n_channels);
+
+    ggml_tensor * inp;
+    if (is_video) {
+        const size_t nb1 = ggml_row_size(inp_raw->type, img.nx);
+        const size_t nb2 = nb1 * img.ny;
+        ggml_tensor * inp_even = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, 0);
+        ggml_tensor * inp_odd  = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, nb2 * 3);
+        inp = ggml_add(ctx0,
+            ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_even, patch_size, patch_size, 0, 0, 1, 1),
+            ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_odd,  patch_size, patch_size, 0, 0, 1, 1));
+    } else {
+        inp = ggml_add(ctx0,
+            ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1),
+            ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1));
+    }

    GGML_ASSERT(img.nx % (patch_size * 2) == 0);
    GGML_ASSERT(img.ny % (patch_size * 2) == 0);

-    // second conv dimension
+    // spatial merge
    {
-        auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
-        inp = ggml_add(ctx0, inp, inp_1);

        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
        inp = ggml_cont_4d(
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@ -174,6 +174,28 @@ struct decode_embd_batch {
        }
    }

+    // M-RoPE for video: 3D positions [temporal, height, width]
+    void set_position_mrope_3d(llama_pos pos_0, int nx, int ny, int nt, llama_seq_id seq_id) {
+        GGML_ASSERT(n_pos_per_embd == 4);
+        seq_id_0[0] = seq_id;
+        for (int t = 0; t < nt; t++) {
+            for (int y = 0; y < ny; y++) {
+                for (int x = 0; x < nx; x++) {
+                    int i = t * ny * nx + y * nx + x;
+                    pos[i                     ] = pos_0 + t;
+                    pos[i + batch.n_tokens    ] = pos_0 + y;
+                    pos[i + batch.n_tokens * 2] = pos_0 + x;
+                    pos[i + batch.n_tokens * 3] = 0;
+                }
+            }
+        }
+        for (int i = 0; i < batch.n_tokens; i++) {
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+
    // M-RoPE for audio
    void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
        GGML_ASSERT(n_pos_per_embd == 4);
@ -260,7 +282,12 @@ int32_t mtmd_helper_decode_image_chunk(
            }
            const int nx = mtmd_image_tokens_get_nx(image_tokens);
            const int ny = mtmd_image_tokens_get_ny(image_tokens);
-            batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
+            const int nt = mtmd_image_tokens_get_nt(image_tokens);
+            if (nt > 1) {
+                batch_embd.set_position_mrope_3d(n_past, nx, ny, nt, seq_id);
+            } else {
+                batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
+            }
        } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
            batch_embd.set_position_mrope_1d(n_past, seq_id);
        } else {
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@ -24,9 +24,11 @@

 // represents raw image data, layout is RGBRGBRGB...
 // length of data must be nx * ny * 3
+// for video: data is n_frames sequential RGB frames, each nx * ny * 3 bytes
 struct mtmd_bitmap {
    uint32_t nx;
    uint32_t ny;
+    uint32_t n_frames = 0; // 0 for single images, >= 2 (even) for video
    std::vector<unsigned char> data;
    std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
    bool is_audio = false; // true if the bitmap is audio
@ -35,8 +37,9 @@ struct mtmd_bitmap {
 struct mtmd_image_tokens {
    uint32_t nx; // number of tokens in x direction
    uint32_t ny; // number of tokens in y direction
+    uint32_t nt = 1; // number of temporal positions (1 for images, > 1 for video)
    bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
-    uint32_t n_tokens() const { return nx * ny; }
+    uint32_t n_tokens() const { return nt * nx * ny; }
    clip_image_f32_batch batch_f32; // preprocessed image patches
    std::string id; // optional user-defined ID, useful for KV cache tracking

@ -44,6 +47,7 @@ struct mtmd_image_tokens {
        return mtmd_image_tokens{
            nx,
            ny,
+            nt,
            use_mrope_pos,
            batch_f32.clone(),
            id
@ -553,6 +557,10 @@ struct mtmd_tokenizer {
    }

    int32_t add_media(const mtmd_bitmap * bitmap) {
+        if (bitmap->n_frames >= 2) {
+            return add_video(bitmap);
+        }
+
        if (!bitmap->is_audio) {
            // handle image

@ -743,6 +751,102 @@ struct mtmd_tokenizer {
        return 0;
    }

+    // preprocess video frames and create an image chunk with temporal dimension
+    // frames are paired (even+odd), each pair becomes one 6-channel image
+    // each pair is encoded independently through the ViT (per-frame attention)
+    int32_t add_video(const mtmd_bitmap * bitmap) {
+        if (!ctx->ctx_v) {
+            LOG_ERR("%s: error: model does not support vision input\n", __func__);
+            return 2;
+        }
+
+        const uint32_t n_frames = bitmap->n_frames;
+        const uint32_t n_pairs  = n_frames / 2;
+        const size_t   frame_bytes = (size_t)bitmap->nx * bitmap->ny * 3;
+
+        if (!ctx->img_beg.empty()) {
+            add_text(ctx->img_beg, true);
+        }
+
+        // preprocess each frame individually
+        clip_image_f32_batch all_frames;
+        for (uint32_t f = 0; f < n_frames; f++) {
+            clip_image_u8_ptr img_u8(clip_image_u8_init());
+            img_u8->nx = bitmap->nx;
+            img_u8->ny = bitmap->ny;
+            img_u8->buf.resize(frame_bytes);
+            std::memcpy(img_u8->buf.data(), bitmap->data.data() + f * frame_bytes, frame_bytes);
+
+            clip_image_f32_batch frame_batch;
+            bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &frame_batch);
+            if (!ok) {
+                LOG_ERR("Unable to preprocess video frame %u\n", f);
+                return 2;
+            }
+            GGML_ASSERT(frame_batch.entries.size() == 1);
+            all_frames.entries.push_back(std::move(frame_batch.entries[0]));
+        }
+
+        const int frame_nx = all_frames.entries[0]->nx;
+        const int frame_ny = all_frames.entries[0]->ny;
+        const int n_pixels = frame_nx * frame_ny;
+
+        // interleave frame pairs into 6-channel images (even_rgb + odd_rgb)
+        // each pair is a separate batch entry, encoded independently
+        clip_image_f32_batch pair_batch;
+        for (uint32_t p = 0; p < n_pairs; p++) {
+            const auto & even = all_frames.entries[p * 2];
+            const auto & odd  = all_frames.entries[p * 2 + 1];
+            GGML_ASSERT(even->nx == frame_nx && even->ny == frame_ny);
+            GGML_ASSERT(odd->nx  == frame_nx && odd->ny  == frame_ny);
+
+            clip_image_f32_ptr pair(clip_image_f32_init());
+            pair->nx = frame_nx;
+            pair->ny = frame_ny;
+            pair->buf.resize((size_t)n_pixels * 6);
+
+            for (int i = 0; i < n_pixels; i++) {
+                const int dst = i * 6;
+                const int src = i * 3;
+                pair->buf[dst + 0] = even->buf[src + 0];
+                pair->buf[dst + 1] = even->buf[src + 1];
+                pair->buf[dst + 2] = even->buf[src + 2];
+                pair->buf[dst + 3] = odd->buf[src + 0];
+                pair->buf[dst + 4] = odd->buf[src + 1];
+                pair->buf[dst + 5] = odd->buf[src + 2];
+            }
+            pair_batch.entries.push_back(std::move(pair));
+        }
+
+        const uint32_t tokens_x = clip_n_output_tokens_x(ctx->ctx_v, pair_batch.entries[0].get());
+        const uint32_t tokens_y = clip_n_output_tokens_y(ctx->ctx_v, pair_batch.entries[0].get());
+
+        mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+        image_tokens->nx = tokens_x;
+        image_tokens->ny = tokens_y;
+        image_tokens->nt = n_pairs;
+        image_tokens->use_mrope_pos = true;
+        image_tokens->batch_f32 = std::move(pair_batch);
+        image_tokens->id = bitmap->id;
+
+        LOG_DBG("video: nt=%u, nx=%u, ny=%u, n_tokens=%u\n",
+                image_tokens->nt, image_tokens->nx, image_tokens->ny, image_tokens->n_tokens());
+
+        mtmd_input_chunk chunk{
+            MTMD_INPUT_CHUNK_TYPE_IMAGE,
+            {}, // text tokens
+            std::move(image_tokens),
+            nullptr, // audio tokens
+        };
+        cur.entries.emplace_back(std::move(chunk));
+
+        if (!ctx->img_end.empty()) {
+            add_text(ctx->img_end, true);
+        }
+
+        return 0;
+    }
+
    std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
        std::vector<mtmd_input_chunk> chunks;

@ -855,10 +959,13 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
    ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
    bool ok = false;

-    if (clip_is_llava(ctx_clip)
+    if (image_tokens->nt > 1
+        || clip_is_llava(ctx_clip)
        || clip_is_minicpmv(ctx_clip)
        || clip_is_glm(ctx_clip)) {
-        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
+        // encode each batch entry independently
+        // video: each entry is one frame pair, encoded with per-frame attention
+        // llava/minicpmv/glm: does not support batched encoding
        const auto & entries = image_tokens->batch_f32.entries;
        for (size_t i = 0; i < entries.size(); i++) {
            int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
@ -938,6 +1045,21 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
    return bitmap;
 }

+mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx,
+                                          uint32_t ny,
+                                          uint32_t n_frames,
+                                          const unsigned char * data) {
+    GGML_ASSERT(n_frames >= 2 && n_frames % 2 == 0);
+    mtmd_bitmap * bitmap = new mtmd_bitmap;
+    bitmap->nx = nx;
+    bitmap->ny = ny;
+    bitmap->n_frames = n_frames;
+    size_t data_size = (size_t)nx * ny * 3 * n_frames;
+    bitmap->data.resize(data_size);
+    std::memcpy(bitmap->data.data(), data, data_size);
+    return bitmap;
+}
+
 mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
                                          const float * data) {
    mtmd_bitmap * bitmap = new mtmd_bitmap;
@ -970,6 +1092,14 @@ bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
    return bitmap->is_audio;
 }

+bool mtmd_bitmap_is_video(const mtmd_bitmap * bitmap) {
+    return bitmap->n_frames >= 2;
+}
+
+uint32_t mtmd_bitmap_get_n_frames(const mtmd_bitmap * bitmap) {
+    return bitmap->n_frames;
+}
+
 const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
    return bitmap->id.c_str();
 }
@ -1106,15 +1236,18 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
    return image_tokens->ny;
 }

+size_t mtmd_image_tokens_get_nt(const mtmd_image_tokens * image_tokens) {
+    return image_tokens->nt;
+}
+
 const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
    return image_tokens->id.c_str();
 }

 llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
    if (image_tokens->use_mrope_pos) {
-        // for M-RoPE, temporal dimension = max(t,h,w)
-        // t is omitted as we don't support video input
-        return std::max(image_tokens->nx, image_tokens->ny);
+        // for M-RoPE, n_pos = max(t, h, w)
+        return (llama_pos)std::max({image_tokens->nt, image_tokens->nx, image_tokens->ny});
    }
    return image_tokens->n_tokens();
 }
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@ -134,17 +134,24 @@ MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);
 // if bitmap is image:
 //     length of data must be nx * ny * 3
 //     the data is in RGBRGBRGB... format
+// if bitmap is video:
+//     length of data must be nx * ny * 3 * n_frames
+//     n_frames must be >= 2 and even
+//     frames are sequential RGB, each nx * ny * 3 bytes
 // if bitmap is audio:
 //     length of data must be n_samples * sizeof(float)
 //     the data is in float format (PCM F32)
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init           (uint32_t nx, uint32_t ny, const unsigned char * data);
+MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t n_frames, const unsigned char * data);
 MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples,         const float         * data);
-MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap);
-MTMD_API uint32_t              mtmd_bitmap_get_ny     (const mtmd_bitmap * bitmap);
-MTMD_API const unsigned char * mtmd_bitmap_get_data   (const mtmd_bitmap * bitmap);
-MTMD_API size_t                mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
-MTMD_API bool                  mtmd_bitmap_is_audio   (const mtmd_bitmap * bitmap);
-MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap);
+MTMD_API uint32_t              mtmd_bitmap_get_nx      (const mtmd_bitmap * bitmap);
+MTMD_API uint32_t              mtmd_bitmap_get_ny      (const mtmd_bitmap * bitmap);
+MTMD_API const unsigned char * mtmd_bitmap_get_data    (const mtmd_bitmap * bitmap);
+MTMD_API size_t                mtmd_bitmap_get_n_bytes (const mtmd_bitmap * bitmap);
+MTMD_API bool                  mtmd_bitmap_is_audio    (const mtmd_bitmap * bitmap);
+MTMD_API bool                  mtmd_bitmap_is_video    (const mtmd_bitmap * bitmap);
+MTMD_API uint32_t              mtmd_bitmap_get_n_frames(const mtmd_bitmap * bitmap);
+MTMD_API void                  mtmd_bitmap_free        (mtmd_bitmap * bitmap);
 // bitmap ID is optional, but useful for KV cache tracking
 // these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
 MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
@ -187,6 +194,7 @@ MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
 MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
 MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
 MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
+MTMD_API size_t       mtmd_image_tokens_get_nt      (const mtmd_image_tokens * image_tokens);
 MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens); // TODO: deprecate
 // number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
 MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens); // TODO: deprecate
@ -276,9 +284,14 @@ struct bitmap {
    bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
        ptr.reset(mtmd_bitmap_init(nx, ny, data));
    }
+    bitmap(uint32_t nx, uint32_t ny, uint32_t n_frames, const unsigned char * data) {
+        ptr.reset(mtmd_bitmap_init_from_video(nx, ny, n_frames, data));
+    }
    ~bitmap() = default;
-    uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
-    uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
+    uint32_t nx()       const { return mtmd_bitmap_get_nx(ptr.get()); }
+    uint32_t ny()       const { return mtmd_bitmap_get_ny(ptr.get()); }
+    uint32_t n_frames() const { return mtmd_bitmap_get_n_frames(ptr.get()); }
+    bool     is_video() const { return mtmd_bitmap_is_video(ptr.get()); }
    const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
    size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
    std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }