This commit is contained in:
Andrew Sampson 2026-03-16 02:08:16 +08:00 committed by GitHub
commit bb941c5269
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 233 additions and 34 deletions

View File

@ -3615,9 +3615,17 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// set input pixel values
if (!imgs.is_audio) {
// detect number of channels from the buffer size
const int nx = imgs.entries[0]->nx;
const int ny = imgs.entries[0]->ny;
const int n = nx * ny;
const size_t buf_size = imgs.entries[0]->buf.size();
const int n_channels = (int)(buf_size / n);
GGML_ASSERT(n_channels == 3 || n_channels == 6);
size_t nelem = 0;
for (const auto & img : imgs.entries) {
nelem += img->nx * img->ny * 3;
nelem += img->nx * img->ny * n_channels;
}
std::vector<float> inp_raw(nelem);
@ -3631,21 +3639,21 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// │ H │ channel = B
// └─────┘ │
// ──────┘ x B
//
// for 6-channel video input, same layout but with 6 planar channels
for (size_t i = 0; i < imgs.entries.size(); i++) {
const int nx = imgs.entries[i]->nx;
const int ny = imgs.entries[i]->ny;
const int n = nx * ny;
for (int b = 0; b < batch_size; b++) {
const int cur_nx = imgs.entries[b]->nx;
const int cur_ny = imgs.entries[b]->ny;
const int cur_n = cur_nx * cur_ny;
for (int b = 0; b < batch_size; b++) {
float * batch_entry = inp_raw.data() + b * (3*n);
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
size_t base_src = 3*(y * nx + x); // idx of the first channel
size_t base_dst = y * nx + x; // idx of the first channel
batch_entry[ base_dst] = imgs.entries[b]->buf[base_src ];
batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
float * batch_entry = inp_raw.data() + b * (n_channels * cur_n);
for (int y = 0; y < cur_ny; y++) {
for (int x = 0; x < cur_nx; x++) {
size_t base_src = n_channels * (y * cur_nx + x);
size_t base_dst = y * cur_nx + x;
for (int c = 0; c < n_channels; c++) {
batch_entry[c * cur_n + base_dst] = imgs.entries[b]->buf[base_src + c];
}
}
}

View File

@ -13,16 +13,34 @@ ggml_cgraph * clip_graph_qwen3vl::build() {
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
ggml_tensor * inp_raw = build_inp_raw();
ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
// detect video: 6-channel input means interleaved frame pairs (even_rgb + odd_rgb)
// for images (3ch), both Conv2Ds receive the same input (original behavior)
// for video (6ch), Conv2D_0 gets even frames (ch 0-2), Conv2D_1 gets odd frames (ch 3-5)
const bool is_video = (img.buf.size() == (size_t)img.nx * img.ny * 6);
const int n_channels = is_video ? 6 : 3;
ggml_tensor * inp_raw = build_inp_raw(n_channels);
ggml_tensor * inp;
if (is_video) {
const size_t nb1 = ggml_row_size(inp_raw->type, img.nx);
const size_t nb2 = nb1 * img.ny;
ggml_tensor * inp_even = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, 0);
ggml_tensor * inp_odd = ggml_view_3d(ctx0, inp_raw, img.nx, img.ny, 3, nb1, nb2, nb2 * 3);
inp = ggml_add(ctx0,
ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_even, patch_size, patch_size, 0, 0, 1, 1),
ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_odd, patch_size, patch_size, 0, 0, 1, 1));
} else {
inp = ggml_add(ctx0,
ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1),
ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1));
}
GGML_ASSERT(img.nx % (patch_size * 2) == 0);
GGML_ASSERT(img.ny % (patch_size * 2) == 0);
// second conv dimension
// spatial merge
{
auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
inp = ggml_add(ctx0, inp, inp_1);
inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b]
inp = ggml_cont_4d(

View File

@ -174,6 +174,28 @@ struct decode_embd_batch {
}
}
// M-RoPE for video: 3D positions [temporal, height, width]
void set_position_mrope_3d(llama_pos pos_0, int nx, int ny, int nt, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
seq_id_0[0] = seq_id;
for (int t = 0; t < nt; t++) {
for (int y = 0; y < ny; y++) {
for (int x = 0; x < nx; x++) {
int i = t * ny * nx + y * nx + x;
pos[i ] = pos_0 + t;
pos[i + batch.n_tokens ] = pos_0 + y;
pos[i + batch.n_tokens * 2] = pos_0 + x;
pos[i + batch.n_tokens * 3] = 0;
}
}
}
for (int i = 0; i < batch.n_tokens; i++) {
batch.n_seq_id[i] = 1;
batch.seq_id [i] = seq_id_0.data();
batch.logits [i] = false;
}
}
// M-RoPE for audio
void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
GGML_ASSERT(n_pos_per_embd == 4);
@ -260,7 +282,12 @@ int32_t mtmd_helper_decode_image_chunk(
}
const int nx = mtmd_image_tokens_get_nx(image_tokens);
const int ny = mtmd_image_tokens_get_ny(image_tokens);
batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
const int nt = mtmd_image_tokens_get_nt(image_tokens);
if (nt > 1) {
batch_embd.set_position_mrope_3d(n_past, nx, ny, nt, seq_id);
} else {
batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
}
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
batch_embd.set_position_mrope_1d(n_past, seq_id);
} else {

View File

@ -24,9 +24,11 @@
// represents raw image data, layout is RGBRGBRGB...
// length of data must be nx * ny * 3
// for video: data is n_frames sequential RGB frames, each nx * ny * 3 bytes
struct mtmd_bitmap {
uint32_t nx;
uint32_t ny;
uint32_t n_frames = 0; // 0 for single images, >= 2 (even) for video
std::vector<unsigned char> data;
std::string id; // optional user-defined id, for ex: can be set to image hash, useful for KV cache tracking
bool is_audio = false; // true if the bitmap is audio
@ -35,8 +37,9 @@ struct mtmd_bitmap {
struct mtmd_image_tokens {
uint32_t nx; // number of tokens in x direction
uint32_t ny; // number of tokens in y direction
uint32_t nt = 1; // number of temporal positions (1 for images, > 1 for video)
bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
uint32_t n_tokens() const { return nx * ny; }
uint32_t n_tokens() const { return nt * nx * ny; }
clip_image_f32_batch batch_f32; // preprocessed image patches
std::string id; // optional user-defined ID, useful for KV cache tracking
@ -44,6 +47,7 @@ struct mtmd_image_tokens {
return mtmd_image_tokens{
nx,
ny,
nt,
use_mrope_pos,
batch_f32.clone(),
id
@ -553,6 +557,10 @@ struct mtmd_tokenizer {
}
int32_t add_media(const mtmd_bitmap * bitmap) {
if (bitmap->n_frames >= 2) {
return add_video(bitmap);
}
if (!bitmap->is_audio) {
// handle image
@ -743,6 +751,102 @@ struct mtmd_tokenizer {
return 0;
}
// preprocess video frames and create an image chunk with temporal dimension
// frames are paired (even+odd), each pair becomes one 6-channel image
// each pair is encoded independently through the ViT (per-frame attention)
int32_t add_video(const mtmd_bitmap * bitmap) {
if (!ctx->ctx_v) {
LOG_ERR("%s: error: model does not support vision input\n", __func__);
return 2;
}
const uint32_t n_frames = bitmap->n_frames;
const uint32_t n_pairs = n_frames / 2;
const size_t frame_bytes = (size_t)bitmap->nx * bitmap->ny * 3;
if (!ctx->img_beg.empty()) {
add_text(ctx->img_beg, true);
}
// preprocess each frame individually
clip_image_f32_batch all_frames;
for (uint32_t f = 0; f < n_frames; f++) {
clip_image_u8_ptr img_u8(clip_image_u8_init());
img_u8->nx = bitmap->nx;
img_u8->ny = bitmap->ny;
img_u8->buf.resize(frame_bytes);
std::memcpy(img_u8->buf.data(), bitmap->data.data() + f * frame_bytes, frame_bytes);
clip_image_f32_batch frame_batch;
bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &frame_batch);
if (!ok) {
LOG_ERR("Unable to preprocess video frame %u\n", f);
return 2;
}
GGML_ASSERT(frame_batch.entries.size() == 1);
all_frames.entries.push_back(std::move(frame_batch.entries[0]));
}
const int frame_nx = all_frames.entries[0]->nx;
const int frame_ny = all_frames.entries[0]->ny;
const int n_pixels = frame_nx * frame_ny;
// interleave frame pairs into 6-channel images (even_rgb + odd_rgb)
// each pair is a separate batch entry, encoded independently
clip_image_f32_batch pair_batch;
for (uint32_t p = 0; p < n_pairs; p++) {
const auto & even = all_frames.entries[p * 2];
const auto & odd = all_frames.entries[p * 2 + 1];
GGML_ASSERT(even->nx == frame_nx && even->ny == frame_ny);
GGML_ASSERT(odd->nx == frame_nx && odd->ny == frame_ny);
clip_image_f32_ptr pair(clip_image_f32_init());
pair->nx = frame_nx;
pair->ny = frame_ny;
pair->buf.resize((size_t)n_pixels * 6);
for (int i = 0; i < n_pixels; i++) {
const int dst = i * 6;
const int src = i * 3;
pair->buf[dst + 0] = even->buf[src + 0];
pair->buf[dst + 1] = even->buf[src + 1];
pair->buf[dst + 2] = even->buf[src + 2];
pair->buf[dst + 3] = odd->buf[src + 0];
pair->buf[dst + 4] = odd->buf[src + 1];
pair->buf[dst + 5] = odd->buf[src + 2];
}
pair_batch.entries.push_back(std::move(pair));
}
const uint32_t tokens_x = clip_n_output_tokens_x(ctx->ctx_v, pair_batch.entries[0].get());
const uint32_t tokens_y = clip_n_output_tokens_y(ctx->ctx_v, pair_batch.entries[0].get());
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
image_tokens->nx = tokens_x;
image_tokens->ny = tokens_y;
image_tokens->nt = n_pairs;
image_tokens->use_mrope_pos = true;
image_tokens->batch_f32 = std::move(pair_batch);
image_tokens->id = bitmap->id;
LOG_DBG("video: nt=%u, nx=%u, ny=%u, n_tokens=%u\n",
image_tokens->nt, image_tokens->nx, image_tokens->ny, image_tokens->n_tokens());
mtmd_input_chunk chunk{
MTMD_INPUT_CHUNK_TYPE_IMAGE,
{}, // text tokens
std::move(image_tokens),
nullptr, // audio tokens
};
cur.entries.emplace_back(std::move(chunk));
if (!ctx->img_end.empty()) {
add_text(ctx->img_end, true);
}
return 0;
}
std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
std::vector<mtmd_input_chunk> chunks;
@ -855,10 +959,13 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
bool ok = false;
if (clip_is_llava(ctx_clip)
if (image_tokens->nt > 1
|| clip_is_llava(ctx_clip)
|| clip_is_minicpmv(ctx_clip)
|| clip_is_glm(ctx_clip)) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
// encode each batch entry independently
// video: each entry is one frame pair, encoded with per-frame attention
// llava/minicpmv/glm: does not support batched encoding
const auto & entries = image_tokens->batch_f32.entries;
for (size_t i = 0; i < entries.size(); i++) {
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
@ -938,6 +1045,21 @@ mtmd_bitmap * mtmd_bitmap_init(uint32_t nx,
return bitmap;
}
mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx,
uint32_t ny,
uint32_t n_frames,
const unsigned char * data) {
GGML_ASSERT(n_frames >= 2 && n_frames % 2 == 0);
mtmd_bitmap * bitmap = new mtmd_bitmap;
bitmap->nx = nx;
bitmap->ny = ny;
bitmap->n_frames = n_frames;
size_t data_size = (size_t)nx * ny * 3 * n_frames;
bitmap->data.resize(data_size);
std::memcpy(bitmap->data.data(), data, data_size);
return bitmap;
}
mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples,
const float * data) {
mtmd_bitmap * bitmap = new mtmd_bitmap;
@ -970,6 +1092,14 @@ bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) {
return bitmap->is_audio;
}
bool mtmd_bitmap_is_video(const mtmd_bitmap * bitmap) {
return bitmap->n_frames >= 2;
}
uint32_t mtmd_bitmap_get_n_frames(const mtmd_bitmap * bitmap) {
return bitmap->n_frames;
}
const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap) {
return bitmap->id.c_str();
}
@ -1106,15 +1236,18 @@ size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens) {
return image_tokens->ny;
}
size_t mtmd_image_tokens_get_nt(const mtmd_image_tokens * image_tokens) {
return image_tokens->nt;
}
const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
return image_tokens->id.c_str();
}
llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
if (image_tokens->use_mrope_pos) {
// for M-RoPE, temporal dimension = max(t,h,w)
// t is omitted as we don't support video input
return std::max(image_tokens->nx, image_tokens->ny);
// for M-RoPE, n_pos = max(t, h, w)
return (llama_pos)std::max({image_tokens->nt, image_tokens->nx, image_tokens->ny});
}
return image_tokens->n_tokens();
}

View File

@ -134,17 +134,24 @@ MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);
// if bitmap is image:
// length of data must be nx * ny * 3
// the data is in RGBRGBRGB... format
// if bitmap is video:
// length of data must be nx * ny * 3 * n_frames
// n_frames must be >= 2 and even
// frames are sequential RGB, each nx * ny * 3 bytes
// if bitmap is audio:
// length of data must be n_samples * sizeof(float)
// the data is in float format (PCM F32)
MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data);
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_video(uint32_t nx, uint32_t ny, uint32_t n_frames, const unsigned char * data);
MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data);
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap);
MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap);
MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap);
MTMD_API size_t mtmd_bitmap_get_n_bytes (const mtmd_bitmap * bitmap);
MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap);
MTMD_API bool mtmd_bitmap_is_video (const mtmd_bitmap * bitmap);
MTMD_API uint32_t mtmd_bitmap_get_n_frames(const mtmd_bitmap * bitmap);
MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap);
// bitmap ID is optional, but useful for KV cache tracking
// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
@ -187,6 +194,7 @@ MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk);
MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens);
MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens);
MTMD_API size_t mtmd_image_tokens_get_nt (const mtmd_image_tokens * image_tokens);
MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate
// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate
@ -276,9 +284,14 @@ struct bitmap {
bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
ptr.reset(mtmd_bitmap_init(nx, ny, data));
}
bitmap(uint32_t nx, uint32_t ny, uint32_t n_frames, const unsigned char * data) {
ptr.reset(mtmd_bitmap_init_from_video(nx, ny, n_frames, data));
}
~bitmap() = default;
uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
uint32_t n_frames() const { return mtmd_bitmap_get_n_frames(ptr.get()); }
bool is_video() const { return mtmd_bitmap_is_video(ptr.get()); }
const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }