From 871f1a2d2f54059d66ccb985aa43ff4dbabab972 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Fri, 27 Mar 2026 11:00:52 +0100 Subject: [PATCH] mtmd: add more sanity checks (#21047) --- tools/mtmd/clip.cpp | 10 ++++++++++ tools/mtmd/mtmd-audio.cpp | 18 ++++++++++-------- tools/mtmd/mtmd-audio.h | 4 ++-- tools/mtmd/mtmd-helper.cpp | 10 ++++++++++ tools/mtmd/mtmd-image.cpp | 8 ++++++-- tools/mtmd/mtmd.cpp | 11 ++++++++++- 6 files changed, 48 insertions(+), 13 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index fd1cb0dfea..2947fcf9a3 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1377,6 +1377,16 @@ struct clip_model_loader { // sanity check { + if (hparams.image_size < 0) { + // note: some models having hparams.image_size == 0, which means the image size is dynamic + throw std::runtime_error(string_format("%s: image_size (%d) cannot be negative\n", __func__, hparams.image_size)); + } + if (hparams.patch_size <= 0) { + throw std::runtime_error(string_format("%s: patch_size (%d) must be greater than 0\n", __func__, hparams.patch_size)); + } + if (hparams.n_embd <= 0) { + throw std::runtime_error(string_format("%s: n_embd (%d) must be greater than 0\n", __func__, hparams.n_embd)); + } if (hparams.image_max_pixels < hparams.image_min_pixels) { throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels)); } diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 447f61aaa4..e68387c273 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -13,23 +13,20 @@ constexpr bool DEBUG = false; -void mtmd_audio_cache::fill_sin_cos_table(int n) { +void mtmd_audio_cache::fill_sin_cos_table(uint32_t n) { sin_vals.resize(n); cos_vals.resize(n); - for (int i = 0; i < n; i++) { + for (uint32_t i = 0; i < n; i++) { double theta = (2 * M_PI * i) / n; sin_vals[i] = sinf(theta); cos_vals[i] = cosf(theta); } } -void mtmd_audio_cache::fill_hann_window(int length, bool periodic) { +void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) { hann_window.resize(length); - int offset = -1; - if (periodic) { - offset = 0; - } - for (int i = 0; i < length; i++) { + int offset = periodic ? 0 : -1; + for (uint32_t i = 0; i < length; i++) { hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); } } @@ -165,6 +162,7 @@ static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, fl // false = input is complex-valued (interleaved real/imag, stride 2) template static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) { + GGML_ASSERT(N > 0); const int n_sin_cos_vals = cache.sin_vals.size(); if (N == 1) { @@ -407,6 +405,8 @@ static bool log_mel_spectrogram( } + GGML_ASSERT(params.n_fft_bins > 0); + GGML_ASSERT(params.hop_length > 0); out.n_mel = params.n_mel; out.n_len = (n_samples - frame_size) / frame_step + 1; // TODO: handle these checks better @@ -438,6 +438,7 @@ static bool log_mel_spectrogram( const int effective_n_len = n_samples_in / frame_step; if (params.norm_per_feature) { + GGML_ASSERT(effective_n_len > 1); for (int i = 0; i < out.n_mel; i++) { double mean = 0; for (int j = 0; j < effective_n_len; ++j) { @@ -639,6 +640,7 @@ mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length padding_to_remove((n_fft - hop_length) / 2), ifft_in(n_fft * 2 * 4, 0.0f), // extra space for recursive IFFT ifft_out(n_fft * 2 * 4, 0.0f) { + GGML_ASSERT(n_fft > 0 && hop_length > 0 && hop_length <= n_fft); cache.fill_sin_cos_table(n_fft); cache.fill_hann_window(n_fft, true); } diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index 016c7392e4..53857a2eb5 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -33,9 +33,9 @@ struct mtmd_audio_cache { mtmd_audio_mel_filters filters; - void fill_sin_cos_table(int n); + void fill_sin_cos_table(uint32_t n); - void fill_hann_window(int length, bool periodic); + void fill_hann_window(uint32_t length, bool periodic); // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime. // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257. diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 5bcb7ec1bc..778aacb61d 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -127,6 +127,7 @@ struct decode_embd_batch { std::vector logits; llama_batch batch; decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) { + GGML_ASSERT(n_tokens > 0 && n_pos_per_embd > 0 && n_mmproj_embd > 0); pos .resize(n_tokens * n_pos_per_embd); n_seq_id.resize(n_tokens); seq_ids .resize(n_tokens + 1); @@ -157,6 +158,7 @@ struct decode_embd_batch { // M-RoPE for image void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) { GGML_ASSERT(n_pos_per_embd == 4); + GGML_ASSERT(nx > 0 && ny > 0 && nx * ny == batch.n_tokens); seq_id_0[0] = seq_id; for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { @@ -192,6 +194,7 @@ struct decode_embd_batch { } llama_batch get_view(int offset, int n_tokens) { + GGML_ASSERT(offset >= 0 && n_tokens > 0 && offset + n_tokens <= batch.n_tokens); llama_pos * pos_ptr; pos_view.clear(); pos_view.reserve(n_tokens * n_pos_per_embd); @@ -235,6 +238,7 @@ int32_t mtmd_helper_decode_image_chunk( llama_seq_id seq_id, int32_t n_batch, llama_pos * new_n_past) { + GGML_ASSERT(n_batch > 0); auto chunk_type = mtmd_input_chunk_get_type(chunk); const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio"; if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) { @@ -312,6 +316,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, int32_t n_batch, bool logits_last, llama_pos * new_n_past) { + GGML_ASSERT(n_batch > 0); int32_t ret; llama_batch text_batch = llama_batch_init(n_batch, 0, 1); auto chunk_type = mtmd_input_chunk_get_type(chunk); @@ -508,6 +513,11 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fseek(f, 0, SEEK_END); long file_size = ftell(f); fseek(f, 0, SEEK_SET); + if (file_size < 0) { + LOG_ERR("Failed to get file size of %s\n", fname); + fclose(f); + return nullptr; + } buf.resize(file_size); size_t n_read = fread(buf.data(), 1, file_size, f); diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp index b446437192..a2166622b7 100644 --- a/tools/mtmd/mtmd-image.cpp +++ b/tools/mtmd/mtmd-image.cpp @@ -99,6 +99,8 @@ struct img_tool { } static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) { + GGML_ASSERT(x >= 0 && y >= 0 && w > 0 && h > 0); + GGML_ASSERT(x + w <= image.nx && y + h <= image.ny); dst.nx = w; dst.ny = h; dst.buf.resize(3 * w * h); @@ -196,6 +198,7 @@ struct img_tool { private: // Bilinear resize function static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) { + GGML_ASSERT(src.nx >= 2 && src.ny >= 2); dst.nx = target_width; dst.ny = target_height; dst.buf.resize(3 * target_width * target_height); @@ -207,8 +210,8 @@ private: for (int x = 0; x < target_width; x++) { float px = x_ratio * x; float py = y_ratio * y; - int x_floor = static_cast(px); - int y_floor = static_cast(py); + int x_floor = std::min(static_cast(px), src.nx - 2); + int y_floor = std::min(static_cast(py), src.ny - 2); float x_lerp = px - x_floor; float y_lerp = py - y_floor; @@ -347,6 +350,7 @@ private: // Returns: kernel size (ksize) - number of input pixels that contribute to each output pixel auto precompute_weights = [&](int inSize, int outSize, std::vector & bounds, std::vector & weights) -> int { + GGML_ASSERT(inSize > 0 && outSize > 0); double support, scale, filterscale; double center, ww, ss; int xx, x, ksize, xmin, xmax, xcnt; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index d078120f76..9c400ce104 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -641,6 +641,11 @@ struct mtmd_tokenizer { add_text(ctx->img_beg, true); // add image begin token } + // sanity check + GGML_ASSERT(bitmap->nx > 0 && bitmap->ny > 0); + GGML_ASSERT(bitmap->data.size() == (size_t)bitmap->nx * bitmap->ny * 3); + GGML_ASSERT(ctx->image_preproc != nullptr); + // convert mtmd_bitmap to clip_image_u8 clip_image_u8_ptr img_u8(clip_image_u8_init()); img_u8->nx = bitmap->nx; @@ -649,7 +654,6 @@ struct mtmd_tokenizer { std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3); // preprocess image - GGML_ASSERT(ctx->image_preproc != nullptr); clip_image_f32_batch batch_f32; bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32); if (!ok) { @@ -773,6 +777,11 @@ struct mtmd_tokenizer { add_text(ctx->aud_beg, true); // add audio begin token } + // sanity check + GGML_ASSERT(ctx->audio_preproc != nullptr); + GGML_ASSERT(bitmap->data.size() > sizeof(float)); + GGML_ASSERT(bitmap->data.size() % sizeof(float) == 0); + // preprocess audio std::vector mel_spec_chunks; const float * samples = (const float *)bitmap->data.data();