From 871f1a2d2f54059d66ccb985aa43ff4dbabab972 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Fri, 27 Mar 2026 11:00:52 +0100
Subject: [PATCH] mtmd: add more sanity checks (#21047)

---
 tools/mtmd/clip.cpp        | 10 ++++++++++
 tools/mtmd/mtmd-audio.cpp  | 18 ++++++++++--------
 tools/mtmd/mtmd-audio.h    |  4 ++--
 tools/mtmd/mtmd-helper.cpp | 10 ++++++++++
 tools/mtmd/mtmd-image.cpp  |  8 ++++++--
 tools/mtmd/mtmd.cpp        | 11 ++++++++++-
 6 files changed, 48 insertions(+), 13 deletions(-)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index fd1cb0dfea..2947fcf9a3 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1377,6 +1377,16 @@ struct clip_model_loader {
 
             // sanity check
             {
+                if (hparams.image_size < 0) {
+                    // note: some models having hparams.image_size == 0, which means the image size is dynamic
+                    throw std::runtime_error(string_format("%s: image_size (%d) cannot be negative\n", __func__, hparams.image_size));
+                }
+                if (hparams.patch_size <= 0) {
+                    throw std::runtime_error(string_format("%s: patch_size (%d) must be greater than 0\n", __func__, hparams.patch_size));
+                }
+                if (hparams.n_embd <= 0) {
+                    throw std::runtime_error(string_format("%s: n_embd (%d) must be greater than 0\n", __func__, hparams.n_embd));
+                }
                 if (hparams.image_max_pixels < hparams.image_min_pixels) {
                     throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
                 }
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp
index 447f61aaa4..e68387c273 100644
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@@ -13,23 +13,20 @@
 
 constexpr bool DEBUG = false;
 
-void mtmd_audio_cache::fill_sin_cos_table(int n) {
+void mtmd_audio_cache::fill_sin_cos_table(uint32_t n) {
     sin_vals.resize(n);
     cos_vals.resize(n);
-    for (int i = 0; i < n; i++) {
+    for (uint32_t i = 0; i < n; i++) {
         double theta = (2 * M_PI * i) / n;
         sin_vals[i]  = sinf(theta);
         cos_vals[i]  = cosf(theta);
     }
 }
 
-void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
+void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) {
     hann_window.resize(length);
-    int offset = -1;
-    if (periodic) {
-        offset = 0;
-    }
-    for (int i = 0; i < length; i++) {
+    int offset = periodic ? 0 : -1;
+    for (uint32_t i = 0; i < length; i++) {
         hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
     }
 }
@@ -165,6 +162,7 @@ static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, fl
 //              false = input is complex-valued (interleaved real/imag, stride 2)
 template <bool Inverse, bool RealInput>
 static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
+    GGML_ASSERT(N > 0);
     const int n_sin_cos_vals = cache.sin_vals.size();
 
     if (N == 1) {
@@ -407,6 +405,8 @@ static bool log_mel_spectrogram(
     }
 
 
+    GGML_ASSERT(params.n_fft_bins > 0);
+    GGML_ASSERT(params.hop_length > 0);
     out.n_mel = params.n_mel;
     out.n_len = (n_samples - frame_size) / frame_step + 1;
     // TODO: handle these checks better
@@ -438,6 +438,7 @@ static bool log_mel_spectrogram(
 
     const int effective_n_len = n_samples_in / frame_step;
     if (params.norm_per_feature) {
+        GGML_ASSERT(effective_n_len > 1);
         for (int i = 0; i < out.n_mel; i++) {
             double mean = 0;
             for (int j = 0; j < effective_n_len; ++j) {
@@ -639,6 +640,7 @@ mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length
     padding_to_remove((n_fft - hop_length) / 2),
     ifft_in(n_fft * 2 * 4, 0.0f),  // extra space for recursive IFFT
     ifft_out(n_fft * 2 * 4, 0.0f) {
+    GGML_ASSERT(n_fft > 0 && hop_length > 0 && hop_length <= n_fft);
     cache.fill_sin_cos_table(n_fft);
     cache.fill_hann_window(n_fft, true);
 }
diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h
index 016c7392e4..53857a2eb5 100644
--- a/tools/mtmd/mtmd-audio.h
+++ b/tools/mtmd/mtmd-audio.h
@@ -33,9 +33,9 @@ struct mtmd_audio_cache {
 
     mtmd_audio_mel_filters filters;
 
-    void fill_sin_cos_table(int n);
+    void fill_sin_cos_table(uint32_t n);
 
-    void fill_hann_window(int length, bool periodic);
+    void fill_hann_window(uint32_t length, bool periodic);
 
     // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
     // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index 5bcb7ec1bc..778aacb61d 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -127,6 +127,7 @@ struct decode_embd_batch {
     std::vector<int8_t>         logits;
     llama_batch batch;
     decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+        GGML_ASSERT(n_tokens > 0 && n_pos_per_embd > 0 && n_mmproj_embd > 0);
         pos     .resize(n_tokens * n_pos_per_embd);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
@@ -157,6 +158,7 @@ struct decode_embd_batch {
     // M-RoPE for image
     void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
         GGML_ASSERT(n_pos_per_embd == 4);
+        GGML_ASSERT(nx > 0 && ny > 0 && nx * ny == batch.n_tokens);
         seq_id_0[0] = seq_id;
         for (int y = 0; y < ny; y++) {
             for (int x = 0; x < nx; x++) {
@@ -192,6 +194,7 @@ struct decode_embd_batch {
     }
 
     llama_batch get_view(int offset, int n_tokens) {
+        GGML_ASSERT(offset >= 0 && n_tokens > 0 && offset + n_tokens <= batch.n_tokens);
         llama_pos * pos_ptr;
         pos_view.clear();
         pos_view.reserve(n_tokens * n_pos_per_embd);
@@ -235,6 +238,7 @@ int32_t mtmd_helper_decode_image_chunk(
         llama_seq_id seq_id,
         int32_t n_batch,
         llama_pos * new_n_past) {
+    GGML_ASSERT(n_batch > 0);
     auto chunk_type = mtmd_input_chunk_get_type(chunk);
     const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
     if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
@@ -312,6 +316,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
         int32_t n_batch,
         bool logits_last,
         llama_pos * new_n_past) {
+    GGML_ASSERT(n_batch > 0);
     int32_t ret;
     llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
     auto chunk_type = mtmd_input_chunk_get_type(chunk);
@@ -508,6 +513,11 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
     fseek(f, 0, SEEK_END);
     long file_size = ftell(f);
     fseek(f, 0, SEEK_SET);
+    if (file_size < 0) {
+        LOG_ERR("Failed to get file size of %s\n", fname);
+        fclose(f);
+        return nullptr;
+    }
     buf.resize(file_size);
 
     size_t n_read = fread(buf.data(), 1, file_size, f);
diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp
index b446437192..a2166622b7 100644
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
@@ -99,6 +99,8 @@ struct img_tool {
     }
 
     static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
+        GGML_ASSERT(x >= 0 && y >= 0 && w > 0 && h > 0);
+        GGML_ASSERT(x + w <= image.nx && y + h <= image.ny);
         dst.nx = w;
         dst.ny = h;
         dst.buf.resize(3 * w * h);
@@ -196,6 +198,7 @@ struct img_tool {
 private:
     // Bilinear resize function
     static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
+        GGML_ASSERT(src.nx >= 2 && src.ny >= 2);
         dst.nx = target_width;
         dst.ny = target_height;
         dst.buf.resize(3 * target_width * target_height);
@@ -207,8 +210,8 @@ private:
             for (int x = 0; x < target_width; x++) {
                 float px = x_ratio * x;
                 float py = y_ratio * y;
-                int x_floor = static_cast<int>(px);
-                int y_floor = static_cast<int>(py);
+                int x_floor = std::min(static_cast<int>(px), src.nx - 2);
+                int y_floor = std::min(static_cast<int>(py), src.ny - 2);
                 float x_lerp = px - x_floor;
                 float y_lerp = py - y_floor;
 
@@ -347,6 +350,7 @@ private:
         // Returns: kernel size (ksize) - number of input pixels that contribute to each output pixel
         auto precompute_weights = [&](int inSize, int outSize,
                                      std::vector<int> & bounds, std::vector<int32_t> & weights) -> int {
+            GGML_ASSERT(inSize > 0 && outSize > 0);
             double support, scale, filterscale;
             double center, ww, ss;
             int xx, x, ksize, xmin, xmax, xcnt;
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index d078120f76..9c400ce104 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -641,6 +641,11 @@ struct mtmd_tokenizer {
                 add_text(ctx->img_beg, true); // add image begin token
             }
 
+            // sanity check
+            GGML_ASSERT(bitmap->nx > 0 && bitmap->ny > 0);
+            GGML_ASSERT(bitmap->data.size() == (size_t)bitmap->nx * bitmap->ny * 3);
+            GGML_ASSERT(ctx->image_preproc != nullptr);
+
             // convert mtmd_bitmap to clip_image_u8
             clip_image_u8_ptr img_u8(clip_image_u8_init());
             img_u8->nx = bitmap->nx;
@@ -649,7 +654,6 @@ struct mtmd_tokenizer {
             std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
 
             // preprocess image
-            GGML_ASSERT(ctx->image_preproc != nullptr);
             clip_image_f32_batch batch_f32;
             bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32);
             if (!ok) {
@@ -773,6 +777,11 @@ struct mtmd_tokenizer {
                 add_text(ctx->aud_beg, true); // add audio begin token
             }
 
+            // sanity check
+            GGML_ASSERT(ctx->audio_preproc != nullptr);
+            GGML_ASSERT(bitmap->data.size() > sizeof(float));
+            GGML_ASSERT(bitmap->data.size() % sizeof(float) == 0);
+
             // preprocess audio
             std::vector<mtmd_audio_mel> mel_spec_chunks;
             const float * samples = (const float *)bitmap->data.data();