From 7b8d735c901666d91f211f380ca2edc625fd72c1 Mon Sep 17 00:00:00 2001 From: bluebread Date: Fri, 21 Nov 2025 18:04:01 +0000 Subject: [PATCH] mtmd: fixed the wrong scaler for get_rel_pos --- tools/mtmd/clip.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index a4bf717d0b..f291894b6e 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2529,11 +2529,14 @@ private: ); // [q_size, k_size] k_coord = ggml_cont(ctx, ggml_repeat(ctx, k_coord, rel)); // [q_size, k_size] + float q_scale = std::max((float)k_size/q_size, 1.0f); + float k_scale = std::max((float)q_size/k_size, 1.0f); + // This wouldn't be triggered in DeepSeek-OCR. Just for compatibility with // the original implementation. if (q_size != k_size) { - q_coord = ggml_scale_inplace(ctx, q_coord, std::max((float)k_size/q_size, 1.0f)); - k_coord = ggml_scale_inplace(ctx, k_coord, std::max((float)q_size/k_size, 1.0f)); + q_coord = ggml_scale_inplace(ctx, q_coord, q_scale); + k_coord = ggml_scale_inplace(ctx, k_coord, k_scale); } // ------------------------------------------------- @@ -2541,7 +2544,7 @@ private: // ------------------------------------------------- rel = ggml_sub(ctx, q_coord, k_coord); // [q_size, k_size] - rel = ggml_scale_bias(ctx, rel, 1.0f, static_cast(k_size) - 1.0f); // [q_size, k_size] + rel = ggml_scale_bias(ctx, rel, 1.0f, (k_size - 1.0f)*k_scale); // [q_size, k_size] // Clamp to [0, L-1] range for valid indexing rel = ggml_clamp(ctx, rel, 0.0f, static_cast(rel_pos->ne[1] - 1));