From d0d1062e7f5ea21f722a4b85cbe99fa900e87e74 Mon Sep 17 00:00:00 2001 From: Aes Sedai <7980540+AesSedai@users.noreply.github.com> Date: Sun, 8 Feb 2026 02:15:12 -0800 Subject: [PATCH] Kimi-K2.5: fix min / max pixel --- convert_hf_to_gguf.py | 7 +++--- tools/mtmd/clip.cpp | 53 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 83c2cd6923..c58dd91d9d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -11107,11 +11107,10 @@ class KimiK25Model(MmprojModel): self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0]) # Image size limits - # These are used to set token limits: tokens = pixels / (patch_size ^ 2) - in_patch_limit = self.preprocessor_config.get("in_patch_limit_each_frame", - self.preprocessor_config.get("in_patch_limit", 4096)) + # Note: in_patch_limit is for images, in_patch_limit_each_frame is for video (not supported yet) + in_patch_limit = self.preprocessor_config.get("in_patch_limit", 16384) min_patches = 8 # reasonable minimum - pixels_per_patch = self.patch_size * self.patch_size + pixels_per_patch = self.patch_size ** 2 self.gguf_writer.add_vision_min_pixels(min_patches * pixels_per_patch) self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 168341edf0..dae17c6fb0 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1153,20 +1153,16 @@ struct clip_model_loader { hparams.rope_theta = 10000.0f; get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); - // Read min/max pixels from GGUF and convert to token limits int min_pixels = 0, max_pixels = 0; get_u32(KEY_IMAGE_MIN_PIXELS, min_pixels, false); get_u32(KEY_IMAGE_MAX_PIXELS, max_pixels, false); if (min_pixels > 0 && max_pixels > 0) { - const int pixels_per_patch = hparams.patch_size * hparams.patch_size; - const int min_tokens = min_pixels / pixels_per_patch; - const int max_tokens = max_pixels / pixels_per_patch; - hparams.set_limit_image_tokens(min_tokens, max_tokens); + hparams.image_min_pixels = min_pixels; + hparams.image_max_pixels = max_pixels; + hparams.warmup_image_size = static_cast(std::sqrt(max_pixels)); } else { - // Fallback to hardcoded defaults - hparams.set_limit_image_tokens(8, 4096); + hparams.set_limit_image_tokens(2, 4096); } - hparams.set_warmup_n_tokens(256); } break; case PROJECTOR_TYPE_GEMMA3: { @@ -3773,6 +3769,47 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); } + // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set + if (std::getenv("MTMD_DEBUG_EMBEDDINGS") != nullptr) { + const int64_t n_embd = embeddings->ne[0]; + const int64_t n_tokens = embeddings->ne[1]; + std::vector emb_data(n_embd * n_tokens); + ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings)); + + LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n"); + LOG_INF("Shape: [%lld, %lld]\n", (long long)n_embd, (long long)n_tokens); + + // Print first few values of first token + LOG_INF("Token 0 (first 16 values): "); + for (int i = 0; i < std::min((int64_t)16, n_embd); i++) { + LOG_INF("%.6f ", emb_data[i]); + } + LOG_INF("\n"); + + // Print last few values of first token + if (n_embd > 16) { + LOG_INF("Token 0 (last 16 values): "); + for (int64_t i = n_embd - 16; i < n_embd; i++) { + LOG_INF("%.6f ", emb_data[i]); + } + LOG_INF("\n"); + } + + // Compute and print statistics + float sum = 0.0f, sum_sq = 0.0f, min_val = emb_data[0], max_val = emb_data[0]; + for (size_t i = 0; i < emb_data.size(); i++) { + sum += emb_data[i]; + sum_sq += emb_data[i] * emb_data[i]; + min_val = std::min(min_val, emb_data[i]); + max_val = std::max(max_val, emb_data[i]); + } + float mean = sum / emb_data.size(); + float variance = (sum_sq / emb_data.size()) - (mean * mean); + LOG_INF("Stats: mean=%.6f, std=%.6f, min=%.6f, max=%.6f, sum=%.6f\n", + mean, sqrtf(variance), min_val, max_val, sum); + LOG_INF("=== END MTMD_DEBUG_EMBEDDINGS ===\n\n"); + } + return true; }