Kimi-K2.5: pre-convert vision QK to use build_rope_2d

2026-02-07 23:18:45 -08:00 · 2026-02-07 23:18:45 -08:00 · 052fda6c5d
parent be1b0c3554
commit 052fda6c5d
3 changed files with 105 additions and 3 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -11096,7 +11096,7 @@ class KimiK25Model(MmprojModel):

        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIK25)

-        # Position embedding parameters (for interpolation) - KimiK25-specific
+        # Position embedding parameters (for interpolation)
        self.gguf_writer.add_uint32("vision.pos_emb_height", self.hparams_vision.get("init_pos_emb_height", 64))
        self.gguf_writer.add_uint32("vision.pos_emb_width", self.hparams_vision.get("init_pos_emb_width", 64))
        self.gguf_writer.add_uint32("vision.pos_emb_time", self.hparams_vision.get("init_pos_emb_time", 4))
@ -11106,6 +11106,43 @@ class KimiK25Model(MmprojModel):
        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("projector_ln_eps", 1e-5))
        self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0])

+        # Image size limits (from preprocessor_config.json media_proc_cfg)
+        # These are used to set token limits: tokens = pixels / (patch_size²)
+        in_patch_limit = self.preprocessor_config.get("in_patch_limit_each_frame",
+                         self.preprocessor_config.get("in_patch_limit", 4096))
+        min_patches = 8  # reasonable minimum
+        pixels_per_patch = self.patch_size * self.patch_size
+        self.gguf_writer.add_vision_min_pixels(min_patches * pixels_per_patch)
+        self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch)
+
+    @staticmethod
+    def _permute_rope_interleaved_to_split(weights: Tensor, n_head: int) -> Tensor:
+        """Permute Q/K weights from interleaved to split RoPE format.
+
+        Kimi-K2.5 uses interleaved 2D RoPE pattern (per head):
+            [x0_re, x0_im, y0_re, y0_im, x1_re, x1_im, y1_re, y1_im, ...]
+            i.e., groups of 4: (x_pair, y_pair) repeated
+
+        llama.cpp build_rope_2d expects split format (per head):
+            [x0_re, x0_im, x1_re, x1_im, ..., y0_re, y0_im, y1_re, y1_im, ...]
+            i.e., first half is all X pairs, second half is all Y pairs
+
+        This permutation is applied at conversion time so we can use build_rope_2d at runtime.
+        """
+        out_dim, in_dim = weights.shape
+        head_dim = out_dim // n_head
+        # Reshape to expose the interleaved structure:
+        # [n_head, head_dim//4, 2, 2, in_dim]
+        # where: head_dim//4 = number of (x,y) frequency pairs
+        #        first 2 = x_or_y (0=x, 1=y)
+        #        second 2 = re_or_im (real, imaginary parts of complex rotation)
+        w = weights.reshape(n_head, head_dim // 4, 2, 2, in_dim)
+        # Permute to split format: [n_head, 2, head_dim//4, 2, in_dim]
+        # Now dim 1 separates X (index 0) from Y (index 1)
+        w = w.permute(0, 2, 1, 3, 4)
+        # Reshape back: [out_dim, in_dim]
+        return w.reshape(out_dim, in_dim)
+
    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
        # Only process vision and projector tensors
        is_vision = any(x in name for x in ["vision_tower", "mm_projector"])
@ -11113,6 +11150,28 @@ class KimiK25Model(MmprojModel):
        if not is_vision:
            return

+        assert self.hparams_vision is not None
+        n_head = self.hparams_vision.get("num_attention_heads", 16)
+
+        # Permute Q/K weights/biases from interleaved to split RoPE format
+        # This allows using the build_rope_2d at runtime
+        if "wqkv" in name:
+            out_dim = data_torch.shape[0]
+            qkv_dim = out_dim // 3
+            head_dim = qkv_dim // n_head
+
+            if "weight" in name:
+                wq, wk, wv = data_torch[:qkv_dim, :], data_torch[qkv_dim:2*qkv_dim, :], data_torch[2*qkv_dim:, :]
+                wq = self._permute_rope_interleaved_to_split(wq, n_head)
+                wk = self._permute_rope_interleaved_to_split(wk, n_head)
+                data_torch = torch.cat([wq, wk, wv], dim=0)
+            elif "bias" in name:
+                bq, bk, bv = data_torch[:qkv_dim], data_torch[qkv_dim:2*qkv_dim], data_torch[2*qkv_dim:]
+                # Same permutation as weights: [n_head, head_dim//4, 2, 2] -> [n_head, 2, head_dim//4, 2]
+                bq = bq.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
+                bk = bk.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
+                data_torch = torch.cat([bq, bk, bv], dim=0)
+
        # Temporal embeddings: (T, 1, C) → (T, C)
        if "pos_emb.time_weight" in name:
            T, _, C = data_torch.shape
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@ -655,6 +655,11 @@ ggml_tensor * clip_graph::build_rope_2d(
    const int64_t n_head = cur->ne[1];
    const int64_t n_pos  = cur->ne[2];

+    // Ensure input is contiguous (needed when using merged QKV with ggml_view)
+    if (!ggml_is_contiguous(cur)) {
+        cur = ggml_cont(ctx0, cur);
+    }
+
    // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
    // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
    // first half of cur will use 1e-0, 1e-2 (even)
@ -1229,7 +1234,20 @@ struct clip_model_loader {
                    {
                        hparams.rope_theta = 10000.0f;
                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+
+                        // Read min/max pixels from GGUF and convert to token limits
+                        int min_pixels = 0, max_pixels = 0;
+                        get_u32(KEY_IMAGE_MIN_PIXELS, min_pixels, false);
+                        get_u32(KEY_IMAGE_MAX_PIXELS, max_pixels, false);
+                        if (min_pixels > 0 && max_pixels > 0) {
+                            const int pixels_per_patch = hparams.patch_size * hparams.patch_size;
+                            const int min_tokens = min_pixels / pixels_per_patch;
+                            const int max_tokens = max_pixels / pixels_per_patch;
+                            hparams.set_limit_image_tokens(min_tokens, max_tokens);
+                        } else {
+                            // Fallback to hardcoded defaults
                        hparams.set_limit_image_tokens(8, 4096);
+                        }
                        hparams.set_warmup_n_tokens(256);
                    } break;
                case PROJECTOR_TYPE_GEMMA3:
--- a/tools/mtmd/models/kimik25.cpp
+++ b/tools/mtmd/models/kimik25.cpp
@ -42,9 +42,34 @@ ggml_cgraph * clip_graph_kimik25::build() {

    ggml_tensor * learned_pos_embd = resize_position_embeddings_3d(GGML_SCALE_MODE_BICUBIC);

-    // Kimi-K2.5 uses INTERLEAVED frequency pattern: [x_freq0, y_freq0, x_freq1, y_freq1, ...]
+    // Kimi-K2.5 uses interleaved 2D RoPE pattern: [x0_re, x0_im, y0_re, y0_im, x1_re, x1_im, ...]
+    // Q/K weights are permuted during conversion from interleaved to split format.
+    // build_rope_2d expects split format and outputs split format.
+    // We need to convert the output back to interleaved format for the attention mechanism.
    auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
-        return build_rope_2d_interleaved(ctx0, cur, pos_w, pos_h, hparams.rope_theta);
+        const int64_t n_dim  = cur->ne[0];
+        const int64_t n_head = cur->ne[1];
+        const int64_t n_pos  = cur->ne[2];
+
+        // Apply RoPE in split format
+        cur = build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+
+        // Convert output from split format back to interleaved format
+        // Split:       [x0_re, x0_im, x1_re, x1_im, ..., y0_re, y0_im, y1_re, y1_im, ...]
+        // Interleaved: [x0_re, x0_im, y0_re, y0_im, x1_re, x1_im, y1_re, y1_im, ...]
+        //
+        // Reshape to [2, n_dim/4, 2, n_head, n_pos] where:
+        //   - first dim 2 = re/im pair
+        //   - n_dim/4 = number of frequency pairs per axis
+        //   - second dim 2 = X half (0) vs Y half (1)
+        // Then permute to interleave X and Y
+        // Finally reshape back to [n_dim, n_head, n_pos]
+        cur = ggml_reshape_4d(ctx0, cur, 2, n_dim/4, 2, n_head * n_pos);
+        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);  // [2, 2, n_dim/4, n_head*n_pos]
+        cur = ggml_cont(ctx0, cur);
+        cur = ggml_reshape_3d(ctx0, cur, n_dim, n_head, n_pos);
+
+        return cur;
    };

    ggml_tensor * inp = build_inp();