From 0c50dd9fe4e43e598edda17f37956df3ba2a377a Mon Sep 17 00:00:00 2001 From: Aes Sedai <7980540+AesSedai@users.noreply.github.com> Date: Sun, 8 Feb 2026 01:19:18 -0800 Subject: [PATCH] Kimi-K2.5: support non-interleaved rope for vision --- convert_hf_to_gguf.py | 51 ++++++++++----------- gguf-py/gguf/tensor_mapping.py | 1 + tools/mtmd/clip-graph.h | 11 ----- tools/mtmd/clip.cpp | 82 ---------------------------------- tools/mtmd/models/kimik25.cpp | 30 +++---------- 5 files changed, 29 insertions(+), 146 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 070b22fcd9..83c2cd6923 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -11106,8 +11106,8 @@ class KimiK25Model(MmprojModel): self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("projector_ln_eps", 1e-5)) self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0]) - # Image size limits (from preprocessor_config.json media_proc_cfg) - # These are used to set token limits: tokens = pixels / (patch_size²) + # Image size limits + # These are used to set token limits: tokens = pixels / (patch_size ^ 2) in_patch_limit = self.preprocessor_config.get("in_patch_limit_each_frame", self.preprocessor_config.get("in_patch_limit", 4096)) min_patches = 8 # reasonable minimum @@ -11116,31 +11116,19 @@ class KimiK25Model(MmprojModel): self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch) @staticmethod - def _permute_rope_interleaved_to_split(weights: Tensor, n_head: int) -> Tensor: - """Permute Q/K weights from interleaved to split RoPE format. - - Kimi-K2.5 uses interleaved 2D RoPE pattern (per head): - [x0_re, x0_im, y0_re, y0_im, x1_re, x1_im, y1_re, y1_im, ...] - i.e., groups of 4: (x_pair, y_pair) repeated - - llama.cpp build_rope_2d expects split format (per head): - [x0_re, x0_im, x1_re, x1_im, ..., y0_re, y0_im, y1_re, y1_im, ...] - i.e., first half is all X pairs, second half is all Y pairs - - This permutation is applied at conversion time so we can use build_rope_2d at runtime. - """ + def _permute_kqv(weights: Tensor, n_head: int) -> Tensor: out_dim, in_dim = weights.shape head_dim = out_dim // n_head - # Reshape to expose the interleaved structure: - # [n_head, head_dim//4, 2, 2, in_dim] - # where: head_dim//4 = number of (x,y) frequency pairs - # first 2 = x_or_y (0=x, 1=y) - # second 2 = re_or_im (real, imaginary parts of complex rotation) w = weights.reshape(n_head, head_dim // 4, 2, 2, in_dim) - # Permute to split format: [n_head, 2, head_dim//4, 2, in_dim] - # Now dim 1 separates X (index 0) from Y (index 1) w = w.permute(0, 2, 1, 3, 4) - # Reshape back: [out_dim, in_dim] + return w.reshape(out_dim, in_dim) + + @staticmethod + def _permute_output_proj(weights: Tensor, n_head: int) -> Tensor: + out_dim, in_dim = weights.shape + head_dim = in_dim // n_head + w = weights.reshape(out_dim, n_head, head_dim // 4, 2, 2) + w = w.permute(0, 1, 3, 2, 4) return w.reshape(out_dim, in_dim) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: @@ -11153,8 +11141,10 @@ class KimiK25Model(MmprojModel): assert self.hparams_vision is not None n_head = self.hparams_vision.get("num_attention_heads", 16) - # Permute Q/K weights/biases from interleaved to split RoPE format - # This allows using the build_rope_2d at runtime + # Permute Q/K/V weights/biases from interleaved to split RoPE format + # This allows using build_rope_2d at runtime without post-permutation. + # V is also permuted so the attention output is in split format, + # which is then handled by the permuted output projection. if "wqkv" in name: out_dim = data_torch.shape[0] qkv_dim = out_dim // 3 @@ -11162,16 +11152,21 @@ class KimiK25Model(MmprojModel): if "weight" in name: wq, wk, wv = data_torch[:qkv_dim, :], data_torch[qkv_dim:2*qkv_dim, :], data_torch[2*qkv_dim:, :] - wq = self._permute_rope_interleaved_to_split(wq, n_head) - wk = self._permute_rope_interleaved_to_split(wk, n_head) + wq = self._permute_kqv(wq, n_head) + wk = self._permute_kqv(wk, n_head) + wv = self._permute_kqv(wv, n_head) data_torch = torch.cat([wq, wk, wv], dim=0) elif "bias" in name: bq, bk, bv = data_torch[:qkv_dim], data_torch[qkv_dim:2*qkv_dim], data_torch[2*qkv_dim:] - # Same permutation as weights: [n_head, head_dim//4, 2, 2] -> [n_head, 2, head_dim//4, 2] bq = bq.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1) bk = bk.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1) + bv = bv.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1) data_torch = torch.cat([bq, bk, bv], dim=0) + # Permute output projection from interleaved to split RoPE format + if "wo.weight" in name: + data_torch = self._permute_output_proj(data_torch, n_head) + # Temporal embeddings: (T, 1, C) → (T, C) if "pos_emb.time_weight" in name: T, _, C = data_torch.shape diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ba4f644dc2..548b035964 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1358,6 +1358,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_ATTN_QKV: ( "visual.blocks.{bid}.attn.qkv", # qwen3vl "model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm + "vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5 ), MODEL_TENSOR.V_ENC_ATTN_Q: ( diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 8c9d56c8cb..4c7f7504cf 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -107,17 +107,6 @@ struct clip_graph { const bool interleave_freq ); - // 2D RoPE with interleaved frequency - // Pattern: [x_freq0, y_freq0, x_freq1, y_freq1, ...] - // build_rope_2d uses split pattern: [x_freq0, x_freq1, ..., y_freq0, y_freq1, ...] - ggml_tensor * build_rope_2d_interleaved( - ggml_context * ctx0, - ggml_tensor * cur, // [n_dim, n_head, n_pos] - ggml_tensor * pos_w, // [n_pos] - X/width positions - ggml_tensor * pos_h, // [n_pos] - Y/height positions - const float freq_base - ); - // aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL) // support dynamic resolution ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index eb174e4b17..168341edf0 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -715,88 +715,6 @@ ggml_tensor * clip_graph::build_rope_2d( return cur; } -// 2D RoPE with interleaved frequency -// Pattern: [x_freq0, y_freq0, x_freq1, y_freq1, ...] -// build_rope_2d uses split pattern: [x_freq0, x_freq1, ..., y_freq0, y_freq1, ...] -ggml_tensor * clip_graph::build_rope_2d_interleaved( - ggml_context * ctx0, - ggml_tensor * cur, // [n_dim, n_head, n_pos] - ggml_tensor * pos_w, // [n_pos] - X/width positions - ggml_tensor * pos_h, // [n_pos] - Y/height positions - const float freq_base -) { - const int64_t n_dim = cur->ne[0]; - const int64_t n_head = cur->ne[1]; - const int64_t n_pos = cur->ne[2]; - - GGML_ASSERT(n_dim % 4 == 0); // Must be divisible by 4 for interleaved x,y pairs - - // Ensure input is contiguous (needed when using merged QKV with ggml_view) - if (!ggml_is_contiguous(cur)) { - cur = ggml_cont(ctx0, cur); - } - - // Step 1: Reshape to expose interleaved structure - // cur: [n_dim, n_head, n_pos] -> [4, n_dim/4, n_head, n_pos] - ggml_tensor * reshaped = ggml_reshape_4d(ctx0, cur, 4, n_dim/4, n_head, n_pos); - - // Step 2: Extract X pairs (elements 0,1 of each group of 4) - // x_pairs: [2, n_dim/4, n_head, n_pos] - ggml_tensor * x_pairs = ggml_view_4d(ctx0, reshaped, - 2, n_dim/4, n_head, n_pos, - reshaped->nb[1], reshaped->nb[2], reshaped->nb[3], - 0); - - // Step 3: Extract Y pairs (elements 2,3 of each group of 4) - // y_pairs: [2, n_dim/4, n_head, n_pos] - ggml_tensor * y_pairs = ggml_view_4d(ctx0, reshaped, - 2, n_dim/4, n_head, n_pos, - reshaped->nb[1], reshaped->nb[2], reshaped->nb[3], - 2 * ggml_element_size(reshaped)); - - // Step 4: Make contiguous and reshape for rope_ext - // [2, n_dim/4, n_head, n_pos] -> [n_dim/2, n_head, n_pos] - x_pairs = ggml_cont(ctx0, x_pairs); - x_pairs = ggml_reshape_3d(ctx0, x_pairs, n_dim/2, n_head, n_pos); - - y_pairs = ggml_cont(ctx0, y_pairs); - y_pairs = ggml_reshape_3d(ctx0, y_pairs, n_dim/2, n_head, n_pos); - - // Step 5: Apply RoPE to X pairs using pos_w, Y pairs using pos_h - x_pairs = ggml_rope_ext( - ctx0, - x_pairs, - pos_w, - nullptr, - n_dim/2, - 0, 0, freq_base, - 1.0f, 0.0f, 1.0f, 0.0f, 0.0f - ); - - y_pairs = ggml_rope_ext( - ctx0, - y_pairs, - pos_h, - nullptr, - n_dim/2, - 0, 0, freq_base, - 1.0f, 0.0f, 1.0f, 0.0f, 0.0f - ); - - // Step 6: Reshape back to [2, n_dim/4, n_head, n_pos] for interleaving - x_pairs = ggml_reshape_4d(ctx0, x_pairs, 2, n_dim/4, n_head, n_pos); - y_pairs = ggml_reshape_4d(ctx0, y_pairs, 2, n_dim/4, n_head, n_pos); - - // Step 7: Interleave X and Y pairs back together - // Concatenate along dimension 0: [4, n_dim/4, n_head, n_pos] - ggml_tensor * result = ggml_concat(ctx0, x_pairs, y_pairs, 0); - - // Step 8: Reshape back to original: [n_dim, n_head, n_pos] - result = ggml_reshape_3d(ctx0, result, n_dim, n_head, n_pos); - - return result; -} - // Generic function to stack frames for audio processing // Abstracts out the StackAudioFrames logic used by ultravox ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) { diff --git a/tools/mtmd/models/kimik25.cpp b/tools/mtmd/models/kimik25.cpp index ceb7b848f9..5f5cd9b7ed 100644 --- a/tools/mtmd/models/kimik25.cpp +++ b/tools/mtmd/models/kimik25.cpp @@ -42,33 +42,13 @@ ggml_cgraph * clip_graph_kimik25::build() { ggml_tensor * learned_pos_embd = resize_position_embeddings_3d(GGML_SCALE_MODE_BICUBIC); - // Kimi-K2.5 uses interleaved 2D RoPE pattern: [x0_re, x0_im, y0_re, y0_im, x1_re, x1_im, ...] - // Q/K weights are permuted during conversion from interleaved to split format. - // build_rope_2d expects split format and outputs split format. - // We need to convert the output back to interleaved format for the attention mechanism. + // Kimi-K2.5 uses interleaved 2D RoPE pattern natively, but all attention weights + // (Q, K, V, O) are permuted during conversion to use split format throughout. + // This allows using build_rope_2d without any runtime format conversion. + // The dot product in attention is order-independent, so keeping everything in + // split format produces mathematically equivalent results. auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { - const int64_t n_dim = cur->ne[0]; - const int64_t n_head = cur->ne[1]; - const int64_t n_pos = cur->ne[2]; - - // Apply RoPE in split format cur = build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); - - // Convert output from split format back to interleaved format - // Split: [x0_re, x0_im, x1_re, x1_im, ..., y0_re, y0_im, y1_re, y1_im, ...] - // Interleaved: [x0_re, x0_im, y0_re, y0_im, x1_re, x1_im, y1_re, y1_im, ...] - // - // Reshape to [2, n_dim/4, 2, n_head, n_pos] where: - // - first dim 2 = re/im pair - // - n_dim/4 = number of frequency pairs per axis - // - second dim 2 = X half (0) vs Y half (1) - // Then permute to interleave X and Y - // Finally reshape back to [n_dim, n_head, n_pos] - cur = ggml_reshape_4d(ctx0, cur, 2, n_dim/4, 2, n_head * n_pos); - cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // [2, 2, n_dim/4, n_head*n_pos] - cur = ggml_cont(ctx0, cur); - cur = ggml_reshape_3d(ctx0, cur, n_dim, n_head, n_pos); - return cur; };