Kimi-K2.5: support non-interleaved rope for vision
This commit is contained in:
parent
052fda6c5d
commit
0c50dd9fe4
|
|
@ -11106,8 +11106,8 @@ class KimiK25Model(MmprojModel):
|
||||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("projector_ln_eps", 1e-5))
|
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("projector_ln_eps", 1e-5))
|
||||||
self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0])
|
self.gguf_writer.add_vision_projector_scale_factor(self.merge_kernel_size[0])
|
||||||
|
|
||||||
# Image size limits (from preprocessor_config.json media_proc_cfg)
|
# Image size limits
|
||||||
# These are used to set token limits: tokens = pixels / (patch_size²)
|
# These are used to set token limits: tokens = pixels / (patch_size ^ 2)
|
||||||
in_patch_limit = self.preprocessor_config.get("in_patch_limit_each_frame",
|
in_patch_limit = self.preprocessor_config.get("in_patch_limit_each_frame",
|
||||||
self.preprocessor_config.get("in_patch_limit", 4096))
|
self.preprocessor_config.get("in_patch_limit", 4096))
|
||||||
min_patches = 8 # reasonable minimum
|
min_patches = 8 # reasonable minimum
|
||||||
|
|
@ -11116,31 +11116,19 @@ class KimiK25Model(MmprojModel):
|
||||||
self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch)
|
self.gguf_writer.add_vision_max_pixels(in_patch_limit * pixels_per_patch)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _permute_rope_interleaved_to_split(weights: Tensor, n_head: int) -> Tensor:
|
def _permute_kqv(weights: Tensor, n_head: int) -> Tensor:
|
||||||
"""Permute Q/K weights from interleaved to split RoPE format.
|
|
||||||
|
|
||||||
Kimi-K2.5 uses interleaved 2D RoPE pattern (per head):
|
|
||||||
[x0_re, x0_im, y0_re, y0_im, x1_re, x1_im, y1_re, y1_im, ...]
|
|
||||||
i.e., groups of 4: (x_pair, y_pair) repeated
|
|
||||||
|
|
||||||
llama.cpp build_rope_2d expects split format (per head):
|
|
||||||
[x0_re, x0_im, x1_re, x1_im, ..., y0_re, y0_im, y1_re, y1_im, ...]
|
|
||||||
i.e., first half is all X pairs, second half is all Y pairs
|
|
||||||
|
|
||||||
This permutation is applied at conversion time so we can use build_rope_2d at runtime.
|
|
||||||
"""
|
|
||||||
out_dim, in_dim = weights.shape
|
out_dim, in_dim = weights.shape
|
||||||
head_dim = out_dim // n_head
|
head_dim = out_dim // n_head
|
||||||
# Reshape to expose the interleaved structure:
|
|
||||||
# [n_head, head_dim//4, 2, 2, in_dim]
|
|
||||||
# where: head_dim//4 = number of (x,y) frequency pairs
|
|
||||||
# first 2 = x_or_y (0=x, 1=y)
|
|
||||||
# second 2 = re_or_im (real, imaginary parts of complex rotation)
|
|
||||||
w = weights.reshape(n_head, head_dim // 4, 2, 2, in_dim)
|
w = weights.reshape(n_head, head_dim // 4, 2, 2, in_dim)
|
||||||
# Permute to split format: [n_head, 2, head_dim//4, 2, in_dim]
|
|
||||||
# Now dim 1 separates X (index 0) from Y (index 1)
|
|
||||||
w = w.permute(0, 2, 1, 3, 4)
|
w = w.permute(0, 2, 1, 3, 4)
|
||||||
# Reshape back: [out_dim, in_dim]
|
return w.reshape(out_dim, in_dim)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _permute_output_proj(weights: Tensor, n_head: int) -> Tensor:
|
||||||
|
out_dim, in_dim = weights.shape
|
||||||
|
head_dim = in_dim // n_head
|
||||||
|
w = weights.reshape(out_dim, n_head, head_dim // 4, 2, 2)
|
||||||
|
w = w.permute(0, 1, 3, 2, 4)
|
||||||
return w.reshape(out_dim, in_dim)
|
return w.reshape(out_dim, in_dim)
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
|
@ -11153,8 +11141,10 @@ class KimiK25Model(MmprojModel):
|
||||||
assert self.hparams_vision is not None
|
assert self.hparams_vision is not None
|
||||||
n_head = self.hparams_vision.get("num_attention_heads", 16)
|
n_head = self.hparams_vision.get("num_attention_heads", 16)
|
||||||
|
|
||||||
# Permute Q/K weights/biases from interleaved to split RoPE format
|
# Permute Q/K/V weights/biases from interleaved to split RoPE format
|
||||||
# This allows using the build_rope_2d at runtime
|
# This allows using build_rope_2d at runtime without post-permutation.
|
||||||
|
# V is also permuted so the attention output is in split format,
|
||||||
|
# which is then handled by the permuted output projection.
|
||||||
if "wqkv" in name:
|
if "wqkv" in name:
|
||||||
out_dim = data_torch.shape[0]
|
out_dim = data_torch.shape[0]
|
||||||
qkv_dim = out_dim // 3
|
qkv_dim = out_dim // 3
|
||||||
|
|
@ -11162,16 +11152,21 @@ class KimiK25Model(MmprojModel):
|
||||||
|
|
||||||
if "weight" in name:
|
if "weight" in name:
|
||||||
wq, wk, wv = data_torch[:qkv_dim, :], data_torch[qkv_dim:2*qkv_dim, :], data_torch[2*qkv_dim:, :]
|
wq, wk, wv = data_torch[:qkv_dim, :], data_torch[qkv_dim:2*qkv_dim, :], data_torch[2*qkv_dim:, :]
|
||||||
wq = self._permute_rope_interleaved_to_split(wq, n_head)
|
wq = self._permute_kqv(wq, n_head)
|
||||||
wk = self._permute_rope_interleaved_to_split(wk, n_head)
|
wk = self._permute_kqv(wk, n_head)
|
||||||
|
wv = self._permute_kqv(wv, n_head)
|
||||||
data_torch = torch.cat([wq, wk, wv], dim=0)
|
data_torch = torch.cat([wq, wk, wv], dim=0)
|
||||||
elif "bias" in name:
|
elif "bias" in name:
|
||||||
bq, bk, bv = data_torch[:qkv_dim], data_torch[qkv_dim:2*qkv_dim], data_torch[2*qkv_dim:]
|
bq, bk, bv = data_torch[:qkv_dim], data_torch[qkv_dim:2*qkv_dim], data_torch[2*qkv_dim:]
|
||||||
# Same permutation as weights: [n_head, head_dim//4, 2, 2] -> [n_head, 2, head_dim//4, 2]
|
|
||||||
bq = bq.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
|
bq = bq.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
|
||||||
bk = bk.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
|
bk = bk.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
|
||||||
|
bv = bv.reshape(n_head, head_dim // 4, 2, 2).permute(0, 2, 1, 3).reshape(-1)
|
||||||
data_torch = torch.cat([bq, bk, bv], dim=0)
|
data_torch = torch.cat([bq, bk, bv], dim=0)
|
||||||
|
|
||||||
|
# Permute output projection from interleaved to split RoPE format
|
||||||
|
if "wo.weight" in name:
|
||||||
|
data_torch = self._permute_output_proj(data_torch, n_head)
|
||||||
|
|
||||||
# Temporal embeddings: (T, 1, C) → (T, C)
|
# Temporal embeddings: (T, 1, C) → (T, C)
|
||||||
if "pos_emb.time_weight" in name:
|
if "pos_emb.time_weight" in name:
|
||||||
T, _, C = data_torch.shape
|
T, _, C = data_torch.shape
|
||||||
|
|
|
||||||
|
|
@ -1358,6 +1358,7 @@ class TensorNameMap:
|
||||||
MODEL_TENSOR.V_ENC_ATTN_QKV: (
|
MODEL_TENSOR.V_ENC_ATTN_QKV: (
|
||||||
"visual.blocks.{bid}.attn.qkv", # qwen3vl
|
"visual.blocks.{bid}.attn.qkv", # qwen3vl
|
||||||
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
|
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
|
||||||
|
"vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.V_ENC_ATTN_Q: (
|
MODEL_TENSOR.V_ENC_ATTN_Q: (
|
||||||
|
|
|
||||||
|
|
@ -107,17 +107,6 @@ struct clip_graph {
|
||||||
const bool interleave_freq
|
const bool interleave_freq
|
||||||
);
|
);
|
||||||
|
|
||||||
// 2D RoPE with interleaved frequency
|
|
||||||
// Pattern: [x_freq0, y_freq0, x_freq1, y_freq1, ...]
|
|
||||||
// build_rope_2d uses split pattern: [x_freq0, x_freq1, ..., y_freq0, y_freq1, ...]
|
|
||||||
ggml_tensor * build_rope_2d_interleaved(
|
|
||||||
ggml_context * ctx0,
|
|
||||||
ggml_tensor * cur, // [n_dim, n_head, n_pos]
|
|
||||||
ggml_tensor * pos_w, // [n_pos] - X/width positions
|
|
||||||
ggml_tensor * pos_h, // [n_pos] - Y/height positions
|
|
||||||
const float freq_base
|
|
||||||
);
|
|
||||||
|
|
||||||
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
// aka pixel_shuffle / pixel_unshuffle / patch_merger (Kimi-VL)
|
||||||
// support dynamic resolution
|
// support dynamic resolution
|
||||||
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
|
ggml_tensor * build_patch_merge_permute(ggml_tensor * cur, int scale_factor);
|
||||||
|
|
|
||||||
|
|
@ -715,88 +715,6 @@ ggml_tensor * clip_graph::build_rope_2d(
|
||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2D RoPE with interleaved frequency
|
|
||||||
// Pattern: [x_freq0, y_freq0, x_freq1, y_freq1, ...]
|
|
||||||
// build_rope_2d uses split pattern: [x_freq0, x_freq1, ..., y_freq0, y_freq1, ...]
|
|
||||||
ggml_tensor * clip_graph::build_rope_2d_interleaved(
|
|
||||||
ggml_context * ctx0,
|
|
||||||
ggml_tensor * cur, // [n_dim, n_head, n_pos]
|
|
||||||
ggml_tensor * pos_w, // [n_pos] - X/width positions
|
|
||||||
ggml_tensor * pos_h, // [n_pos] - Y/height positions
|
|
||||||
const float freq_base
|
|
||||||
) {
|
|
||||||
const int64_t n_dim = cur->ne[0];
|
|
||||||
const int64_t n_head = cur->ne[1];
|
|
||||||
const int64_t n_pos = cur->ne[2];
|
|
||||||
|
|
||||||
GGML_ASSERT(n_dim % 4 == 0); // Must be divisible by 4 for interleaved x,y pairs
|
|
||||||
|
|
||||||
// Ensure input is contiguous (needed when using merged QKV with ggml_view)
|
|
||||||
if (!ggml_is_contiguous(cur)) {
|
|
||||||
cur = ggml_cont(ctx0, cur);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Step 1: Reshape to expose interleaved structure
|
|
||||||
// cur: [n_dim, n_head, n_pos] -> [4, n_dim/4, n_head, n_pos]
|
|
||||||
ggml_tensor * reshaped = ggml_reshape_4d(ctx0, cur, 4, n_dim/4, n_head, n_pos);
|
|
||||||
|
|
||||||
// Step 2: Extract X pairs (elements 0,1 of each group of 4)
|
|
||||||
// x_pairs: [2, n_dim/4, n_head, n_pos]
|
|
||||||
ggml_tensor * x_pairs = ggml_view_4d(ctx0, reshaped,
|
|
||||||
2, n_dim/4, n_head, n_pos,
|
|
||||||
reshaped->nb[1], reshaped->nb[2], reshaped->nb[3],
|
|
||||||
0);
|
|
||||||
|
|
||||||
// Step 3: Extract Y pairs (elements 2,3 of each group of 4)
|
|
||||||
// y_pairs: [2, n_dim/4, n_head, n_pos]
|
|
||||||
ggml_tensor * y_pairs = ggml_view_4d(ctx0, reshaped,
|
|
||||||
2, n_dim/4, n_head, n_pos,
|
|
||||||
reshaped->nb[1], reshaped->nb[2], reshaped->nb[3],
|
|
||||||
2 * ggml_element_size(reshaped));
|
|
||||||
|
|
||||||
// Step 4: Make contiguous and reshape for rope_ext
|
|
||||||
// [2, n_dim/4, n_head, n_pos] -> [n_dim/2, n_head, n_pos]
|
|
||||||
x_pairs = ggml_cont(ctx0, x_pairs);
|
|
||||||
x_pairs = ggml_reshape_3d(ctx0, x_pairs, n_dim/2, n_head, n_pos);
|
|
||||||
|
|
||||||
y_pairs = ggml_cont(ctx0, y_pairs);
|
|
||||||
y_pairs = ggml_reshape_3d(ctx0, y_pairs, n_dim/2, n_head, n_pos);
|
|
||||||
|
|
||||||
// Step 5: Apply RoPE to X pairs using pos_w, Y pairs using pos_h
|
|
||||||
x_pairs = ggml_rope_ext(
|
|
||||||
ctx0,
|
|
||||||
x_pairs,
|
|
||||||
pos_w,
|
|
||||||
nullptr,
|
|
||||||
n_dim/2,
|
|
||||||
0, 0, freq_base,
|
|
||||||
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
|
||||||
);
|
|
||||||
|
|
||||||
y_pairs = ggml_rope_ext(
|
|
||||||
ctx0,
|
|
||||||
y_pairs,
|
|
||||||
pos_h,
|
|
||||||
nullptr,
|
|
||||||
n_dim/2,
|
|
||||||
0, 0, freq_base,
|
|
||||||
1.0f, 0.0f, 1.0f, 0.0f, 0.0f
|
|
||||||
);
|
|
||||||
|
|
||||||
// Step 6: Reshape back to [2, n_dim/4, n_head, n_pos] for interleaving
|
|
||||||
x_pairs = ggml_reshape_4d(ctx0, x_pairs, 2, n_dim/4, n_head, n_pos);
|
|
||||||
y_pairs = ggml_reshape_4d(ctx0, y_pairs, 2, n_dim/4, n_head, n_pos);
|
|
||||||
|
|
||||||
// Step 7: Interleave X and Y pairs back together
|
|
||||||
// Concatenate along dimension 0: [4, n_dim/4, n_head, n_pos]
|
|
||||||
ggml_tensor * result = ggml_concat(ctx0, x_pairs, y_pairs, 0);
|
|
||||||
|
|
||||||
// Step 8: Reshape back to original: [n_dim, n_head, n_pos]
|
|
||||||
result = ggml_reshape_3d(ctx0, result, n_dim, n_head, n_pos);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generic function to stack frames for audio processing
|
// Generic function to stack frames for audio processing
|
||||||
// Abstracts out the StackAudioFrames logic used by ultravox
|
// Abstracts out the StackAudioFrames logic used by ultravox
|
||||||
ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
|
ggml_tensor * clip_graph::build_stack(ggml_tensor * cur, int32_t stack_factor, int32_t n_embed) {
|
||||||
|
|
|
||||||
|
|
@ -42,33 +42,13 @@ ggml_cgraph * clip_graph_kimik25::build() {
|
||||||
|
|
||||||
ggml_tensor * learned_pos_embd = resize_position_embeddings_3d(GGML_SCALE_MODE_BICUBIC);
|
ggml_tensor * learned_pos_embd = resize_position_embeddings_3d(GGML_SCALE_MODE_BICUBIC);
|
||||||
|
|
||||||
// Kimi-K2.5 uses interleaved 2D RoPE pattern: [x0_re, x0_im, y0_re, y0_im, x1_re, x1_im, ...]
|
// Kimi-K2.5 uses interleaved 2D RoPE pattern natively, but all attention weights
|
||||||
// Q/K weights are permuted during conversion from interleaved to split format.
|
// (Q, K, V, O) are permuted during conversion to use split format throughout.
|
||||||
// build_rope_2d expects split format and outputs split format.
|
// This allows using build_rope_2d without any runtime format conversion.
|
||||||
// We need to convert the output back to interleaved format for the attention mechanism.
|
// The dot product in attention is order-independent, so keeping everything in
|
||||||
|
// split format produces mathematically equivalent results.
|
||||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||||
const int64_t n_dim = cur->ne[0];
|
|
||||||
const int64_t n_head = cur->ne[1];
|
|
||||||
const int64_t n_pos = cur->ne[2];
|
|
||||||
|
|
||||||
// Apply RoPE in split format
|
|
||||||
cur = build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
cur = build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
||||||
|
|
||||||
// Convert output from split format back to interleaved format
|
|
||||||
// Split: [x0_re, x0_im, x1_re, x1_im, ..., y0_re, y0_im, y1_re, y1_im, ...]
|
|
||||||
// Interleaved: [x0_re, x0_im, y0_re, y0_im, x1_re, x1_im, y1_re, y1_im, ...]
|
|
||||||
//
|
|
||||||
// Reshape to [2, n_dim/4, 2, n_head, n_pos] where:
|
|
||||||
// - first dim 2 = re/im pair
|
|
||||||
// - n_dim/4 = number of frequency pairs per axis
|
|
||||||
// - second dim 2 = X half (0) vs Y half (1)
|
|
||||||
// Then permute to interleave X and Y
|
|
||||||
// Finally reshape back to [n_dim, n_head, n_pos]
|
|
||||||
cur = ggml_reshape_4d(ctx0, cur, 2, n_dim/4, 2, n_head * n_pos);
|
|
||||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // [2, 2, n_dim/4, n_head*n_pos]
|
|
||||||
cur = ggml_cont(ctx0, cur);
|
|
||||||
cur = ggml_reshape_3d(ctx0, cur, n_dim, n_head, n_pos);
|
|
||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue