From 96294c6ad955962964ae72c8b80426c4f65f2071 Mon Sep 17 00:00:00 2001
From: suhyun-hwang <saeyu93@gmail.com>
Date: Sat, 10 Jan 2026 20:56:26 +0900
Subject: [PATCH] refactor: simplify partial RoPE with weight reordering

---
 convert_hf_to_gguf.py |  9 +++++
 src/llama-model.cpp   | 14 +++----
 src/models/vaetki.cpp | 86 +++++++++++++++++++------------------------
 3 files changed, 51 insertions(+), 58 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 52ad5ac3e2..c48385da83 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -7843,6 +7843,15 @@ class VaetkiModel(TextModel):
         elif name.startswith("language_model."):
             name = name.replace("language_model.", "model.")
 
+        if name.endswith("q_b_proj.weight"):
+            n_head = self.hparams["num_attention_heads"]
+            qk_nope_head_dim = self.hparams["qk_nope_head_dim"]
+            qk_rope_head_dim = self.hparams["qk_rope_head_dim"]
+            qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+            data_torch = data_torch.view(n_head, qk_head_dim, -1)
+            data_torch = torch.cat([data_torch[:, qk_nope_head_dim:, :], data_torch[:, :qk_nope_head_dim, :]], dim=1)
+            data_torch = data_torch.reshape(n_head * qk_head_dim, -1)
+
         # VAETKI WBLRMSNorm: add 1 to weights for standard RMSNorm compatibility
         norm_weight_patterns = [
             "input_layernorm.weight",
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 9315651993..506368fd1b 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1153,15 +1153,11 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
                 }
 
-                {
-                    uint32_t n_swa_temp = 0;
-                    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, n_swa_temp, false);
-                    if (n_swa_temp > 0) {
-                        hparams.n_swa = n_swa_temp;
-                        ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
-                        ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
-                        hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
-                    }
+                ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
+                ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
+                if (hparams.n_swa > 0) {
+                    hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+                    ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.swa_layers, hparams.n_layer);
                 }
 
                 switch (hparams.n_layer) {
diff --git a/src/models/vaetki.cpp b/src/models/vaetki.cpp
index 9621220462..8ddd39821e 100644
--- a/src/models/vaetki.cpp
+++ b/src/models/vaetki.cpp
@@ -20,7 +20,7 @@ llm_build_vaetki::llm_build_vaetki(const llama_model & model, const llm_graph_pa
 
     ggml_tensor * inp_pos = build_inp_pos();
 
-    llm_graph_input_attn_kv_iswa * inp_attn = build_attn_inp_kv_iswa();
+    auto * inp_attn = build_attn_inp_kv_iswa();
 
     ggml_tensor * inp_out_ids = build_inp_out_ids();
 
@@ -44,76 +44,64 @@ llm_build_vaetki::llm_build_vaetki(const llama_model & model, const llm_graph_pa
             q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
             cb(q, "q", il);
 
-            ggml_tensor * q_nope =
-                ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k_mla),
-                             ggml_row_size(q->type, n_embd_head_k_mla) * n_head, 0);
-            cb(q_nope, "q_nope", il);
-
-            ggml_tensor * q_pe = ggml_view_3d(
-                ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_row_size(q->type, n_embd_head_k_mla),
-                ggml_row_size(q->type, n_embd_head_k_mla) * n_head, ggml_row_size(q->type, n_embd_head_qk_nope));
-            cb(q_pe, "q_pe", il);
+            // q is now [rope | nope] after weight reordering in conversion
+            // reshape to {n_embd_head_k_mla, n_head, n_tokens}
+            q = ggml_reshape_3d(ctx0, q, n_embd_head_k_mla, n_head, n_tokens);
 
             ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
             cb(kv_cmpr_pe, "kv_cmpr_pe", il);
 
-            ggml_tensor * kv_cmpr =
-                ggml_view_2d(ctx0, kv_cmpr_pe, kv_lora_rank, n_tokens,
-                             ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0);
+            // {kv_lora_rank, n_tokens}
+            ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe,
+                kv_lora_rank, n_tokens,
+                ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0);
             cb(kv_cmpr, "kv_cmpr", il);
 
-            ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, n_embd_head_qk_rope, 1, n_tokens,
-                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
-                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
-                                              ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
+            // {n_embd_head_qk_rope, 1, n_tokens}
+            ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe,
+                n_embd_head_qk_rope, 1, n_tokens,
+                ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+                ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope),
+                ggml_row_size(kv_cmpr_pe->type, kv_lora_rank));
             cb(k_pe, "k_pe", il);
 
-            q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                                 ext_factor, attn_factor, beta_fast, beta_slow);
-            cb(q_pe, "q_pe_rope", il);
+            // apply rope - rotates first n_rot dims, copies rest unchanged
+            ggml_tensor * Qcur = ggml_rope_ext(ctx0, q, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow);
+            cb(Qcur, "Qcur", il);
 
-            k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base_l, freq_scale_l,
-                                 ext_factor, attn_factor, beta_fast, beta_slow);
+            k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
+                freq_base_l, freq_scale_l, ext_factor, attn_factor, beta_fast, beta_slow);
             cb(k_pe, "k_pe_rope", il);
 
-            // convert interleaved RoPE to split format
-            q_pe = ggml_reshape_4d(ctx0, q_pe, 2, n_embd_head_qk_rope/2, n_head, n_tokens);
-            q_pe = ggml_permute(ctx0, q_pe, 1, 0, 2, 3);
-            q_pe = ggml_cont(ctx0, q_pe);
-            q_pe = ggml_reshape_3d(ctx0, q_pe, n_embd_head_qk_rope, n_head, n_tokens);
-            cb(q_pe, "q_pe_split", il);
-
-            k_pe = ggml_reshape_4d(ctx0, k_pe, 2, n_embd_head_qk_rope/2, 1, n_tokens);
-            k_pe = ggml_permute(ctx0, k_pe, 1, 0, 2, 3);
-            k_pe = ggml_cont(ctx0, k_pe);
-            k_pe = ggml_reshape_3d(ctx0, k_pe, n_embd_head_qk_rope, 1, n_tokens);
-            cb(k_pe, "k_pe_split", il);
-
             kv_cmpr = build_norm(kv_cmpr, model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
             cb(kv_cmpr, "kv_cmpr_norm", il);
 
             ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr);
             cb(kv, "kv", il);
 
-            ggml_tensor * k_nope =
-                ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
-                             ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v_mla),
-                             ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v_mla) * n_head, 0);
-            cb(k_nope, "k_nope_view", il);
+            // {n_embd_head_qk_nope, n_head, n_tokens}
+            ggml_tensor * k_nope = ggml_view_3d(ctx0, kv,
+                n_embd_head_qk_nope, n_head, n_tokens,
+                ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v_mla),
+                ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v_mla) * n_head, 0);
+            cb(k_nope, "k_nope", il);
 
-            ggml_tensor * Vcur = ggml_view_3d(ctx0, kv, n_embd_head_v_mla, n_head, n_tokens,
-                                              ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v_mla),
-                                              ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v_mla) * n_head,
-                                              ggml_row_size(kv->type, n_embd_head_qk_nope));
-            cb(Vcur, "Vcur_view", il);
+            // {n_embd_head_v_mla, n_head, n_tokens}
+            ggml_tensor * Vcur = ggml_view_3d(ctx0, kv,
+                n_embd_head_v_mla, n_head, n_tokens,
+                ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v_mla),
+                ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v_mla) * n_head,
+                ggml_row_size(kv->type, n_embd_head_qk_nope));
+            cb(Vcur, "Vcur", il);
 
             Vcur = ggml_cont(ctx0, Vcur);
             cb(Vcur, "Vcur_cont", il);
 
-            ggml_tensor * Qcur = ggml_concat(ctx0, q_nope, q_pe, 0);
-            cb(Qcur, "Qcur", il);
-
-            ggml_tensor * Kcur = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
+            ggml_tensor * q_pe_ref = ggml_view_3d(ctx0, Qcur,
+                n_embd_head_qk_rope, n_head, n_tokens,
+                Qcur->nb[1], Qcur->nb[2], 0);
+            ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe_ref), k_nope, 0);
             cb(Kcur, "Kcur", il);
 
             cur = build_attn(inp_attn,