llama: dynamic head_dim and n_rot for SWA (#20301)

* llama: dynamic head_dim and n_rot for SWA * also add gguf_writer wrappers * fix build * build_rope_shift arg reorder
2026-03-09 22:22:39 +01:00 · 2026-03-09 22:22:39 +01:00 · 59db9a357d
parent 23fbfcb1ad
commit 59db9a357d
112 changed files with 419 additions and 346 deletions
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -177,6 +177,8 @@ class Keys:
        TEMPERATURE_LENGTH           = "{arch}.attention.temperature_length"
        KEY_LENGTH_MLA               = "{arch}.attention.key_length_mla"
        VALUE_LENGTH_MLA             = "{arch}.attention.value_length_mla"
+        KEY_LENGTH_SWA               = "{arch}.attention.key_length_swa"
+        VALUE_LENGTH_SWA             = "{arch}.attention.value_length_swa"
        SHARED_KV_LAYERS             = "{arch}.attention.shared_kv_layers"
        SLIDING_WINDOW_PATTERN       = "{arch}.attention.sliding_window_pattern"
        TEMPERATURE_SCALE            = "{arch}.attention.temperature_scale"
@ -188,6 +190,7 @@ class Keys:

    class Rope:
        DIMENSION_COUNT           = "{arch}.rope.dimension_count"
+        DIMENSION_COUNT_SWA       = "{arch}.rope.dimension_count_swa"
        DIMENSION_SECTIONS        = "{arch}.rope.dimension_sections"
        FREQ_BASE                 = "{arch}.rope.freq_base"
        FREQ_BASE_SWA             = "{arch}.rope.freq_base_swa"
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -773,6 +773,12 @@ class GGUFWriter:
    def add_value_length_mla(self, length: int) -> None:
        self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length)

+    def add_key_length_swa(self, length: int) -> None:
+        self.add_uint32(Keys.Attention.KEY_LENGTH_SWA.format(arch=self.arch), length)
+
+    def add_value_length_swa(self, length: int) -> None:
+        self.add_uint32(Keys.Attention.VALUE_LENGTH_SWA.format(arch=self.arch), length)
+
    def add_indexer_head_count(self, count: int) -> None:
        self.add_uint32(Keys.Attention.Indexer.HEAD_COUNT.format(arch=self.arch), count)

@ -946,6 +952,9 @@ class GGUFWriter:
    def add_rope_dimension_count(self, count: int) -> None:
        self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)

+    def add_rope_dimension_count_swa(self, count: int) -> None:
+        self.add_uint32(Keys.Rope.DIMENSION_COUNT_SWA.format(arch=self.arch), count)
+
    def add_rope_dimension_sections(self, dims: Sequence[int]) -> None:
        self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims)

--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -230,11 +230,14 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_TEMPERATURE_SCALE,            "%s.attention.temperature_scale"            },
    { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
    { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
+    { LLM_KV_ATTENTION_KEY_LENGTH_SWA,               "%s.attention.key_length_swa"               },
+    { LLM_KV_ATTENTION_VALUE_LENGTH_SWA,             "%s.attention.value_length_swa"             },
    { LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,           "%s.attention.indexer.head_count"           },
    { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,           "%s.attention.indexer.key_length"           },
    { LLM_KV_ATTENTION_INDEXER_TOP_K,                "%s.attention.indexer.top_k"                },

    { LLM_KV_ROPE_DIMENSION_COUNT,           "%s.rope.dimension_count"                 },
+    { LLM_KV_ROPE_DIMENSION_COUNT_SWA,       "%s.rope.dimension_count_swa"             },
    { LLM_KV_ROPE_DIMENSION_SECTIONS,        "%s.rope.dimension_sections"              },
    { LLM_KV_ROPE_FREQ_BASE,                 "%s.rope.freq_base"                       },
    { LLM_KV_ROPE_FREQ_BASE_SWA,             "%s.rope.freq_base_swa"                   },
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -234,11 +234,14 @@ enum llm_kv {
    LLM_KV_ATTENTION_TEMPERATURE_SCALE,
    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
+    LLM_KV_ATTENTION_KEY_LENGTH_SWA,
+    LLM_KV_ATTENTION_VALUE_LENGTH_SWA,
    LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,
    LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
    LLM_KV_ATTENTION_INDEXER_TOP_K,

    LLM_KV_ROPE_DIMENSION_COUNT,
+    LLM_KV_ROPE_DIMENSION_COUNT_SWA,
    LLM_KV_ROPE_DIMENSION_SECTIONS,
    LLM_KV_ROPE_FREQ_BASE,
    LLM_KV_ROPE_FREQ_BASE_SWA,
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -2876,19 +2876,23 @@ llama_context * llama_init_from_model(

    if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) {
        const uint32_t blck_size = ggml_blck_size(params.type_k);
-        if (model->hparams.n_embd_head_k % blck_size != 0) {
-            LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
-                __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k);
-            return nullptr;
+        for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
+            if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
+                LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
+                    __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
+                return nullptr;
+            }
        }
    }

    if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) {
        const uint32_t blck_size = ggml_blck_size(params.type_v);
-        if (model->hparams.n_embd_head_v % blck_size != 0) {
-            LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n",
-                __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v);
-            return nullptr;
+        for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
+            if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
+                LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
+                    __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
+                return nullptr;
+            }
        }
    }

--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@ -849,13 +849,13 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
    ubatch           (params.ubatch),
    n_embd           (hparams.n_embd),
    n_layer          (hparams.n_layer),
-    n_rot            (hparams.n_rot),
+    n_rot            (hparams.n_rot()),
    n_ctx            (cparams.n_ctx),
    n_head           (hparams.n_head()),
    n_head_kv        (hparams.n_head_kv()),
-    n_embd_head_k    (hparams.n_embd_head_k),
+    n_embd_head_k    (hparams.n_embd_head_k()),
    n_embd_k_gqa     (hparams.n_embd_k_gqa()),
-    n_embd_head_v    (hparams.n_embd_head_v),
+    n_embd_head_v    (hparams.n_embd_head_v()),
    n_embd_v_gqa     (hparams.n_embd_v_gqa()),
    n_expert         (hparams.n_expert),
    n_expert_used    (cparams.warmup ? hparams.n_expert : hparams.n_expert_used),
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@ -62,6 +62,14 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
    return n_head/n_head_kv;
 }

+uint32_t llama_hparams::n_rot(uint32_t il) const {
+    if (il < n_layer) {
+        return is_swa(il) ? n_rot_swa : n_rot_full;
+    }
+
+    GGML_ABORT("fatal error");
+}
+
 uint32_t llama_hparams::n_embd_inp() const {
    uint32_t n_embd_inp = n_embd;

@ -76,16 +84,32 @@ uint32_t llama_hparams::n_embd_out() const {
    return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd;
 }

+uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
+    if (il < n_layer) {
+        return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
+    }
+
+    GGML_ABORT("fatal error");
+}
+
+uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
+    if (il < n_layer) {
+        return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
+    }
+
+    GGML_ABORT("fatal error");
+}
+
 uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
    const uint32_t n_head_kv = this->n_head_kv(il);

-    return n_embd_head_k * n_head_kv;
+    return n_embd_head_k(il) * n_head_kv;
 }

 uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
    const uint32_t n_head_kv = this->n_head_kv(il);

-    return n_embd_head_v * n_head_kv;
+    return n_embd_head_v(il) * n_head_kv;
 }

 bool llama_hparams::is_n_embd_k_gqa_variable() const {
@ -197,11 +221,11 @@ bool llama_hparams::is_mla() const {
 }

 uint32_t llama_hparams::n_embd_head_k_mla() const {
-    return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k;
+    return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k();
 }

 uint32_t llama_hparams::n_embd_head_v_mla() const {
-    return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v;
+    return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v();
 }

 bool llama_hparams::has_kv(uint32_t il) const {
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@ -44,13 +44,20 @@ struct llama_hparams {
    uint32_t n_embd;
    uint32_t n_layer;
    int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
-    uint32_t n_rot;
-    uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
-    uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
    uint32_t n_expert = 0;
    uint32_t n_expert_used = 0;
    uint32_t n_rel_attn_bkts = 0;

+    // different head size for full_attention and SWA layers
+    uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
+    uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
+    uint32_t n_embd_head_k_swa;
+    uint32_t n_embd_head_v_swa;
+
+    // different RoPE dimensions for full_attention and SWA layers
+    uint32_t n_rot_full;
+    uint32_t n_rot_swa;
+
    // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
    uint32_t n_embd_head_k_mla_impl = 0;
    uint32_t n_embd_head_v_mla_impl = 0;
@ -247,12 +254,18 @@ struct llama_hparams {

    uint32_t n_gqa(uint32_t il = 0) const;

+    uint32_t n_rot(uint32_t il = 0) const;
+
    // dimension of main + auxiliary input embeddings
    uint32_t n_embd_inp() const;

    // dimension of output embeddings
    uint32_t n_embd_out() const;

+    // dimension of key/value embeddings for each head (per layer)
+    uint32_t n_embd_head_k(uint32_t il = 0) const;
+    uint32_t n_embd_head_v(uint32_t il = 0) const;
+
    // dimension of key embeddings across all k-v heads
    uint32_t n_embd_k_gqa(uint32_t il = 0) const;

--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@ -1033,8 +1033,8 @@ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_k
    const uint32_t ns = sinfo.s1 - sinfo.s0 + 1;

    return ggml_view_4d(ctx, k,
-            hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, ns,
-            ggml_row_size(k->type, hparams.n_embd_head_k),
+            hparams.n_embd_head_k(il), hparams.n_head_kv(il), n_kv, ns,
+            ggml_row_size(k->type, hparams.n_embd_head_k(il)),
            ggml_row_size(k->type, n_embd_k_gqa),
            ggml_row_size(k->type, n_embd_k_gqa*kv_size),
            ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0);
@ -1056,8 +1056,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
    if (!v_trans) {
        // note: v->nb[1] <= v->nb[2]
        return ggml_view_4d(ctx, v,
-                hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns,
-                ggml_row_size(v->type, hparams.n_embd_head_v),          // v->nb[1]
+                hparams.n_embd_head_v(il), hparams.n_head_kv(il), n_kv, ns,
+                ggml_row_size(v->type, hparams.n_embd_head_v(il)),          // v->nb[1]
                ggml_row_size(v->type, n_embd_v_gqa),                   // v->nb[2]
                ggml_row_size(v->type, n_embd_v_gqa*kv_size),           // v->nb[3]
                ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0);
@ -1065,8 +1065,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k

    // note: v->nb[1] > v->nb[2]
    return ggml_view_4d(ctx, v,
-            n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns,
-            ggml_row_size(v->type, kv_size*hparams.n_embd_head_v),  // v->nb[1]
+            n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v(il), ns,
+            ggml_row_size(v->type, kv_size*hparams.n_embd_head_v(il)),  // v->nb[1]
            ggml_row_size(v->type, kv_size),                        // v->nb[2]
            ggml_row_size(v->type, kv_size*n_embd_v_gqa),           // v->nb[3]
            ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0);
@ -1544,7 +1544,8 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
                ggml_tensor * shift,
                ggml_tensor * factors,
                      float   freq_base,
-                      float   freq_scale) const {
+                      float   freq_scale,
+                   uint32_t   il) const {
    const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;

    const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
@ -1552,7 +1553,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift(
    const auto & yarn_beta_slow   = cparams.yarn_beta_slow;
    const auto & yarn_attn_factor = cparams.yarn_attn_factor;

-    const auto & n_rot     = hparams.n_rot;
+    const auto & n_rot     = hparams.n_rot(il);
    const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE
                                // @ngxson : this is a workaround
                                // for M-RoPE, we want to rotate the whole vector when doing KV shift
@ -1606,13 +1607,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
    auto * ctx = res->get_ctx();
    auto * gf  = res->get_gf();

-    const auto & n_embd_head_k = hparams.n_embd_head_k;
-  //const auto & n_embd_head_v = hparams.n_embd_head_v;
-
-    const auto & n_rot = hparams.n_rot;
-
-    const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
-
    auto inp = std::make_unique<llm_graph_input_k_shift>(this);

    inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream);
@ -1626,6 +1620,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
        const int64_t n_head_kv    = hparams.n_head_kv(il);
        const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);

+        const auto n_rot         = hparams.n_rot(il);
+        const auto n_embd_head_k = hparams.n_embd_head_k(il);
+        const auto n_embd_nope   = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0;
+
        const float freq_base_l  = model.get_rope_freq_base (cparams, il);
        const float freq_scale_l = model.get_rope_freq_scale(cparams, il);

@ -1638,7 +1636,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co
                ggml_row_size(layer.k->type, n_embd_k_gqa),
                ggml_row_size(layer.k->type, n_embd_nope));

-        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
+        ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il);

        ggml_build_forward_expand(gf, cur);
    }
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@ -264,7 +264,8 @@ private:
                    ggml_tensor * shift,
                    ggml_tensor * factors,
                          float   freq_base,
-                          float   freq_scale) const;
+                          float   freq_scale,
+                       uint32_t   il) const;

    ggml_cgraph * build_graph_shift(
               llm_graph_result * res,
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@ -918,7 +918,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w
            } break;
        case GGML_OP_ROPE:
            {
-                const int n_embd_head = hparams.n_embd_head_v;
+                const int n_embd_head = hparams.n_embd_head_v();
                const int n_head = hparams.n_head();
                ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512);
                ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512);
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@ -186,8 +186,10 @@ void llama_model_saver::add_kv_from_model() {
    add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV,           hparams.n_head_kv_arr, true);
    add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS,          hparams.f_max_alibi_bias);
    add_kv(LLM_KV_ATTENTION_CLAMP_KQV,               hparams.f_clamp_kqv);
-    add_kv(LLM_KV_ATTENTION_KEY_LENGTH,              hparams.n_embd_head_k);
-    add_kv(LLM_KV_ATTENTION_VALUE_LENGTH,            hparams.n_embd_head_v);
+    add_kv(LLM_KV_ATTENTION_KEY_LENGTH,              hparams.n_embd_head_k_full);
+    add_kv(LLM_KV_ATTENTION_VALUE_LENGTH,            hparams.n_embd_head_v_full);
+    add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA,          hparams.n_embd_head_k_swa);
+    add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA,        hparams.n_embd_head_v_swa);
    add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS,           hparams.f_norm_eps);
    add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,       hparams.f_norm_rms_eps);
    add_kv(LLM_KV_ATTENTION_CAUSAL,                  hparams.causal_attn);
@ -199,7 +201,8 @@ void llama_model_saver::add_kv_from_model() {

    const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;

-    add_kv(LLM_KV_ROPE_DIMENSION_COUNT,              hparams.n_rot);
+    add_kv(LLM_KV_ROPE_DIMENSION_COUNT,              hparams.n_rot_full);
+    add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA,          hparams.n_rot_swa);
    add_kv(LLM_KV_ROPE_FREQ_BASE,                    hparams.rope_freq_base_train);
    // add_kv(LLM_KV_ROPE_SCALE_LINEAR,                 rope_scaling_factor); // old name
    add_kv(LLM_KV_ROPE_SCALING_TYPE,                 llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -459,26 +459,37 @@ void llama_model::load_hparams(llama_model_loader & ml) {
        // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
        // gpt-j n_rot = rotary_dim

-        hparams.n_embd_head_k = hparams.n_embd / hparams.n_head();
-        ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
+        hparams.n_embd_head_k_full = hparams.n_embd / hparams.n_head();
+        ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full, false);

-        hparams.n_embd_head_v = hparams.n_embd / hparams.n_head();
-        ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
+        hparams.n_embd_head_v_full = hparams.n_embd / hparams.n_head();
+        ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full, false);

        // sanity check for n_rot (optional)
-        hparams.n_rot = hparams.n_embd_head_k;
+        hparams.n_rot_full = hparams.n_embd_head_k_full;

-        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
+        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full, false);

        if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) {
-            if (hparams.n_rot != hparams.n_embd_head_k) {
-                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k));
+            if (hparams.n_rot_full != hparams.n_embd_head_k_full) {
+                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot_full, hparams.n_embd_head_k_full));
            }
        }
    } else {
-        hparams.n_rot = 0;
-        hparams.n_embd_head_k = 0;
-        hparams.n_embd_head_v = 0;
+        hparams.n_rot_full = 0;
+        hparams.n_embd_head_k_full = 0;
+        hparams.n_embd_head_v_full = 0;
+    }
+
+    // head size and n_rot for SWA layers
+    {
+        hparams.n_embd_head_k_swa = hparams.n_embd_head_k_full;
+        hparams.n_embd_head_v_swa = hparams.n_embd_head_v_full;
+        ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa, false);
+        ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa, false);
+
+        hparams.n_rot_swa = hparams.n_rot_full;
+        ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false);
    }

    // for differentiating model types
@ -1114,10 +1125,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                        break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
-
-                // Load attention parameters
-                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH,   hparams.n_embd_head_k, false);
-                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
            } break;
        case LLM_ARCH_PLAMO3:
            {
@ -1212,7 +1219,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173
                hparams.f_attention_scale = type == LLM_TYPE_27B
                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
-                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
            } break;
        case LLM_ARCH_GEMMA3:
            {
@ -1245,7 +1252,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289
                hparams.f_attention_scale = type == LLM_TYPE_27B
                    ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
-                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+                    : 1.0f / std::sqrt(float(hparams.n_embd_head_k()));
            } break;
        case LLM_ARCH_GEMMA3N:
            {
@ -1294,7 +1301,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                    case 24: type = LLM_TYPE_0_3B; break;
                    default: type = LLM_TYPE_UNKNOWN;
                }
-                hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k));
+                hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k()));

            } break;
        case LLM_ARCH_STARCODER2:
@ -2487,7 +2494,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA,    hparams.n_embd_head_k_mla_impl);
                ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA,  hparams.n_embd_head_v_mla_impl);
                ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK,      hparams.n_lora_kv);
-                ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT,        hparams.n_rot);
                ml.get_key(LLM_KV_SSM_CONV_KERNEL,             hparams.ssm_d_conv);
                ml.get_key(LLM_KV_KDA_HEAD_DIM,                hparams.n_embd_head_kda);

@ -2518,6 +2524,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {

                hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;

+                // full_attention layer only use half of the RoPE dimensions
+                hparams.n_rot_full = hparams.n_rot_full / 2;
+
                // MoE + SWA parameters
                ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,        hparams.n_ff_exp);
                ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false);
@ -2661,13 +2670,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
        const int64_t n_embd        = hparams.n_embd;
        const int64_t n_embd_k_gqa  = hparams.n_embd_k_gqa();
        const int64_t n_embd_v_gqa  = hparams.n_embd_v_gqa();
-        const int64_t n_embd_head_k = hparams.n_embd_head_k;
-        const int64_t n_embd_head_v = hparams.n_embd_head_v;
+        const int64_t n_embd_head_k = hparams.n_embd_head_k();
+        const int64_t n_embd_head_v = hparams.n_embd_head_v();
        const int64_t n_ff          = hparams.n_ff();
        const int64_t n_embd_gqa    = n_embd_v_gqa;
        const int64_t n_vocab       = vocab.n_tokens();
        const int64_t n_token_types = vocab.n_token_types();
-        const int64_t n_rot         = hparams.n_rot;
+        const int64_t n_rot         = hparams.n_rot();
        const int64_t n_expert      = hparams.n_expert;
        const int64_t n_expert_used = hparams.n_expert_used;
        const int64_t n_ctx_train   = hparams.n_ctx_train;
@ -2967,8 +2976,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                } break;
            case LLM_ARCH_MINICPM3:
                {
-                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
-                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();

                    const int64_t q_lora_rank  = hparams.n_lora_q;
                    const int64_t kv_lora_rank = hparams.n_lora_kv;
@ -3840,8 +3849,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                    const int64_t dt_dim              = std::max(64, int(hparams.n_embd / 16));

                    // attention parameters
-                    const uint32_t qk_dim = hparams.n_embd_head_k;
-                    const uint32_t v_dim  = hparams.n_embd_head_v;
+                    const uint32_t qk_dim = hparams.n_embd_head_k();
+                    const uint32_t v_dim  = hparams.n_embd_head_v();

                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

@ -3901,8 +3910,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                } break;
            case LLM_ARCH_PLAMO3:
                {
-                    const int64_t head_dim_q = hparams.n_embd_head_k;
-                    const int64_t head_dim_v = hparams.n_embd_head_v;
+                    const int64_t head_dim_q = hparams.n_embd_head_k();
+                    const int64_t head_dim_v = hparams.n_embd_head_v();

                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

@ -4649,7 +4658,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                } break;
            case LLM_ARCH_SEED_OSS:
                {
-                    const uint32_t head_dim             = hparams.n_embd_head_k;
+                    const uint32_t head_dim             = hparams.n_embd_head_k();
                    const int64_t n_qo_dim              = n_head * head_dim;
                    const int64_t n_kv_dim              = n_head_kv * head_dim;

@ -4878,7 +4887,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
                    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();

-                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
+                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
                    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;
                    GGML_ASSERT(n_embd_head_qk_nope >= 1);

@ -4957,8 +4966,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                } break;
            case LLM_ARCH_PLM:
                {
-                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
-                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
+                    const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();
                    const int64_t kv_lora_rank = hparams.n_lora_kv;

                    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@ -5396,7 +5405,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                    const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla();
                    const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla();

-                    const int64_t n_embd_head_qk_rope = hparams.n_rot;
+                    const int64_t n_embd_head_qk_rope = hparams.n_rot();
                    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;

                    const int64_t q_lora_rank  = hparams.n_lora_q;
@ -5680,7 +5689,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                    const int64_t n_expert       = hparams.n_expert;
                    const int64_t n_expert_used  = hparams.n_expert_used;
                    const int64_t n_ff_shexp     = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : n_ff_exp;
-                    const int64_t head_dim       = hparams.n_embd_head_k;
+                    const int64_t head_dim       = hparams.n_embd_head_k();
                    const int64_t n_qo_dim       = n_head * head_dim;
                    const int64_t n_kv_dim       = n_head_kv * head_dim;

@ -6968,7 +6977,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

                             // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA)
                             // Note: hparams.n_rot may be 72 (from conversion) but actual is 64
-                             const int64_t qk_rope_head_dim = hparams.n_rot;  // From config: qk_rope_head_dim
+                             const int64_t qk_rope_head_dim = hparams.n_rot();  // From config: qk_rope_head_dim
                             layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0);
                             // Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled)
                             layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i),
@ -7339,7 +7348,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                    // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer.
                    uint32_t n_rot_max = 0;
                    for (int i = 0; i < n_layer; ++i) {
-                        n_rot_max = std::max(n_rot_max, hparams.n_rot);
+                        n_rot_max = std::max(n_rot_max, hparams.n_rot());
                    }
                    if (n_rot_max == 0) {
                        n_rot_max = n_rot;
@ -7674,11 +7683,11 @@ void llama_model::print_info() const {
        LLAMA_LOG_INFO("%s: n_layer               = %u\n",     __func__, hparams.n_layer);
        LLAMA_LOG_INFO("%s: n_head                = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head(il);    }, hparams.n_layer).c_str());
        LLAMA_LOG_INFO("%s: n_head_kv             = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str());
-        LLAMA_LOG_INFO("%s: n_rot                 = %u\n",     __func__, hparams.n_rot);
+        LLAMA_LOG_INFO("%s: n_rot                 = %u\n",     __func__, hparams.n_rot_full);
        LLAMA_LOG_INFO("%s: n_swa                 = %u\n",     __func__, hparams.n_swa);
        LLAMA_LOG_INFO("%s: is_swa_any            = %u\n",     __func__, hparams.is_swa_any());
-        LLAMA_LOG_INFO("%s: n_embd_head_k         = %u\n",     __func__, hparams.n_embd_head_k);
-        LLAMA_LOG_INFO("%s: n_embd_head_v         = %u\n",     __func__, hparams.n_embd_head_v);
+        LLAMA_LOG_INFO("%s: n_embd_head_k         = %u\n",     __func__, hparams.n_embd_head_k_full);
+        LLAMA_LOG_INFO("%s: n_embd_head_v         = %u\n",     __func__, hparams.n_embd_head_v_full);
        LLAMA_LOG_INFO("%s: n_gqa                 = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il);        }, hparams.n_layer).c_str());
        LLAMA_LOG_INFO("%s: n_embd_k_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str());
        LLAMA_LOG_INFO("%s: n_embd_v_gqa          = %s\n",     __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str());
@ -7702,6 +7711,9 @@ void llama_model::print_info() const {
        if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) {
            LLAMA_LOG_INFO("%s: freq_base_swa         = %.1f\n",   __func__, hparams.rope_freq_base_train_swa);
            LLAMA_LOG_INFO("%s: freq_scale_swa        = %g\n",     __func__, hparams.rope_freq_scale_train_swa);
+            LLAMA_LOG_INFO("%s: n_embd_head_k_swa     = %u\n",     __func__, hparams.n_embd_head_k_swa);
+            LLAMA_LOG_INFO("%s: n_embd_head_v_swa     = %u\n",     __func__, hparams.n_embd_head_v_swa);
+            LLAMA_LOG_INFO("%s: n_rot_swa             = %u\n",     __func__, hparams.n_rot_swa);
        }
        LLAMA_LOG_INFO("%s: n_ctx_orig_yarn       = %u\n",     __func__, hparams.n_ctx_orig_yarn);
        LLAMA_LOG_INFO("%s: rope_yarn_log_mul     = %.4f\n",   __func__, hparams.rope_yarn_log_mul);
--- a/src/models/afmoe.cpp
+++ b/src/models/afmoe.cpp
@ -1,8 +1,8 @@
 #include "models.h"

 llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/apertus.cpp
+++ b/src/models/apertus.cpp
@ -3,10 +3,10 @@


 llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/arcee.cpp
+++ b/src/models/arcee.cpp
@ -2,10 +2,10 @@


 llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/arctic.cpp
+++ b/src/models/arctic.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/baichuan.cpp
+++ b/src/models/baichuan.cpp
@ -2,10 +2,10 @@


 llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/bailingmoe2.cpp
+++ b/src/models/bailingmoe2.cpp
@ -2,10 +2,10 @@

 llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/bert.cpp
+++ b/src/models/bert.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/bitnet.cpp
+++ b/src/models/bitnet.cpp
@ -2,9 +2,9 @@


 llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/bloom.cpp
+++ b/src/models/bloom.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/chameleon.cpp
+++ b/src/models/chameleon.cpp
@ -3,10 +3,10 @@
 #include <float.h>

 llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/chatglm.cpp
+++ b/src/models/chatglm.cpp
@ -2,10 +2,10 @@


 llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/codeshell.cpp
+++ b/src/models/codeshell.cpp
@ -1,11 +1,11 @@
 #include "models.h"

 llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/cogvlm.cpp
+++ b/src/models/cogvlm.cpp
@ -2,11 +2,11 @@

 llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const float   kq_scale    = 1.0f / sqrtf(float(n_embd_head));

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * inpL;
    ggml_tensor * cur;
--- a/src/models/cohere2-iswa.cpp
+++ b/src/models/cohere2-iswa.cpp
@ -1,9 +1,9 @@
 #include "models.h"

 llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    const float f_logit_scale = hparams.f_logit_scale;

--- a/src/models/command-r.cpp
+++ b/src/models/command-r.cpp
@ -4,9 +4,9 @@

 llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    const float f_logit_scale = hparams.f_logit_scale;

--- a/src/models/dbrx.cpp
+++ b/src/models/dbrx.cpp
@ -1,11 +1,11 @@
 #include "models.h"

 llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/deci.cpp
+++ b/src/models/deci.cpp
@ -3,10 +3,10 @@


 llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/deepseek.cpp
+++ b/src/models/deepseek.cpp
@ -2,10 +2,10 @@

 llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@ -8,7 +8,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
    const int64_t n_embd_head_k = hparams.n_embd_head_k_mla();
    const int64_t n_embd_head_v = hparams.n_embd_head_v_mla();

-    const int64_t n_embd_head_qk_rope = hparams.n_rot;
+    const int64_t n_embd_head_qk_rope = hparams.n_rot();
    const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope;

    const uint32_t kv_lora_rank = hparams.n_lora_kv;
--- a/src/models/dots1.cpp
+++ b/src/models/dots1.cpp
@ -2,10 +2,10 @@

 llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/dream.cpp
+++ b/src/models/dream.cpp
@ -5,10 +5,10 @@
 llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
    //copied from qwen2
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/ernie4-5-moe.cpp
+++ b/src/models/ernie4-5-moe.cpp
@ -2,10 +2,10 @@

 llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/ernie4-5.cpp
+++ b/src/models/ernie4-5.cpp
@ -2,10 +2,10 @@

 llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/eurobert.cpp
+++ b/src/models/eurobert.cpp
@ -1,9 +1,9 @@
 #include "models.h"

 llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/exaone-moe.cpp
+++ b/src/models/exaone-moe.cpp
@ -2,10 +2,10 @@

 llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_k;
+    const int64_t n_embd_head = hparams.n_embd_head_k();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/exaone.cpp
+++ b/src/models/exaone.cpp
@ -4,10 +4,10 @@

 llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/exaone4.cpp
+++ b/src/models/exaone4.cpp
@ -4,10 +4,10 @@
 template <bool iswa>
 llm_build_exaone4<iswa>::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_k;
+    const int64_t n_embd_head = hparams.n_embd_head_k();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_v);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_v());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/falcon-h1.cpp
+++ b/src/models/falcon-h1.cpp
@ -2,7 +2,7 @@

 llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) :
    llm_build_mamba_base(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/falcon.cpp
+++ b/src/models/falcon.cpp
@ -2,11 +2,11 @@


 llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/gemma-embedding.cpp
+++ b/src/models/gemma-embedding.cpp
@ -2,7 +2,7 @@

 llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_k;
+    const int64_t n_embd_head = hparams.n_embd_head_k();

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/gemma.cpp
+++ b/src/models/gemma.cpp
@ -2,7 +2,7 @@


 llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/gemma2-iswa.cpp
+++ b/src/models/gemma2-iswa.cpp
@ -1,7 +1,7 @@
 #include "models.h"

 llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_k;
+    const int64_t n_embd_head = hparams.n_embd_head_k();

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/gemma3.cpp
+++ b/src/models/gemma3.cpp
@ -2,7 +2,7 @@

 template <bool iswa>
 llm_build_gemma3<iswa>::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_k;
+    const int64_t n_embd_head = hparams.n_embd_head_k();

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n-iswa.cpp
@ -3,7 +3,7 @@
 llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params),
    model(model),
-    n_embd_head(model.hparams.n_embd_head_k),
+    n_embd_head(model.hparams.n_embd_head_k()),
    n_embd_altup(model.hparams.n_embd_altup),
    n_altup(model.hparams.n_altup),
    i_altup_act(model.hparams.i_altup_act) {
--- a/src/models/glm4-moe.cpp
+++ b/src/models/glm4-moe.cpp
@ -1,9 +1,9 @@
 #include "models.h"

 llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    int sections[4];
    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
--- a/src/models/glm4.cpp
+++ b/src/models/glm4.cpp
@ -3,10 +3,10 @@


 llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    int sections[4];
    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
--- a/src/models/gpt2.cpp
+++ b/src/models/gpt2.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * pos;
--- a/src/models/gptneox.cpp
+++ b/src/models/gptneox.cpp
@ -2,10 +2,10 @@


 llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/granite-hybrid.cpp
+++ b/src/models/granite-hybrid.cpp
@ -2,8 +2,8 @@

 llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) :
    llm_build_mamba_base(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/granite.cpp
+++ b/src/models/granite.cpp
@ -5,10 +5,10 @@ llm_build_granite::llm_build_granite(
    const llm_graph_params & params)
    : llm_graph_context(params) {

-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/grok.cpp
+++ b/src/models/grok.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/grovemoe.cpp
+++ b/src/models/grovemoe.cpp
@ -2,11 +2,11 @@

 llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const int64_t n_embd_head    = hparams.n_embd_head_v;
+    const int64_t n_embd_head    = hparams.n_embd_head_v();
    const int64_t n_chunk_expert = n_expert / hparams.n_group_experts;

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/hunyuan-dense.cpp
+++ b/src/models/hunyuan-dense.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/hunyuan-moe.cpp
+++ b/src/models/hunyuan-moe.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/internlm2.cpp
+++ b/src/models/internlm2.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/jais.cpp
+++ b/src/models/jais.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/jais2.cpp
+++ b/src/models/jais2.cpp
@ -3,10 +3,10 @@
 // JAIS-2 model graph builder
 // Uses: LayerNorm (not RMSNorm), relu2 activation, separate Q/K/V, RoPE embeddings
 llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/jamba.cpp
+++ b/src/models/jamba.cpp
@ -1,7 +1,7 @@
 #include "models.h"

 llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/kimi-linear.cpp
+++ b/src/models/kimi-linear.cpp
@ -102,7 +102,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll
    const int64_t kv_lora_rank = hparams.n_lora_kv;
    // qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot
    // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim]
-    const int64_t n_embd_head_qk_rope = hparams.n_rot;  // config.qk_rope_head_dim
+    const int64_t n_embd_head_qk_rope = hparams.n_rot();  // config.qk_rope_head_dim
    const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope;  // 192 - 64 = 128
    // Attention scale for MLA
    const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla);
--- a/src/models/lfm2.cpp
+++ b/src/models/lfm2.cpp
@ -39,7 +39,7 @@ llm_build_lfm2<iswa>::llm_build_lfm2(const llama_model & model, const llm_graph_
                                           inp_attn_type * inp_attn,
                                           int             il) -> ggml_tensor * {
        GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
-        const auto n_embd_head = hparams.n_embd_head_v;
+        const auto n_embd_head = hparams.n_embd_head_v();
        const auto n_head_kv   = hparams.n_head_kv(il);

        auto * q = build_lora_mm(model.layers[il].wq, cur);
--- a/src/models/llada-moe.cpp
+++ b/src/models/llada-moe.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/llada.cpp
+++ b/src/models/llada.cpp
@ -2,10 +2,10 @@

 llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
    // LLaDA is similar to LLaMA but uses non-causal attention for diffusion
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/llama-iswa.cpp
+++ b/src/models/llama-iswa.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/llama.cpp
+++ b/src/models/llama.cpp
@ -2,10 +2,10 @@

 template <bool embed>
 llm_build_llama<embed>::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/maincoder.cpp
+++ b/src/models/maincoder.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/minicpm3.cpp
+++ b/src/models/minicpm3.cpp
@ -5,10 +5,10 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
    const int64_t n_embd_base = 256;
    const float scale_embd  = 12.0f;
    const float scale_depth = 1.4f;
-    const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k));
+    const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k()));

-    const uint32_t n_embd_head_qk_rope = hparams.n_rot;
-    const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+    const uint32_t n_embd_head_qk_rope = hparams.n_rot();
+    const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();

    const uint32_t kv_lora_rank = hparams.n_lora_kv;

@ -51,21 +51,21 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap
                    LLM_NORM_RMS, il);
            cb(q, "q", il);

-            // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
+            // {q_lora_rank, n_head * hparams.n_embd_head_k()} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k(), n_tokens}
            q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
            cb(q, "q", il);

            // split into {n_head * n_embd_head_qk_nope, n_tokens}
            ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
-                    ggml_row_size(q->type, hparams.n_embd_head_k),
-                    ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                    ggml_row_size(q->type, hparams.n_embd_head_k()),
+                    ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
                    0);
            cb(q_nope, "q_nope", il);

            // and {n_head * n_embd_head_qk_rope, n_tokens}
            ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
-                    ggml_row_size(q->type, hparams.n_embd_head_k),
-                    ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                    ggml_row_size(q->type, hparams.n_embd_head_k()),
+                    ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
                    ggml_row_size(q->type, n_embd_head_qk_nope));
            cb(q_pe, "q_pe", il);

@ -97,15 +97,15 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap

            // split into {n_head * n_embd_head_qk_nope, n_tokens}
            ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
-                    ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
-                    ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                    ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()),
+                    ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())),
                    0);
            cb(k_nope, "k_nope", il);

            // and {n_head * n_embd_head_v, n_tokens}
-            ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
-                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
-                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+            ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens,
+                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())),
+                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head),
                    ggml_row_size(kv->type, (n_embd_head_qk_nope)));
            cb(v_states, "v_states", il);

--- a/src/models/minimax-m2.cpp
+++ b/src/models/minimax-m2.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    // GGML_ASSERT(n_embd_head == n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/mistral3.cpp
+++ b/src/models/mistral3.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/modern-bert.cpp
+++ b/src/models/modern-bert.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/mpt.cpp
+++ b/src/models/mpt.cpp
@ -3,10 +3,10 @@


 llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * pos;
--- a/src/models/nemotron-h.cpp
+++ b/src/models/nemotron-h.cpp
@ -2,8 +2,8 @@

 llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) :
    llm_build_mamba_base(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/nemotron.cpp
+++ b/src/models/nemotron.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    //GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    //GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/neo-bert.cpp
+++ b/src/models/neo-bert.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_neo_bert::llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/olmo.cpp
+++ b/src/models/olmo.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/olmo2.cpp
+++ b/src/models/olmo2.cpp
@ -2,10 +2,10 @@

 template <bool iswa>
 llm_build_olmo2<iswa>::llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/olmoe.cpp
+++ b/src/models/olmoe.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/openelm.cpp
+++ b/src/models/openelm.cpp
@ -1,9 +1,9 @@
 #include "models.h"

 llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/orion.cpp
+++ b/src/models/orion.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_orion::llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/paddleocr.cpp
+++ b/src/models/paddleocr.cpp
@ -5,10 +5,10 @@ llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_gr

    // NOTE: same with qwen2vl.cpp, but bias tensors are optional

-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/pangu-embedded.cpp
+++ b/src/models/pangu-embedded.cpp
@ -2,10 +2,10 @@


 llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/phi2.cpp
+++ b/src/models/phi2.cpp
@ -2,10 +2,10 @@


 llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * attn_norm_output;
--- a/src/models/phi3.cpp
+++ b/src/models/phi3.cpp
@ -2,10 +2,10 @@

 template<bool iswa>
 llm_build_phi3<iswa>::llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();
    const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/plamo.cpp
+++ b/src/models/plamo.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_plamo::llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/plamo2.cpp
+++ b/src/models/plamo2.cpp
@ -106,9 +106,9 @@ ggml_tensor * llm_build_plamo2::build_plamo2_attn_layer(llm_graph_input_attn_kv
        cb(qkv, "wqkv", il);

        // split QKV tensor into Q, K, V
-        const int64_t n_embd_head_q = hparams.n_embd_head_k;
-        const int64_t n_embd_head_k = hparams.n_embd_head_k;
-        const int64_t n_embd_head_v = hparams.n_embd_head_v;
+        const int64_t n_embd_head_q = hparams.n_embd_head_k();
+        const int64_t n_embd_head_k = hparams.n_embd_head_k();
+        const int64_t n_embd_head_v = hparams.n_embd_head_v();
        int32_t       n_head        = hparams.n_head(il);
        int32_t       n_head_kv     = hparams.n_head_kv(il);

--- a/src/models/plamo3.cpp
+++ b/src/models/plamo3.cpp
@ -3,8 +3,8 @@
 template <bool iswa>
 llm_build_plamo3<iswa>::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params) {
-    const int64_t head_dim_q = hparams.n_embd_head_k;
-    const int64_t head_dim_v = hparams.n_embd_head_v;
+    const int64_t head_dim_q = hparams.n_embd_head_k();
+    const int64_t head_dim_v = hparams.n_embd_head_v();

    ggml_tensor * cur;
    ggml_tensor * inpL = build_inp_embd(model.tok_embd);
--- a/src/models/plm.cpp
+++ b/src/models/plm.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k));
+    const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k()));

-    const uint32_t n_embd_head_qk_rope = hparams.n_rot;
-    const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
+    const uint32_t n_embd_head_qk_rope = hparams.n_rot();
+    const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot();

    const uint32_t kv_lora_rank = hparams.n_lora_kv;

@ -38,15 +38,15 @@ llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params &

            // split into {n_head * n_embd_head_qk_nope, n_tokens}
            ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
-                    ggml_row_size(q->type, hparams.n_embd_head_k),
-                    ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                    ggml_row_size(q->type, hparams.n_embd_head_k()),
+                    ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
                    0);
            cb(q_nope, "q_nope", il);

            // and {n_head * n_embd_head_qk_rope, n_tokens}
            ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
-                    ggml_row_size(q->type, hparams.n_embd_head_k),
-                    ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
+                    ggml_row_size(q->type, hparams.n_embd_head_k()),
+                    ggml_row_size(q->type, hparams.n_embd_head_k() * n_head),
                    ggml_row_size(q->type, n_embd_head_qk_nope));
            cb(q_pe, "q_pe", il);

@ -78,23 +78,23 @@ llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params &

            // split into {n_head * n_embd_head_qk_nope, n_tokens}
            ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
-                    ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
-                    ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
+                    ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()),
+                    ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())),
                    0);
            cb(k_nope, "k_nope", il);

            // and {n_head * n_embd_head_v, n_tokens}
-            ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
-                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
-                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
+            ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens,
+                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())),
+                    ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head),
                    ggml_row_size(kv->type, (n_embd_head_qk_nope)));
            cb(v_states, "v_states", il);

            v_states = ggml_cont(ctx0, v_states);
            cb(v_states, "v_states", il);

-            v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
-                    ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
+            v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v() * n_head, n_tokens,
+                    ggml_row_size(kv->type, hparams.n_embd_head_v() * n_head),
                    0);
            cb(v_states, "v_states", il);

--- a/src/models/qwen.cpp
+++ b/src/models/qwen.cpp
@ -2,9 +2,9 @@


 llm_build_qwen::llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/qwen2.cpp
+++ b/src/models/qwen2.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/qwen2moe.cpp
+++ b/src/models/qwen2moe.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/qwen2vl.cpp
+++ b/src/models/qwen2vl.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/qwen3.cpp
+++ b/src/models/qwen3.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/qwen35.cpp
+++ b/src/models/qwen35.cpp
@ -4,9 +4,9 @@

 llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_params & params) :
    llm_build_delta_net_base(params), model(model) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    int sections[4];
    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@ -117,8 +117,8 @@ ggml_tensor * llm_build_qwen35::build_layer_attn(
        ggml_tensor *             inp_pos,
        int *                     sections,
        int                       il) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention

--- a/src/models/qwen35moe.cpp
+++ b/src/models/qwen35moe.cpp
@ -4,9 +4,9 @@

 llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params) :
    llm_build_delta_net_base(params), model(model) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    int sections[4];
    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
@ -117,8 +117,8 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn(
        ggml_tensor *             inp_pos,
        int *                     sections,
        int                       il) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention

--- a/src/models/qwen3moe.cpp
+++ b/src/models/qwen3moe.cpp
@ -1,10 +1,10 @@
 #include "models.h"

 llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/qwen3next.cpp
+++ b/src/models/qwen3next.cpp
@ -100,8 +100,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn(
        ggml_tensor *             cur,
        ggml_tensor *             inp_pos,
        int                       il) {
-    const int64_t n_embd_head = hparams.n_embd_head_v;
-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+    const int64_t n_embd_head = hparams.n_embd_head_v();
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());

    // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention

--- a/src/models/qwen3vl-moe.cpp
+++ b/src/models/qwen3vl-moe.cpp
@ -4,10 +4,10 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_
    const size_t n_deepstack_layers = hparams.n_deepstack_layers;

    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/src/models/qwen3vl.cpp
+++ b/src/models/qwen3vl.cpp
@ -4,10 +4,10 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_
    const size_t n_deepstack_layers = hparams.n_deepstack_layers;

    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_head = hparams.n_embd_head_v();

-    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
-    GGML_ASSERT(n_embd_head == hparams.n_rot);
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k());
+    GGML_ASSERT(n_embd_head == n_rot);

    ggml_tensor * cur;
    ggml_tensor * inpL;
--- a/Show More
+++ b/Show More