diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 839c6e787f..c5f5469506 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -177,6 +177,8 @@ class Keys: TEMPERATURE_LENGTH = "{arch}.attention.temperature_length" KEY_LENGTH_MLA = "{arch}.attention.key_length_mla" VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla" + KEY_LENGTH_SWA = "{arch}.attention.key_length_swa" + VALUE_LENGTH_SWA = "{arch}.attention.value_length_swa" SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers" SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern" TEMPERATURE_SCALE = "{arch}.attention.temperature_scale" @@ -188,6 +190,7 @@ class Keys: class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" + DIMENSION_COUNT_SWA = "{arch}.rope.dimension_count_swa" DIMENSION_SECTIONS = "{arch}.rope.dimension_sections" FREQ_BASE = "{arch}.rope.freq_base" FREQ_BASE_SWA = "{arch}.rope.freq_base_swa" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 9ee3ac9e8f..e790be9533 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -773,6 +773,12 @@ class GGUFWriter: def add_value_length_mla(self, length: int) -> None: self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length) + def add_key_length_swa(self, length: int) -> None: + self.add_uint32(Keys.Attention.KEY_LENGTH_SWA.format(arch=self.arch), length) + + def add_value_length_swa(self, length: int) -> None: + self.add_uint32(Keys.Attention.VALUE_LENGTH_SWA.format(arch=self.arch), length) + def add_indexer_head_count(self, count: int) -> None: self.add_uint32(Keys.Attention.Indexer.HEAD_COUNT.format(arch=self.arch), count) @@ -946,6 +952,9 @@ class GGUFWriter: def add_rope_dimension_count(self, count: int) -> None: self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) + def add_rope_dimension_count_swa(self, count: int) -> None: + self.add_uint32(Keys.Rope.DIMENSION_COUNT_SWA.format(arch=self.arch), count) + def add_rope_dimension_sections(self, dims: Sequence[int]) -> None: self.add_array(Keys.Rope.DIMENSION_SECTIONS.format(arch=self.arch), dims) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 9d8eb88d0b..ce49bbd988 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -230,11 +230,14 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_TEMPERATURE_SCALE, "%s.attention.temperature_scale" }, { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, + { LLM_KV_ATTENTION_KEY_LENGTH_SWA, "%s.attention.key_length_swa" }, + { LLM_KV_ATTENTION_VALUE_LENGTH_SWA, "%s.attention.value_length_swa" }, { LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, "%s.attention.indexer.head_count" }, { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" }, { LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, + { LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, { LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" }, { LLM_KV_ROPE_FREQ_BASE_SWA, "%s.rope.freq_base_swa" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 07aac40aa1..28dd1ffac7 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -234,11 +234,14 @@ enum llm_kv { LLM_KV_ATTENTION_TEMPERATURE_SCALE, LLM_KV_ATTENTION_KEY_LENGTH_MLA, LLM_KV_ATTENTION_VALUE_LENGTH_MLA, + LLM_KV_ATTENTION_KEY_LENGTH_SWA, + LLM_KV_ATTENTION_VALUE_LENGTH_SWA, LLM_KV_ATTENTION_INDEXER_HEAD_COUNT, LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, LLM_KV_ATTENTION_INDEXER_TOP_K, LLM_KV_ROPE_DIMENSION_COUNT, + LLM_KV_ROPE_DIMENSION_COUNT_SWA, LLM_KV_ROPE_DIMENSION_SECTIONS, LLM_KV_ROPE_FREQ_BASE, LLM_KV_ROPE_FREQ_BASE_SWA, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 009d07e00e..ee2669c154 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2876,19 +2876,23 @@ llama_context * llama_init_from_model( if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_k)) { const uint32_t blck_size = ggml_blck_size(params.type_k); - if (model->hparams.n_embd_head_k % blck_size != 0) { - LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n", - __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k); - return nullptr; + for (uint32_t il = 0; il < model->hparams.n_layer; ++il) { + if (model->hparams.n_embd_head_k(il) % blck_size != 0) { + LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n", + __func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il)); + return nullptr; + } } } if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO && ggml_is_quantized(params.type_v)) { const uint32_t blck_size = ggml_blck_size(params.type_v); - if (model->hparams.n_embd_head_v % blck_size != 0) { - LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_k=%u\n", - __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v); - return nullptr; + for (uint32_t il = 0; il < model->hparams.n_layer; ++il) { + if (model->hparams.n_embd_head_v(il) % blck_size != 0) { + LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n", + __func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il)); + return nullptr; + } } } diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index f4cb7dce15..5f875136a1 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -849,13 +849,13 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) : ubatch (params.ubatch), n_embd (hparams.n_embd), n_layer (hparams.n_layer), - n_rot (hparams.n_rot), + n_rot (hparams.n_rot()), n_ctx (cparams.n_ctx), n_head (hparams.n_head()), n_head_kv (hparams.n_head_kv()), - n_embd_head_k (hparams.n_embd_head_k), + n_embd_head_k (hparams.n_embd_head_k()), n_embd_k_gqa (hparams.n_embd_k_gqa()), - n_embd_head_v (hparams.n_embd_head_v), + n_embd_head_v (hparams.n_embd_head_v()), n_embd_v_gqa (hparams.n_embd_v_gqa()), n_expert (hparams.n_expert), n_expert_used (cparams.warmup ? hparams.n_expert : hparams.n_expert_used), diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 756dda1a7a..002d15d415 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -62,6 +62,14 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const { return n_head/n_head_kv; } +uint32_t llama_hparams::n_rot(uint32_t il) const { + if (il < n_layer) { + return is_swa(il) ? n_rot_swa : n_rot_full; + } + + GGML_ABORT("fatal error"); +} + uint32_t llama_hparams::n_embd_inp() const { uint32_t n_embd_inp = n_embd; @@ -76,16 +84,32 @@ uint32_t llama_hparams::n_embd_out() const { return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd; } +uint32_t llama_hparams::n_embd_head_k(uint32_t il) const { + if (il < n_layer) { + return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full; + } + + GGML_ABORT("fatal error"); +} + +uint32_t llama_hparams::n_embd_head_v(uint32_t il) const { + if (il < n_layer) { + return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full; + } + + GGML_ABORT("fatal error"); +} + uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const { const uint32_t n_head_kv = this->n_head_kv(il); - return n_embd_head_k * n_head_kv; + return n_embd_head_k(il) * n_head_kv; } uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const { const uint32_t n_head_kv = this->n_head_kv(il); - return n_embd_head_v * n_head_kv; + return n_embd_head_v(il) * n_head_kv; } bool llama_hparams::is_n_embd_k_gqa_variable() const { @@ -197,11 +221,11 @@ bool llama_hparams::is_mla() const { } uint32_t llama_hparams::n_embd_head_k_mla() const { - return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k; + return is_mla() ? n_embd_head_k_mla_impl : n_embd_head_k(); } uint32_t llama_hparams::n_embd_head_v_mla() const { - return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v; + return is_mla() ? n_embd_head_v_mla_impl : n_embd_head_v(); } bool llama_hparams::has_kv(uint32_t il) const { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index c4b2a99da5..abfd7f2c4b 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -44,13 +44,20 @@ struct llama_hparams { uint32_t n_embd; uint32_t n_layer; int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache - uint32_t n_rot; - uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads - uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head uint32_t n_expert = 0; uint32_t n_expert_used = 0; uint32_t n_rel_attn_bkts = 0; + // different head size for full_attention and SWA layers + uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads + uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head + uint32_t n_embd_head_k_swa; + uint32_t n_embd_head_v_swa; + + // different RoPE dimensions for full_attention and SWA layers + uint32_t n_rot_full; + uint32_t n_rot_swa; + // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA uint32_t n_embd_head_k_mla_impl = 0; uint32_t n_embd_head_v_mla_impl = 0; @@ -247,12 +254,18 @@ struct llama_hparams { uint32_t n_gqa(uint32_t il = 0) const; + uint32_t n_rot(uint32_t il = 0) const; + // dimension of main + auxiliary input embeddings uint32_t n_embd_inp() const; // dimension of output embeddings uint32_t n_embd_out() const; + // dimension of key/value embeddings for each head (per layer) + uint32_t n_embd_head_k(uint32_t il = 0) const; + uint32_t n_embd_head_v(uint32_t il = 0) const; + // dimension of key embeddings across all k-v heads uint32_t n_embd_k_gqa(uint32_t il = 0) const; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index d80e8a70bc..82fe58fac4 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -1033,8 +1033,8 @@ ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_k const uint32_t ns = sinfo.s1 - sinfo.s0 + 1; return ggml_view_4d(ctx, k, - hparams.n_embd_head_k, hparams.n_head_kv(il), n_kv, ns, - ggml_row_size(k->type, hparams.n_embd_head_k), + hparams.n_embd_head_k(il), hparams.n_head_kv(il), n_kv, ns, + ggml_row_size(k->type, hparams.n_embd_head_k(il)), ggml_row_size(k->type, n_embd_k_gqa), ggml_row_size(k->type, n_embd_k_gqa*kv_size), ggml_row_size(k->type, n_embd_k_gqa*kv_size)*sinfo.s0); @@ -1056,8 +1056,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k if (!v_trans) { // note: v->nb[1] <= v->nb[2] return ggml_view_4d(ctx, v, - hparams.n_embd_head_v, hparams.n_head_kv(il), n_kv, ns, - ggml_row_size(v->type, hparams.n_embd_head_v), // v->nb[1] + hparams.n_embd_head_v(il), hparams.n_head_kv(il), n_kv, ns, + ggml_row_size(v->type, hparams.n_embd_head_v(il)), // v->nb[1] ggml_row_size(v->type, n_embd_v_gqa), // v->nb[2] ggml_row_size(v->type, n_embd_v_gqa*kv_size), // v->nb[3] ggml_row_size(v->type, n_embd_v_gqa*kv_size)*sinfo.s0); @@ -1065,8 +1065,8 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k // note: v->nb[1] > v->nb[2] return ggml_view_4d(ctx, v, - n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v, ns, - ggml_row_size(v->type, kv_size*hparams.n_embd_head_v), // v->nb[1] + n_kv, hparams.n_head_kv(il), hparams.n_embd_head_v(il), ns, + ggml_row_size(v->type, kv_size*hparams.n_embd_head_v(il)), // v->nb[1] ggml_row_size(v->type, kv_size), // v->nb[2] ggml_row_size(v->type, kv_size*n_embd_v_gqa), // v->nb[3] ggml_row_size(v->type, kv_size*n_embd_v_gqa)*sinfo.s0); @@ -1544,7 +1544,8 @@ ggml_tensor * llama_kv_cache::build_rope_shift( ggml_tensor * shift, ggml_tensor * factors, float freq_base, - float freq_scale) const { + float freq_scale, + uint32_t il) const { const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; const auto & yarn_ext_factor = cparams.yarn_ext_factor; @@ -1552,7 +1553,7 @@ ggml_tensor * llama_kv_cache::build_rope_shift( const auto & yarn_beta_slow = cparams.yarn_beta_slow; const auto & yarn_attn_factor = cparams.yarn_attn_factor; - const auto & n_rot = hparams.n_rot; + const auto & n_rot = hparams.n_rot(il); const auto & rope_type = hparams.rope_type == LLAMA_ROPE_TYPE_MROPE || hparams.rope_type == LLAMA_ROPE_TYPE_IMROPE // @ngxson : this is a workaround // for M-RoPE, we want to rotate the whole vector when doing KV shift @@ -1606,13 +1607,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co auto * ctx = res->get_ctx(); auto * gf = res->get_gf(); - const auto & n_embd_head_k = hparams.n_embd_head_k; - //const auto & n_embd_head_v = hparams.n_embd_head_v; - - const auto & n_rot = hparams.n_rot; - - const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0; - auto inp = std::make_unique(this); inp->k_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, (int64_t) get_size()*n_stream); @@ -1626,6 +1620,10 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co const int64_t n_head_kv = hparams.n_head_kv(il); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const auto n_rot = hparams.n_rot(il); + const auto n_embd_head_k = hparams.n_embd_head_k(il); + const auto n_embd_nope = hparams.n_lora_kv > 0 ? n_embd_head_k - n_rot : 0; + const float freq_base_l = model.get_rope_freq_base (cparams, il); const float freq_scale_l = model.get_rope_freq_scale(cparams, il); @@ -1638,7 +1636,7 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co ggml_row_size(layer.k->type, n_embd_k_gqa), ggml_row_size(layer.k->type, n_embd_nope)); - ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l); + ggml_tensor * cur = build_rope_shift(cparams, ctx, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, il); ggml_build_forward_expand(gf, cur); } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index e194bf3e26..33c78c5f21 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -264,7 +264,8 @@ private: ggml_tensor * shift, ggml_tensor * factors, float freq_base, - float freq_scale) const; + float freq_scale, + uint32_t il) const; ggml_cgraph * build_graph_shift( llm_graph_result * res, diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 2a6196eff3..623a3455dd 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -918,7 +918,7 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w } break; case GGML_OP_ROPE: { - const int n_embd_head = hparams.n_embd_head_v; + const int n_embd_head = hparams.n_embd_head_v(); const int n_head = hparams.n_head(); ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd_head, n_head, 512); ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 512); diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 9f677b40cf..6f6538aecc 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -186,8 +186,10 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true); add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias); add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); - add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k); - add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v); + add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full); + add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full); + add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa); + add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa); add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); @@ -199,7 +201,8 @@ void llama_model_saver::add_kv_from_model() { const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train; - add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot); + add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full); + add_kv(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa); add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train); // add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train)); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e18cca0524..349f70a611 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -459,26 +459,37 @@ void llama_model::load_hparams(llama_model_loader & ml) { // gpt-neox n_rot = rotary_pct * (n_embd / n_head) // gpt-j n_rot = rotary_dim - hparams.n_embd_head_k = hparams.n_embd / hparams.n_head(); - ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false); + hparams.n_embd_head_k_full = hparams.n_embd / hparams.n_head(); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k_full, false); - hparams.n_embd_head_v = hparams.n_embd / hparams.n_head(); - ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false); + hparams.n_embd_head_v_full = hparams.n_embd / hparams.n_head(); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v_full, false); // sanity check for n_rot (optional) - hparams.n_rot = hparams.n_embd_head_k; + hparams.n_rot_full = hparams.n_embd_head_k_full; - ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); + ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot_full, false); if (arch == LLM_ARCH_LLAMA || arch == LLM_ARCH_DECI || arch == LLM_ARCH_FALCON || arch == LLM_ARCH_LLAMA_EMBED) { - if (hparams.n_rot != hparams.n_embd_head_k) { - throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); + if (hparams.n_rot_full != hparams.n_embd_head_k_full) { + throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot_full, hparams.n_embd_head_k_full)); } } } else { - hparams.n_rot = 0; - hparams.n_embd_head_k = 0; - hparams.n_embd_head_v = 0; + hparams.n_rot_full = 0; + hparams.n_embd_head_k_full = 0; + hparams.n_embd_head_v_full = 0; + } + + // head size and n_rot for SWA layers + { + hparams.n_embd_head_k_swa = hparams.n_embd_head_k_full; + hparams.n_embd_head_v_swa = hparams.n_embd_head_v_full; + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_SWA, hparams.n_embd_head_k_swa, false); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, hparams.n_embd_head_v_swa, false); + + hparams.n_rot_swa = hparams.n_rot_full; + ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT_SWA, hparams.n_rot_swa, false); } // for differentiating model types @@ -1114,10 +1125,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { break; default: type = LLM_TYPE_UNKNOWN; } - - // Load attention parameters - ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false); - ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false); } break; case LLM_ARCH_PLAMO3: { @@ -1212,7 +1219,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L173 hparams.f_attention_scale = type == LLM_TYPE_27B ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0))) - : 1.0f / std::sqrt(float(hparams.n_embd_head_k)); + : 1.0f / std::sqrt(float(hparams.n_embd_head_k())); } break; case LLM_ARCH_GEMMA3: { @@ -1245,7 +1252,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { // ref: https://github.com/google/gemma_pytorch/blob/014acb7ac4563a5f77c76d7ff98f31b568c16508/gemma/config.py#L289 hparams.f_attention_scale = type == LLM_TYPE_27B ? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0))) - : 1.0f / std::sqrt(float(hparams.n_embd_head_k)); + : 1.0f / std::sqrt(float(hparams.n_embd_head_k())); } break; case LLM_ARCH_GEMMA3N: { @@ -1294,7 +1301,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 24: type = LLM_TYPE_0_3B; break; default: type = LLM_TYPE_UNKNOWN; } - hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k)); + hparams.f_attention_scale = 1.0f / std::sqrt(float(hparams.n_embd_head_k())); } break; case LLM_ARCH_STARCODER2: @@ -2487,7 +2494,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla_impl); ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla_impl); ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); - ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot); ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); ml.get_key(LLM_KV_KDA_HEAD_DIM, hparams.n_embd_head_kda); @@ -2518,6 +2524,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + // full_attention layer only use half of the RoPE dimensions + hparams.n_rot_full = hparams.n_rot_full / 2; + // MoE + SWA parameters ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); @@ -2661,13 +2670,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t n_embd = hparams.n_embd; const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - const int64_t n_embd_head_k = hparams.n_embd_head_k; - const int64_t n_embd_head_v = hparams.n_embd_head_v; + const int64_t n_embd_head_k = hparams.n_embd_head_k(); + const int64_t n_embd_head_v = hparams.n_embd_head_v(); const int64_t n_ff = hparams.n_ff(); const int64_t n_embd_gqa = n_embd_v_gqa; const int64_t n_vocab = vocab.n_tokens(); const int64_t n_token_types = vocab.n_token_types(); - const int64_t n_rot = hparams.n_rot; + const int64_t n_rot = hparams.n_rot(); const int64_t n_expert = hparams.n_expert; const int64_t n_expert_used = hparams.n_expert_used; const int64_t n_ctx_train = hparams.n_ctx_train; @@ -2967,8 +2976,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_MINICPM3: { - const int64_t n_embd_head_qk_rope = hparams.n_rot; - const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const int64_t n_embd_head_qk_rope = hparams.n_rot(); + const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot(); const int64_t q_lora_rank = hparams.n_lora_q; const int64_t kv_lora_rank = hparams.n_lora_kv; @@ -3840,8 +3849,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t dt_dim = std::max(64, int(hparams.n_embd / 16)); // attention parameters - const uint32_t qk_dim = hparams.n_embd_head_k; - const uint32_t v_dim = hparams.n_embd_head_v; + const uint32_t qk_dim = hparams.n_embd_head_k(); + const uint32_t v_dim = hparams.n_embd_head_v(); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -3901,8 +3910,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_PLAMO3: { - const int64_t head_dim_q = hparams.n_embd_head_k; - const int64_t head_dim_v = hparams.n_embd_head_v; + const int64_t head_dim_q = hparams.n_embd_head_k(); + const int64_t head_dim_v = hparams.n_embd_head_v(); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -4649,7 +4658,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_SEED_OSS: { - const uint32_t head_dim = hparams.n_embd_head_k; + const uint32_t head_dim = hparams.n_embd_head_k(); const int64_t n_qo_dim = n_head * head_dim; const int64_t n_kv_dim = n_head_kv * head_dim; @@ -4878,7 +4887,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla(); const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla(); - const int64_t n_embd_head_qk_rope = hparams.n_rot; + const int64_t n_embd_head_qk_rope = hparams.n_rot(); const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; GGML_ASSERT(n_embd_head_qk_nope >= 1); @@ -4957,8 +4966,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_PLM: { - const int64_t n_embd_head_qk_rope = hparams.n_rot; - const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const int64_t n_embd_head_qk_rope = hparams.n_rot(); + const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot(); const int64_t kv_lora_rank = hparams.n_lora_kv; tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -5396,7 +5405,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t n_embd_head_k_mla = hparams.n_embd_head_k_mla(); const int64_t n_embd_head_v_mla = hparams.n_embd_head_v_mla(); - const int64_t n_embd_head_qk_rope = hparams.n_rot; + const int64_t n_embd_head_qk_rope = hparams.n_rot(); const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; const int64_t q_lora_rank = hparams.n_lora_q; @@ -5680,7 +5689,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { const int64_t n_expert = hparams.n_expert; const int64_t n_expert_used = hparams.n_expert_used; const int64_t n_ff_shexp = hparams.n_ff_shexp > 0 ? hparams.n_ff_shexp : n_ff_exp; - const int64_t head_dim = hparams.n_embd_head_k; + const int64_t head_dim = hparams.n_embd_head_k(); const int64_t n_qo_dim = n_head * head_dim; const int64_t n_kv_dim = n_head_kv * head_dim; @@ -6968,7 +6977,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // Kimi: qk_rope_head_dim = 64 (actual RoPE dimension for MLA) // Note: hparams.n_rot may be 72 (from conversion) but actual is 64 - const int64_t qk_rope_head_dim = hparams.n_rot; // From config: qk_rope_head_dim + const int64_t qk_rope_head_dim = hparams.n_rot(); // From config: qk_rope_head_dim layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + qk_rope_head_dim}, 0); // Support Legacy GGUFs that don't split wkv_b (MLA KV cache disabled) layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), @@ -7339,7 +7348,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // ("rope_freqs.weight") and ggml uses only the first (n_rot_l/2) entries per layer. uint32_t n_rot_max = 0; for (int i = 0; i < n_layer; ++i) { - n_rot_max = std::max(n_rot_max, hparams.n_rot); + n_rot_max = std::max(n_rot_max, hparams.n_rot()); } if (n_rot_max == 0) { n_rot_max = n_rot; @@ -7674,11 +7683,11 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_layer = %u\n", __func__, hparams.n_layer); LLAMA_LOG_INFO("%s: n_head = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head(il); }, hparams.n_layer).c_str()); LLAMA_LOG_INFO("%s: n_head_kv = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_head_kv(il); }, hparams.n_layer).c_str()); - LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); + LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot_full); LLAMA_LOG_INFO("%s: n_swa = %u\n", __func__, hparams.n_swa); LLAMA_LOG_INFO("%s: is_swa_any = %u\n", __func__, hparams.is_swa_any()); - LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k); - LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v); + LLAMA_LOG_INFO("%s: n_embd_head_k = %u\n", __func__, hparams.n_embd_head_k_full); + LLAMA_LOG_INFO("%s: n_embd_head_v = %u\n", __func__, hparams.n_embd_head_v_full); LLAMA_LOG_INFO("%s: n_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_gqa(il); }, hparams.n_layer).c_str()); LLAMA_LOG_INFO("%s: n_embd_k_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_k_gqa(il); }, hparams.n_layer).c_str()); LLAMA_LOG_INFO("%s: n_embd_v_gqa = %s\n", __func__, print_f([&](uint32_t il) { return hparams.n_embd_v_gqa(il); }, hparams.n_layer).c_str()); @@ -7702,6 +7711,9 @@ void llama_model::print_info() const { if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { LLAMA_LOG_INFO("%s: freq_base_swa = %.1f\n", __func__, hparams.rope_freq_base_train_swa); LLAMA_LOG_INFO("%s: freq_scale_swa = %g\n", __func__, hparams.rope_freq_scale_train_swa); + LLAMA_LOG_INFO("%s: n_embd_head_k_swa = %u\n", __func__, hparams.n_embd_head_k_swa); + LLAMA_LOG_INFO("%s: n_embd_head_v_swa = %u\n", __func__, hparams.n_embd_head_v_swa); + LLAMA_LOG_INFO("%s: n_rot_swa = %u\n", __func__, hparams.n_rot_swa); } LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn); LLAMA_LOG_INFO("%s: rope_yarn_log_mul = %.4f\n", __func__, hparams.rope_yarn_log_mul); diff --git a/src/models/afmoe.cpp b/src/models/afmoe.cpp index 11f5ea2c27..9aabe25c96 100644 --- a/src/models/afmoe.cpp +++ b/src/models/afmoe.cpp @@ -1,8 +1,8 @@ #include "models.h" llm_build_afmoe::llm_build_afmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const int64_t n_embd_head = hparams.n_embd_head_v(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/apertus.cpp b/src/models/apertus.cpp index 9af19c1bfe..4d65614e46 100644 --- a/src/models/apertus.cpp +++ b/src/models/apertus.cpp @@ -3,10 +3,10 @@ llm_build_apertus::llm_build_apertus(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/arcee.cpp b/src/models/arcee.cpp index aa6167dba1..20b9ffd49e 100644 --- a/src/models/arcee.cpp +++ b/src/models/arcee.cpp @@ -2,10 +2,10 @@ llm_build_arcee::llm_build_arcee(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/arctic.cpp b/src/models/arctic.cpp index d7db06de1d..b712e08cbd 100644 --- a/src/models/arctic.cpp +++ b/src/models/arctic.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_arctic::llm_build_arctic(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/baichuan.cpp b/src/models/baichuan.cpp index d5c6528531..abd03cd0b9 100644 --- a/src/models/baichuan.cpp +++ b/src/models/baichuan.cpp @@ -2,10 +2,10 @@ llm_build_baichuan::llm_build_baichuan(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/bailingmoe2.cpp b/src/models/bailingmoe2.cpp index 81906cecb5..4209862466 100644 --- a/src/models/bailingmoe2.cpp +++ b/src/models/bailingmoe2.cpp @@ -2,10 +2,10 @@ llm_build_bailingmoe2::llm_build_bailingmoe2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/bert.cpp b/src/models/bert.cpp index 17efdafec3..8733179141 100644 --- a/src/models/bert.cpp +++ b/src/models/bert.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_bert::llm_build_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/bitnet.cpp b/src/models/bitnet.cpp index 331a3f1119..d47638498d 100644 --- a/src/models/bitnet.cpp +++ b/src/models/bitnet.cpp @@ -2,9 +2,9 @@ llm_build_bitnet::llm_build_bitnet(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/bloom.cpp b/src/models/bloom.cpp index 2c552d1d15..b1c19bb58a 100644 --- a/src/models/bloom.cpp +++ b/src/models/bloom.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_bloom::llm_build_bloom(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/chameleon.cpp b/src/models/chameleon.cpp index 184511aed4..2f24105fa1 100644 --- a/src/models/chameleon.cpp +++ b/src/models/chameleon.cpp @@ -3,10 +3,10 @@ #include llm_build_chameleon::llm_build_chameleon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/chatglm.cpp b/src/models/chatglm.cpp index 2685d4fbcb..5887ed22e7 100644 --- a/src/models/chatglm.cpp +++ b/src/models/chatglm.cpp @@ -2,10 +2,10 @@ llm_build_chatglm::llm_build_chatglm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/codeshell.cpp b/src/models/codeshell.cpp index 0b3bdbff52..e8e13e143f 100644 --- a/src/models/codeshell.cpp +++ b/src/models/codeshell.cpp @@ -1,11 +1,11 @@ #include "models.h" llm_build_codeshell::llm_build_codeshell(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/cogvlm.cpp b/src/models/cogvlm.cpp index 0ceae3aaeb..2ef2b6e389 100644 --- a/src/models/cogvlm.cpp +++ b/src/models/cogvlm.cpp @@ -2,11 +2,11 @@ llm_build_cogvlm::llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const float kq_scale = 1.0f / sqrtf(float(n_embd_head)); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * inpL; ggml_tensor * cur; diff --git a/src/models/cohere2-iswa.cpp b/src/models/cohere2-iswa.cpp index 9334b5e426..7c71a59ae7 100644 --- a/src/models/cohere2-iswa.cpp +++ b/src/models/cohere2-iswa.cpp @@ -1,9 +1,9 @@ #include "models.h" llm_build_cohere2_iswa::llm_build_cohere2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); const float f_logit_scale = hparams.f_logit_scale; diff --git a/src/models/command-r.cpp b/src/models/command-r.cpp index 4d3b643b44..ba1230f041 100644 --- a/src/models/command-r.cpp +++ b/src/models/command-r.cpp @@ -4,9 +4,9 @@ llm_build_command_r::llm_build_command_r(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); const float f_logit_scale = hparams.f_logit_scale; diff --git a/src/models/dbrx.cpp b/src/models/dbrx.cpp index 5c7f108437..73eb5cd24e 100644 --- a/src/models/dbrx.cpp +++ b/src/models/dbrx.cpp @@ -1,11 +1,11 @@ #include "models.h" llm_build_dbrx::llm_build_dbrx(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/deci.cpp b/src/models/deci.cpp index 7410a3a46d..ac448bfcaa 100644 --- a/src/models/deci.cpp +++ b/src/models/deci.cpp @@ -3,10 +3,10 @@ llm_build_deci::llm_build_deci(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/deepseek.cpp b/src/models/deepseek.cpp index 57cb1724f2..3432359e03 100644 --- a/src/models/deepseek.cpp +++ b/src/models/deepseek.cpp @@ -2,10 +2,10 @@ llm_build_deepseek::llm_build_deepseek(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp index abd54b7656..d437fe29e7 100644 --- a/src/models/deepseek2.cpp +++ b/src/models/deepseek2.cpp @@ -8,7 +8,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr const int64_t n_embd_head_k = hparams.n_embd_head_k_mla(); const int64_t n_embd_head_v = hparams.n_embd_head_v_mla(); - const int64_t n_embd_head_qk_rope = hparams.n_rot; + const int64_t n_embd_head_qk_rope = hparams.n_rot(); const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope; const uint32_t kv_lora_rank = hparams.n_lora_kv; diff --git a/src/models/dots1.cpp b/src/models/dots1.cpp index 0bcf3fe0de..07236dd27c 100644 --- a/src/models/dots1.cpp +++ b/src/models/dots1.cpp @@ -2,10 +2,10 @@ llm_build_dots1::llm_build_dots1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/dream.cpp b/src/models/dream.cpp index 2aafbae139..4edc8530cb 100644 --- a/src/models/dream.cpp +++ b/src/models/dream.cpp @@ -5,10 +5,10 @@ llm_build_dream::llm_build_dream(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { //copied from qwen2 - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/ernie4-5-moe.cpp b/src/models/ernie4-5-moe.cpp index ed781d5076..63baf152c4 100644 --- a/src/models/ernie4-5-moe.cpp +++ b/src/models/ernie4-5-moe.cpp @@ -2,10 +2,10 @@ llm_build_ernie4_5_moe::llm_build_ernie4_5_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/ernie4-5.cpp b/src/models/ernie4-5.cpp index 99aead5328..d548de0547 100644 --- a/src/models/ernie4-5.cpp +++ b/src/models/ernie4-5.cpp @@ -2,10 +2,10 @@ llm_build_ernie4_5::llm_build_ernie4_5(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/eurobert.cpp b/src/models/eurobert.cpp index 86e3176edc..e8628d165d 100644 --- a/src/models/eurobert.cpp +++ b/src/models/eurobert.cpp @@ -1,9 +1,9 @@ #include "models.h" llm_build_eurobert::llm_build_eurobert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/exaone-moe.cpp b/src/models/exaone-moe.cpp index a7396829ca..ea75701c52 100644 --- a/src/models/exaone-moe.cpp +++ b/src/models/exaone-moe.cpp @@ -2,10 +2,10 @@ llm_build_exaone_moe::llm_build_exaone_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_k; + const int64_t n_embd_head = hparams.n_embd_head_k(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_v); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_v()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/exaone.cpp b/src/models/exaone.cpp index 62602b284d..d4eea58e2f 100644 --- a/src/models/exaone.cpp +++ b/src/models/exaone.cpp @@ -4,10 +4,10 @@ llm_build_exaone::llm_build_exaone(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/exaone4.cpp b/src/models/exaone4.cpp index 8b7e3dc06e..755af3b747 100644 --- a/src/models/exaone4.cpp +++ b/src/models/exaone4.cpp @@ -4,10 +4,10 @@ template llm_build_exaone4::llm_build_exaone4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_k; + const int64_t n_embd_head = hparams.n_embd_head_k(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_v); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_v()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/falcon-h1.cpp b/src/models/falcon-h1.cpp index 785a7e5e66..ff842d93a4 100644 --- a/src/models/falcon-h1.cpp +++ b/src/models/falcon-h1.cpp @@ -2,7 +2,7 @@ llm_build_falcon_h1::llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/falcon.cpp b/src/models/falcon.cpp index db1ccdb500..9fcba50887 100644 --- a/src/models/falcon.cpp +++ b/src/models/falcon.cpp @@ -2,11 +2,11 @@ llm_build_falcon::llm_build_falcon(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/gemma-embedding.cpp b/src/models/gemma-embedding.cpp index 944c198bf9..98110d45e3 100644 --- a/src/models/gemma-embedding.cpp +++ b/src/models/gemma-embedding.cpp @@ -2,7 +2,7 @@ llm_build_gemma_embedding::llm_build_gemma_embedding(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_k; + const int64_t n_embd_head = hparams.n_embd_head_k(); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/gemma.cpp b/src/models/gemma.cpp index 4893d9af4b..1869efd389 100644 --- a/src/models/gemma.cpp +++ b/src/models/gemma.cpp @@ -2,7 +2,7 @@ llm_build_gemma::llm_build_gemma(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/gemma2-iswa.cpp b/src/models/gemma2-iswa.cpp index 7a9198193a..3927ddd297 100644 --- a/src/models/gemma2-iswa.cpp +++ b/src/models/gemma2-iswa.cpp @@ -1,7 +1,7 @@ #include "models.h" llm_build_gemma2_iswa::llm_build_gemma2_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_k; + const int64_t n_embd_head = hparams.n_embd_head_k(); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/gemma3.cpp b/src/models/gemma3.cpp index dec3fc4b8b..bbb4d9a81e 100644 --- a/src/models/gemma3.cpp +++ b/src/models/gemma3.cpp @@ -2,7 +2,7 @@ template llm_build_gemma3::llm_build_gemma3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_k; + const int64_t n_embd_head = hparams.n_embd_head_k(); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n-iswa.cpp index 7db6d3bf4e..8ce2ae39c2 100644 --- a/src/models/gemma3n-iswa.cpp +++ b/src/models/gemma3n-iswa.cpp @@ -3,7 +3,7 @@ llm_build_gemma3n_iswa::llm_build_gemma3n_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params), model(model), - n_embd_head(model.hparams.n_embd_head_k), + n_embd_head(model.hparams.n_embd_head_k()), n_embd_altup(model.hparams.n_embd_altup), n_altup(model.hparams.n_altup), i_altup_act(model.hparams.i_altup_act) { diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp index 97a65f4116..7938545ed8 100644 --- a/src/models/glm4-moe.cpp +++ b/src/models/glm4-moe.cpp @@ -1,9 +1,9 @@ #include "models.h" llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index bcd837b30d..b6ad8febed 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -3,10 +3,10 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); diff --git a/src/models/gpt2.cpp b/src/models/gpt2.cpp index 60761c8e76..cb1238f2d3 100644 --- a/src/models/gpt2.cpp +++ b/src/models/gpt2.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_gpt2::llm_build_gpt2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * pos; diff --git a/src/models/gptneox.cpp b/src/models/gptneox.cpp index 2151b14e93..1c8fe6c836 100644 --- a/src/models/gptneox.cpp +++ b/src/models/gptneox.cpp @@ -2,10 +2,10 @@ llm_build_gptneox::llm_build_gptneox(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/granite-hybrid.cpp b/src/models/granite-hybrid.cpp index d9b1c06da8..9b54a38c38 100644 --- a/src/models/granite-hybrid.cpp +++ b/src/models/granite-hybrid.cpp @@ -2,8 +2,8 @@ llm_build_granite_hybrid::llm_build_granite_hybrid(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const int64_t n_embd_head = hparams.n_embd_head_v(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/granite.cpp b/src/models/granite.cpp index fd97116ed3..7a7e1664c2 100644 --- a/src/models/granite.cpp +++ b/src/models/granite.cpp @@ -5,10 +5,10 @@ llm_build_granite::llm_build_granite( const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/grok.cpp b/src/models/grok.cpp index 24232604b3..580d63e36a 100644 --- a/src/models/grok.cpp +++ b/src/models/grok.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_grok::llm_build_grok(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/grovemoe.cpp b/src/models/grovemoe.cpp index 2081f06856..aa60d3e938 100644 --- a/src/models/grovemoe.cpp +++ b/src/models/grovemoe.cpp @@ -2,11 +2,11 @@ llm_build_grovemoe::llm_build_grovemoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_chunk_expert = n_expert / hparams.n_group_experts; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/hunyuan-dense.cpp b/src/models/hunyuan-dense.cpp index 7d5dcc7828..6a51707c85 100644 --- a/src/models/hunyuan-dense.cpp +++ b/src/models/hunyuan-dense.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_hunyuan_dense::llm_build_hunyuan_dense(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/hunyuan-moe.cpp b/src/models/hunyuan-moe.cpp index cee2b541b7..806c30b366 100644 --- a/src/models/hunyuan-moe.cpp +++ b/src/models/hunyuan-moe.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_hunyuan_moe::llm_build_hunyuan_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/internlm2.cpp b/src/models/internlm2.cpp index 387e821127..441d250268 100644 --- a/src/models/internlm2.cpp +++ b/src/models/internlm2.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_internlm2::llm_build_internlm2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/jais.cpp b/src/models/jais.cpp index 3e3376e6a6..135bf288ba 100644 --- a/src/models/jais.cpp +++ b/src/models/jais.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_jais::llm_build_jais(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/jais2.cpp b/src/models/jais2.cpp index a69fcaa3bb..2cfe484eb5 100644 --- a/src/models/jais2.cpp +++ b/src/models/jais2.cpp @@ -3,10 +3,10 @@ // JAIS-2 model graph builder // Uses: LayerNorm (not RMSNorm), relu2 activation, separate Q/K/V, RoPE embeddings llm_build_jais2::llm_build_jais2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/jamba.cpp b/src/models/jamba.cpp index 1d482e425a..c0c89de187 100644 --- a/src/models/jamba.cpp +++ b/src/models/jamba.cpp @@ -1,7 +1,7 @@ #include "models.h" llm_build_jamba::llm_build_jamba(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/kimi-linear.cpp b/src/models/kimi-linear.cpp index 8d4b95e519..063b17a2f6 100644 --- a/src/models/kimi-linear.cpp +++ b/src/models/kimi-linear.cpp @@ -102,7 +102,7 @@ llm_build_kimi_linear::llm_build_kimi_linear(const llama_model & model, const ll const int64_t kv_lora_rank = hparams.n_lora_kv; // qk_rope_head_dim = 64 (from Kimi config) which is hparams.n_rot // Confirmed from tensor shape: wkv_a_mqa [2304, 576] = [n_embd, kv_lora_rank + qk_rope_head_dim] - const int64_t n_embd_head_qk_rope = hparams.n_rot; // config.qk_rope_head_dim + const int64_t n_embd_head_qk_rope = hparams.n_rot(); // config.qk_rope_head_dim const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; // 192 - 64 = 128 // Attention scale for MLA const float kq_scale_mla = 1.0f / sqrtf((float)n_embd_head_k_mla); diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp index 8ca8e6c8e2..dfa322166b 100644 --- a/src/models/lfm2.cpp +++ b/src/models/lfm2.cpp @@ -39,7 +39,7 @@ llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_ inp_attn_type * inp_attn, int il) -> ggml_tensor * { GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il)); - const auto n_embd_head = hparams.n_embd_head_v; + const auto n_embd_head = hparams.n_embd_head_v(); const auto n_head_kv = hparams.n_head_kv(il); auto * q = build_lora_mm(model.layers[il].wq, cur); diff --git a/src/models/llada-moe.cpp b/src/models/llada-moe.cpp index 3bb9943f49..18de88fde1 100644 --- a/src/models/llada-moe.cpp +++ b/src/models/llada-moe.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_llada_moe::llm_build_llada_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/llada.cpp b/src/models/llada.cpp index 857033660a..0dac9d616a 100644 --- a/src/models/llada.cpp +++ b/src/models/llada.cpp @@ -2,10 +2,10 @@ llm_build_llada::llm_build_llada(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { // LLaDA is similar to LLaMA but uses non-causal attention for diffusion - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/llama-iswa.cpp b/src/models/llama-iswa.cpp index 40dc2427a8..67cb9a10ec 100644 --- a/src/models/llama-iswa.cpp +++ b/src/models/llama-iswa.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_llama_iswa::llm_build_llama_iswa(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/llama.cpp b/src/models/llama.cpp index 279f2e301a..ca4beac51f 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -2,10 +2,10 @@ template llm_build_llama::llm_build_llama(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/maincoder.cpp b/src/models/maincoder.cpp index da57308167..a72b7790a1 100644 --- a/src/models/maincoder.cpp +++ b/src/models/maincoder.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_maincoder::llm_build_maincoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/minicpm3.cpp b/src/models/minicpm3.cpp index 297cc34ba5..89dd710515 100644 --- a/src/models/minicpm3.cpp +++ b/src/models/minicpm3.cpp @@ -5,10 +5,10 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap const int64_t n_embd_base = 256; const float scale_embd = 12.0f; const float scale_depth = 1.4f; - const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k)); + const float kq_scale = 1.0f / sqrtf(float(hparams.n_embd_head_k())); - const uint32_t n_embd_head_qk_rope = hparams.n_rot; - const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const uint32_t n_embd_head_qk_rope = hparams.n_rot(); + const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot(); const uint32_t kv_lora_rank = hparams.n_lora_kv; @@ -51,21 +51,21 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap LLM_NORM_RMS, il); cb(q, "q", il); - // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} + // {q_lora_rank, n_head * hparams.n_embd_head_k()} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k(), n_tokens} q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); cb(q, "q", il); // split into {n_head * n_embd_head_qk_nope, n_tokens} ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + ggml_row_size(q->type, hparams.n_embd_head_k()), + ggml_row_size(q->type, hparams.n_embd_head_k() * n_head), 0); cb(q_nope, "q_nope", il); // and {n_head * n_embd_head_qk_rope, n_tokens} ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + ggml_row_size(q->type, hparams.n_embd_head_k()), + ggml_row_size(q->type, hparams.n_embd_head_k() * n_head), ggml_row_size(q->type, n_embd_head_qk_nope)); cb(q_pe, "q_pe", il); @@ -97,15 +97,15 @@ llm_build_minicpm3::llm_build_minicpm3(const llama_model & model, const llm_grap // split into {n_head * n_embd_head_qk_nope, n_tokens} ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), - ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), + ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()), + ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())), 0); cb(k_nope, "k_nope", il); // and {n_head * n_embd_head_v, n_tokens} - ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), + ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens, + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())), + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head), ggml_row_size(kv->type, (n_embd_head_qk_nope))); cb(v_states, "v_states", il); diff --git a/src/models/minimax-m2.cpp b/src/models/minimax-m2.cpp index fbeed8eab9..83d0916c08 100644 --- a/src/models/minimax-m2.cpp +++ b/src/models/minimax-m2.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_minimax_m2::llm_build_minimax_m2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - // GGML_ASSERT(n_embd_head == hparams.n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64 + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + // GGML_ASSERT(n_embd_head == n_rot); this is wrong in case of minimax, head_dim = 128, n_rot = 64 ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/mistral3.cpp b/src/models/mistral3.cpp index 49734989df..42a5117ff0 100644 --- a/src/models/mistral3.cpp +++ b/src/models/mistral3.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_mistral3::llm_build_mistral3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/modern-bert.cpp b/src/models/modern-bert.cpp index 32066c712b..26020584c6 100644 --- a/src/models/modern-bert.cpp +++ b/src/models/modern-bert.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_modern_bert::llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/mpt.cpp b/src/models/mpt.cpp index 2328e027a7..ce44a805f5 100644 --- a/src/models/mpt.cpp +++ b/src/models/mpt.cpp @@ -3,10 +3,10 @@ llm_build_mpt::llm_build_mpt(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * pos; diff --git a/src/models/nemotron-h.cpp b/src/models/nemotron-h.cpp index fa2b55a284..6358215050 100644 --- a/src/models/nemotron-h.cpp +++ b/src/models/nemotron-h.cpp @@ -2,8 +2,8 @@ llm_build_nemotron_h::llm_build_nemotron_h(const llama_model & model, const llm_graph_params & params) : llm_build_mamba_base(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const int64_t n_embd_head = hparams.n_embd_head_v(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/nemotron.cpp b/src/models/nemotron.cpp index fcead041f0..34aa6fa5ec 100644 --- a/src/models/nemotron.cpp +++ b/src/models/nemotron.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_nemotron::llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - //GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + //GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/neo-bert.cpp b/src/models/neo-bert.cpp index 7c32bfca5f..2fdf4a3692 100644 --- a/src/models/neo-bert.cpp +++ b/src/models/neo-bert.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_neo_bert::llm_build_neo_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/olmo.cpp b/src/models/olmo.cpp index bbd623f111..26f4b6ee62 100644 --- a/src/models/olmo.cpp +++ b/src/models/olmo.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_olmo::llm_build_olmo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/olmo2.cpp b/src/models/olmo2.cpp index 713552dab8..5076359e3f 100644 --- a/src/models/olmo2.cpp +++ b/src/models/olmo2.cpp @@ -2,10 +2,10 @@ template llm_build_olmo2::llm_build_olmo2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/olmoe.cpp b/src/models/olmoe.cpp index a05b892c70..83a56a0b3b 100644 --- a/src/models/olmoe.cpp +++ b/src/models/olmoe.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_olmoe::llm_build_olmoe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/openelm.cpp b/src/models/openelm.cpp index fbf682ec83..5df6fe3e3c 100644 --- a/src/models/openelm.cpp +++ b/src/models/openelm.cpp @@ -1,9 +1,9 @@ #include "models.h" llm_build_openelm::llm_build_openelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/orion.cpp b/src/models/orion.cpp index bb02273bfe..48c01efe36 100644 --- a/src/models/orion.cpp +++ b/src/models/orion.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_orion::llm_build_orion(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/paddleocr.cpp b/src/models/paddleocr.cpp index 39a368df53..340455c2d5 100644 --- a/src/models/paddleocr.cpp +++ b/src/models/paddleocr.cpp @@ -5,10 +5,10 @@ llm_build_paddleocr::llm_build_paddleocr(const llama_model & model, const llm_gr // NOTE: same with qwen2vl.cpp, but bias tensors are optional - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/pangu-embedded.cpp b/src/models/pangu-embedded.cpp index 664572a500..1cf0938e68 100644 --- a/src/models/pangu-embedded.cpp +++ b/src/models/pangu-embedded.cpp @@ -2,10 +2,10 @@ llm_build_pangu_embedded::llm_build_pangu_embedded(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/phi2.cpp b/src/models/phi2.cpp index 22dbf61076..32d40d71fb 100644 --- a/src/models/phi2.cpp +++ b/src/models/phi2.cpp @@ -2,10 +2,10 @@ llm_build_phi2::llm_build_phi2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * attn_norm_output; diff --git a/src/models/phi3.cpp b/src/models/phi3.cpp index 803e374aa5..3d11a9459c 100644 --- a/src/models/phi3.cpp +++ b/src/models/phi3.cpp @@ -2,10 +2,10 @@ template llm_build_phi3::llm_build_phi3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/plamo.cpp b/src/models/plamo.cpp index 04ff709f9c..b7a7121104 100644 --- a/src/models/plamo.cpp +++ b/src/models/plamo.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_plamo::llm_build_plamo(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/plamo2.cpp b/src/models/plamo2.cpp index 276d3829b1..f02acbc186 100644 --- a/src/models/plamo2.cpp +++ b/src/models/plamo2.cpp @@ -106,9 +106,9 @@ ggml_tensor * llm_build_plamo2::build_plamo2_attn_layer(llm_graph_input_attn_kv cb(qkv, "wqkv", il); // split QKV tensor into Q, K, V - const int64_t n_embd_head_q = hparams.n_embd_head_k; - const int64_t n_embd_head_k = hparams.n_embd_head_k; - const int64_t n_embd_head_v = hparams.n_embd_head_v; + const int64_t n_embd_head_q = hparams.n_embd_head_k(); + const int64_t n_embd_head_k = hparams.n_embd_head_k(); + const int64_t n_embd_head_v = hparams.n_embd_head_v(); int32_t n_head = hparams.n_head(il); int32_t n_head_kv = hparams.n_head_kv(il); diff --git a/src/models/plamo3.cpp b/src/models/plamo3.cpp index 55c8064679..32af6e0466 100644 --- a/src/models/plamo3.cpp +++ b/src/models/plamo3.cpp @@ -3,8 +3,8 @@ template llm_build_plamo3::llm_build_plamo3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t head_dim_q = hparams.n_embd_head_k; - const int64_t head_dim_v = hparams.n_embd_head_v; + const int64_t head_dim_q = hparams.n_embd_head_k(); + const int64_t head_dim_v = hparams.n_embd_head_v(); ggml_tensor * cur; ggml_tensor * inpL = build_inp_embd(model.tok_embd); diff --git a/src/models/plm.cpp b/src/models/plm.cpp index 612a487c56..bcb651ce54 100644 --- a/src/models/plm.cpp +++ b/src/models/plm.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k)); + const float kq_scale = 1.0f/sqrtf(float(hparams.n_embd_head_k())); - const uint32_t n_embd_head_qk_rope = hparams.n_rot; - const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const uint32_t n_embd_head_qk_rope = hparams.n_rot(); + const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k() - hparams.n_rot(); const uint32_t kv_lora_rank = hparams.n_lora_kv; @@ -38,15 +38,15 @@ llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & // split into {n_head * n_embd_head_qk_nope, n_tokens} ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + ggml_row_size(q->type, hparams.n_embd_head_k()), + ggml_row_size(q->type, hparams.n_embd_head_k() * n_head), 0); cb(q_nope, "q_nope", il); // and {n_head * n_embd_head_qk_rope, n_tokens} ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + ggml_row_size(q->type, hparams.n_embd_head_k()), + ggml_row_size(q->type, hparams.n_embd_head_k() * n_head), ggml_row_size(q->type, n_embd_head_qk_nope)); cb(q_pe, "q_pe", il); @@ -78,23 +78,23 @@ llm_build_plm::llm_build_plm(const llama_model & model, const llm_graph_params & // split into {n_head * n_embd_head_qk_nope, n_tokens} ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), - ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), + ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v()), + ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v())), 0); cb(k_nope, "k_nope", il); // and {n_head * n_embd_head_v, n_tokens} - ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), + ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v(), n_head, n_tokens, + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())), + ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v())*n_head), ggml_row_size(kv->type, (n_embd_head_qk_nope))); cb(v_states, "v_states", il); v_states = ggml_cont(ctx0, v_states); cb(v_states, "v_states", il); - v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, - ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), + v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v() * n_head, n_tokens, + ggml_row_size(kv->type, hparams.n_embd_head_v() * n_head), 0); cb(v_states, "v_states", il); diff --git a/src/models/qwen.cpp b/src/models/qwen.cpp index 31fd9b7376..7390f1320b 100644 --- a/src/models/qwen.cpp +++ b/src/models/qwen.cpp @@ -2,9 +2,9 @@ llm_build_qwen::llm_build_qwen(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/qwen2.cpp b/src/models/qwen2.cpp index 3da4dea3c1..58c1062250 100644 --- a/src/models/qwen2.cpp +++ b/src/models/qwen2.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/qwen2moe.cpp b/src/models/qwen2moe.cpp index e19061334c..60761789dc 100644 --- a/src/models/qwen2moe.cpp +++ b/src/models/qwen2moe.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_qwen2moe::llm_build_qwen2moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/qwen2vl.cpp b/src/models/qwen2vl.cpp index 9be38675cf..9004bab9db 100644 --- a/src/models/qwen2vl.cpp +++ b/src/models/qwen2vl.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_qwen2vl::llm_build_qwen2vl(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index a5cfffa531..be4811aba1 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_qwen3::llm_build_qwen3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index afc5a1aad7..ba096a5a7b 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -4,9 +4,9 @@ llm_build_qwen35::llm_build_qwen35(const llama_model & model, const llm_graph_params & params) : llm_build_delta_net_base(params), model(model) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); @@ -117,8 +117,8 @@ ggml_tensor * llm_build_qwen35::build_layer_attn( ggml_tensor * inp_pos, int * sections, int il) { - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const int64_t n_embd_head = hparams.n_embd_head_v(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index fe081af212..fe382286e9 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -4,9 +4,9 @@ llm_build_qwen35moe::llm_build_qwen35moe(const llama_model & model, const llm_graph_params & params) : llm_build_delta_net_base(params), model(model) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); int sections[4]; std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections); @@ -117,8 +117,8 @@ ggml_tensor * llm_build_qwen35moe ::build_layer_attn( ggml_tensor * inp_pos, int * sections, int il) { - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const int64_t n_embd_head = hparams.n_embd_head_v(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index 9cf1ec03c6..5912a71582 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_qwen3moe::llm_build_qwen3moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/qwen3next.cpp b/src/models/qwen3next.cpp index 9b8164ddfa..30912fd5e3 100644 --- a/src/models/qwen3next.cpp +++ b/src/models/qwen3next.cpp @@ -100,8 +100,8 @@ ggml_tensor * llm_build_qwen3next::build_layer_attn( ggml_tensor * cur, ggml_tensor * inp_pos, int il) { - const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const int64_t n_embd_head = hparams.n_embd_head_v(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); // Order: joint QG projection, QG split, Q norm, KV projection, K norm, RoPE, attention diff --git a/src/models/qwen3vl-moe.cpp b/src/models/qwen3vl-moe.cpp index f347c5d6fb..195daea66c 100644 --- a/src/models/qwen3vl-moe.cpp +++ b/src/models/qwen3vl-moe.cpp @@ -4,10 +4,10 @@ llm_build_qwen3vlmoe::llm_build_qwen3vlmoe(const llama_model & model, const llm_ const size_t n_deepstack_layers = hparams.n_deepstack_layers; const int64_t n_embd = hparams.n_embd; - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/qwen3vl.cpp b/src/models/qwen3vl.cpp index 0f8315b324..bbd5f42ba5 100644 --- a/src/models/qwen3vl.cpp +++ b/src/models/qwen3vl.cpp @@ -4,10 +4,10 @@ llm_build_qwen3vl::llm_build_qwen3vl(const llama_model & model, const llm_graph_ const size_t n_deepstack_layers = hparams.n_deepstack_layers; const int64_t n_embd = hparams.n_embd; - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/refact.cpp b/src/models/refact.cpp index ff5eb2841d..140700d9e2 100644 --- a/src/models/refact.cpp +++ b/src/models/refact.cpp @@ -1,9 +1,9 @@ #include "models.h" llm_build_refact::llm_build_refact(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/rnd1.cpp b/src/models/rnd1.cpp index de9ab1c652..c8e1f43400 100644 --- a/src/models/rnd1.cpp +++ b/src/models/rnd1.cpp @@ -2,10 +2,10 @@ // RND1 is a Qwen3Moe AR model converted to diffusion model. llm_build_rnd1::llm_build_rnd1(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/seed-oss.cpp b/src/models/seed-oss.cpp index 0dc33c50ba..a4d0b75d84 100644 --- a/src/models/seed-oss.cpp +++ b/src/models/seed-oss.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_seed_oss::llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/smallthinker.cpp b/src/models/smallthinker.cpp index 8723905e83..e2155aacef 100644 --- a/src/models/smallthinker.cpp +++ b/src/models/smallthinker.cpp @@ -2,10 +2,10 @@ template llm_build_smallthinker::llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){ - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/smollm3.cpp b/src/models/smollm3.cpp index 97c30deed5..e267fd8f32 100644 --- a/src/models/smollm3.cpp +++ b/src/models/smollm3.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_smollm3::llm_build_smollm3(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/stablelm.cpp b/src/models/stablelm.cpp index bed1915c00..ff5aced93b 100644 --- a/src/models/stablelm.cpp +++ b/src/models/stablelm.cpp @@ -1,9 +1,9 @@ #include "models.h" llm_build_stablelm::llm_build_stablelm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/starcoder.cpp b/src/models/starcoder.cpp index e197af4a8c..941cee9821 100644 --- a/src/models/starcoder.cpp +++ b/src/models/starcoder.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_starcoder::llm_build_starcoder(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/starcoder2.cpp b/src/models/starcoder2.cpp index e40ef2cb74..a5965aceb3 100644 --- a/src/models/starcoder2.cpp +++ b/src/models/starcoder2.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_starcoder2::llm_build_starcoder2(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/step35-iswa.cpp b/src/models/step35-iswa.cpp index aa8e98b737..176209cd93 100644 --- a/src/models/step35-iswa.cpp +++ b/src/models/step35-iswa.cpp @@ -52,7 +52,7 @@ llm_build_step35_iswa::llm_build_step35_iswa(const llama_model & model, const ll // RoPE (partial rotary factors per layer) const bool is_swa = hparams.is_swa(il); ggml_tensor * rope_factors = is_swa ? nullptr : model.get_rope_factors(cparams, il); - const int64_t n_rot_l = is_swa ? hparams.n_rot : (hparams.n_rot / 2); + const int64_t n_rot_l = hparams.n_rot(il); Qcur = ggml_rope_ext( ctx0, Qcur, inp_pos, rope_factors, n_rot_l, rope_type, n_ctx_orig, freq_base_l, freq_scale_l, diff --git a/src/models/t5-dec.cpp b/src/models/t5-dec.cpp index 297e450de7..8ca8372bd4 100644 --- a/src/models/t5-dec.cpp +++ b/src/models/t5-dec.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_t5_dec::llm_build_t5_dec(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); //const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/t5-enc.cpp b/src/models/t5-enc.cpp index 70e1d80dcd..395dfb5104 100644 --- a/src/models/t5-enc.cpp +++ b/src/models/t5-enc.cpp @@ -1,9 +1,9 @@ #include "models.h" llm_build_t5_enc::llm_build_t5_enc(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); ggml_tensor * cur; ggml_tensor * inpL; diff --git a/src/models/xverse.cpp b/src/models/xverse.cpp index 364797dd31..3a8dfafcce 100644 --- a/src/models/xverse.cpp +++ b/src/models/xverse.cpp @@ -1,10 +1,10 @@ #include "models.h" llm_build_xverse::llm_build_xverse(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_head = hparams.n_embd_head_v(); - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - GGML_ASSERT(n_embd_head == hparams.n_rot); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); ggml_tensor * cur; ggml_tensor * inpL;