From da0604a5487ffd830ec85616f0c6005462d8a913 Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Fri, 12 Sep 2025 16:50:15 -0400 Subject: [PATCH] fixed alternating rope, the hparams.rope_freq_base_train and hparams.rope_freq_base_train_swa were the same and i set them to correct values --- src/llama-model.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 7a05491868..448c7320ac 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -763,7 +763,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_LOCAL; hparams.set_swa_pattern(3, 0); - hparams.n_swa = 128; + hparams.rope_freq_base_train_swa = 10000.f; + hparams.rope_freq_base_train = 160000.f; + hparams.n_swa = 128; ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -7553,8 +7555,10 @@ struct llm_build_bert : public llm_graph_context { template struct llm_build_modern_bert : public llm_graph_context { llm_build_modern_bert(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - const int64_t n_embd_head = hparams.n_embd_head_v; - const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + const float rope_theta_global = hparams.rope_freq_base_train; + const float rope_theta_local = hparams.rope_freq_base_train_swa; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); @@ -7580,7 +7584,7 @@ struct llm_build_modern_bert : public llm_graph_context { ggml_tensor * Kcur; ggml_tensor * Vcur; - float rope_theta = il % 3 == 0 ? hparams.rope_freq_base_train : hparams.rope_freq_base_train_swa; + const float rope_theta = il % 3 == 0 ? rope_theta_global : rope_theta_local; // attention layer norm if (model.layers[il].attn_norm) {