This commit is contained in:
Sigbjørn Skjæret 2026-01-02 21:34:17 +00:00 committed by GitHub
commit 87a9297cb8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 44 additions and 17 deletions

View File

@ -771,9 +771,14 @@ class TextModel(ModelBase):
self.rope_parameters = self.hparams.get("rope_parameters", self.hparams.get("rope_scaling")) or {}
rope_theta = self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)
local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "swa_rope_theta", "rope_local_base_freq"], optional=True)
# Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters
if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters:
if "rope_theta" not in self.rope_parameters and (rope_theta := self.find_hparam(["rope_theta", "global_rope_theta", "rotary_emb_base"], optional=True)) is not None:
if local_rope_theta is not None:
self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta}
if "rope_theta" not in self.rope_parameters and rope_theta is not None:
self.rope_parameters["rope_theta"] = rope_theta
if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None:
self.rope_parameters["rope_type"] = rope_type
@ -885,6 +890,9 @@ class TextModel(ModelBase):
if (rope_theta := rope_params.get("rope_theta")) is not None:
self.gguf_writer.add_rope_freq_base(rope_theta)
logger.info(f"gguf: rope theta = {rope_theta}")
if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None:
self.gguf_writer.add_rope_freq_base_swa(local_rope_theta)
logger.info(f"gguf: rope theta swa = {local_rope_theta}")
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
@ -5004,7 +5012,6 @@ class Plamo3Model(TextModel):
if (sliding_window := self.find_hparam(["window_size", "sliding_window"], optional=True)) is not None:
self.gguf_writer.add_sliding_window(sliding_window)
self.gguf_writer.add_sliding_window_pattern(self.hparams["sliding_window_pattern"])
self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("rope_local_theta")})["rope_theta"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@ -7480,7 +7487,6 @@ class MimoV2Model(TextModel):
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
self.gguf_writer.add_sliding_window_pattern(self.hparams["hybrid_layer_pattern"])
self.gguf_writer.add_rope_freq_base_swa(self.hparams["swa_rope_theta"])
self.gguf_writer.add_value_length(self.hparams["v_head_dim"])
self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"])
self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"])
@ -10218,7 +10224,6 @@ class ModernBertModel(BertModel):
self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
if (sliding_window_pattern := self.hparams.get("global_attn_every_n_layers")) is not None:
self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern)
self.gguf_writer.add_rope_freq_base_swa(self.rope_parameters.get("sliding_attention", {"rope_theta": self.hparams.get("local_rope_theta")})["rope_theta"])
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])

View File

@ -105,9 +105,9 @@ struct llama_hparams {
float rope_attn_factor = 1.0f;
float rope_freq_base_train;
float rope_freq_base_train_swa;
float rope_freq_base_train_swa = 10000.0f;
float rope_freq_scale_train;
float rope_freq_scale_train_swa;
float rope_freq_scale_train_swa = 1.0f;
uint32_t n_ctx_orig_yarn;
float rope_yarn_log_mul = 0.0f;

View File

@ -586,10 +586,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
}
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
// by default assume that the sliding-window layers use the same scaling type as the non-sliding-window layers
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor, false);
// non-transformer models do not have attention heads
@ -677,6 +673,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.f_attn_temp_scale = 0.1f;
hparams.f_attn_temp_offset = 1.0f;
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
}
switch (hparams.n_expert) {
@ -722,6 +722,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
if (hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(4);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
@ -1243,7 +1247,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
if (found_swa && hparams.n_swa > 0) {
uint32_t swa_period = 8;
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.rope_freq_scale_train_swa = 1.0f;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false);
hparams.set_swa_pattern(swa_period);
@ -1309,7 +1312,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.n_swa = 4096; // default value of gemma 2
hparams.set_swa_pattern(2);
hparams.attn_soft_cap = true;
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
@ -1334,8 +1340,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(6);
hparams.rope_freq_base_train_swa = 10000.0f;
hparams.rope_freq_scale_train_swa = 1.0f;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
@ -1365,10 +1370,9 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.set_swa_pattern(5);
hparams.n_layer_kv_from_start = 20;
hparams.rope_freq_base_train_swa = 10000.0f;
hparams.rope_freq_scale_train_swa = 1.0f;
hparams.f_attention_scale = 1.0f;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@ -1384,9 +1388,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.set_swa_pattern(6);
hparams.causal_attn = false; // embeddings do not use causal attention
hparams.rope_freq_base_train_swa = 10000.0f;
hparams.rope_freq_scale_train_swa = 1.0f;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
@ -1525,7 +1528,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(4);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
@ -1564,6 +1570,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
if (found_swa && hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(4);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}
@ -1906,6 +1916,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa = 4096;
hparams.set_swa_pattern(4);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
}
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
@ -2208,6 +2222,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(2);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
switch (hparams.n_layer) {
case 24: type = LLM_TYPE_20B; break;
case 36: type = LLM_TYPE_120B; break;
@ -2252,6 +2270,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.n_swa = 4096;
hparams.set_swa_pattern(4, true);
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
hparams.n_no_rope_layer_step = hparams.n_layer;