refactor: simplify VaetkiModel set_gguf_parameters

This commit is contained in:
suhyun-hwang 2026-01-13 23:52:51 +09:00
parent ab233049dc
commit 487909ae0e
1 changed files with 12 additions and 26 deletions

View File

@ -7676,12 +7676,11 @@ class VaetkiModel(TextModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Flatten text_config parameters to top level
if "text_config" in self.hparams:
text_config = self.hparams["text_config"]
for key, value in text_config.items():
if key not in self.hparams:
self.hparams[key] = value
# Set rope_parameters for hybrid attention (transformers 5.0 format)
self.rope_parameters = {
"full_attention": {"rope_theta": self.hparams.get("rope_theta_global", 1000000.0)},
"sliding_attention": {"rope_theta": self.hparams.get("rope_theta", 10000.0)}
}
def set_vocab(self):
# VAETKI uses Metaspace-based BPE tokenizer, load vocab from tokenizer.json
@ -7765,15 +7764,9 @@ class VaetkiModel(TextModel):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_block_count(hparams["num_hidden_layers"])
self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 32768))
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
# For MLA without absorption, n_head_kv = n_head (full MHA after decompression)
self.gguf_writer.add_head_count_kv(hparams["num_attention_heads"])
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-5))
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
# MLA parameters (like DeepSeek2)
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
@ -7789,25 +7782,18 @@ class VaetkiModel(TextModel):
self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
# VAETKI uses hybrid attention with different rope_theta per layer type:
# - sliding_attention layers use rope_theta (local, default 10000.0)
# - full_attention layers use rope_theta_global (global, default 1000000.0)
# In llama.cpp: rope_freq_base is for non-SWA (full), rope_freq_base_swa is for SWA (sliding)
rope_theta_local = hparams.get("rope_theta", 10000.0)
rope_theta_global = hparams.get("rope_theta_global", 1000000.0)
self.gguf_writer.add_rope_freq_base(rope_theta_global) # for full_attention layers
self.gguf_writer.add_rope_freq_base_swa(rope_theta_local) # for sliding_attention layers
self.rope_parameters = {
"full_attention": {"rope_theta": self.hparams.get("rope_theta_global", 1000000.0)},
"sliding_attention": {"rope_theta": self.hparams.get("rope_theta", 10000.0)}
}
# MoE parameters
self.gguf_writer.add_leading_dense_block_count(hparams.get("first_k_dense_replace", 1))
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
self.gguf_writer.add_expert_used_count(hparams["num_experts_per_tok"])
self.gguf_writer.add_expert_shared_count(hparams.get("n_shared_experts", 1))
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_weights_scale(hparams.get("routed_scaling_factor", 1.0))
# VAETKI uses sigmoid gating function (WBLTopkRouter uses router_logits.sigmoid())
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
# Normalize top-k probabilities (norm_topk_prob=true in config)
if (routed_scale := hparams.get("routed_scaling_factor")) is not None:
self.gguf_writer.add_expert_weights_scale(routed_scale)
if hparams.get("norm_topk_prob", False):
self.gguf_writer.add_expert_weights_norm(True)