conversion now working for swa pattern - dense every n layers

This commit is contained in:
ryan-mangeno 2025-11-29 11:04:11 -05:00
parent b66c2fd1cd
commit a9441fb70b
8 changed files with 20 additions and 7 deletions

View File

@ -9343,6 +9343,7 @@ class ModernBertModel(BertModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_dense_every_n_layers(self.hparams["global_attn_every_n_layers"])
self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
self.gguf_writer.add_rope_freq_base(self.hparams["global_rope_theta"])
self.gguf_writer.add_rope_freq_base_swa(self.hparams["local_rope_theta"])

View File

@ -160,6 +160,7 @@ class Keys:
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
DENSE_EVERY_N_LAYERS = "{arch}.attention.dense_every_n_layers"
class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count"

View File

@ -729,6 +729,9 @@ class GGUFWriter:
def add_sliding_window_pattern(self, value: Sequence[bool]) -> None:
self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value)
def add_dense_every_n_layers(self, value: int) -> None:
self.add_uint32(Keys.Attention.DENSE_EVERY_N_LAYERS.format(arch=self.arch), value)
def add_dense_features_dims(self, dense:str, in_f:int, out_f:int) -> None:
self.add_uint32(Keys.LLM.DENSE_FEAT_IN_SIZE.format(arch=self.arch, dense=dense), in_f)

View File

@ -181,6 +181,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
{ LLM_KV_ATTENTION_DENSE_EVERY_N_LAYERS, "%s.attention.dense_every_n_layers" },
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },

View File

@ -165,6 +165,7 @@ enum llm_kv {
LLM_KV_EMBEDDING_SCALE,
LLM_KV_TOKEN_SHIFT_COUNT,
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
LLM_KV_DENSE_EVERY_N_LAYERS,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@ -185,6 +186,7 @@ enum llm_kv {
LLM_KV_ATTENTION_GATE_LORA_RANK,
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
LLM_KV_ATTENTION_SLIDING_WINDOW,
LLM_KV_ATTENTION_DENSE_EVERY_N_LAYERS,
LLM_KV_ATTENTION_SCALE,
LLM_KV_ATTENTION_OUTPUT_SCALE,
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,

View File

@ -121,6 +121,7 @@ struct llama_hparams {
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
// the size of the sliding window (0 - no SWA)
uint32_t n_swa = 0;
uint32_t n_swa_pattern = 1;
// if swa_layers[il] == true, then layer il is SWA
// if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
// by default, all layers are dense

View File

@ -182,6 +182,7 @@ void llama_model_saver::add_kv_from_model() {
add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
add_kv(LLM_KV_ATTENTION_DENSE_EVERY_N_LAYERS, hparams.n_swa_pattern);
add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;

View File

@ -840,13 +840,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
hparams.set_swa_pattern(3);
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ATTENTION_DENSE_EVERY_N_LAYERS, hparams.n_swa_pattern);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
hparams.set_swa_pattern(hparams.n_swa_pattern);
switch (hparams.n_layer) {
case 12:
@ -8204,6 +8206,7 @@ struct llm_build_modern_bert : public llm_graph_context {
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
const float rope_theta_global = hparams.rope_freq_base_train;
const float rope_theta_local = hparams.rope_freq_base_train_swa;
const uint32_t n_swa_pattern = hparams.n_swa_pattern;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
@ -8230,7 +8233,7 @@ struct llm_build_modern_bert : public llm_graph_context {
ggml_tensor * Kcur = nullptr;
ggml_tensor * Vcur = nullptr;
const float rope_theta = (il % 3 == 0) ? rope_theta_global : rope_theta_local;
const float rope_theta = (il % n_swa_pattern == 0) ? rope_theta_global : rope_theta_local;
// attention layer norm