conversion now working for swa pattern - dense every n layers
This commit is contained in:
parent
b66c2fd1cd
commit
a9441fb70b
|
|
@ -9343,6 +9343,7 @@ class ModernBertModel(BertModel):
|
|||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_dense_every_n_layers(self.hparams["global_attn_every_n_layers"])
|
||||
self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
|
||||
self.gguf_writer.add_rope_freq_base(self.hparams["global_rope_theta"])
|
||||
self.gguf_writer.add_rope_freq_base_swa(self.hparams["local_rope_theta"])
|
||||
|
|
|
|||
|
|
@ -160,6 +160,7 @@ class Keys:
|
|||
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
|
||||
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
|
||||
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
|
||||
DENSE_EVERY_N_LAYERS = "{arch}.attention.dense_every_n_layers"
|
||||
|
||||
class Rope:
|
||||
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
||||
|
|
|
|||
|
|
@ -729,6 +729,9 @@ class GGUFWriter:
|
|||
|
||||
def add_sliding_window_pattern(self, value: Sequence[bool]) -> None:
|
||||
self.add_array(Keys.Attention.SLIDING_WINDOW_PATTERN.format(arch=self.arch), value)
|
||||
|
||||
def add_dense_every_n_layers(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Attention.DENSE_EVERY_N_LAYERS.format(arch=self.arch), value)
|
||||
|
||||
def add_dense_features_dims(self, dense:str, in_f:int, out_f:int) -> None:
|
||||
self.add_uint32(Keys.LLM.DENSE_FEAT_IN_SIZE.format(arch=self.arch, dense=dense), in_f)
|
||||
|
|
|
|||
|
|
@ -181,6 +181,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
|||
{ LLM_KV_ATTENTION_GATE_LORA_RANK, "%s.attention.gate_lora_rank" },
|
||||
{ LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" },
|
||||
{ LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" },
|
||||
{ LLM_KV_ATTENTION_DENSE_EVERY_N_LAYERS, "%s.attention.dense_every_n_layers" },
|
||||
{ LLM_KV_ATTENTION_SCALE, "%s.attention.scale" },
|
||||
{ LLM_KV_ATTENTION_OUTPUT_SCALE, "%s.attention.output_scale" },
|
||||
{ LLM_KV_ATTENTION_TEMPERATURE_LENGTH, "%s.attention.temperature_length" },
|
||||
|
|
|
|||
|
|
@ -165,6 +165,7 @@ enum llm_kv {
|
|||
LLM_KV_EMBEDDING_SCALE,
|
||||
LLM_KV_TOKEN_SHIFT_COUNT,
|
||||
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
|
||||
LLM_KV_DENSE_EVERY_N_LAYERS,
|
||||
|
||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||
|
|
@ -185,6 +186,7 @@ enum llm_kv {
|
|||
LLM_KV_ATTENTION_GATE_LORA_RANK,
|
||||
LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT,
|
||||
LLM_KV_ATTENTION_SLIDING_WINDOW,
|
||||
LLM_KV_ATTENTION_DENSE_EVERY_N_LAYERS,
|
||||
LLM_KV_ATTENTION_SCALE,
|
||||
LLM_KV_ATTENTION_OUTPUT_SCALE,
|
||||
LLM_KV_ATTENTION_TEMPERATURE_LENGTH,
|
||||
|
|
|
|||
|
|
@ -121,6 +121,7 @@ struct llama_hparams {
|
|||
llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
// the size of the sliding window (0 - no SWA)
|
||||
uint32_t n_swa = 0;
|
||||
uint32_t n_swa_pattern = 1;
|
||||
// if swa_layers[il] == true, then layer il is SWA
|
||||
// if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
|
||||
// by default, all layers are dense
|
||||
|
|
|
|||
|
|
@ -182,6 +182,7 @@ void llama_model_saver::add_kv_from_model() {
|
|||
add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||||
add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
||||
add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
add_kv(LLM_KV_ATTENTION_DENSE_EVERY_N_LAYERS, hparams.n_swa_pattern);
|
||||
add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
||||
|
||||
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
||||
|
|
|
|||
|
|
@ -840,13 +840,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||
{
|
||||
|
||||
hparams.swa_type = LLAMA_SWA_TYPE_SYMMETRIC;
|
||||
hparams.set_swa_pattern(3);
|
||||
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||
ml.get_key(LLM_KV_ATTENTION_DENSE_EVERY_N_LAYERS, hparams.n_swa_pattern);
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||
|
||||
hparams.set_swa_pattern(hparams.n_swa_pattern);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 12:
|
||||
|
|
@ -8204,6 +8206,7 @@ struct llm_build_modern_bert : public llm_graph_context {
|
|||
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
||||
const float rope_theta_global = hparams.rope_freq_base_train;
|
||||
const float rope_theta_local = hparams.rope_freq_base_train_swa;
|
||||
const uint32_t n_swa_pattern = hparams.n_swa_pattern;
|
||||
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
|
|
@ -8230,7 +8233,7 @@ struct llm_build_modern_bert : public llm_graph_context {
|
|||
ggml_tensor * Kcur = nullptr;
|
||||
ggml_tensor * Vcur = nullptr;
|
||||
|
||||
const float rope_theta = (il % 3 == 0) ? rope_theta_global : rope_theta_local;
|
||||
const float rope_theta = (il % n_swa_pattern == 0) ? rope_theta_global : rope_theta_local;
|
||||
|
||||
|
||||
// attention layer norm
|
||||
|
|
|
|||
Loading…
Reference in New Issue