safe defaults for unknowns
This commit is contained in:
parent
3039150973
commit
fb1aa17d86
|
|
@ -673,6 +673,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
hparams.f_attn_temp_scale = 0.1f;
|
hparams.f_attn_temp_scale = 0.1f;
|
||||||
hparams.f_attn_temp_offset = 1.0f;
|
hparams.f_attn_temp_offset = 1.0f;
|
||||||
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
hparams.set_swa_pattern(4); // pattern: 3 chunked - 1 full
|
||||||
|
|
||||||
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||||
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||||
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (hparams.n_expert) {
|
switch (hparams.n_expert) {
|
||||||
|
|
@ -718,6 +722,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
if (hparams.n_swa > 0) {
|
if (hparams.n_swa > 0) {
|
||||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||||
hparams.set_swa_pattern(4);
|
hparams.set_swa_pattern(4);
|
||||||
|
|
||||||
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||||
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||||
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||||
} else {
|
} else {
|
||||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||||
}
|
}
|
||||||
|
|
@ -1304,7 +1312,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
hparams.n_swa = 4096; // default value of gemma 2
|
hparams.n_swa = 4096; // default value of gemma 2
|
||||||
hparams.set_swa_pattern(2);
|
hparams.set_swa_pattern(2);
|
||||||
hparams.attn_soft_cap = true;
|
hparams.attn_soft_cap = true;
|
||||||
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||||
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||||
|
|
||||||
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false);
|
||||||
|
|
@ -1517,7 +1528,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
{
|
{
|
||||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||||
hparams.set_swa_pattern(4);
|
hparams.set_swa_pattern(4);
|
||||||
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||||
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||||
|
|
||||||
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||||
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
|
@ -1556,6 +1570,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
if (found_swa && hparams.n_swa > 0) {
|
if (found_swa && hparams.n_swa > 0) {
|
||||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||||
hparams.set_swa_pattern(4);
|
hparams.set_swa_pattern(4);
|
||||||
|
|
||||||
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||||
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||||
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||||
} else {
|
} else {
|
||||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||||
}
|
}
|
||||||
|
|
@ -1898,6 +1916,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||||
hparams.n_swa = 4096;
|
hparams.n_swa = 4096;
|
||||||
hparams.set_swa_pattern(4);
|
hparams.set_swa_pattern(4);
|
||||||
|
|
||||||
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||||
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||||
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);
|
||||||
|
|
@ -2200,6 +2222,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||||
hparams.set_swa_pattern(2);
|
hparams.set_swa_pattern(2);
|
||||||
|
|
||||||
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||||
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||||
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 24: type = LLM_TYPE_20B; break;
|
case 24: type = LLM_TYPE_20B; break;
|
||||||
case 36: type = LLM_TYPE_120B; break;
|
case 36: type = LLM_TYPE_120B; break;
|
||||||
|
|
@ -2244,6 +2270,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
||||||
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
|
||||||
hparams.n_swa = 4096;
|
hparams.n_swa = 4096;
|
||||||
hparams.set_swa_pattern(4, true);
|
hparams.set_swa_pattern(4, true);
|
||||||
|
|
||||||
|
hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train;
|
||||||
|
hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
|
||||||
|
ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false);
|
||||||
} else {
|
} else {
|
||||||
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
|
||||||
hparams.n_no_rope_layer_step = hparams.n_layer;
|
hparams.n_no_rope_layer_step = hparams.n_layer;
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue