working through previous attemp, implimented more accurate conversion per previous attempt, added local sliding window attention that alternates every third layer

This commit is contained in:
ryan-mangeno 2025-09-03 14:32:39 -04:00
parent ca353d37b4
commit 6d86944cb4
4 changed files with 102 additions and 126 deletions

View File

@ -8308,37 +8308,32 @@ class SmallThinkerModel(TextModel):
raise ValueError(f"Unprocessed experts: {experts}")
@ModelBase.register("ModernBertModel")
class ModernBertModel(TextModel):
@ModelBase.register("ModernBertModel", "ModernBertForMaskedLM", "ModernBertForSequenceClassification")
class ModernBertModel(BertModel):
model_arch = gguf.MODEL_ARCH.MODERN_BERT
def set_gguf_parameters(self) -> None:
# Determine block count (number of hidden layers)
block_count = self.hparams.get("num_hidden_layers") or self.hparams.get("num_hidden_layers_alt")
if block_count is None:
raise ValueError("Could not determine number of hidden layers from hparams")
def set_vocab(self):
self._set_vocab_gpt2()
self.gguf_writer.add_add_bos_token(True)
self.gguf_writer.add_add_eos_token(True)
# Attention heads and dimensions
n_head = self.hparams.get("num_attention_heads")
if n_head is None:
raise ValueError("Missing 'num_attention_heads' in hparams")
hidden_size = self.hparams["hidden_size"]
head_dim = hidden_size // n_head
ffn_dim = self.hparams.get("intermediate_size", 4 * hidden_size)
# GGUF parameter assignment
self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 512))
self.gguf_writer.add_embedding_length(hidden_size)
self.gguf_writer.add_feed_forward_length(ffn_dim)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(n_head)
self.gguf_writer.add_layer_norm_eps(self.hparams.get("layer_norm_eps", 1e-12))
self.gguf_writer.add_file_type(self.ftype)
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_sliding_window(self.hparams["local_attention"])
self.gguf_writer.add_rope_freq_base(self.hparams["global_rope_theta"])
self.gguf_writer.add_rope_freq_base_swa(self.hparams["local_rope_theta"])
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Directly map tensor names without QKV splitting or reordering
return [(self.map_tensor_name(name), data_torch)]
# These layers act as MLM head, so we don't need them
if name.startswith("decoder."):
return []
if name.startswith("model."):
name = name[6:]
return super().modify_tensors(data_torch, name, bid)
###### CONVERSION LOGIC ######

View File

@ -19,6 +19,7 @@ enum llama_swa_type {
LLAMA_SWA_TYPE_NONE = 0,
LLAMA_SWA_TYPE_STANDARD = 1,
LLAMA_SWA_TYPE_CHUNKED = 2,
LLAMA_SWA_TYPE_LOCAL = 3,
};
struct llama_hparams_posnet {

View File

@ -1807,6 +1807,18 @@ bool llama_kv_cache_unified::is_masked_swa(llama_pos p0, llama_pos p1) const {
return true;
}
} break;
case LLAMA_SWA_TYPE_LOCAL:
{
const int32_t half_n_swa = (int32_t) n_swa / 2;
const int32_t pos_diff = p1 - p0;
// mask if outside the window
if (pos_diff < -half_n_swa || pos_diff > half_n_swa) {
return true;
}
} break;
}
return false;

View File

@ -759,11 +759,20 @@ void llama_model::load_hparams(llama_model_loader & ml) {
} break;
case LLM_ARCH_MODERN_BERT:
{
//ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
LLAMA_LOG_INFO("Switching Modern Bert Arch\n");
hparams.swa_type = LLAMA_SWA_TYPE_LOCAL;
hparams.set_swa_pattern(3, 0);
hparams.n_swa = 128;
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
switch (hparams.n_layer) {
case 12:
type = LLM_TYPE_47M; break; // granite-embeddings-mall
type = LLM_TYPE_47M; break; // granite-embeddings-small
default: type = LLM_TYPE_UNKNOWN;
}
} break;
@ -7544,152 +7553,111 @@ struct llm_build_bert : public llm_graph_context {
struct llm_build_modern_bert : public llm_graph_context {
llm_build_modern_bert(const llama_model & model, const llm_graph_params & params)
: llm_graph_context(params) {
const int64_t n_embd = hparams.n_embd;
const int64_t n_layer = hparams.n_layer;
const int64_t n_head = hparams.n_head();
const int64_t n_head_kv = hparams.n_head_kv();
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
const int64_t n_tokens = ubatch.n_tokens;
const int64_t n_embd = hparams.n_embd;
const int64_t n_layer = hparams.n_layer;
const int64_t n_head = hparams.n_head();
const int64_t n_head_kv = hparams.n_head_kv();
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
const int64_t n_tokens = ubatch.n_tokens;
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
// RoPE params
const int32_t rope_type = LLAMA_ROPE_TYPE_NEOX; // uses rotary
const int32_t n_rot = hparams.n_rot;
const int32_t n_ctx_orig = hparams.n_ctx_train;
// rope params
const int32_t rope_type = LLAMA_ROPE_TYPE_NEOX;
const int32_t n_rot = hparams.n_rot;
const int32_t n_ctx_orig = hparams.n_ctx_train;
const float freq_base = hparams.rope_freq_base_train;
const float freq_scale = hparams.rope_freq_scale_train;
const float attn_factor = 1.0f;
const float ext_factor = 1.0f;
const float beta_fast = 0.0f;
const float beta_slow = 0.0f;
ggml_tensor * cur;
ggml_tensor * inpL;
ggml_tensor * inp_pos = nullptr;
// needs positions for RoPE
inp_pos = build_inp_pos();
// embeddings (token + optional type), NO absolute pos embed
inpL = build_inp_embd(model.tok_embd);
ggml_tensor * inp_pos = build_inp_pos();
ggml_tensor * inpL = build_inp_embd(model.tok_embd);
if (model.type_embd) {
ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
inpL = ggml_add(ctx0, inpL, type_row0);
inpL = ggml_add(ctx0, inpL, ggml_view_1d(ctx0, model.type_embd, n_embd, 0));
}
cb(inpL, "inp_embd", -1);
// embeddings LayerNorm (embeddings.norm)
inpL = build_norm(inpL, model.tok_norm, model.tok_norm_b, LLM_NORM, -1);
cb(inpL, "inp_norm", -1);
auto * inp_attn = build_attn_inp_no_cache();
auto * inp_attn = build_attn_inp_no_cache();
ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) {
ggml_tensor * x = inpL;
// pre attention norm (attn_norm). Layer 0 may be Identity() -> nullptr
// Pre attention Layer norm
ggml_tensor * x_attn_in = x;
if (model.layers[il].attn_norm) {
x_attn_in = build_norm(x,
model.layers[il].attn_norm,
model.layers[il].attn_norm_b,
LLM_NORM, il);
cb(x_attn_in, "attn_pre_norm", il);
} else {
cb(x_attn_in, "attn_pre_norm_identity", il);
x_attn_in = build_norm(x, model.layers[il].attn_norm, model.layers[il].attn_norm_b, LLM_NORM, il);
}
// Attention: fused Wqkv -> split -> heads -> RoPE(Q,K) -> attn -> Wo
ggml_tensor * qkv = nullptr;
ggml_tensor * Qcur;
ggml_tensor * Kcur;
ggml_tensor * Vcur;
GGML_ASSERT(model.layers[il].wqkv); // fused QKV
qkv = build_lora_mm(model.layers[il].wqkv, x_attn_in);
cb(qkv, "wqkv", il);
// fused qkv
GGML_ASSERT(model.layers[il].wqkv);
ggml_tensor * qkv = build_lora_mm(model.layers[il].wqkv, x_attn_in);
if (model.layers[il].bqkv) {
qkv = ggml_add(ctx0, qkv, model.layers[il].bqkv);
cb(qkv, "bqkv", il);
}
// Fused layout: [ (n_embd + 2*n_embd_gqa), n_tokens ]
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens, qkv->nb[1], 0*sizeof(float)*(n_embd)));
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], 1*sizeof(float)*(n_embd)));
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens, qkv->nb[1], 0));
ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], n_embd));
ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd_gqa, n_tokens, qkv->nb[1], n_embd + n_embd_gqa));
// optional per Q/K
if (model.layers[il].attn_q_norm) {
Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, LLM_NORM, il);
}
if (model.layers[il].attn_k_norm) {
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, LLM_NORM, il);
}
// optional q/k LayerNorm
if (model.layers[il].attn_q_norm) Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, model.layers[il].attn_q_norm_b, LLM_NORM, il);
if (model.layers[il].attn_k_norm) Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, model.layers[il].attn_k_norm_b, LLM_NORM, il);
// heads
// reshape for multi head
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
// RoPE (NEOX ... maybe?) on Q and K
// rope embedding
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
cb(Qcur, "Qcur_rope", il);
cb(Kcur, "Kcur_rope", il);
cb(Vcur, "Vcur", il);
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);
ggml_tensor * attn_out = build_attn(
inp_attn,
model.layers[il].wo, model.layers[il].bo, // Wo, optional bias
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur,
/*K_cache*/ nullptr,
/*V_cache*/ nullptr,
/*k cache*/ nullptr,
/*v cache*/ nullptr,
1.0f / sqrtf(float(n_embd_head)),
il);
cb(attn_out, "attn_out", il);
il
);
// residual after attention
ggml_tensor * cur_attn = ggml_add(ctx0, attn_out, x);
// ifwe subselect outputs, do it at the last layer after attn resid
// optional subselect output tokens (inp_out_ids)
if (il == n_layer - 1 && inp_out_ids) {
cur_attn = ggml_get_rows(ctx0, cur_attn, inp_out_ids);
x = ggml_get_rows(ctx0, x, inp_out_ids);
cur_attn = ggml_get_rows(ctx0, cur_attn, inp_out_ids);
inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
}
// pre mlp norm
ggml_tensor * h = build_norm(cur_attn,
model.layers[il].ffn_norm,
model.layers[il].ffn_norm_b,
LLM_NORM, il);
cb(h, "mlp_pre_norm", il);
// pre mlp LayerNorm
ggml_tensor * h = build_norm(cur_attn, model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, LLM_NORM, il);
// GEGLU because we will split ffn_up which has shape [n_embd, n_ff * 2] and ffn_down has shape [n_ff, n_embd]
// geglu FFN
ggml_tensor * mlp_out = build_ffn(
h,
model.layers[il].ffn_up, /*up_b*/ NULL, /*up_shexp*/ NULL,
/*gate*/ NULL , /*gate_b*/ NULL, /*gate_shexp*/ NULL,
model.layers[il].ffn_down, /*down_b*/ NULL, /*down_shexp*/ NULL,
/*act_scales*/ NULL,
model.layers[il].ffn_up, NULL, NULL,
NULL, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL,
LLM_FFN_GEGLU, LLM_FFN_PAR, il
);
cb(mlp_out, "ffn_out_geglu", il);
// Residual after MLP
ggml_tensor * cur_layer = ggml_add(ctx0, mlp_out, cur_attn);
// feed into next layer
inpL = cur_layer;
// resid addition
inpL = ggml_add(ctx0, mlp_out, cur_attn);
}
// final model norm (final_norm)
cur = build_norm(inpL, model.output_norm, model.output_norm_b, LLM_NORM, -1);
cb(cur, "final_norm", -1);
ggml_tensor * cur = build_norm(inpL, model.output_norm, model.output_norm_b, LLM_NORM, -1);
res->t_embd = cur;
ggml_build_forward_expand(gf, cur);
}