model: support GLM-OCR (#19677)

* model: support GLM-OCR

* Update convert_hf_to_gguf.py

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

---------

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
This commit is contained in:
Xuan-Son Nguyen 2026-02-18 17:51:40 +01:00 committed by GitHub
parent e99f1083a0
commit eeef3cfced
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 122 additions and 43 deletions

View File

@ -4584,7 +4584,7 @@ class Qwen3VLVisionModel(MmprojModel):
yield from super().modify_tensors(data_torch, name, bid) yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration") @ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration")
class Glm4VVisionModel(Qwen3VLVisionModel): class Glm4VVisionModel(Qwen3VLVisionModel):
def set_gguf_parameters(self): def set_gguf_parameters(self):
MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters
@ -8776,7 +8776,7 @@ class Glm4Model(TextModel):
n_head = self.hparams["num_attention_heads"] n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams["num_key_value_heads"] n_kv_head = self.hparams["num_key_value_heads"]
n_embd = self.hparams["hidden_size"] n_embd = self.hparams["hidden_size"]
head_dim = n_embd // n_head head_dim = self.hparams.get("head_dim", n_embd // n_head)
# because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here
if name.endswith(("q_proj.weight", "q_proj.bias")): if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor) data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor)
@ -8785,6 +8785,27 @@ class Glm4Model(TextModel):
yield from super().modify_tensors(data_torch, name, bid) yield from super().modify_tensors(data_torch, name, bid)
@ModelBase.register("GlmOcrForConditionalGeneration")
class GlmOCRModel(Glm4Model):
model_arch = gguf.MODEL_ARCH.GLM4
use_mrope = False
partial_rotary_factor = 0.5
# Note: GLM-OCR is the same as GLM4, but with an extra NextN/MTP prediction layer
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# GLM-OCR has num_hidden_layers + 1 actual layers (including NextN layer)
self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0)
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
def set_gguf_parameters(self):
super().set_gguf_parameters()
# NextN/MTP prediction layers
if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
@ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration") @ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration")
class Glm4MoeModel(TextModel): class Glm4MoeModel(TextModel):
model_arch = gguf.MODEL_ARCH.GLM4_MOE model_arch = gguf.MODEL_ARCH.GLM4_MOE

View File

@ -2660,6 +2660,13 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.ATTN_POST_NORM, MODEL_TENSOR.ATTN_POST_NORM,
MODEL_TENSOR.FFN_POST_NORM, MODEL_TENSOR.FFN_POST_NORM,
# NextN/MTP tensors - preserved but unused
MODEL_TENSOR.NEXTN_EH_PROJ,
MODEL_TENSOR.NEXTN_EMBED_TOKENS,
MODEL_TENSOR.NEXTN_ENORM,
MODEL_TENSOR.NEXTN_HNORM,
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
], ],
MODEL_ARCH.GLM4_MOE: [ MODEL_ARCH.GLM4_MOE: [
MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD,

View File

@ -1404,6 +1404,7 @@ class TensorNameMap:
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( MODEL_TENSOR.V_ENC_ATTN_Q_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL "vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL
"model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1 "model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1
"visual.blocks.{bid}.attn.q_norm", # GLM-OCR
), ),
MODEL_TENSOR.V_ENC_ATTN_K: ( MODEL_TENSOR.V_ENC_ATTN_K: (
@ -1422,6 +1423,7 @@ class TensorNameMap:
MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( MODEL_TENSOR.V_ENC_ATTN_K_NORM: (
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL "vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL
"model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1 "model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1
"visual.blocks.{bid}.attn.k_norm", # GLM-OCR
), ),
MODEL_TENSOR.V_ENC_ATTN_V: ( MODEL_TENSOR.V_ENC_ATTN_V: (

View File

@ -1633,6 +1633,12 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_DOWN,
LLM_TENSOR_ATTN_POST_NORM, LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_FFN_POST_NORM, LLM_TENSOR_FFN_POST_NORM,
LLM_TENSOR_NEXTN_EH_PROJ,
LLM_TENSOR_NEXTN_EMBED_TOKENS,
LLM_TENSOR_NEXTN_ENORM,
LLM_TENSOR_NEXTN_HNORM,
LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD,
LLM_TENSOR_NEXTN_SHARED_HEAD_NORM,
}; };
case LLM_ARCH_GLM4_MOE: case LLM_ARCH_GLM4_MOE:
return { return {

View File

@ -1784,7 +1784,15 @@ void llama_model::load_hparams(llama_model_loader & ml) {
{ {
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false);
// NextN/MTP parameters (GLM-OCR)
ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false);
// TODO: when MTP is implemented, this should probably be updated if needed
hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers;
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 17: type = LLM_TYPE_1B; break; // GLM-OCR
case 40: type = LLM_TYPE_9B; break; case 40: type = LLM_TYPE_9B; break;
case 61: type = LLM_TYPE_32B; break; case 61: type = LLM_TYPE_32B; break;
default: type = LLM_TYPE_UNKNOWN; default: type = LLM_TYPE_UNKNOWN;
@ -5410,30 +5418,48 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
} }
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i]; int flags = 0;
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); // skip all tensors in the NextN layers
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED); flags |= TENSOR_SKIP;
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED);
if (layer.wqkv == nullptr) {
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
} }
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); auto & layer = layers[i];
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags);
layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); if (layer.wqkv == nullptr) {
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, flags);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0); layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, flags);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, flags);
layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, flags | TENSOR_NOT_REQUIRED);
layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED);
}
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags);
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags);
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, flags);
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags);
// NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers
if (hparams.nextn_predict_layers > 0 && static_cast<uint32_t>(i) >= n_layer - hparams.nextn_predict_layers) {
layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags);
layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags);
layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags);
// Optional tensors
layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED);
layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED);
}
} }
} break; } break;
case LLM_ARCH_GLM4_MOE: case LLM_ARCH_GLM4_MOE:

View File

@ -29,7 +29,10 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
ggml_tensor * inp_out_ids = build_inp_out_ids(); ggml_tensor * inp_out_ids = build_inp_out_ids();
for (int il = 0; il < n_layer; ++il) { // Only process up to last layer (skip final NextN layer)
// Final layer tensors are loaded but not processed in forward pass
const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
for (int il = 0; il < n_transformer_layers; ++il) {
ggml_tensor * inpSA = inpL; ggml_tensor * inpSA = inpL;
// Pre-attention norm // Pre-attention norm
@ -100,7 +103,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
} }
if (il == n_layer - 1 && inp_out_ids) { if (il == n_transformer_layers - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids); cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
} }
@ -130,9 +133,13 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il); cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "post_mlp_norm", il); cb(cur, "post_mlp_norm", il);
} }
// Add residual connection after post-MLP norm cur = ggml_add(ctx0, cur, ffn_inp);
inpL = ggml_add(ctx0, cur, ffn_inp);
cb(inpL, "l_out", il); cur = build_cvec(cur, il);
cb(cur, "l_out", il);
// input for next layer
inpL = cur;
} }
// Final norm // Final norm
cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1); cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);

View File

@ -342,9 +342,17 @@ ggml_tensor * clip_graph::build_vit(
/* nb2 */ cur->nb[1], /* nb2 */ cur->nb[1],
/* offset */ ggml_row_size(cur->type, 2 * n_embd)); /* offset */ ggml_row_size(cur->type, 2 * n_embd));
// TODO: q/k norm requires row size == n_embd, while here it's d_head if (layer.q_norm) {
// we can add support in the future if needed GGML_ASSERT(layer.q_norm->ne[0] == Qcur->ne[0]);
GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr); Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il);
cb(Qcur, "Qcur_norm", il);
}
if (layer.k_norm) {
GGML_ASSERT(layer.k_norm->ne[0] == Kcur->ne[0]);
Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il);
cb(Kcur, "Kcur_norm", il);
}
} else { } else {
// separate q, k, v // separate q, k, v

View File

@ -2,7 +2,6 @@
ggml_cgraph * clip_graph_glm4v::build() { ggml_cgraph * clip_graph_glm4v::build() {
GGML_ASSERT(model.patch_bias != nullptr); GGML_ASSERT(model.patch_bias != nullptr);
GGML_ASSERT(model.position_embeddings != nullptr);
GGML_ASSERT(model.class_embedding == nullptr); GGML_ASSERT(model.class_embedding == nullptr);
const int batch_size = 1; const int batch_size = 1;
@ -45,19 +44,22 @@ ggml_cgraph * clip_graph_glm4v::build() {
// pos-conv norm // pos-conv norm
inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1); inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1);
// calculate absolute position embedding and apply ggml_tensor * learned_pos_embd = nullptr;
ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC); // Note: GLM-OCR does not have learned position embeddings
learned_pos_embd = ggml_cont_4d( if (model.position_embeddings != nullptr) {
ctx0, learned_pos_embd, learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC);
n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); learned_pos_embd = ggml_cont_4d(
learned_pos_embd = ggml_reshape_4d( ctx0, learned_pos_embd,
ctx0, learned_pos_embd, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); learned_pos_embd = ggml_reshape_4d(
learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3); ctx0, learned_pos_embd,
learned_pos_embd = ggml_cont_3d( n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
ctx0, learned_pos_embd, learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3);
n_embd, n_patches_x * n_patches_y, batch_size); learned_pos_embd = ggml_cont_3d(
cb(learned_pos_embd, "learned_pos_embd", -1); ctx0, learned_pos_embd,
n_embd, n_patches_x * n_patches_y, batch_size);
cb(learned_pos_embd, "learned_pos_embd", -1);
}
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
return ggml_rope_multi( return ggml_rope_multi(