diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0e5d0f8589..7446e0df13 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4584,7 +4584,7 @@ class Qwen3VLVisionModel(MmprojModel): yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration") +@ModelBase.register("Glm4vForConditionalGeneration", "Glm4vMoeForConditionalGeneration", "GlmOcrForConditionalGeneration") class Glm4VVisionModel(Qwen3VLVisionModel): def set_gguf_parameters(self): MmprojModel.set_gguf_parameters(self) # skip Qwen3VLVisionModel parameters @@ -8776,7 +8776,7 @@ class Glm4Model(TextModel): n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams["num_key_value_heads"] n_embd = self.hparams["hidden_size"] - head_dim = n_embd // n_head + head_dim = self.hparams.get("head_dim", n_embd // n_head) # because llama.cpp M-RoPE kernel only supports Neox ordering, we have to permute the weights here if name.endswith(("q_proj.weight", "q_proj.bias")): data_torch = Glm4Model.normal_to_neox(data_torch, n_head, n_head, head_dim, self.partial_rotary_factor) @@ -8785,6 +8785,27 @@ class Glm4Model(TextModel): yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("GlmOcrForConditionalGeneration") +class GlmOCRModel(Glm4Model): + model_arch = gguf.MODEL_ARCH.GLM4 + use_mrope = False + partial_rotary_factor = 0.5 + + # Note: GLM-OCR is the same as GLM4, but with an extra NextN/MTP prediction layer + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # GLM-OCR has num_hidden_layers + 1 actual layers (including NextN layer) + self.block_count = self.hparams["num_hidden_layers"] + self.hparams.get("num_nextn_predict_layers", 0) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + # NextN/MTP prediction layers + if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None: + self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers) + + @ModelBase.register("Glm4MoeForCausalLM", "Glm4vMoeForConditionalGeneration") class Glm4MoeModel(TextModel): model_arch = gguf.MODEL_ARCH.GLM4_MOE diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index ea0fde920e..727e4dd96e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -2660,6 +2660,13 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, MODEL_TENSOR.ATTN_POST_NORM, MODEL_TENSOR.FFN_POST_NORM, + # NextN/MTP tensors - preserved but unused + MODEL_TENSOR.NEXTN_EH_PROJ, + MODEL_TENSOR.NEXTN_EMBED_TOKENS, + MODEL_TENSOR.NEXTN_ENORM, + MODEL_TENSOR.NEXTN_HNORM, + MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, + MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, ], MODEL_ARCH.GLM4_MOE: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b5c6e4bc58..228ba70aa3 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1404,6 +1404,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL "model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1 + "visual.blocks.{bid}.attn.q_norm", # GLM-OCR ), MODEL_TENSOR.V_ENC_ATTN_K: ( @@ -1422,6 +1423,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL "model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1 + "visual.blocks.{bid}.attn.k_norm", # GLM-OCR ), MODEL_TENSOR.V_ENC_ATTN_V: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 416c17463e..7e4da4e78c 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1633,6 +1633,12 @@ static std::set llm_get_tensor_names(llm_arch arch) { LLM_TENSOR_FFN_DOWN, LLM_TENSOR_ATTN_POST_NORM, LLM_TENSOR_FFN_POST_NORM, + LLM_TENSOR_NEXTN_EH_PROJ, + LLM_TENSOR_NEXTN_EMBED_TOKENS, + LLM_TENSOR_NEXTN_ENORM, + LLM_TENSOR_NEXTN_HNORM, + LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, + LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, }; case LLM_ARCH_GLM4_MOE: return { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c26584aa67..279a4d5ced 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1784,7 +1784,15 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key_or_arr(LLM_KV_ROPE_DIMENSION_SECTIONS, hparams.rope_sections, 4, false); + + // NextN/MTP parameters (GLM-OCR) + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + + // TODO: when MTP is implemented, this should probably be updated if needed + hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + switch (hparams.n_layer) { + case 17: type = LLM_TYPE_1B; break; // GLM-OCR case 40: type = LLM_TYPE_9B; break; case 61: type = LLM_TYPE_32B; break; default: type = LLM_TYPE_UNKNOWN; @@ -5410,30 +5418,48 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } for (int i = 0; i < n_layer; ++i) { - auto & layer = layers[i]; - - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED); - layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, TENSOR_NOT_REQUIRED); - - if (layer.wqkv == nullptr) { - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); - layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); - layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); - layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED); + int flags = 0; + if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + // skip all tensors in the NextN layers + flags |= TENSOR_SKIP; } - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + auto & layer = layers[i]; - layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED); + layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, flags | TENSOR_NOT_REQUIRED); - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, 0); + if (layer.wqkv == nullptr) { + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, flags); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, flags); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, flags); + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, flags | TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, flags | TENSOR_NOT_REQUIRED); + } - layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags); + + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, flags); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff * 2}, flags); + + layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, flags); + + // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers + if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); + + // Optional tensors + layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, flags | TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, flags | TENSOR_NOT_REQUIRED); + } } } break; case LLM_ARCH_GLM4_MOE: diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp index 204aa3932a..bcd837b30d 100644 --- a/src/models/glm4.cpp +++ b/src/models/glm4.cpp @@ -29,7 +29,10 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params ggml_tensor * inp_out_ids = build_inp_out_ids(); - for (int il = 0; il < n_layer; ++il) { + // Only process up to last layer (skip final NextN layer) + // Final layer tensors are loaded but not processed in forward pass + const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; + for (int il = 0; il < n_transformer_layers; ++il) { ggml_tensor * inpSA = inpL; // Pre-attention norm @@ -100,7 +103,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } - if (il == n_layer - 1 && inp_out_ids) { + if (il == n_transformer_layers - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } @@ -130,9 +133,13 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il); cb(cur, "post_mlp_norm", il); } - // Add residual connection after post-MLP norm - inpL = ggml_add(ctx0, cur, ffn_inp); - cb(inpL, "l_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; } // Final norm cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 597289b7b6..0796644f08 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -342,9 +342,17 @@ ggml_tensor * clip_graph::build_vit( /* nb2 */ cur->nb[1], /* offset */ ggml_row_size(cur->type, 2 * n_embd)); - // TODO: q/k norm requires row size == n_embd, while here it's d_head - // we can add support in the future if needed - GGML_ASSERT(layer.q_norm == nullptr && layer.k_norm == nullptr); + if (layer.q_norm) { + GGML_ASSERT(layer.q_norm->ne[0] == Qcur->ne[0]); + Qcur = build_norm(Qcur, layer.q_norm, NULL, norm_t, eps, il); + cb(Qcur, "Qcur_norm", il); + } + + if (layer.k_norm) { + GGML_ASSERT(layer.k_norm->ne[0] == Kcur->ne[0]); + Kcur = build_norm(Kcur, layer.k_norm, NULL, norm_t, eps, il); + cb(Kcur, "Kcur_norm", il); + } } else { // separate q, k, v diff --git a/tools/mtmd/models/glm4v.cpp b/tools/mtmd/models/glm4v.cpp index f39b6922eb..6f52df41ab 100644 --- a/tools/mtmd/models/glm4v.cpp +++ b/tools/mtmd/models/glm4v.cpp @@ -2,7 +2,6 @@ ggml_cgraph * clip_graph_glm4v::build() { GGML_ASSERT(model.patch_bias != nullptr); - GGML_ASSERT(model.position_embeddings != nullptr); GGML_ASSERT(model.class_embedding == nullptr); const int batch_size = 1; @@ -45,19 +44,22 @@ ggml_cgraph * clip_graph_glm4v::build() { // pos-conv norm inp = build_norm(inp, model.norm_embd_w, model.norm_embd_b, norm_t, eps, -1); - // calculate absolute position embedding and apply - ggml_tensor * learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC); - learned_pos_embd = ggml_cont_4d( - ctx0, learned_pos_embd, - n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); - learned_pos_embd = ggml_reshape_4d( - ctx0, learned_pos_embd, - n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); - learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3); - learned_pos_embd = ggml_cont_3d( - ctx0, learned_pos_embd, - n_embd, n_patches_x * n_patches_y, batch_size); - cb(learned_pos_embd, "learned_pos_embd", -1); + ggml_tensor * learned_pos_embd = nullptr; + // Note: GLM-OCR does not have learned position embeddings + if (model.position_embeddings != nullptr) { + learned_pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BICUBIC); + learned_pos_embd = ggml_cont_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + learned_pos_embd = ggml_reshape_4d( + ctx0, learned_pos_embd, + n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + learned_pos_embd = ggml_permute(ctx0, learned_pos_embd, 0, 2, 1, 3); + learned_pos_embd = ggml_cont_3d( + ctx0, learned_pos_embd, + n_embd, n_patches_x * n_patches_y, batch_size); + cb(learned_pos_embd, "learned_pos_embd", -1); + } auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { return ggml_rope_multi(