diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index adce4f8390..b5e56f87ca 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3777,7 +3777,14 @@ class QwenModel(TextModel): self._set_vocab_qwen() -@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration") +@ModelBase.register( + "Qwen2Model", + "Qwen2ForCausalLM", + "Qwen2AudioForConditionalGeneration", + "KORMoForCausalLM", + "AudioFlamingo3ForConditionalGeneration", + "DotsOCRForCausalLM", +) class Qwen2Model(TextModel): model_arch = gguf.MODEL_ARCH.QWEN2 @@ -3798,7 +3805,8 @@ class Qwen2Model(TextModel): name = name.replace("language_model.", "") # for InternVL if name.startswith("mlp") or name.startswith("multi_modal_projector") \ or name.startswith("vision_model") or name.startswith("audio_tower") \ - or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"): + or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") \ + or name.startswith("vision_tower."): # skip vision and audio tensors return yield from super().modify_tensors(data_torch, name, bid) @@ -12819,6 +12827,37 @@ class SolarOpenModel(Glm4MoeModel): special_vocab.add_to_gguf(self.gguf_writer) +@ModelBase.register("DotsOCRForCausalLM") +class DotsOCRVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = 0 # dynamic resolution + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DOTSOCR) + self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"]) + self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"]) + self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["rms_norm_eps"])) + self.gguf_writer.add_vision_projector_scale_factor(self.find_vparam(["spatial_merge_size"])) + self.gguf_writer.add_vision_use_silu(True) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("vision_tower."): + if "vision_tower.blocks." in name and ".mlp." in name: + # note: to avoid naming conflicts in tensor_mapping.py, we need to handle FFN renaming here + # x = F.silu(self.fc1(x)) * self.fc3(x) + # x = self.fc2(x) + # fc1 -> gate, fc2 -> down, fc3 -> up + # mapping original names to Qwen2.5 naming scheme + name = name.replace("vision_tower.blocks.", "visual.blocks.") + name = name.replace(".fc1", ".gate_proj") + name = name.replace(".fc2", ".down_proj") + name = name.replace(".fc3", ".up_proj") + yield from super().modify_tensors(data_torch, name, bid) + + ###### CONVERSION LOGIC ###### diff --git a/docs/multimodal.md b/docs/multimodal.md index 744347f626..f849eb9695 100644 --- a/docs/multimodal.md +++ b/docs/multimodal.md @@ -37,6 +37,7 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload > - PaddleOCR-VL: https://github.com/ggml-org/llama.cpp/pull/18825 > - GLM-OCR: https://github.com/ggml-org/llama.cpp/pull/19677 > - Deepseek-OCR: https://github.com/ggml-org/llama.cpp/pull/17400 +> - Dots.OCR: https://github.com/ggml-org/llama.cpp/pull/17575 > - HunyuanOCR: https://github.com/ggml-org/llama.cpp/pull/21395 ## Pre-quantized models diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 6760296b22..53ce138fce 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -4122,6 +4122,7 @@ class VisionProjectorType: LIGHTONOCR = "lightonocr" COGVLM = "cogvlm" JANUS_PRO = "janus_pro" + DOTSOCR = "dots_ocr" DEEPSEEKOCR = "deepseekocr" LFM2A = "lfm2a" # audio MUSIC_FLAMINGO = "musicflamingo" # audio diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a2aa139de1..23eae9a7e6 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1359,6 +1359,7 @@ class TensorNameMap: "visual.merger.mlp.{bid}", # qwen2vl "mlp_AR.linear_{bid}", # PaddleOCR-VL "merger.mlp.{bid}", + "vision_tower.merger.mlp.{bid}", # dots.ocr "vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2) ), @@ -1406,11 +1407,13 @@ class TensorNameMap: "siglip2.vision_model.embeddings.patch_embedding", "vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL "model.vision_tower.patch_embedder.input_proj", # gemma4 + "vision_tower.patch_embed.patchifier.proj", # dots.ocr "vision_model.conv1", # Step3-VL ), MODEL_TENSOR.V_ENC_EMBD_NORM: ( "visual.post_conv_layernorm", # glm4v + "vision_tower.patch_embed.patchifier.norm", # dots.ocr ), MODEL_TENSOR.V_ENC_EMBD_POS: ( @@ -1441,6 +1444,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_ATTN_QKV: ( "visual.blocks.{bid}.attn.qkv", # qwen3vl + "vision_tower.blocks.{bid}.attn.qkv", # dots.ocr "model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm "model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj", # Deepseek-OCR CLIP "vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5 @@ -1526,6 +1530,7 @@ class TensorNameMap: "model.vision_model.transformer.layers.{bid}.layer_norm1", # Deepseek-OCR CLIP "siglip2.vision_model.encoder.layers.{bid}.layer_norm1", "vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL + "vision_tower.blocks.{bid}.norm1", # dots.ocr "vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL ), @@ -1547,6 +1552,7 @@ class TensorNameMap: "siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL "vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4 + "vision_tower.blocks.{bid}.attn.proj", # dots.ocr "vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL ), @@ -1567,6 +1573,7 @@ class TensorNameMap: "siglip2.vision_model.encoder.layers.{bid}.layer_norm2", "vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL "vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4 + "vision_tower.blocks.{bid}.norm2", # dots.ocr "vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL ), @@ -1649,6 +1656,7 @@ class TensorNameMap: "vision_encoder.ln_pre", # pixtral "vision_model.layernorm_pre", # llama4 "model.vision_model.pre_layrnorm", # Deepseek-OCR CLIP + "vision_tower.patch_embed.patchifier.norm", # dots.ocr "vision_model.ln_pre", # Step3-VL ), @@ -1664,6 +1672,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_POST_NORM: ( "visual.merger.post_projection_norm", # glm4v + "vision_tower.post_trunk_norm", # dots.ocr "vit.perceive.after_rms", # HunyuanOCR ), @@ -1680,6 +1689,7 @@ class TensorNameMap: "model.vision.linear_proj.norm1", # cogvlm "mlp_AR.pre_norm", # PaddleOCR-VL "merger.ln_q", + "vision_tower.merger.ln_q", # dots.ocr ), MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 151c15d704..6a4267d2e1 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -17,6 +17,7 @@ add_library(mtmd models/models.h models/cogvlm.cpp models/conformer.cpp + models/dotsocr.cpp models/gemma4v.cpp models/glm4v.cpp models/hunyuanocr.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 0c3e60e1a8..c812e6c4b5 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -266,6 +266,7 @@ enum projector_type { PROJECTOR_TYPE_LIGHTONOCR, PROJECTOR_TYPE_COGVLM, PROJECTOR_TYPE_JANUS_PRO, + PROJECTOR_TYPE_DOTS_OCR, PROJECTOR_TYPE_DEEPSEEKOCR, PROJECTOR_TYPE_LFM2A, PROJECTOR_TYPE_GLM4V, @@ -308,6 +309,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, { PROJECTOR_TYPE_COGVLM, "cogvlm"}, { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, + { PROJECTOR_TYPE_DOTS_OCR, "dots_ocr"}, { PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"}, { PROJECTOR_TYPE_LFM2A, "lfm2a"}, { PROJECTOR_TYPE_GLM4V, "glm4v"}, diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9c886bc890..b947a4183e 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -853,6 +853,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_DOTS_OCR: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: { @@ -1269,6 +1273,14 @@ struct clip_model_loader { get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false); hparams.set_warmup_n_tokens(256); // avoid OOM on warmup } break; + case PROJECTOR_TYPE_DOTS_OCR: + { + hparams.rope_theta = 10000.0f; + get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge); + get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels); + get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels); + hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup + } break; case PROJECTOR_TYPE_KIMIVL: { hparams.image_resize_algo = RESIZE_ALGO_BILINEAR; @@ -1983,6 +1995,17 @@ struct clip_model_loader { model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false); model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false); } break; + case PROJECTOR_TYPE_DOTS_OCR: + { + model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight")); + model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias")); + model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight")); + model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias")); + model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM); + model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B); + // post_trunk_norm: applied after all ViT blocks, before the merger + model.post_ln_w = get_tensor(string_format(TN_MM_POST_NORM, "weight")); + } break; case PROJECTOR_TYPE_ULTRAVOX: { model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight")); @@ -2763,6 +2786,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches = x_patch * y_patch; } break; case PROJECTOR_TYPE_PADDLEOCR: + case PROJECTOR_TYPE_DOTS_OCR: { // dynamic size int n_merge = ctx->model.hparams.n_merge; @@ -3071,6 +3095,28 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } } + set_input_i32("positions", positions); + } break; + case PROJECTOR_TYPE_DOTS_OCR: + { + const int pw = image_size_width / patch_size; + const int ph = image_size_height / patch_size; + const int n_pos = ph * pw; + std::vector positions(n_pos * 4); + int ptr = 0; + + // flat layout: [h, w, h, w] for each patch + // patches are in raster order (matching conv2d output) + for (int y = 0; y < ph; y++) { + for (int x = 0; x < pw; x++) { + positions[ ptr] = y; + positions[ n_pos + ptr] = x; + positions[2*n_pos + ptr] = y; + positions[3*n_pos + ptr] = x; + ptr++; + } + } + set_input_i32("positions", positions); } break; case PROJECTOR_TYPE_QWEN25VL: @@ -3388,6 +3434,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_PHI4: case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: + case PROJECTOR_TYPE_DOTS_OCR: return ctx->model.mm_2_w->ne[1]; case PROJECTOR_TYPE_MLP_NORM: return ctx->model.mm_3_b->ne[0]; diff --git a/tools/mtmd/models/dotsocr.cpp b/tools/mtmd/models/dotsocr.cpp new file mode 100644 index 0000000000..92974bb670 --- /dev/null +++ b/tools/mtmd/models/dotsocr.cpp @@ -0,0 +1,49 @@ +#include "models.h" + +ggml_cgraph * clip_graph_dotsocr::build() { + const int n_pos = n_patches; + const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position + + // note: similar to PaddleOCR + int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4}; + + ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + return ggml_rope_multi( + ctx0, cur, positions, nullptr, + d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, + 32768, 10000, 1, 0, 1, 32, 1); + }; + + ggml_tensor * inp = build_inp(); + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_RMS, + hparams.ffn_op, + nullptr, + add_pos); + + cb(cur, "vit_out", -1); + + // dots.ocr patch merger + projector + { + GGML_ASSERT(hparams.n_merge > 0); + cur = build_norm(cur, model.mm_input_norm_w, model.mm_input_norm_b, NORM_TYPE_NORMAL, 1e-6, -1); + cur = build_patch_merge_permute(cur, hparams.n_merge); + cb(cur, "after_patch_merger", -1); + cur = build_ffn(cur, + model.mm_0_w, model.mm_0_b, + nullptr, nullptr, // no gate + model.mm_2_w, model.mm_2_b, + FFN_GELU_ERF, -1); // nn.GELU() defaults to exact erf-based GELU + cb(cur, "after_projector", -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 47e2cde2b9..5f5b76040d 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -73,6 +73,11 @@ struct clip_graph_paddleocr : clip_graph { ggml_cgraph * build() override; }; +struct clip_graph_dotsocr : clip_graph { + clip_graph_dotsocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; +}; + struct clip_graph_cogvlm : clip_graph { clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 4cbb3301ea..41c5211375 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -375,6 +375,13 @@ struct mtmd_context { img_end = "<|im_end|>"; image_preproc = std::make_unique(ctx_v); } break; + case PROJECTOR_TYPE_DOTS_OCR: + { + // <|img|> ... (image embeddings) ... <|endofimg|> + img_beg = "<|img|>"; + img_end = "<|endofimg|>"; + image_preproc = std::make_unique(ctx_v); + } break; case PROJECTOR_TYPE_NEMOTRON_V2_VL: { image_preproc = std::make_unique(ctx_v); diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index d6a6b03c85..651f7a6271 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -89,6 +89,7 @@ add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0" add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0" add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0" add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr +add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR" add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR" add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"