mtmd: support dots.ocr (#17575)
* convert gguf * clip impl * fix conversion * wip * corrections * update docs * add gguf to test script
This commit is contained in:
parent
0ec191e1d7
commit
501aeed18f
|
|
@ -3777,7 +3777,14 @@ class QwenModel(TextModel):
|
|||
self._set_vocab_qwen()
|
||||
|
||||
|
||||
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM", "AudioFlamingo3ForConditionalGeneration")
|
||||
@ModelBase.register(
|
||||
"Qwen2Model",
|
||||
"Qwen2ForCausalLM",
|
||||
"Qwen2AudioForConditionalGeneration",
|
||||
"KORMoForCausalLM",
|
||||
"AudioFlamingo3ForConditionalGeneration",
|
||||
"DotsOCRForCausalLM",
|
||||
)
|
||||
class Qwen2Model(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.QWEN2
|
||||
|
||||
|
|
@ -3798,7 +3805,8 @@ class Qwen2Model(TextModel):
|
|||
name = name.replace("language_model.", "") # for InternVL
|
||||
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
|
||||
or name.startswith("vision_model") or name.startswith("audio_tower") \
|
||||
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
|
||||
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") \
|
||||
or name.startswith("vision_tower."):
|
||||
# skip vision and audio tensors
|
||||
return
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
|
@ -12819,6 +12827,37 @@ class SolarOpenModel(Glm4MoeModel):
|
|||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
|
||||
@ModelBase.register("DotsOCRForCausalLM")
|
||||
class DotsOCRVisionModel(MmprojModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
assert self.hparams_vision is not None
|
||||
self.hparams_vision["image_size"] = 0 # dynamic resolution
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DOTSOCR)
|
||||
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
|
||||
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["rms_norm_eps"]))
|
||||
self.gguf_writer.add_vision_projector_scale_factor(self.find_vparam(["spatial_merge_size"]))
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
if name.startswith("vision_tower."):
|
||||
if "vision_tower.blocks." in name and ".mlp." in name:
|
||||
# note: to avoid naming conflicts in tensor_mapping.py, we need to handle FFN renaming here
|
||||
# x = F.silu(self.fc1(x)) * self.fc3(x)
|
||||
# x = self.fc2(x)
|
||||
# fc1 -> gate, fc2 -> down, fc3 -> up
|
||||
# mapping original names to Qwen2.5 naming scheme
|
||||
name = name.replace("vision_tower.blocks.", "visual.blocks.")
|
||||
name = name.replace(".fc1", ".gate_proj")
|
||||
name = name.replace(".fc2", ".down_proj")
|
||||
name = name.replace(".fc3", ".up_proj")
|
||||
yield from super().modify_tensors(data_torch, name, bid)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
|
|||
> - PaddleOCR-VL: https://github.com/ggml-org/llama.cpp/pull/18825
|
||||
> - GLM-OCR: https://github.com/ggml-org/llama.cpp/pull/19677
|
||||
> - Deepseek-OCR: https://github.com/ggml-org/llama.cpp/pull/17400
|
||||
> - Dots.OCR: https://github.com/ggml-org/llama.cpp/pull/17575
|
||||
> - HunyuanOCR: https://github.com/ggml-org/llama.cpp/pull/21395
|
||||
|
||||
## Pre-quantized models
|
||||
|
|
|
|||
|
|
@ -4122,6 +4122,7 @@ class VisionProjectorType:
|
|||
LIGHTONOCR = "lightonocr"
|
||||
COGVLM = "cogvlm"
|
||||
JANUS_PRO = "janus_pro"
|
||||
DOTSOCR = "dots_ocr"
|
||||
DEEPSEEKOCR = "deepseekocr"
|
||||
LFM2A = "lfm2a" # audio
|
||||
MUSIC_FLAMINGO = "musicflamingo" # audio
|
||||
|
|
|
|||
|
|
@ -1359,6 +1359,7 @@ class TensorNameMap:
|
|||
"visual.merger.mlp.{bid}", # qwen2vl
|
||||
"mlp_AR.linear_{bid}", # PaddleOCR-VL
|
||||
"merger.mlp.{bid}",
|
||||
"vision_tower.merger.mlp.{bid}", # dots.ocr
|
||||
"vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
|
||||
),
|
||||
|
||||
|
|
@ -1406,11 +1407,13 @@ class TensorNameMap:
|
|||
"siglip2.vision_model.embeddings.patch_embedding",
|
||||
"vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
|
||||
"model.vision_tower.patch_embedder.input_proj", # gemma4
|
||||
"vision_tower.patch_embed.patchifier.proj", # dots.ocr
|
||||
"vision_model.conv1", # Step3-VL
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_NORM: (
|
||||
"visual.post_conv_layernorm", # glm4v
|
||||
"vision_tower.patch_embed.patchifier.norm", # dots.ocr
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_EMBD_POS: (
|
||||
|
|
@ -1441,6 +1444,7 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.V_ENC_ATTN_QKV: (
|
||||
"visual.blocks.{bid}.attn.qkv", # qwen3vl
|
||||
"vision_tower.blocks.{bid}.attn.qkv", # dots.ocr
|
||||
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
|
||||
"model.vision_model.transformer.layers.{bid}.self_attn.qkv_proj", # Deepseek-OCR CLIP
|
||||
"vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5
|
||||
|
|
@ -1526,6 +1530,7 @@ class TensorNameMap:
|
|||
"model.vision_model.transformer.layers.{bid}.layer_norm1", # Deepseek-OCR CLIP
|
||||
"siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
|
||||
"vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
|
||||
"vision_tower.blocks.{bid}.norm1", # dots.ocr
|
||||
"vision_model.transformer.resblocks.{bid}.ln_1", # Step3-VL
|
||||
),
|
||||
|
||||
|
|
@ -1547,6 +1552,7 @@ class TensorNameMap:
|
|||
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
|
||||
"vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.self_attn.o_proj.linear", # gemma4
|
||||
"vision_tower.blocks.{bid}.attn.proj", # dots.ocr
|
||||
"vision_model.transformer.resblocks.{bid}.attn.out_proj", # Step3-VL
|
||||
),
|
||||
|
||||
|
|
@ -1567,6 +1573,7 @@ class TensorNameMap:
|
|||
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
|
||||
"vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
|
||||
"vision_model.model.layers.{bid}.pre_feedforward_layernorm", # gemma4
|
||||
"vision_tower.blocks.{bid}.norm2", # dots.ocr
|
||||
"vision_model.transformer.resblocks.{bid}.ln_2", # Step3-VL
|
||||
),
|
||||
|
||||
|
|
@ -1649,6 +1656,7 @@ class TensorNameMap:
|
|||
"vision_encoder.ln_pre", # pixtral
|
||||
"vision_model.layernorm_pre", # llama4
|
||||
"model.vision_model.pre_layrnorm", # Deepseek-OCR CLIP
|
||||
"vision_tower.patch_embed.patchifier.norm", # dots.ocr
|
||||
"vision_model.ln_pre", # Step3-VL
|
||||
),
|
||||
|
||||
|
|
@ -1664,6 +1672,7 @@ class TensorNameMap:
|
|||
|
||||
MODEL_TENSOR.V_MM_POST_NORM: (
|
||||
"visual.merger.post_projection_norm", # glm4v
|
||||
"vision_tower.post_trunk_norm", # dots.ocr
|
||||
"vit.perceive.after_rms", # HunyuanOCR
|
||||
),
|
||||
|
||||
|
|
@ -1680,6 +1689,7 @@ class TensorNameMap:
|
|||
"model.vision.linear_proj.norm1", # cogvlm
|
||||
"mlp_AR.pre_norm", # PaddleOCR-VL
|
||||
"merger.ln_q",
|
||||
"vision_tower.merger.ln_q", # dots.ocr
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ add_library(mtmd
|
|||
models/models.h
|
||||
models/cogvlm.cpp
|
||||
models/conformer.cpp
|
||||
models/dotsocr.cpp
|
||||
models/gemma4v.cpp
|
||||
models/glm4v.cpp
|
||||
models/hunyuanocr.cpp
|
||||
|
|
|
|||
|
|
@ -266,6 +266,7 @@ enum projector_type {
|
|||
PROJECTOR_TYPE_LIGHTONOCR,
|
||||
PROJECTOR_TYPE_COGVLM,
|
||||
PROJECTOR_TYPE_JANUS_PRO,
|
||||
PROJECTOR_TYPE_DOTS_OCR,
|
||||
PROJECTOR_TYPE_DEEPSEEKOCR,
|
||||
PROJECTOR_TYPE_LFM2A,
|
||||
PROJECTOR_TYPE_GLM4V,
|
||||
|
|
@ -308,6 +309,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
|
||||
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
|
||||
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
|
||||
{ PROJECTOR_TYPE_DOTS_OCR, "dots_ocr"},
|
||||
{ PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"},
|
||||
{ PROJECTOR_TYPE_LFM2A, "lfm2a"},
|
||||
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
|
||||
|
|
|
|||
|
|
@ -853,6 +853,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
{
|
||||
builder = std::make_unique<clip_graph_pixtral>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DOTS_OCR:
|
||||
{
|
||||
builder = std::make_unique<clip_graph_dotsocr>(ctx, img);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN2VL:
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
{
|
||||
|
|
@ -1269,6 +1273,14 @@ struct clip_model_loader {
|
|||
get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
|
||||
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DOTS_OCR:
|
||||
{
|
||||
hparams.rope_theta = 10000.0f;
|
||||
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge);
|
||||
get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
|
||||
get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
|
||||
hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
|
||||
} break;
|
||||
case PROJECTOR_TYPE_KIMIVL:
|
||||
{
|
||||
hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
|
||||
|
|
@ -1983,6 +1995,17 @@ struct clip_model_loader {
|
|||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
||||
model.mm_patch_merger_w = get_tensor(string_format(TN_MM_PATCH_MERGER, "weight"), false);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DOTS_OCR:
|
||||
{
|
||||
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM);
|
||||
model.mm_input_norm_b = get_tensor(TN_MM_INP_NORM_B);
|
||||
// post_trunk_norm: applied after all ViT blocks, before the merger
|
||||
model.post_ln_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
|
||||
} break;
|
||||
case PROJECTOR_TYPE_ULTRAVOX:
|
||||
{
|
||||
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
||||
|
|
@ -2763,6 +2786,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|||
n_patches = x_patch * y_patch;
|
||||
} break;
|
||||
case PROJECTOR_TYPE_PADDLEOCR:
|
||||
case PROJECTOR_TYPE_DOTS_OCR:
|
||||
{
|
||||
// dynamic size
|
||||
int n_merge = ctx->model.hparams.n_merge;
|
||||
|
|
@ -3071,6 +3095,28 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|||
}
|
||||
}
|
||||
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DOTS_OCR:
|
||||
{
|
||||
const int pw = image_size_width / patch_size;
|
||||
const int ph = image_size_height / patch_size;
|
||||
const int n_pos = ph * pw;
|
||||
std::vector<int> positions(n_pos * 4);
|
||||
int ptr = 0;
|
||||
|
||||
// flat layout: [h, w, h, w] for each patch
|
||||
// patches are in raster order (matching conv2d output)
|
||||
for (int y = 0; y < ph; y++) {
|
||||
for (int x = 0; x < pw; x++) {
|
||||
positions[ ptr] = y;
|
||||
positions[ n_pos + ptr] = x;
|
||||
positions[2*n_pos + ptr] = y;
|
||||
positions[3*n_pos + ptr] = x;
|
||||
ptr++;
|
||||
}
|
||||
}
|
||||
|
||||
set_input_i32("positions", positions);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_QWEN25VL:
|
||||
|
|
@ -3388,6 +3434,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|||
case PROJECTOR_TYPE_PHI4:
|
||||
case PROJECTOR_TYPE_PIXTRAL:
|
||||
case PROJECTOR_TYPE_LIGHTONOCR:
|
||||
case PROJECTOR_TYPE_DOTS_OCR:
|
||||
return ctx->model.mm_2_w->ne[1];
|
||||
case PROJECTOR_TYPE_MLP_NORM:
|
||||
return ctx->model.mm_3_b->ne[0];
|
||||
|
|
|
|||
|
|
@ -0,0 +1,49 @@
|
|||
#include "models.h"
|
||||
|
||||
ggml_cgraph * clip_graph_dotsocr::build() {
|
||||
const int n_pos = n_patches;
|
||||
const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
|
||||
|
||||
// note: similar to PaddleOCR
|
||||
int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
|
||||
|
||||
ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
|
||||
ggml_set_name(positions, "positions");
|
||||
ggml_set_input(positions);
|
||||
|
||||
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
||||
return ggml_rope_multi(
|
||||
ctx0, cur, positions, nullptr,
|
||||
d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION,
|
||||
32768, 10000, 1, 0, 1, 32, 1);
|
||||
};
|
||||
|
||||
ggml_tensor * inp = build_inp();
|
||||
ggml_tensor * cur = build_vit(
|
||||
inp, n_patches,
|
||||
NORM_TYPE_RMS,
|
||||
hparams.ffn_op,
|
||||
nullptr,
|
||||
add_pos);
|
||||
|
||||
cb(cur, "vit_out", -1);
|
||||
|
||||
// dots.ocr patch merger + projector
|
||||
{
|
||||
GGML_ASSERT(hparams.n_merge > 0);
|
||||
cur = build_norm(cur, model.mm_input_norm_w, model.mm_input_norm_b, NORM_TYPE_NORMAL, 1e-6, -1);
|
||||
cur = build_patch_merge_permute(cur, hparams.n_merge);
|
||||
cb(cur, "after_patch_merger", -1);
|
||||
cur = build_ffn(cur,
|
||||
model.mm_0_w, model.mm_0_b,
|
||||
nullptr, nullptr, // no gate
|
||||
model.mm_2_w, model.mm_2_b,
|
||||
FFN_GELU_ERF, -1); // nn.GELU() defaults to exact erf-based GELU
|
||||
cb(cur, "after_projector", -1);
|
||||
}
|
||||
|
||||
// build the graph
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
|
@ -73,6 +73,11 @@ struct clip_graph_paddleocr : clip_graph {
|
|||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_dotsocr : clip_graph {
|
||||
clip_graph_dotsocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
};
|
||||
|
||||
struct clip_graph_cogvlm : clip_graph {
|
||||
clip_graph_cogvlm(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
|
||||
ggml_cgraph * build() override;
|
||||
|
|
|
|||
|
|
@ -375,6 +375,13 @@ struct mtmd_context {
|
|||
img_end = "<|im_end|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_longest_edge>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_DOTS_OCR:
|
||||
{
|
||||
// <|img|> ... (image embeddings) ... <|endofimg|>
|
||||
img_beg = "<|img|>";
|
||||
img_end = "<|endofimg|>";
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
|
||||
} break;
|
||||
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
|
||||
{
|
||||
image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
|
||||
|
|
|
|||
|
|
@ -89,6 +89,7 @@ add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0"
|
|||
add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/LightOnOCR-1B-1025-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/DeepSeek-OCR-GGUF:Q8_0" -p "Free OCR." --chat-template deepseek-ocr
|
||||
add_test_vision "ggml-org/dots.ocr-GGUF:Q8_0" -p "OCR"
|
||||
add_test_vision "ggml-org/HunyuanOCR-GGUF:Q8_0" -p "OCR"
|
||||
|
||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
|
||||
|
|
|
|||
Loading…
Reference in New Issue