diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6d07b9acdb..390edfe864 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5790,16 +5790,16 @@ class Gemma3VisionModel(MmprojModel): @ModelBase.register("DeepseekOCRForCausalLM") class DeepseekOCRVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - + proc_fname = self.dir_model / "processor_config.json" - + if proc_fname.is_file(): with open(proc_fname, "r") as f: self.preprocessor_config = json.load(f) - - + + def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams @@ -5857,7 +5857,7 @@ class DeepseekOCRVisionModel(MmprojModel): return [(self.map_tensor_name(name, try_suffixes=("",)), data_torch)] return [(self.map_tensor_name(name), data_torch)] - + @ModelBase.register("Gemma3nForConditionalGeneration") class Gemma3NModel(Gemma3Model): diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 4cb2808c26..520e0cf508 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -130,18 +130,18 @@ #define TN_TOK_EOI "v.eoi" // deepseek-ocr -#define TN_SAM_POS_EMBD "sam.pos_embd" -#define TN_SAM_PATCH_EMBD "sam.patch_embd.%s" -#define TN_SAM_PRE_NORM "sam.blk.%d.pre_ln.%s" -#define TN_SAM_POST_NORM "sam.blk.%d.post_ln" -#define TN_SAM_ATTN_POS_H "sam.blk.%d.attn.pos_h" -#define TN_SAM_ATTN_POS_W "sam.blk.%d.attn.pos_w" -#define TN_SAM_ATTN_QKV "sam.blk.%d.attn.qkv.%s" -#define TN_SAM_ATTN_OUT "sam.blk.%d.attn.out.%s" -#define TN_SAM_FFN_UP "sam.blk.%d.mlp.lin1.%s" -#define TN_SAM_FFN_DOWN "sam.blk.%d.mlp.lin2.%s" -#define TN_SAM_NECK "sam.neck.%d.%s" -#define TN_SAM_NET "sam.net_%d.%s" +#define TN_SAM_POS_EMBD "v.sam.pos_embd" +#define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s" +#define TN_SAM_PRE_NORM "v.sam.blk.%d.pre_ln.%s" +#define TN_SAM_POST_NORM "v.sam.blk.%d.post_ln" +#define TN_SAM_ATTN_POS_H "v.sam.blk.%d.attn.pos_h" +#define TN_SAM_ATTN_POS_W "v.sam.blk.%d.attn.pos_w" +#define TN_SAM_ATTN_QKV "v.sam.blk.%d.attn.qkv.%s" +#define TN_SAM_ATTN_OUT "v.sam.blk.%d.attn.out.%s" +#define TN_SAM_FFN_UP "v.sam.blk.%d.mlp.lin1.%s" +#define TN_SAM_FFN_DOWN "v.sam.blk.%d.mlp.lin2.%s" +#define TN_SAM_NECK "v.sam.neck.%d.%s" +#define TN_SAM_NET "v.sam.net_%d.%s" // align x to upper multiple of n #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) @@ -170,7 +170,7 @@ enum projector_type { PROJECTOR_TYPE_LIGHTONOCR, PROJECTOR_TYPE_COGVLM, PROJECTOR_TYPE_JANUS_PRO, - PROJECTOR_TYPE_DEEPSEEK_OCR, + PROJECTOR_TYPE_DEEPSEEKOCR, PROJECTOR_TYPE_UNKNOWN, }; @@ -197,7 +197,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, { PROJECTOR_TYPE_COGVLM, "cogvlm"}, { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, - { PROJECTOR_TYPE_DEEPSEEK_OCR,"deepseek_orc"}, + { PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index d94d05b2f2..5d4257ac84 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -682,8 +682,8 @@ struct clip_graph { const int enc_n_patches = enc_image_size / enc_patch_size; // 64 - ggml_tensor * inpL = build_enc_inp(inp_raw, enc_patch_size, enc_image_size, enc_n_embd); - ggml_tensor * cur = ggml_add(ctx0, inpL, model.position_embeddings); + ggml_tensor * inpL = build_enc_inp(inp_raw, enc_patch_size, enc_n_patches, enc_n_embd); + ggml_tensor * cur = ggml_add(ctx0, inpL, model.pos_embed); // loop over layers for (int il = 0; il < _depth; il++) { @@ -842,7 +842,7 @@ struct clip_graph { ggml_tensor * inp_raw = build_inp_raw(); - ggml_tensor * global_features_1 = build_sam_enc(inp_raw); + ggml_tensor * global_features_1 = build_sam_enc(inp_raw, std::max(img.nx, img.ny)); ggml_tensor * global_features_2 = build_dp_ocr_clip(inp_raw, global_features_1); @@ -2862,6 +2862,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { res = graph.build_cogvlm(); } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + res = graph.build_deepseek_ocr(); + } break; default: { res = graph.build_llava(); @@ -3187,6 +3191,11 @@ struct clip_model_loader { hparams.ffn_op = FFN_GELU_ERF; log_ffn_op = "gelu_erf"; // temporary solution for logging } break; + case PROJECTOR_TYPE_DEEPSEEKOCR: + { + hparams.set_limit_image_tokens(8, 1024); + hparams.set_warmup_n_tokens(256); // avoid OOM on warmup + } break; default: break; } @@ -3574,7 +3583,7 @@ struct clip_model_loader { model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight")); model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias")); } break; - case PROJECTOR_TYPE_DEEPSEEK_OCR: + case PROJECTOR_TYPE_DEEPSEEKOCR: { model.pos_embed = get_tensor(TN_SAM_POS_EMBD); model.patch_embed_proj_w = get_tensor(string_format(TN_SAM_PATCH_EMBD, "weight")); @@ -4830,7 +4839,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str } } } break; - case PROJECTOR_TYPE_DEEPSEEK_OCR: + case PROJECTOR_TYPE_DEEPSEEKOCR: { // configurable, or read from params const int min_num = 2;