diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8ba3788814..db119a2770 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6032,6 +6032,7 @@ class DeepseekOCRVisionModel(MmprojModel): sam_hparams = hparams['sam'] self.gguf_writer.add_vision_sam_layers_count(sam_hparams['layers']) self.gguf_writer.add_vision_sam_embedding_length(sam_hparams['width']) + self.gguf_writer.add_vision_sam_head_count(sam_hparams['heads']) def get_vision_config(self) -> dict[str, Any]: vision_config: dict[str, Any] | None = self.global_config.get("vision_config") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 9fa88bab2b..c8a1ea805f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -306,6 +306,7 @@ class Keys: class SAM: BLOCK_COUNT = "clip.vision.sam.block_count" EMBEDDING_LENGTH = "clip.vision.sam.embedding_length" + HEAD_COUNT = "clip.vision.sam.head_count" class ClipAudio: NUM_MEL_BINS = "clip.audio.num_mel_bins" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 95de763628..1e10af5f9f 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1135,6 +1135,9 @@ class GGUFWriter: def add_vision_sam_embedding_length(self, value: int) -> None: self.add_uint32(Keys.ClipVision.SAM.EMBEDDING_LENGTH, value) + + def add_vision_sam_head_count(self, value: int) -> None: + self.add_uint32(Keys.ClipVision.SAM.HEAD_COUNT, value) # audio models diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index b8bf5ac899..a90e67f360 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -49,7 +49,9 @@ #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num" - +#define KEY_SAM_N_HEAD "clip.vision.sam.head_count" +#define KEY_SAM_N_BLOCK "clip.vision.sam.block_count" +#define KEY_SAM_N_EMBD "clip.vision.sam.embedding_length" // audio-specific #define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities #define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 02526da02a..bca08b745f 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -193,6 +193,11 @@ struct clip_hparams { int32_t attn_window_size = 0; int32_t n_wa_pattern = 0; + // deepseek-ocr (sam) + int32_t sam_n_layer = 0; + int32_t sam_n_head = 0; + int32_t sam_n_embd = 0; + // audio int32_t n_mel_bins = 0; // whisper preprocessor int32_t proj_stack_factor = 0; // ultravox @@ -2676,9 +2681,9 @@ private: } ggml_tensor * build_sam(ggml_tensor * inp_raw) { - const int n_embd = 768; - const int _depth = 12; - const int n_heads = 12; + const int n_embd = hparams.sam_n_embd; + const int n_layer = hparams.sam_n_layer; + const int n_heads = hparams.sam_n_head; const int d_heads = n_embd / n_heads; const int window = hparams.attn_window_size; @@ -2721,7 +2726,7 @@ private: } // loop over layers - for (int il = 0; il < _depth; il++) { + for (int il = 0; il < n_layer; il++) { auto & layer = model.sam_layers[il]; ggml_tensor * shortcut = cur; @@ -3286,6 +3291,10 @@ struct clip_model_loader { hparams.patch_size = 16; hparams.image_size = 1024; hparams.warmup_image_size = 1024; + + get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true); + get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true); + get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true); get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true); } break; default: