diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 46577a2a4a..9a62e1980b 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -60,11 +60,11 @@ struct clip_hparams { std::unordered_set vision_feature_layer; int32_t attn_window_size = 0; int32_t n_wa_pattern = 0; - + // deepseek-ocr (sam) int32_t sam_n_layer = 0; - int32_t sam_n_head = 0; - int32_t sam_n_embd = 0; + int32_t sam_n_head = 0; + int32_t sam_n_embd = 0; // audio int32_t n_mel_bins = 0; // whisper preprocessor @@ -154,7 +154,7 @@ struct clip_layer { ggml_tensor * deepstack_fc1_b = nullptr; ggml_tensor * deepstack_fc2_w = nullptr; ggml_tensor * deepstack_fc2_b = nullptr; - + // sam rel_pos ggml_tensor * rel_pos_w = nullptr; ggml_tensor * rel_pos_h = nullptr; @@ -293,11 +293,11 @@ struct clip_model { ggml_tensor * mm_4h_to_h_w = nullptr; ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_eoi = nullptr; - + // deepseek ocr sam ggml_tensor * patch_embed_proj_w = nullptr; ggml_tensor * patch_embed_proj_b = nullptr; - ggml_tensor * pos_embed = nullptr; + ggml_tensor * pos_embed = nullptr; ggml_tensor * neck_0_w; ggml_tensor * neck_1_w; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 507c2d3407..e86f31f6b4 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -753,8 +753,6 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale return cur; } - - static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) { GGML_ASSERT(imgs.entries.size() == 1 && "n_batch > 1 is not supported");